[HUDI-1479] Use HoodieEngineContext to parallelize fetching of partiton paths (#2417)
* [HUDI-1479] Use HoodieEngineContext to parallelize fetching of partition paths * Adding testClass for FileSystemBackedTableMetadata Co-authored-by: Nishith Agarwal <nagarwal@uber.com>
This commit is contained in:
@@ -161,7 +161,9 @@ public class MetadataCommand implements CommandMarker {
|
|||||||
@CliCommand(value = "metadata list-partitions", help = "Print a list of all partitions from the metadata")
|
@CliCommand(value = "metadata list-partitions", help = "Print a list of all partitions from the metadata")
|
||||||
public String listPartitions() throws IOException {
|
public String listPartitions() throws IOException {
|
||||||
HoodieCLI.getTableMetaClient();
|
HoodieCLI.getTableMetaClient();
|
||||||
HoodieBackedTableMetadata metadata = new HoodieBackedTableMetadata(HoodieCLI.conf, HoodieCLI.basePath, "/tmp", true, false, false);
|
initJavaSparkContext();
|
||||||
|
HoodieBackedTableMetadata metadata = new HoodieBackedTableMetadata(new HoodieSparkEngineContext(jsc),
|
||||||
|
HoodieCLI.basePath, "/tmp", true, false, false, false);
|
||||||
|
|
||||||
StringBuffer out = new StringBuffer("\n");
|
StringBuffer out = new StringBuffer("\n");
|
||||||
if (!metadata.enabled()) {
|
if (!metadata.enabled()) {
|
||||||
|
|||||||
@@ -119,6 +119,10 @@ public abstract class AbstractHoodieClient implements Serializable, AutoCloseabl
|
|||||||
return config;
|
return config;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public HoodieEngineContext getEngineContext() {
|
||||||
|
return context;
|
||||||
|
}
|
||||||
|
|
||||||
protected void initWrapperFSMetrics() {
|
protected void initWrapperFSMetrics() {
|
||||||
// no-op.
|
// no-op.
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -221,8 +221,9 @@ public abstract class HoodieBackedTableMetadataWriter implements HoodieTableMeta
|
|||||||
protected abstract void initialize(HoodieEngineContext engineContext, HoodieTableMetaClient datasetMetaClient);
|
protected abstract void initialize(HoodieEngineContext engineContext, HoodieTableMetaClient datasetMetaClient);
|
||||||
|
|
||||||
private void initTableMetadata() {
|
private void initTableMetadata() {
|
||||||
this.metadata = new HoodieBackedTableMetadata(hadoopConf.get(), datasetWriteConfig.getBasePath(), datasetWriteConfig.getSpillableMapBasePath(),
|
this.metadata = new HoodieBackedTableMetadata(engineContext, datasetWriteConfig.getBasePath(),
|
||||||
datasetWriteConfig.useFileListingMetadata(), datasetWriteConfig.getFileListingMetadataVerify(), false,
|
datasetWriteConfig.getSpillableMapBasePath(), datasetWriteConfig.useFileListingMetadata(),
|
||||||
|
datasetWriteConfig.getFileListingMetadataVerify(), false,
|
||||||
datasetWriteConfig.shouldAssumeDatePartitioning());
|
datasetWriteConfig.shouldAssumeDatePartitioning());
|
||||||
this.metaClient = metadata.getMetaClient();
|
this.metaClient = metadata.getMetaClient();
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -31,6 +31,7 @@ import org.apache.hudi.avro.model.HoodieRollbackMetadata;
|
|||||||
import org.apache.hudi.avro.model.HoodieSavepointMetadata;
|
import org.apache.hudi.avro.model.HoodieSavepointMetadata;
|
||||||
import org.apache.hudi.common.config.SerializableConfiguration;
|
import org.apache.hudi.common.config.SerializableConfiguration;
|
||||||
import org.apache.hudi.common.engine.HoodieEngineContext;
|
import org.apache.hudi.common.engine.HoodieEngineContext;
|
||||||
|
import org.apache.hudi.common.engine.HoodieLocalEngineContext;
|
||||||
import org.apache.hudi.common.engine.TaskContextSupplier;
|
import org.apache.hudi.common.engine.TaskContextSupplier;
|
||||||
import org.apache.hudi.common.fs.ConsistencyGuard;
|
import org.apache.hudi.common.fs.ConsistencyGuard;
|
||||||
import org.apache.hudi.common.fs.ConsistencyGuard.FileVisibility;
|
import org.apache.hudi.common.fs.ConsistencyGuard.FileVisibility;
|
||||||
@@ -93,6 +94,7 @@ public abstract class HoodieTable<T extends HoodieRecordPayload, I, K, O> implem
|
|||||||
|
|
||||||
protected final HoodieWriteConfig config;
|
protected final HoodieWriteConfig config;
|
||||||
protected final HoodieTableMetaClient metaClient;
|
protected final HoodieTableMetaClient metaClient;
|
||||||
|
protected final transient HoodieEngineContext context;
|
||||||
protected final HoodieIndex<T, I, K, O> index;
|
protected final HoodieIndex<T, I, K, O> index;
|
||||||
|
|
||||||
private SerializableConfiguration hadoopConfiguration;
|
private SerializableConfiguration hadoopConfiguration;
|
||||||
@@ -108,6 +110,7 @@ public abstract class HoodieTable<T extends HoodieRecordPayload, I, K, O> implem
|
|||||||
config.getViewStorageConfig());
|
config.getViewStorageConfig());
|
||||||
this.metaClient = metaClient;
|
this.metaClient = metaClient;
|
||||||
this.index = getIndex(config, context);
|
this.index = getIndex(config, context);
|
||||||
|
this.context = context;
|
||||||
this.taskContextSupplier = context.getTaskContextSupplier();
|
this.taskContextSupplier = context.getTaskContextSupplier();
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -660,8 +663,16 @@ public abstract class HoodieTable<T extends HoodieRecordPayload, I, K, O> implem
|
|||||||
|
|
||||||
public HoodieTableMetadata metadata() {
|
public HoodieTableMetadata metadata() {
|
||||||
if (metadata == null) {
|
if (metadata == null) {
|
||||||
metadata = HoodieTableMetadata.create(hadoopConfiguration.get(), config.getBasePath(), config.getSpillableMapBasePath(),
|
HoodieEngineContext engineContext = context;
|
||||||
config.useFileListingMetadata(), config.getFileListingMetadataVerify(), config.isMetricsOn(), config.shouldAssumeDatePartitioning());
|
if (engineContext == null) {
|
||||||
|
// This is to handle scenarios where this is called at the executor tasks which do not have access
|
||||||
|
// to engine context, and it ends up being null (as its not serializable and marked transient here).
|
||||||
|
engineContext = new HoodieLocalEngineContext(hadoopConfiguration.get());
|
||||||
|
}
|
||||||
|
|
||||||
|
metadata = HoodieTableMetadata.create(engineContext, config.getBasePath(), config.getSpillableMapBasePath(),
|
||||||
|
config.useFileListingMetadata(), config.getFileListingMetadataVerify(), config.isMetricsOn(),
|
||||||
|
config.shouldAssumeDatePartitioning());
|
||||||
}
|
}
|
||||||
return metadata;
|
return metadata;
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -67,7 +67,7 @@ public abstract class PartitionAwareClusteringPlanStrategy<T extends HoodieRecor
|
|||||||
HoodieTableMetaClient metaClient = getHoodieTable().getMetaClient();
|
HoodieTableMetaClient metaClient = getHoodieTable().getMetaClient();
|
||||||
LOG.info("Scheduling clustering for " + metaClient.getBasePath());
|
LOG.info("Scheduling clustering for " + metaClient.getBasePath());
|
||||||
HoodieWriteConfig config = getWriteConfig();
|
HoodieWriteConfig config = getWriteConfig();
|
||||||
List<String> partitionPaths = FSUtils.getAllPartitionPaths(metaClient.getFs(), metaClient.getBasePath(),
|
List<String> partitionPaths = FSUtils.getAllPartitionPaths(getEngineContext(), metaClient.getFs(), metaClient.getBasePath(),
|
||||||
config.useFileListingMetadata(), config.getFileListingMetadataVerify(),
|
config.useFileListingMetadata(), config.getFileListingMetadataVerify(),
|
||||||
config.shouldAssumeDatePartitioning());
|
config.shouldAssumeDatePartitioning());
|
||||||
|
|
||||||
|
|||||||
@@ -91,9 +91,10 @@ public class RollbackUtils {
|
|||||||
* @param config instance of {@link HoodieWriteConfig} to use.
|
* @param config instance of {@link HoodieWriteConfig} to use.
|
||||||
* @return {@link List} of {@link ListingBasedRollbackRequest}s thus collected.
|
* @return {@link List} of {@link ListingBasedRollbackRequest}s thus collected.
|
||||||
*/
|
*/
|
||||||
public static List<ListingBasedRollbackRequest> generateRollbackRequestsByListingCOW(FileSystem fs, String basePath, HoodieWriteConfig config) {
|
public static List<ListingBasedRollbackRequest> generateRollbackRequestsByListingCOW(HoodieEngineContext engineContext,
|
||||||
|
FileSystem fs, String basePath, HoodieWriteConfig config) {
|
||||||
try {
|
try {
|
||||||
return FSUtils.getAllPartitionPaths(fs, basePath, config.useFileListingMetadata(),
|
return FSUtils.getAllPartitionPaths(engineContext, fs, basePath, config.useFileListingMetadata(),
|
||||||
config.getFileListingMetadataVerify(), config.shouldAssumeDatePartitioning()).stream()
|
config.getFileListingMetadataVerify(), config.shouldAssumeDatePartitioning()).stream()
|
||||||
.map(ListingBasedRollbackRequest::createRollbackRequestWithDeleteDataAndLogFilesAction)
|
.map(ListingBasedRollbackRequest::createRollbackRequestWithDeleteDataAndLogFilesAction)
|
||||||
.collect(Collectors.toList());
|
.collect(Collectors.toList());
|
||||||
@@ -113,7 +114,7 @@ public class RollbackUtils {
|
|||||||
public static List<ListingBasedRollbackRequest> generateRollbackRequestsUsingFileListingMOR(HoodieInstant instantToRollback, HoodieTable table, HoodieEngineContext context) throws IOException {
|
public static List<ListingBasedRollbackRequest> generateRollbackRequestsUsingFileListingMOR(HoodieInstant instantToRollback, HoodieTable table, HoodieEngineContext context) throws IOException {
|
||||||
String commit = instantToRollback.getTimestamp();
|
String commit = instantToRollback.getTimestamp();
|
||||||
HoodieWriteConfig config = table.getConfig();
|
HoodieWriteConfig config = table.getConfig();
|
||||||
List<String> partitions = FSUtils.getAllPartitionPaths(table.getMetaClient().getFs(), table.getMetaClient().getBasePath(),
|
List<String> partitions = FSUtils.getAllPartitionPaths(context, table.getMetaClient().getFs(), table.getMetaClient().getBasePath(),
|
||||||
config.useFileListingMetadata(), config.getFileListingMetadataVerify(), config.shouldAssumeDatePartitioning());
|
config.useFileListingMetadata(), config.getFileListingMetadataVerify(), config.shouldAssumeDatePartitioning());
|
||||||
int sparkPartitions = Math.max(Math.min(partitions.size(), config.getRollbackParallelism()), 1);
|
int sparkPartitions = Math.max(Math.min(partitions.size(), config.getRollbackParallelism()), 1);
|
||||||
context.setJobStatus(RollbackUtils.class.getSimpleName(), "Generate all rollback requests");
|
context.setJobStatus(RollbackUtils.class.getSimpleName(), "Generate all rollback requests");
|
||||||
|
|||||||
@@ -90,7 +90,7 @@ public class SavepointActionExecutor<T extends HoodieRecordPayload, I, K, O> ext
|
|||||||
"Could not savepoint commit " + instantTime + " as this is beyond the lookup window " + lastCommitRetained);
|
"Could not savepoint commit " + instantTime + " as this is beyond the lookup window " + lastCommitRetained);
|
||||||
|
|
||||||
context.setJobStatus(this.getClass().getSimpleName(), "Collecting latest files for savepoint " + instantTime);
|
context.setJobStatus(this.getClass().getSimpleName(), "Collecting latest files for savepoint " + instantTime);
|
||||||
List<String> partitions = FSUtils.getAllPartitionPaths(table.getMetaClient().getFs(),
|
List<String> partitions = FSUtils.getAllPartitionPaths(context, table.getMetaClient().getFs(),
|
||||||
table.getMetaClient().getBasePath(), config.useFileListingMetadata(), config.getFileListingMetadataVerify(),
|
table.getMetaClient().getBasePath(), config.useFileListingMetadata(), config.getFileListingMetadataVerify(),
|
||||||
config.shouldAssumeDatePartitioning()
|
config.shouldAssumeDatePartitioning()
|
||||||
);
|
);
|
||||||
|
|||||||
@@ -65,7 +65,7 @@ public class FlinkCopyOnWriteRollbackActionExecutor<T extends HoodieRecordPayloa
|
|||||||
@Override
|
@Override
|
||||||
protected List<HoodieRollbackStat> executeRollbackUsingFileListing(HoodieInstant instantToRollback) {
|
protected List<HoodieRollbackStat> executeRollbackUsingFileListing(HoodieInstant instantToRollback) {
|
||||||
List<ListingBasedRollbackRequest> rollbackRequests = RollbackUtils.generateRollbackRequestsByListingCOW(
|
List<ListingBasedRollbackRequest> rollbackRequests = RollbackUtils.generateRollbackRequestsByListingCOW(
|
||||||
table.getMetaClient().getFs(), table.getMetaClient().getBasePath(), config);
|
context, table.getMetaClient().getFs(), table.getMetaClient().getBasePath(), config);
|
||||||
return new ListingBasedRollbackHelper(table.getMetaClient(), config).performRollback(context, instantToRollback, rollbackRequests);
|
return new ListingBasedRollbackHelper(table.getMetaClient(), config).performRollback(context, instantToRollback, rollbackRequests);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -93,8 +93,8 @@ public class ZeroToOneUpgradeHandler implements UpgradeHandler {
|
|||||||
// generate rollback stats
|
// generate rollback stats
|
||||||
List<ListingBasedRollbackRequest> rollbackRequests;
|
List<ListingBasedRollbackRequest> rollbackRequests;
|
||||||
if (table.getMetaClient().getTableType() == HoodieTableType.COPY_ON_WRITE) {
|
if (table.getMetaClient().getTableType() == HoodieTableType.COPY_ON_WRITE) {
|
||||||
rollbackRequests = RollbackUtils.generateRollbackRequestsByListingCOW(table.getMetaClient().getFs(), table.getMetaClient().getBasePath(),
|
rollbackRequests = RollbackUtils.generateRollbackRequestsByListingCOW(context, table.getMetaClient().getFs(),
|
||||||
table.getConfig());
|
table.getMetaClient().getBasePath(), table.getConfig());
|
||||||
} else {
|
} else {
|
||||||
rollbackRequests = RollbackUtils.generateRollbackRequestsUsingFileListingMOR(commitInstantOpt.get(), table, context);
|
rollbackRequests = RollbackUtils.generateRollbackRequestsUsingFileListingMOR(commitInstantOpt.get(), table, context);
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -63,7 +63,7 @@ public class SparkHoodieGlobalBloomIndex<T extends HoodieRecordPayload> extends
|
|||||||
final HoodieTable hoodieTable) {
|
final HoodieTable hoodieTable) {
|
||||||
HoodieTableMetaClient metaClient = hoodieTable.getMetaClient();
|
HoodieTableMetaClient metaClient = hoodieTable.getMetaClient();
|
||||||
try {
|
try {
|
||||||
List<String> allPartitionPaths = FSUtils.getAllPartitionPaths(metaClient.getFs(), metaClient.getBasePath(),
|
List<String> allPartitionPaths = FSUtils.getAllPartitionPaths(context, metaClient.getFs(), metaClient.getBasePath(),
|
||||||
config.useFileListingMetadata(), config.getFileListingMetadataVerify(), config.shouldAssumeDatePartitioning());
|
config.useFileListingMetadata(), config.getFileListingMetadataVerify(), config.shouldAssumeDatePartitioning());
|
||||||
return super.loadInvolvedFiles(allPartitionPaths, context, hoodieTable);
|
return super.loadInvolvedFiles(allPartitionPaths, context, hoodieTable);
|
||||||
} catch (IOException e) {
|
} catch (IOException e) {
|
||||||
|
|||||||
@@ -104,7 +104,7 @@ public class SparkHoodieGlobalSimpleIndex<T extends HoodieRecordPayload> extends
|
|||||||
final HoodieTable<T, JavaRDD<HoodieRecord<T>>, JavaRDD<HoodieKey>, JavaRDD<WriteStatus>> hoodieTable) {
|
final HoodieTable<T, JavaRDD<HoodieRecord<T>>, JavaRDD<HoodieKey>, JavaRDD<WriteStatus>> hoodieTable) {
|
||||||
HoodieTableMetaClient metaClient = hoodieTable.getMetaClient();
|
HoodieTableMetaClient metaClient = hoodieTable.getMetaClient();
|
||||||
try {
|
try {
|
||||||
List<String> allPartitionPaths = FSUtils.getAllPartitionPaths(metaClient.getFs(), metaClient.getBasePath(),
|
List<String> allPartitionPaths = FSUtils.getAllPartitionPaths(context, metaClient.getFs(), metaClient.getBasePath(),
|
||||||
config.useFileListingMetadata(), config.getFileListingMetadataVerify(), config.shouldAssumeDatePartitioning());
|
config.useFileListingMetadata(), config.getFileListingMetadataVerify(), config.shouldAssumeDatePartitioning());
|
||||||
// Obtain the latest data files from all the partitions.
|
// Obtain the latest data files from all the partitions.
|
||||||
return getLatestBaseFilesForAllPartitions(allPartitionPaths, context, hoodieTable);
|
return getLatestBaseFilesForAllPartitions(allPartitionPaths, context, hoodieTable);
|
||||||
|
|||||||
@@ -50,7 +50,7 @@ public class SparkInsertOverwriteTableCommitActionExecutor<T extends HoodieRecor
|
|||||||
protected Map<String, List<String>> getPartitionToReplacedFileIds(JavaRDD<WriteStatus> writeStatuses) {
|
protected Map<String, List<String>> getPartitionToReplacedFileIds(JavaRDD<WriteStatus> writeStatuses) {
|
||||||
Map<String, List<String>> partitionToExistingFileIds = new HashMap<>();
|
Map<String, List<String>> partitionToExistingFileIds = new HashMap<>();
|
||||||
try {
|
try {
|
||||||
List<String> partitionPaths = FSUtils.getAllPartitionPaths(table.getMetaClient().getFs(),
|
List<String> partitionPaths = FSUtils.getAllPartitionPaths(context, table.getMetaClient().getFs(),
|
||||||
table.getMetaClient().getBasePath(), config.useFileListingMetadata(), config.getFileListingMetadataVerify(),
|
table.getMetaClient().getBasePath(), config.useFileListingMetadata(), config.getFileListingMetadataVerify(),
|
||||||
false);
|
false);
|
||||||
JavaSparkContext jsc = HoodieSparkEngineContext.getSparkContext(context);
|
JavaSparkContext jsc = HoodieSparkEngineContext.getSparkContext(context);
|
||||||
|
|||||||
@@ -195,7 +195,7 @@ public class HoodieSparkMergeOnReadTableCompactor<T extends HoodieRecordPayload>
|
|||||||
// TODO - rollback any compactions in flight
|
// TODO - rollback any compactions in flight
|
||||||
HoodieTableMetaClient metaClient = hoodieTable.getMetaClient();
|
HoodieTableMetaClient metaClient = hoodieTable.getMetaClient();
|
||||||
LOG.info("Compacting " + metaClient.getBasePath() + " with commit " + compactionCommitTime);
|
LOG.info("Compacting " + metaClient.getBasePath() + " with commit " + compactionCommitTime);
|
||||||
List<String> partitionPaths = FSUtils.getAllPartitionPaths(metaClient.getFs(), metaClient.getBasePath(),
|
List<String> partitionPaths = FSUtils.getAllPartitionPaths(context, metaClient.getFs(), metaClient.getBasePath(),
|
||||||
config.useFileListingMetadata(), config.getFileListingMetadataVerify(), config.shouldAssumeDatePartitioning());
|
config.useFileListingMetadata(), config.getFileListingMetadataVerify(), config.shouldAssumeDatePartitioning());
|
||||||
|
|
||||||
// filter the partition paths if needed to reduce list status
|
// filter the partition paths if needed to reduce list status
|
||||||
|
|||||||
@@ -66,8 +66,8 @@ public class SparkCopyOnWriteRollbackActionExecutor<T extends HoodieRecordPayloa
|
|||||||
|
|
||||||
@Override
|
@Override
|
||||||
protected List<HoodieRollbackStat> executeRollbackUsingFileListing(HoodieInstant instantToRollback) {
|
protected List<HoodieRollbackStat> executeRollbackUsingFileListing(HoodieInstant instantToRollback) {
|
||||||
List<ListingBasedRollbackRequest> rollbackRequests = RollbackUtils.generateRollbackRequestsByListingCOW(table.getMetaClient().getFs(),
|
List<ListingBasedRollbackRequest> rollbackRequests = RollbackUtils.generateRollbackRequestsByListingCOW(context,
|
||||||
table.getMetaClient().getBasePath(), config);
|
table.getMetaClient().getFs(), table.getMetaClient().getBasePath(), config);
|
||||||
return new ListingBasedRollbackHelper(table.getMetaClient(), config).performRollback(context, instantToRollback, rollbackRequests);
|
return new ListingBasedRollbackHelper(table.getMetaClient(), config).performRollback(context, instantToRollback, rollbackRequests);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -92,8 +92,8 @@ public class ZeroToOneUpgradeHandler implements UpgradeHandler {
|
|||||||
// generate rollback stats
|
// generate rollback stats
|
||||||
List<ListingBasedRollbackRequest> rollbackRequests;
|
List<ListingBasedRollbackRequest> rollbackRequests;
|
||||||
if (table.getMetaClient().getTableType() == HoodieTableType.COPY_ON_WRITE) {
|
if (table.getMetaClient().getTableType() == HoodieTableType.COPY_ON_WRITE) {
|
||||||
rollbackRequests = RollbackUtils.generateRollbackRequestsByListingCOW(table.getMetaClient().getFs(), table.getMetaClient().getBasePath(),
|
rollbackRequests = RollbackUtils.generateRollbackRequestsByListingCOW(context, table.getMetaClient().getFs(),
|
||||||
table.getConfig());
|
table.getMetaClient().getBasePath(), table.getConfig());
|
||||||
} else {
|
} else {
|
||||||
rollbackRequests = RollbackUtils.generateRollbackRequestsUsingFileListingMOR(commitInstantOpt.get(), table, context);
|
rollbackRequests = RollbackUtils.generateRollbackRequestsUsingFileListingMOR(commitInstantOpt.get(), table, context);
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -101,7 +101,7 @@ public class TestClientRollback extends HoodieClientTestBase {
|
|||||||
assertNoWriteErrors(statuses);
|
assertNoWriteErrors(statuses);
|
||||||
HoodieWriteConfig config = getConfig();
|
HoodieWriteConfig config = getConfig();
|
||||||
List<String> partitionPaths =
|
List<String> partitionPaths =
|
||||||
FSUtils.getAllPartitionPaths(fs, cfg.getBasePath(), config.useFileListingMetadata(),
|
FSUtils.getAllPartitionPaths(context, fs, cfg.getBasePath(), config.useFileListingMetadata(),
|
||||||
config.getFileListingMetadataVerify(), config.shouldAssumeDatePartitioning());
|
config.getFileListingMetadataVerify(), config.shouldAssumeDatePartitioning());
|
||||||
metaClient = HoodieTableMetaClient.reload(metaClient);
|
metaClient = HoodieTableMetaClient.reload(metaClient);
|
||||||
HoodieSparkTable table = HoodieSparkTable.create(getConfig(), context, metaClient);
|
HoodieSparkTable table = HoodieSparkTable.create(getConfig(), context, metaClient);
|
||||||
|
|||||||
@@ -23,6 +23,7 @@ import org.apache.hudi.client.SparkRDDWriteClient;
|
|||||||
import org.apache.hudi.client.WriteStatus;
|
import org.apache.hudi.client.WriteStatus;
|
||||||
import org.apache.hudi.client.common.HoodieSparkEngineContext;
|
import org.apache.hudi.client.common.HoodieSparkEngineContext;
|
||||||
import org.apache.hudi.common.config.HoodieMetadataConfig;
|
import org.apache.hudi.common.config.HoodieMetadataConfig;
|
||||||
|
import org.apache.hudi.common.config.SerializableConfiguration;
|
||||||
import org.apache.hudi.common.fs.FSUtils;
|
import org.apache.hudi.common.fs.FSUtils;
|
||||||
import org.apache.hudi.common.metrics.Registry;
|
import org.apache.hudi.common.metrics.Registry;
|
||||||
import org.apache.hudi.common.model.FileSlice;
|
import org.apache.hudi.common.model.FileSlice;
|
||||||
@@ -454,7 +455,7 @@ public class TestHoodieBackedMetadata extends HoodieClientTestHarness {
|
|||||||
records = dataGen.generateUniqueUpdates(newCommitTime, 5);
|
records = dataGen.generateUniqueUpdates(newCommitTime, 5);
|
||||||
writeStatuses = client.upsert(jsc.parallelize(records, 1), newCommitTime).collect();
|
writeStatuses = client.upsert(jsc.parallelize(records, 1), newCommitTime).collect();
|
||||||
assertNoWriteErrors(writeStatuses);
|
assertNoWriteErrors(writeStatuses);
|
||||||
assertFalse(metadata(client).isInSync());
|
assertTrue(metadata(client).isInSync());
|
||||||
|
|
||||||
// updates and inserts
|
// updates and inserts
|
||||||
newCommitTime = HoodieActiveTimeline.createNewInstantTime();
|
newCommitTime = HoodieActiveTimeline.createNewInstantTime();
|
||||||
@@ -462,21 +463,21 @@ public class TestHoodieBackedMetadata extends HoodieClientTestHarness {
|
|||||||
records = dataGen.generateUpdates(newCommitTime, 10);
|
records = dataGen.generateUpdates(newCommitTime, 10);
|
||||||
writeStatuses = client.upsert(jsc.parallelize(records, 1), newCommitTime).collect();
|
writeStatuses = client.upsert(jsc.parallelize(records, 1), newCommitTime).collect();
|
||||||
assertNoWriteErrors(writeStatuses);
|
assertNoWriteErrors(writeStatuses);
|
||||||
assertFalse(metadata(client).isInSync());
|
assertTrue(metadata(client).isInSync());
|
||||||
|
|
||||||
// Compaction
|
// Compaction
|
||||||
if (metaClient.getTableType() == HoodieTableType.MERGE_ON_READ) {
|
if (metaClient.getTableType() == HoodieTableType.MERGE_ON_READ) {
|
||||||
newCommitTime = HoodieActiveTimeline.createNewInstantTime();
|
newCommitTime = HoodieActiveTimeline.createNewInstantTime();
|
||||||
client.scheduleCompactionAtInstant(newCommitTime, Option.empty());
|
client.scheduleCompactionAtInstant(newCommitTime, Option.empty());
|
||||||
client.compact(newCommitTime);
|
client.compact(newCommitTime);
|
||||||
assertFalse(metadata(client).isInSync());
|
assertTrue(metadata(client).isInSync());
|
||||||
}
|
}
|
||||||
|
|
||||||
// Savepoint
|
// Savepoint
|
||||||
restoreToInstant = newCommitTime;
|
restoreToInstant = newCommitTime;
|
||||||
if (metaClient.getTableType() == HoodieTableType.COPY_ON_WRITE) {
|
if (metaClient.getTableType() == HoodieTableType.COPY_ON_WRITE) {
|
||||||
client.savepoint("hoodie", "metadata test");
|
client.savepoint("hoodie", "metadata test");
|
||||||
assertFalse(metadata(client).isInSync());
|
assertTrue(metadata(client).isInSync());
|
||||||
}
|
}
|
||||||
|
|
||||||
// Deletes
|
// Deletes
|
||||||
@@ -485,12 +486,12 @@ public class TestHoodieBackedMetadata extends HoodieClientTestHarness {
|
|||||||
JavaRDD<HoodieKey> deleteKeys = jsc.parallelize(records, 1).map(r -> r.getKey());
|
JavaRDD<HoodieKey> deleteKeys = jsc.parallelize(records, 1).map(r -> r.getKey());
|
||||||
client.startCommitWithTime(newCommitTime);
|
client.startCommitWithTime(newCommitTime);
|
||||||
client.delete(deleteKeys, newCommitTime);
|
client.delete(deleteKeys, newCommitTime);
|
||||||
assertFalse(metadata(client).isInSync());
|
assertTrue(metadata(client).isInSync());
|
||||||
|
|
||||||
// Clean
|
// Clean
|
||||||
newCommitTime = HoodieActiveTimeline.createNewInstantTime();
|
newCommitTime = HoodieActiveTimeline.createNewInstantTime();
|
||||||
client.clean(newCommitTime);
|
client.clean(newCommitTime);
|
||||||
assertFalse(metadata(client).isInSync());
|
assertTrue(metadata(client).isInSync());
|
||||||
|
|
||||||
// updates
|
// updates
|
||||||
newCommitTime = HoodieActiveTimeline.createNewInstantTime();
|
newCommitTime = HoodieActiveTimeline.createNewInstantTime();
|
||||||
@@ -498,7 +499,7 @@ public class TestHoodieBackedMetadata extends HoodieClientTestHarness {
|
|||||||
records = dataGen.generateUniqueUpdates(newCommitTime, 10);
|
records = dataGen.generateUniqueUpdates(newCommitTime, 10);
|
||||||
writeStatuses = client.upsert(jsc.parallelize(records, 1), newCommitTime).collect();
|
writeStatuses = client.upsert(jsc.parallelize(records, 1), newCommitTime).collect();
|
||||||
assertNoWriteErrors(writeStatuses);
|
assertNoWriteErrors(writeStatuses);
|
||||||
assertFalse(metadata(client).isInSync());
|
assertTrue(metadata(client).isInSync());
|
||||||
|
|
||||||
// insert overwrite to test replacecommit
|
// insert overwrite to test replacecommit
|
||||||
newCommitTime = HoodieActiveTimeline.createNewInstantTime();
|
newCommitTime = HoodieActiveTimeline.createNewInstantTime();
|
||||||
@@ -507,7 +508,7 @@ public class TestHoodieBackedMetadata extends HoodieClientTestHarness {
|
|||||||
HoodieWriteResult replaceResult = client.insertOverwrite(jsc.parallelize(records, 1), newCommitTime);
|
HoodieWriteResult replaceResult = client.insertOverwrite(jsc.parallelize(records, 1), newCommitTime);
|
||||||
writeStatuses = replaceResult.getWriteStatuses().collect();
|
writeStatuses = replaceResult.getWriteStatuses().collect();
|
||||||
assertNoWriteErrors(writeStatuses);
|
assertNoWriteErrors(writeStatuses);
|
||||||
assertFalse(metadata(client).isInSync());
|
assertTrue(metadata(client).isInSync());
|
||||||
}
|
}
|
||||||
|
|
||||||
// Enable metadata table and ensure it is synced
|
// Enable metadata table and ensure it is synced
|
||||||
@@ -757,7 +758,7 @@ public class TestHoodieBackedMetadata extends HoodieClientTestHarness {
|
|||||||
private void validateMetadata(SparkRDDWriteClient client) throws IOException {
|
private void validateMetadata(SparkRDDWriteClient client) throws IOException {
|
||||||
HoodieWriteConfig config = client.getConfig();
|
HoodieWriteConfig config = client.getConfig();
|
||||||
|
|
||||||
HoodieBackedTableMetadata tableMetadata = metadata(client);
|
HoodieTableMetadata tableMetadata = metadata(client);
|
||||||
assertNotNull(tableMetadata, "MetadataReader should have been initialized");
|
assertNotNull(tableMetadata, "MetadataReader should have been initialized");
|
||||||
if (!config.useFileListingMetadata()) {
|
if (!config.useFileListingMetadata()) {
|
||||||
return;
|
return;
|
||||||
@@ -767,7 +768,9 @@ public class TestHoodieBackedMetadata extends HoodieClientTestHarness {
|
|||||||
HoodieSparkEngineContext engineContext = new HoodieSparkEngineContext(jsc);
|
HoodieSparkEngineContext engineContext = new HoodieSparkEngineContext(jsc);
|
||||||
|
|
||||||
// Partitions should match
|
// Partitions should match
|
||||||
List<String> fsPartitions = FSUtils.getAllFoldersWithPartitionMetaFile(fs, basePath);
|
FileSystemBackedTableMetadata fsBackedTableMetadata = new FileSystemBackedTableMetadata(engineContext,
|
||||||
|
new SerializableConfiguration(hadoopConf), config.getBasePath(), config.shouldAssumeDatePartitioning());
|
||||||
|
List<String> fsPartitions = fsBackedTableMetadata.getAllPartitionPaths();
|
||||||
List<String> metadataPartitions = tableMetadata.getAllPartitionPaths();
|
List<String> metadataPartitions = tableMetadata.getAllPartitionPaths();
|
||||||
|
|
||||||
Collections.sort(fsPartitions);
|
Collections.sort(fsPartitions);
|
||||||
@@ -849,7 +852,7 @@ public class TestHoodieBackedMetadata extends HoodieClientTestHarness {
|
|||||||
// Metadata table has a fixed number of partitions
|
// Metadata table has a fixed number of partitions
|
||||||
// Cannot use FSUtils.getAllFoldersWithPartitionMetaFile for this as that function filters all directory
|
// Cannot use FSUtils.getAllFoldersWithPartitionMetaFile for this as that function filters all directory
|
||||||
// in the .hoodie folder.
|
// in the .hoodie folder.
|
||||||
List<String> metadataTablePartitions = FSUtils.getAllPartitionPaths(fs, HoodieTableMetadata.getMetadataTableBasePath(basePath),
|
List<String> metadataTablePartitions = FSUtils.getAllPartitionPaths(engineContext, fs, HoodieTableMetadata.getMetadataTableBasePath(basePath),
|
||||||
false, false, false);
|
false, false, false);
|
||||||
assertEquals(MetadataPartitionType.values().length, metadataTablePartitions.size());
|
assertEquals(MetadataPartitionType.values().length, metadataTablePartitions.size());
|
||||||
|
|
||||||
@@ -873,10 +876,11 @@ public class TestHoodieBackedMetadata extends HoodieClientTestHarness {
|
|||||||
.create(hadoopConf, client.getConfig(), new HoodieSparkEngineContext(jsc));
|
.create(hadoopConf, client.getConfig(), new HoodieSparkEngineContext(jsc));
|
||||||
}
|
}
|
||||||
|
|
||||||
private HoodieBackedTableMetadata metadata(SparkRDDWriteClient client) {
|
private HoodieTableMetadata metadata(SparkRDDWriteClient client) {
|
||||||
HoodieWriteConfig clientConfig = client.getConfig();
|
HoodieWriteConfig clientConfig = client.getConfig();
|
||||||
return (HoodieBackedTableMetadata) HoodieTableMetadata.create(hadoopConf, clientConfig.getBasePath(), clientConfig.getSpillableMapBasePath(),
|
return HoodieTableMetadata.create(client.getEngineContext(), clientConfig.getBasePath(),
|
||||||
clientConfig.useFileListingMetadata(), clientConfig.getFileListingMetadataVerify(), false, clientConfig.shouldAssumeDatePartitioning());
|
clientConfig.getSpillableMapBasePath(), clientConfig.useFileListingMetadata(),
|
||||||
|
clientConfig.getFileListingMetadataVerify(), false, clientConfig.shouldAssumeDatePartitioning());
|
||||||
}
|
}
|
||||||
|
|
||||||
// TODO: this can be moved to TestHarness after merge from master
|
// TODO: this can be moved to TestHarness after merge from master
|
||||||
|
|||||||
@@ -39,6 +39,7 @@ public final class HoodieMetadataConfig extends DefaultHoodieConfig {
|
|||||||
// Validate contents of Metadata Table on each access against the actual filesystem
|
// Validate contents of Metadata Table on each access against the actual filesystem
|
||||||
public static final String METADATA_VALIDATE_PROP = METADATA_PREFIX + ".validate";
|
public static final String METADATA_VALIDATE_PROP = METADATA_PREFIX + ".validate";
|
||||||
public static final boolean DEFAULT_METADATA_VALIDATE = false;
|
public static final boolean DEFAULT_METADATA_VALIDATE = false;
|
||||||
|
public static final boolean DEFAULT_METADATA_ENABLE_FOR_READERS = false;
|
||||||
|
|
||||||
// Parallelism for inserts
|
// Parallelism for inserts
|
||||||
public static final String METADATA_INSERT_PARALLELISM_PROP = METADATA_PREFIX + ".insert.parallelism";
|
public static final String METADATA_INSERT_PARALLELISM_PROP = METADATA_PREFIX + ".insert.parallelism";
|
||||||
@@ -62,10 +63,6 @@ public final class HoodieMetadataConfig extends DefaultHoodieConfig {
|
|||||||
public static final String CLEANER_COMMITS_RETAINED_PROP = METADATA_PREFIX + ".cleaner.commits.retained";
|
public static final String CLEANER_COMMITS_RETAINED_PROP = METADATA_PREFIX + ".cleaner.commits.retained";
|
||||||
public static final int DEFAULT_CLEANER_COMMITS_RETAINED = 3;
|
public static final int DEFAULT_CLEANER_COMMITS_RETAINED = 3;
|
||||||
|
|
||||||
// We can set the default to true for readers, as it will internally default to listing from filesystem if metadata
|
|
||||||
// table is not found
|
|
||||||
public static final boolean DEFAULT_METADATA_ENABLE_FOR_READERS = true;
|
|
||||||
|
|
||||||
private HoodieMetadataConfig(Properties props) {
|
private HoodieMetadataConfig(Properties props) {
|
||||||
super(props);
|
super(props);
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -0,0 +1,90 @@
|
|||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one
|
||||||
|
* or more contributor license agreements. See the NOTICE file
|
||||||
|
* distributed with this work for additional information
|
||||||
|
* regarding copyright ownership. The ASF licenses this file
|
||||||
|
* to you under the Apache License, Version 2.0 (the
|
||||||
|
* "License"); you may not use this file except in compliance
|
||||||
|
* with the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.apache.hudi.common.engine;
|
||||||
|
|
||||||
|
import org.apache.hadoop.conf.Configuration;
|
||||||
|
import org.apache.hudi.common.config.SerializableConfiguration;
|
||||||
|
import org.apache.hudi.common.function.SerializableConsumer;
|
||||||
|
import org.apache.hudi.common.function.SerializableFunction;
|
||||||
|
import org.apache.hudi.common.function.SerializablePairFunction;
|
||||||
|
import org.apache.hudi.common.util.Option;
|
||||||
|
|
||||||
|
import org.apache.hudi.common.util.collection.Pair;
|
||||||
|
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Map;
|
||||||
|
import java.util.stream.Collectors;
|
||||||
|
import java.util.stream.Stream;
|
||||||
|
|
||||||
|
import static java.util.stream.Collectors.toList;
|
||||||
|
import static org.apache.hudi.common.function.FunctionWrapper.throwingFlatMapWrapper;
|
||||||
|
import static org.apache.hudi.common.function.FunctionWrapper.throwingForeachWrapper;
|
||||||
|
import static org.apache.hudi.common.function.FunctionWrapper.throwingMapToPairWrapper;
|
||||||
|
import static org.apache.hudi.common.function.FunctionWrapper.throwingMapWrapper;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* A java based engine context, use this implementation on the query engine integrations if needed.
|
||||||
|
*/
|
||||||
|
public final class HoodieLocalEngineContext extends HoodieEngineContext {
|
||||||
|
|
||||||
|
public HoodieLocalEngineContext(Configuration conf) {
|
||||||
|
this(conf, new LocalTaskContextSupplier());
|
||||||
|
}
|
||||||
|
|
||||||
|
public HoodieLocalEngineContext(Configuration conf, TaskContextSupplier taskContextSupplier) {
|
||||||
|
super(new SerializableConfiguration(conf), taskContextSupplier);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public <I, O> List<O> map(List<I> data, SerializableFunction<I, O> func, int parallelism) {
|
||||||
|
return data.stream().parallel().map(throwingMapWrapper(func)).collect(toList());
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public <I, O> List<O> flatMap(List<I> data, SerializableFunction<I, Stream<O>> func, int parallelism) {
|
||||||
|
return data.stream().parallel().flatMap(throwingFlatMapWrapper(func)).collect(toList());
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public <I> void foreach(List<I> data, SerializableConsumer<I> consumer, int parallelism) {
|
||||||
|
data.stream().forEach(throwingForeachWrapper(consumer));
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public <I, K, V> Map<K, V> mapToPair(List<I> data, SerializablePairFunction<I, K, V> func, Integer parallelism) {
|
||||||
|
return data.stream().map(throwingMapToPairWrapper(func)).collect(
|
||||||
|
Collectors.toMap(Pair::getLeft, Pair::getRight, (oldVal, newVal) -> newVal)
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void setProperty(EngineProperty key, String value) {
|
||||||
|
// no operation for now
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Option<String> getProperty(EngineProperty key) {
|
||||||
|
return Option.empty();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void setJobStatus(String activeModule, String activityDescription) {
|
||||||
|
// no operation for now
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -0,0 +1,45 @@
|
|||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one
|
||||||
|
* or more contributor license agreements. See the NOTICE file
|
||||||
|
* distributed with this work for additional information
|
||||||
|
* regarding copyright ownership. The ASF licenses this file
|
||||||
|
* to you under the Apache License, Version 2.0 (the
|
||||||
|
* "License"); you may not use this file except in compliance
|
||||||
|
* with the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.apache.hudi.common.engine;
|
||||||
|
|
||||||
|
import org.apache.hudi.common.util.Option;
|
||||||
|
|
||||||
|
import java.util.function.Supplier;
|
||||||
|
|
||||||
|
public final class LocalTaskContextSupplier extends TaskContextSupplier {
|
||||||
|
@Override
|
||||||
|
public Supplier<Integer> getPartitionIdSupplier() {
|
||||||
|
return () -> 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Supplier<Integer> getStageIdSupplier() {
|
||||||
|
return () -> 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Supplier<Long> getAttemptIdSupplier() {
|
||||||
|
return () -> 0L;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Option<String> getProperty(EngineProperty prop) {
|
||||||
|
return Option.empty();
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -19,6 +19,7 @@
|
|||||||
package org.apache.hudi.common.fs;
|
package org.apache.hudi.common.fs;
|
||||||
|
|
||||||
import org.apache.hudi.common.config.SerializableConfiguration;
|
import org.apache.hudi.common.config.SerializableConfiguration;
|
||||||
|
import org.apache.hudi.common.engine.HoodieEngineContext;
|
||||||
import org.apache.hudi.common.model.HoodieFileFormat;
|
import org.apache.hudi.common.model.HoodieFileFormat;
|
||||||
import org.apache.hudi.common.model.HoodieLogFile;
|
import org.apache.hudi.common.model.HoodieLogFile;
|
||||||
import org.apache.hudi.common.model.HoodiePartitionMetadata;
|
import org.apache.hudi.common.model.HoodiePartitionMetadata;
|
||||||
@@ -252,13 +253,14 @@ public class FSUtils {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
public static List<String> getAllPartitionPaths(FileSystem fs, String basePathStr, boolean useFileListingFromMetadata, boolean verifyListings,
|
public static List<String> getAllPartitionPaths(HoodieEngineContext engineContext, FileSystem fs, String basePathStr,
|
||||||
|
boolean useFileListingFromMetadata, boolean verifyListings,
|
||||||
boolean assumeDatePartitioning) throws IOException {
|
boolean assumeDatePartitioning) throws IOException {
|
||||||
if (assumeDatePartitioning) {
|
if (assumeDatePartitioning) {
|
||||||
return getAllPartitionFoldersThreeLevelsDown(fs, basePathStr);
|
return getAllPartitionFoldersThreeLevelsDown(fs, basePathStr);
|
||||||
} else {
|
} else {
|
||||||
HoodieTableMetadata tableMetadata = HoodieTableMetadata.create(fs.getConf(), basePathStr, "/tmp/", useFileListingFromMetadata,
|
HoodieTableMetadata tableMetadata = HoodieTableMetadata.create(engineContext, basePathStr, "/tmp/",
|
||||||
verifyListings, false, false);
|
useFileListingFromMetadata, verifyListings, false, false);
|
||||||
return tableMetadata.getAllPartitionPaths();
|
return tableMetadata.getAllPartitionPaths();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -19,6 +19,7 @@
|
|||||||
package org.apache.hudi.common.table.view;
|
package org.apache.hudi.common.table.view;
|
||||||
|
|
||||||
import org.apache.hudi.common.config.SerializableConfiguration;
|
import org.apache.hudi.common.config.SerializableConfiguration;
|
||||||
|
import org.apache.hudi.common.engine.HoodieEngineContext;
|
||||||
import org.apache.hudi.common.table.HoodieTableMetaClient;
|
import org.apache.hudi.common.table.HoodieTableMetaClient;
|
||||||
import org.apache.hudi.common.table.timeline.HoodieTimeline;
|
import org.apache.hudi.common.table.timeline.HoodieTimeline;
|
||||||
import org.apache.hudi.common.util.Functions.Function2;
|
import org.apache.hudi.common.util.Functions.Function2;
|
||||||
@@ -159,12 +160,11 @@ public class FileSystemViewManager {
|
|||||||
return new HoodieTableFileSystemView(metaClient, timeline, viewConf.isIncrementalTimelineSyncEnabled());
|
return new HoodieTableFileSystemView(metaClient, timeline, viewConf.isIncrementalTimelineSyncEnabled());
|
||||||
}
|
}
|
||||||
|
|
||||||
public static HoodieTableFileSystemView createInMemoryFileSystemView(HoodieTableMetaClient metaClient,
|
public static HoodieTableFileSystemView createInMemoryFileSystemView(HoodieEngineContext engineContext,
|
||||||
boolean useFileListingFromMetadata,
|
HoodieTableMetaClient metaClient, boolean useFileListingFromMetadata, boolean verifyListings) {
|
||||||
boolean verifyListings) {
|
|
||||||
LOG.info("Creating InMemory based view for basePath " + metaClient.getBasePath());
|
LOG.info("Creating InMemory based view for basePath " + metaClient.getBasePath());
|
||||||
if (useFileListingFromMetadata) {
|
if (useFileListingFromMetadata) {
|
||||||
return new HoodieMetadataFileSystemView(metaClient,
|
return new HoodieMetadataFileSystemView(engineContext, metaClient,
|
||||||
metaClient.getActiveTimeline().getCommitsTimeline().filterCompletedInstants(),
|
metaClient.getActiveTimeline().getCommitsTimeline().filterCompletedInstants(),
|
||||||
true,
|
true,
|
||||||
verifyListings);
|
verifyListings);
|
||||||
|
|||||||
@@ -23,6 +23,7 @@ import org.apache.avro.Schema;
|
|||||||
import org.apache.hudi.avro.HoodieAvroUtils;
|
import org.apache.hudi.avro.HoodieAvroUtils;
|
||||||
import org.apache.hudi.avro.model.HoodieMetadataRecord;
|
import org.apache.hudi.avro.model.HoodieMetadataRecord;
|
||||||
import org.apache.hudi.common.config.SerializableConfiguration;
|
import org.apache.hudi.common.config.SerializableConfiguration;
|
||||||
|
import org.apache.hudi.common.engine.HoodieEngineContext;
|
||||||
import org.apache.hudi.common.fs.FSUtils;
|
import org.apache.hudi.common.fs.FSUtils;
|
||||||
import org.apache.hudi.common.metrics.Registry;
|
import org.apache.hudi.common.metrics.Registry;
|
||||||
import org.apache.hudi.common.model.HoodiePartitionMetadata;
|
import org.apache.hudi.common.model.HoodiePartitionMetadata;
|
||||||
@@ -33,7 +34,6 @@ import org.apache.hudi.common.table.timeline.HoodieInstant;
|
|||||||
import org.apache.hudi.common.util.HoodieTimer;
|
import org.apache.hudi.common.util.HoodieTimer;
|
||||||
import org.apache.hudi.common.util.Option;
|
import org.apache.hudi.common.util.Option;
|
||||||
|
|
||||||
import org.apache.hadoop.conf.Configuration;
|
|
||||||
import org.apache.hadoop.fs.FileStatus;
|
import org.apache.hadoop.fs.FileStatus;
|
||||||
import org.apache.hadoop.fs.Path;
|
import org.apache.hadoop.fs.Path;
|
||||||
|
|
||||||
@@ -54,6 +54,7 @@ public abstract class BaseTableMetadata implements HoodieTableMetadata {
|
|||||||
static final long MAX_MEMORY_SIZE_IN_BYTES = 1024 * 1024 * 1024;
|
static final long MAX_MEMORY_SIZE_IN_BYTES = 1024 * 1024 * 1024;
|
||||||
static final int BUFFER_SIZE = 10 * 1024 * 1024;
|
static final int BUFFER_SIZE = 10 * 1024 * 1024;
|
||||||
|
|
||||||
|
protected final transient HoodieEngineContext engineContext;
|
||||||
protected final SerializableConfiguration hadoopConf;
|
protected final SerializableConfiguration hadoopConf;
|
||||||
protected final String datasetBasePath;
|
protected final String datasetBasePath;
|
||||||
protected boolean enabled;
|
protected boolean enabled;
|
||||||
@@ -66,10 +67,11 @@ public abstract class BaseTableMetadata implements HoodieTableMetadata {
|
|||||||
protected final String spillableMapDirectory;
|
protected final String spillableMapDirectory;
|
||||||
private transient HoodieMetadataMergedInstantRecordScanner timelineRecordScanner;
|
private transient HoodieMetadataMergedInstantRecordScanner timelineRecordScanner;
|
||||||
|
|
||||||
protected BaseTableMetadata(Configuration hadoopConf, String datasetBasePath, String spillableMapDirectory,
|
protected BaseTableMetadata(HoodieEngineContext engineContext, String datasetBasePath, String spillableMapDirectory,
|
||||||
boolean enabled, boolean validateLookups, boolean enableMetrics,
|
boolean enabled, boolean validateLookups, boolean enableMetrics,
|
||||||
boolean assumeDatePartitioning) {
|
boolean assumeDatePartitioning) {
|
||||||
this.hadoopConf = new SerializableConfiguration(hadoopConf);
|
this.engineContext = engineContext;
|
||||||
|
this.hadoopConf = new SerializableConfiguration(engineContext.getHadoopConf());
|
||||||
this.datasetBasePath = datasetBasePath;
|
this.datasetBasePath = datasetBasePath;
|
||||||
this.spillableMapDirectory = spillableMapDirectory;
|
this.spillableMapDirectory = spillableMapDirectory;
|
||||||
|
|
||||||
@@ -102,7 +104,8 @@ public abstract class BaseTableMetadata implements HoodieTableMetadata {
|
|||||||
LOG.error("Failed to retrieve list of partition from metadata", e);
|
LOG.error("Failed to retrieve list of partition from metadata", e);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return new FileSystemBackedTableMetadata(hadoopConf, datasetBasePath, assumeDatePartitioning).getAllPartitionPaths();
|
return new FileSystemBackedTableMetadata(engineContext, hadoopConf, datasetBasePath,
|
||||||
|
assumeDatePartitioning).getAllPartitionPaths();
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@@ -155,7 +158,8 @@ public abstract class BaseTableMetadata implements HoodieTableMetadata {
|
|||||||
if (validateLookups) {
|
if (validateLookups) {
|
||||||
// Validate the Metadata Table data by listing the partitions from the file system
|
// Validate the Metadata Table data by listing the partitions from the file system
|
||||||
timer.startTimer();
|
timer.startTimer();
|
||||||
FileSystemBackedTableMetadata fileSystemBackedTableMetadata = new FileSystemBackedTableMetadata(hadoopConf, datasetBasePath, assumeDatePartitioning);
|
FileSystemBackedTableMetadata fileSystemBackedTableMetadata = new FileSystemBackedTableMetadata(engineContext,
|
||||||
|
hadoopConf, datasetBasePath, assumeDatePartitioning);
|
||||||
List<String> actualPartitions = fileSystemBackedTableMetadata.getAllPartitionPaths();
|
List<String> actualPartitions = fileSystemBackedTableMetadata.getAllPartitionPaths();
|
||||||
metrics.ifPresent(m -> m.updateMetrics(HoodieMetadataMetrics.VALIDATE_PARTITIONS_STR, timer.endTimer()));
|
metrics.ifPresent(m -> m.updateMetrics(HoodieMetadataMetrics.VALIDATE_PARTITIONS_STR, timer.endTimer()));
|
||||||
|
|
||||||
@@ -290,4 +294,8 @@ public abstract class BaseTableMetadata implements HoodieTableMetadata {
|
|||||||
protected void closeReaders() {
|
protected void closeReaders() {
|
||||||
timelineRecordScanner = null;
|
timelineRecordScanner = null;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
protected HoodieEngineContext getEngineContext() {
|
||||||
|
return engineContext;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -19,23 +19,36 @@
|
|||||||
package org.apache.hudi.metadata;
|
package org.apache.hudi.metadata;
|
||||||
|
|
||||||
import org.apache.hudi.common.config.SerializableConfiguration;
|
import org.apache.hudi.common.config.SerializableConfiguration;
|
||||||
|
import org.apache.hudi.common.engine.HoodieEngineContext;
|
||||||
import org.apache.hudi.common.fs.FSUtils;
|
import org.apache.hudi.common.fs.FSUtils;
|
||||||
|
import org.apache.hudi.common.model.HoodiePartitionMetadata;
|
||||||
|
import org.apache.hudi.common.table.HoodieTableMetaClient;
|
||||||
import org.apache.hudi.common.util.Option;
|
import org.apache.hudi.common.util.Option;
|
||||||
|
import org.apache.hudi.common.util.collection.Pair;
|
||||||
|
|
||||||
import org.apache.hadoop.fs.FileStatus;
|
import org.apache.hadoop.fs.FileStatus;
|
||||||
import org.apache.hadoop.fs.FileSystem;
|
import org.apache.hadoop.fs.FileSystem;
|
||||||
import org.apache.hadoop.fs.Path;
|
import org.apache.hadoop.fs.Path;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.Arrays;
|
||||||
|
import java.util.LinkedList;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
public class FileSystemBackedTableMetadata implements HoodieTableMetadata {
|
public class FileSystemBackedTableMetadata implements HoodieTableMetadata {
|
||||||
|
|
||||||
|
private static final int DEFAULT_LISTING_PARALLELISM = 1500;
|
||||||
|
|
||||||
|
private final transient HoodieEngineContext engineContext;
|
||||||
private final SerializableConfiguration hadoopConf;
|
private final SerializableConfiguration hadoopConf;
|
||||||
private final String datasetBasePath;
|
private final String datasetBasePath;
|
||||||
private final boolean assumeDatePartitioning;
|
private final boolean assumeDatePartitioning;
|
||||||
|
|
||||||
public FileSystemBackedTableMetadata(SerializableConfiguration conf, String datasetBasePath, boolean assumeDatePartitioning) {
|
public FileSystemBackedTableMetadata(HoodieEngineContext engineContext, SerializableConfiguration conf, String datasetBasePath,
|
||||||
|
boolean assumeDatePartitioning) {
|
||||||
|
this.engineContext = engineContext;
|
||||||
this.hadoopConf = conf;
|
this.hadoopConf = conf;
|
||||||
this.datasetBasePath = datasetBasePath;
|
this.datasetBasePath = datasetBasePath;
|
||||||
this.assumeDatePartitioning = assumeDatePartitioning;
|
this.assumeDatePartitioning = assumeDatePartitioning;
|
||||||
@@ -49,12 +62,47 @@ public class FileSystemBackedTableMetadata implements HoodieTableMetadata {
|
|||||||
|
|
||||||
@Override
|
@Override
|
||||||
public List<String> getAllPartitionPaths() throws IOException {
|
public List<String> getAllPartitionPaths() throws IOException {
|
||||||
FileSystem fs = new Path(datasetBasePath).getFileSystem(hadoopConf.get());
|
|
||||||
if (assumeDatePartitioning) {
|
if (assumeDatePartitioning) {
|
||||||
|
FileSystem fs = new Path(datasetBasePath).getFileSystem(hadoopConf.get());
|
||||||
return FSUtils.getAllPartitionFoldersThreeLevelsDown(fs, datasetBasePath);
|
return FSUtils.getAllPartitionFoldersThreeLevelsDown(fs, datasetBasePath);
|
||||||
} else {
|
|
||||||
return FSUtils.getAllFoldersWithPartitionMetaFile(fs, datasetBasePath);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
List<Path> pathsToList = new LinkedList<>();
|
||||||
|
pathsToList.add(new Path(datasetBasePath));
|
||||||
|
List<String> partitionPaths = new ArrayList<>();
|
||||||
|
|
||||||
|
while (!pathsToList.isEmpty()) {
|
||||||
|
// TODO: Get the parallelism from HoodieWriteConfig
|
||||||
|
int listingParallelism = Math.min(DEFAULT_LISTING_PARALLELISM, pathsToList.size());
|
||||||
|
|
||||||
|
// List all directories in parallel
|
||||||
|
List<Pair<Path, FileStatus[]>> dirToFileListing = engineContext.map(pathsToList, path -> {
|
||||||
|
FileSystem fileSystem = path.getFileSystem(hadoopConf.get());
|
||||||
|
return Pair.of(path, fileSystem.listStatus(path));
|
||||||
|
}, listingParallelism);
|
||||||
|
pathsToList.clear();
|
||||||
|
|
||||||
|
// If the listing reveals a directory, add it to queue. If the listing reveals a hoodie partition, add it to
|
||||||
|
// the results.
|
||||||
|
dirToFileListing.forEach(p -> {
|
||||||
|
Option<FileStatus> partitionMetaFile = Option.fromJavaOptional(Arrays.stream(p.getRight()).parallel()
|
||||||
|
.filter(fs -> fs.getPath().getName().equals(HoodiePartitionMetadata.HOODIE_PARTITION_METAFILE))
|
||||||
|
.findFirst());
|
||||||
|
|
||||||
|
if (partitionMetaFile.isPresent()) {
|
||||||
|
// Is a partition.
|
||||||
|
String partitionName = FSUtils.getRelativePartitionPath(new Path(datasetBasePath), p.getLeft());
|
||||||
|
partitionPaths.add(partitionName);
|
||||||
|
} else {
|
||||||
|
// Add sub-dirs to the queue
|
||||||
|
pathsToList.addAll(Arrays.stream(p.getRight())
|
||||||
|
.filter(fs -> fs.isDirectory() && !fs.getPath().getName().equals(HoodieTableMetaClient.METAFOLDER_NAME))
|
||||||
|
.map(fs -> fs.getPath())
|
||||||
|
.collect(Collectors.toList()));
|
||||||
|
}
|
||||||
|
});
|
||||||
|
}
|
||||||
|
return partitionPaths;
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
@@ -64,6 +112,6 @@ public class FileSystemBackedTableMetadata implements HoodieTableMetadata {
|
|||||||
|
|
||||||
@Override
|
@Override
|
||||||
public boolean isInSync() {
|
public boolean isInSync() {
|
||||||
throw new UnsupportedOperationException();
|
return true;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -21,6 +21,8 @@ package org.apache.hudi.metadata;
|
|||||||
import org.apache.hudi.avro.HoodieAvroUtils;
|
import org.apache.hudi.avro.HoodieAvroUtils;
|
||||||
import org.apache.hudi.avro.model.HoodieMetadataRecord;
|
import org.apache.hudi.avro.model.HoodieMetadataRecord;
|
||||||
import org.apache.hudi.common.config.SerializableConfiguration;
|
import org.apache.hudi.common.config.SerializableConfiguration;
|
||||||
|
import org.apache.hudi.common.engine.HoodieEngineContext;
|
||||||
|
import org.apache.hudi.common.engine.HoodieLocalEngineContext;
|
||||||
import org.apache.hudi.common.model.FileSlice;
|
import org.apache.hudi.common.model.FileSlice;
|
||||||
import org.apache.hudi.common.model.HoodieBaseFile;
|
import org.apache.hudi.common.model.HoodieBaseFile;
|
||||||
import org.apache.hudi.common.model.HoodieLogFile;
|
import org.apache.hudi.common.model.HoodieLogFile;
|
||||||
@@ -70,15 +72,15 @@ public class HoodieBackedTableMetadata extends BaseTableMetadata {
|
|||||||
private transient HoodieFileReader<GenericRecord> baseFileReader;
|
private transient HoodieFileReader<GenericRecord> baseFileReader;
|
||||||
private transient HoodieMetadataMergedLogRecordScanner logRecordScanner;
|
private transient HoodieMetadataMergedLogRecordScanner logRecordScanner;
|
||||||
|
|
||||||
public HoodieBackedTableMetadata(Configuration conf, String datasetBasePath, String spillableMapDirectory,
|
public HoodieBackedTableMetadata(Configuration conf, String datasetBasePath, String spillableMapDirectory, boolean enabled,
|
||||||
boolean enabled, boolean validateLookups, boolean assumeDatePartitioning) {
|
boolean validateLookups, boolean assumeDatePartitioning) {
|
||||||
this(conf, datasetBasePath, spillableMapDirectory, enabled, validateLookups, false, assumeDatePartitioning);
|
this(new HoodieLocalEngineContext(conf), datasetBasePath, spillableMapDirectory, enabled, validateLookups,
|
||||||
|
false, assumeDatePartitioning);
|
||||||
}
|
}
|
||||||
|
|
||||||
public HoodieBackedTableMetadata(Configuration conf, String datasetBasePath, String spillableMapDirectory,
|
public HoodieBackedTableMetadata(HoodieEngineContext engineContext, String datasetBasePath, String spillableMapDirectory,
|
||||||
boolean enabled, boolean validateLookups, boolean enableMetrics,
|
boolean enabled, boolean validateLookups, boolean enableMetrics, boolean assumeDatePartitioning) {
|
||||||
boolean assumeDatePartitioning) {
|
super(engineContext, datasetBasePath, spillableMapDirectory, enabled, validateLookups, enableMetrics, assumeDatePartitioning);
|
||||||
super(conf, datasetBasePath, spillableMapDirectory, enabled, validateLookups, enableMetrics, assumeDatePartitioning);
|
|
||||||
this.metadataBasePath = HoodieTableMetadata.getMetadataTableBasePath(datasetBasePath);
|
this.metadataBasePath = HoodieTableMetadata.getMetadataTableBasePath(datasetBasePath);
|
||||||
if (enabled) {
|
if (enabled) {
|
||||||
try {
|
try {
|
||||||
|
|||||||
@@ -22,6 +22,8 @@ import java.io.IOException;
|
|||||||
|
|
||||||
import org.apache.hadoop.fs.FileStatus;
|
import org.apache.hadoop.fs.FileStatus;
|
||||||
import org.apache.hadoop.fs.Path;
|
import org.apache.hadoop.fs.Path;
|
||||||
|
|
||||||
|
import org.apache.hudi.common.engine.HoodieEngineContext;
|
||||||
import org.apache.hudi.common.table.HoodieTableMetaClient;
|
import org.apache.hudi.common.table.HoodieTableMetaClient;
|
||||||
import org.apache.hudi.common.table.timeline.HoodieTimeline;
|
import org.apache.hudi.common.table.timeline.HoodieTimeline;
|
||||||
import org.apache.hudi.common.table.view.FileSystemViewStorageConfig;
|
import org.apache.hudi.common.table.view.FileSystemViewStorageConfig;
|
||||||
@@ -40,12 +42,13 @@ public class HoodieMetadataFileSystemView extends HoodieTableFileSystemView {
|
|||||||
this.tableMetadata = tableMetadata;
|
this.tableMetadata = tableMetadata;
|
||||||
}
|
}
|
||||||
|
|
||||||
public HoodieMetadataFileSystemView(HoodieTableMetaClient metaClient,
|
public HoodieMetadataFileSystemView(HoodieEngineContext engineContext,
|
||||||
|
HoodieTableMetaClient metaClient,
|
||||||
HoodieTimeline visibleActiveTimeline,
|
HoodieTimeline visibleActiveTimeline,
|
||||||
boolean useFileListingFromMetadata,
|
boolean useFileListingFromMetadata,
|
||||||
boolean verifyListings) {
|
boolean verifyListings) {
|
||||||
super(metaClient, visibleActiveTimeline);
|
super(metaClient, visibleActiveTimeline);
|
||||||
this.tableMetadata = HoodieTableMetadata.create(metaClient.getHadoopConf(), metaClient.getBasePath(),
|
this.tableMetadata = HoodieTableMetadata.create(engineContext, metaClient.getBasePath(),
|
||||||
FileSystemViewStorageConfig.DEFAULT_VIEW_SPILLABLE_DIR, useFileListingFromMetadata, verifyListings,
|
FileSystemViewStorageConfig.DEFAULT_VIEW_SPILLABLE_DIR, useFileListingFromMetadata, verifyListings,
|
||||||
false, false);
|
false, false);
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -18,10 +18,11 @@
|
|||||||
|
|
||||||
package org.apache.hudi.metadata;
|
package org.apache.hudi.metadata;
|
||||||
|
|
||||||
|
import org.apache.hudi.common.config.SerializableConfiguration;
|
||||||
|
import org.apache.hudi.common.engine.HoodieEngineContext;
|
||||||
import org.apache.hudi.common.table.HoodieTableMetaClient;
|
import org.apache.hudi.common.table.HoodieTableMetaClient;
|
||||||
import org.apache.hudi.common.util.Option;
|
import org.apache.hudi.common.util.Option;
|
||||||
|
|
||||||
import org.apache.hadoop.conf.Configuration;
|
|
||||||
import org.apache.hadoop.fs.FileStatus;
|
import org.apache.hadoop.fs.FileStatus;
|
||||||
import org.apache.hadoop.fs.Path;
|
import org.apache.hadoop.fs.Path;
|
||||||
|
|
||||||
@@ -68,10 +69,16 @@ public interface HoodieTableMetadata extends Serializable {
|
|||||||
return basePath.endsWith(METADATA_TABLE_REL_PATH);
|
return basePath.endsWith(METADATA_TABLE_REL_PATH);
|
||||||
}
|
}
|
||||||
|
|
||||||
static HoodieTableMetadata create(Configuration conf, String datasetBasePath, String spillableMapPath, boolean useFileListingFromMetadata,
|
static HoodieTableMetadata create(HoodieEngineContext engineContext, String datasetBasePath,
|
||||||
boolean verifyListings, boolean enableMetrics, boolean shouldAssumeDatePartitioning) {
|
String spillableMapPath, boolean useFileListingFromMetadata, boolean verifyListings,
|
||||||
return new HoodieBackedTableMetadata(conf, datasetBasePath, spillableMapPath, useFileListingFromMetadata, verifyListings,
|
boolean enableMetrics, boolean shouldAssumeDatePartitioning) {
|
||||||
enableMetrics, shouldAssumeDatePartitioning);
|
if (useFileListingFromMetadata) {
|
||||||
|
return new HoodieBackedTableMetadata(engineContext, datasetBasePath, spillableMapPath, useFileListingFromMetadata,
|
||||||
|
verifyListings, enableMetrics, shouldAssumeDatePartitioning);
|
||||||
|
} else {
|
||||||
|
return new FileSystemBackedTableMetadata(engineContext, new SerializableConfiguration(engineContext.getHadoopConf()),
|
||||||
|
datasetBasePath, shouldAssumeDatePartitioning);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|||||||
@@ -0,0 +1,174 @@
|
|||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one
|
||||||
|
* or more contributor license agreements. See the NOTICE file
|
||||||
|
* distributed with this work for additional information
|
||||||
|
* regarding copyright ownership. The ASF licenses this file
|
||||||
|
* to you under the Apache License, Version 2.0 (the
|
||||||
|
* "License"); you may not use this file except in compliance
|
||||||
|
* with the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.apache.hudi.metadata;
|
||||||
|
|
||||||
|
import org.apache.hadoop.fs.Path;
|
||||||
|
import org.apache.hudi.common.config.SerializableConfiguration;
|
||||||
|
import org.apache.hudi.common.engine.HoodieLocalEngineContext;
|
||||||
|
import org.apache.hudi.common.testutils.HoodieCommonTestHarness;
|
||||||
|
import org.apache.hudi.common.testutils.HoodieTestTable;
|
||||||
|
import org.junit.jupiter.api.AfterEach;
|
||||||
|
import org.junit.jupiter.api.Assertions;
|
||||||
|
import org.junit.jupiter.api.BeforeEach;
|
||||||
|
import org.junit.jupiter.api.Test;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.util.Arrays;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.stream.IntStream;
|
||||||
|
|
||||||
|
public class TestFileSystemBackedTableMetadata extends HoodieCommonTestHarness {
|
||||||
|
|
||||||
|
private static final String DEFAULT_PARTITION = "";
|
||||||
|
private static final List<String> DATE_PARTITIONS = Arrays.asList("2019/01/01", "2020/01/02", "2021/03/01");
|
||||||
|
private static final List<String> ONE_LEVEL_PARTITIONS = Arrays.asList("2019", "2020", "2021");
|
||||||
|
private static final List<String> MULTI_LEVEL_PARTITIONS = Arrays.asList("2019/01", "2020/01", "2021/01");
|
||||||
|
private static HoodieTestTable hoodieTestTable;
|
||||||
|
|
||||||
|
@BeforeEach
|
||||||
|
public void setUp() throws IOException {
|
||||||
|
initMetaClient();
|
||||||
|
hoodieTestTable = HoodieTestTable.of(metaClient);
|
||||||
|
}
|
||||||
|
|
||||||
|
@AfterEach
|
||||||
|
public void tearDown() throws IOException {
|
||||||
|
metaClient.getFs().delete(new Path(metaClient.getBasePath()), true);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Test non partition hoodie table.
|
||||||
|
* @throws Exception
|
||||||
|
*/
|
||||||
|
@Test
|
||||||
|
public void testNonPartitionedTable() throws Exception {
|
||||||
|
// Generate 10 files under basepath
|
||||||
|
hoodieTestTable.addCommit("100").withBaseFilesInPartition(DEFAULT_PARTITION, IntStream.range(0, 10).toArray());
|
||||||
|
HoodieLocalEngineContext localEngineContext = new HoodieLocalEngineContext(metaClient.getHadoopConf());
|
||||||
|
FileSystemBackedTableMetadata fileSystemBackedTableMetadata =
|
||||||
|
new FileSystemBackedTableMetadata(localEngineContext, new SerializableConfiguration(metaClient.getHadoopConf()), basePath, false);
|
||||||
|
Assertions.assertTrue(fileSystemBackedTableMetadata.getAllPartitionPaths().size() == 0);
|
||||||
|
Assertions.assertTrue(fileSystemBackedTableMetadata.getAllFilesInPartition(new Path(basePath)).length == 10);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Test listing of partitions result for date based partitions.
|
||||||
|
* @throws Exception
|
||||||
|
*/
|
||||||
|
@Test
|
||||||
|
public void testDatePartitionedTable() throws Exception {
|
||||||
|
String instant = "100";
|
||||||
|
hoodieTestTable = hoodieTestTable.addCommit(instant);
|
||||||
|
// Generate 10 files under each partition
|
||||||
|
DATE_PARTITIONS.stream().forEach(p -> {
|
||||||
|
try {
|
||||||
|
hoodieTestTable = hoodieTestTable.withBaseFilesInPartition(p, IntStream.range(0, 10).toArray());
|
||||||
|
} catch (Exception e) {
|
||||||
|
throw new RuntimeException(e);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
HoodieLocalEngineContext localEngineContext = new HoodieLocalEngineContext(metaClient.getHadoopConf());
|
||||||
|
FileSystemBackedTableMetadata fileSystemBackedTableMetadata =
|
||||||
|
new FileSystemBackedTableMetadata(localEngineContext, new SerializableConfiguration(metaClient.getHadoopConf()), basePath, true);
|
||||||
|
Assertions.assertTrue(fileSystemBackedTableMetadata.getAllPartitionPaths().size() == 3);
|
||||||
|
Assertions.assertTrue(fileSystemBackedTableMetadata.getAllFilesInPartition(new Path(basePath + "/" + DATE_PARTITIONS.get(0))).length == 10);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Test listing of partitions result for date based partitions with assumeDataPartitioning = false.
|
||||||
|
* @throws Exception
|
||||||
|
*/
|
||||||
|
@Test
|
||||||
|
public void testDatePartitionedTableWithAssumeDateIsFalse() throws Exception {
|
||||||
|
String instant = "100";
|
||||||
|
hoodieTestTable = hoodieTestTable.addCommit(instant);
|
||||||
|
// Generate 10 files under each partition
|
||||||
|
DATE_PARTITIONS.stream().forEach(p -> {
|
||||||
|
try {
|
||||||
|
hoodieTestTable = hoodieTestTable.withBaseFilesInPartition(p, IntStream.range(0, 10).toArray());
|
||||||
|
} catch (Exception e) {
|
||||||
|
throw new RuntimeException(e);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
HoodieLocalEngineContext localEngineContext = new HoodieLocalEngineContext(metaClient.getHadoopConf());
|
||||||
|
FileSystemBackedTableMetadata fileSystemBackedTableMetadata =
|
||||||
|
new FileSystemBackedTableMetadata(localEngineContext, new SerializableConfiguration(metaClient.getHadoopConf()), basePath, false);
|
||||||
|
Assertions.assertTrue(fileSystemBackedTableMetadata.getAllPartitionPaths().size() == 0);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testOneLevelPartitionedTable() throws Exception {
|
||||||
|
String instant = "100";
|
||||||
|
hoodieTestTable = hoodieTestTable.addCommit(instant);
|
||||||
|
// Generate 10 files under each partition
|
||||||
|
ONE_LEVEL_PARTITIONS.stream().forEach(p -> {
|
||||||
|
try {
|
||||||
|
hoodieTestTable = hoodieTestTable.withPartitionMetaFiles(p)
|
||||||
|
.withBaseFilesInPartition(p, IntStream.range(0, 10).toArray());
|
||||||
|
} catch (Exception e) {
|
||||||
|
throw new RuntimeException(e);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
HoodieLocalEngineContext localEngineContext = new HoodieLocalEngineContext(metaClient.getHadoopConf());
|
||||||
|
FileSystemBackedTableMetadata fileSystemBackedTableMetadata =
|
||||||
|
new FileSystemBackedTableMetadata(localEngineContext, new SerializableConfiguration(metaClient.getHadoopConf()), basePath, false);
|
||||||
|
Assertions.assertTrue(fileSystemBackedTableMetadata.getAllPartitionPaths().size() == 3);
|
||||||
|
Assertions.assertTrue(fileSystemBackedTableMetadata.getAllFilesInPartition(new Path(basePath + "/" + ONE_LEVEL_PARTITIONS.get(0))).length == 10);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testMultiLevelPartitionedTable() throws Exception {
|
||||||
|
String instant = "100";
|
||||||
|
hoodieTestTable = hoodieTestTable.addCommit(instant);
|
||||||
|
// Generate 10 files under each partition
|
||||||
|
MULTI_LEVEL_PARTITIONS.stream().forEach(p -> {
|
||||||
|
try {
|
||||||
|
hoodieTestTable = hoodieTestTable.withPartitionMetaFiles(p)
|
||||||
|
.withBaseFilesInPartition(p, IntStream.range(0, 10).toArray());
|
||||||
|
} catch (Exception e) {
|
||||||
|
throw new RuntimeException(e);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
HoodieLocalEngineContext localEngineContext = new HoodieLocalEngineContext(metaClient.getHadoopConf());
|
||||||
|
FileSystemBackedTableMetadata fileSystemBackedTableMetadata =
|
||||||
|
new FileSystemBackedTableMetadata(localEngineContext, new SerializableConfiguration(metaClient.getHadoopConf()), basePath, false);
|
||||||
|
Assertions.assertTrue(fileSystemBackedTableMetadata.getAllPartitionPaths().size() == 3);
|
||||||
|
Assertions.assertTrue(fileSystemBackedTableMetadata.getAllFilesInPartition(new Path(basePath + "/" + MULTI_LEVEL_PARTITIONS.get(0))).length == 10);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testMultiLevelEmptyPartitionTable() throws Exception {
|
||||||
|
String instant = "100";
|
||||||
|
hoodieTestTable = hoodieTestTable.addCommit(instant);
|
||||||
|
// Generate 10 files under each partition
|
||||||
|
MULTI_LEVEL_PARTITIONS.stream().forEach(p -> {
|
||||||
|
try {
|
||||||
|
hoodieTestTable = hoodieTestTable.withPartitionMetaFiles(p);
|
||||||
|
} catch (Exception e) {
|
||||||
|
throw new RuntimeException(e);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
HoodieLocalEngineContext localEngineContext = new HoodieLocalEngineContext(metaClient.getHadoopConf());
|
||||||
|
FileSystemBackedTableMetadata fileSystemBackedTableMetadata =
|
||||||
|
new FileSystemBackedTableMetadata(localEngineContext, new SerializableConfiguration(metaClient.getHadoopConf()), basePath, false);
|
||||||
|
Assertions.assertTrue(fileSystemBackedTableMetadata.getAllPartitionPaths().size() == 3);
|
||||||
|
Assertions.assertTrue(fileSystemBackedTableMetadata.getAllFilesInPartition(new Path(basePath + "/" + MULTI_LEVEL_PARTITIONS.get(0))).length == 0);
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
@@ -22,6 +22,7 @@ import java.util.Map;
|
|||||||
import java.util.Set;
|
import java.util.Set;
|
||||||
import org.apache.hadoop.conf.Configurable;
|
import org.apache.hadoop.conf.Configurable;
|
||||||
import org.apache.hudi.common.config.SerializableConfiguration;
|
import org.apache.hudi.common.config.SerializableConfiguration;
|
||||||
|
import org.apache.hudi.common.engine.HoodieLocalEngineContext;
|
||||||
import org.apache.hudi.common.fs.FSUtils;
|
import org.apache.hudi.common.fs.FSUtils;
|
||||||
import org.apache.hudi.common.model.HoodieBaseFile;
|
import org.apache.hudi.common.model.HoodieBaseFile;
|
||||||
import org.apache.hudi.common.model.HoodiePartitionMetadata;
|
import org.apache.hudi.common.model.HoodiePartitionMetadata;
|
||||||
@@ -172,8 +173,9 @@ public class HoodieROTablePathFilter implements Configurable, PathFilter, Serial
|
|||||||
|
|
||||||
boolean useFileListingFromMetadata = getConf().getBoolean(METADATA_ENABLE_PROP, DEFAULT_METADATA_ENABLE_FOR_READERS);
|
boolean useFileListingFromMetadata = getConf().getBoolean(METADATA_ENABLE_PROP, DEFAULT_METADATA_ENABLE_FOR_READERS);
|
||||||
boolean verifyFileListing = getConf().getBoolean(METADATA_VALIDATE_PROP, DEFAULT_METADATA_VALIDATE);
|
boolean verifyFileListing = getConf().getBoolean(METADATA_VALIDATE_PROP, DEFAULT_METADATA_VALIDATE);
|
||||||
HoodieTableFileSystemView fsView = FileSystemViewManager.createInMemoryFileSystemView(metaClient,
|
HoodieLocalEngineContext engineContext = new HoodieLocalEngineContext(conf.get());
|
||||||
useFileListingFromMetadata, verifyFileListing);
|
HoodieTableFileSystemView fsView = FileSystemViewManager.createInMemoryFileSystemView(engineContext,
|
||||||
|
metaClient, useFileListingFromMetadata, verifyFileListing);
|
||||||
String partition = FSUtils.getRelativePartitionPath(new Path(metaClient.getBasePath()), folder);
|
String partition = FSUtils.getRelativePartitionPath(new Path(metaClient.getBasePath()), folder);
|
||||||
|
|
||||||
List<HoodieBaseFile> latestFiles = fsView.getLatestBaseFiles(partition).collect(Collectors.toList());
|
List<HoodieBaseFile> latestFiles = fsView.getLatestBaseFiles(partition).collect(Collectors.toList());
|
||||||
|
|||||||
@@ -18,6 +18,7 @@
|
|||||||
|
|
||||||
package org.apache.hudi.hadoop.utils;
|
package org.apache.hudi.hadoop.utils;
|
||||||
|
|
||||||
|
import org.apache.hudi.common.engine.HoodieLocalEngineContext;
|
||||||
import org.apache.hudi.common.fs.FSUtils;
|
import org.apache.hudi.common.fs.FSUtils;
|
||||||
import org.apache.hudi.common.model.HoodieBaseFile;
|
import org.apache.hudi.common.model.HoodieBaseFile;
|
||||||
import org.apache.hudi.common.model.HoodieCommitMetadata;
|
import org.apache.hudi.common.model.HoodieCommitMetadata;
|
||||||
@@ -427,8 +428,9 @@ public class HoodieInputFormatUtils {
|
|||||||
|
|
||||||
boolean useFileListingFromMetadata = job.getBoolean(METADATA_ENABLE_PROP, DEFAULT_METADATA_ENABLE_FOR_READERS);
|
boolean useFileListingFromMetadata = job.getBoolean(METADATA_ENABLE_PROP, DEFAULT_METADATA_ENABLE_FOR_READERS);
|
||||||
boolean verifyFileListing = job.getBoolean(METADATA_VALIDATE_PROP, DEFAULT_METADATA_VALIDATE);
|
boolean verifyFileListing = job.getBoolean(METADATA_VALIDATE_PROP, DEFAULT_METADATA_VALIDATE);
|
||||||
HoodieTableFileSystemView fsView = FileSystemViewManager.createInMemoryFileSystemView(metaClient,
|
HoodieLocalEngineContext engineContext = new HoodieLocalEngineContext(job);
|
||||||
useFileListingFromMetadata, verifyFileListing);
|
HoodieTableFileSystemView fsView = FileSystemViewManager.createInMemoryFileSystemView(engineContext,
|
||||||
|
metaClient, useFileListingFromMetadata, verifyFileListing);
|
||||||
|
|
||||||
List<HoodieBaseFile> filteredBaseFiles = new ArrayList<>();
|
List<HoodieBaseFile> filteredBaseFiles = new ArrayList<>();
|
||||||
for (Path p : paths) {
|
for (Path p : paths) {
|
||||||
|
|||||||
@@ -18,6 +18,7 @@
|
|||||||
|
|
||||||
package org.apache.hudi.hadoop.utils;
|
package org.apache.hudi.hadoop.utils;
|
||||||
|
|
||||||
|
import org.apache.hudi.common.engine.HoodieLocalEngineContext;
|
||||||
import org.apache.hudi.common.fs.FSUtils;
|
import org.apache.hudi.common.fs.FSUtils;
|
||||||
import org.apache.hudi.common.model.FileSlice;
|
import org.apache.hudi.common.model.FileSlice;
|
||||||
import org.apache.hudi.common.model.HoodieBaseFile;
|
import org.apache.hudi.common.model.HoodieBaseFile;
|
||||||
@@ -81,9 +82,9 @@ public class HoodieRealtimeInputFormatUtils extends HoodieInputFormatUtils {
|
|||||||
// for each partition path obtain the data & log file groupings, then map back to inputsplits
|
// for each partition path obtain the data & log file groupings, then map back to inputsplits
|
||||||
HoodieTableMetaClient metaClient = partitionsToMetaClient.get(partitionPath);
|
HoodieTableMetaClient metaClient = partitionsToMetaClient.get(partitionPath);
|
||||||
if (!fsCache.containsKey(metaClient)) {
|
if (!fsCache.containsKey(metaClient)) {
|
||||||
|
HoodieLocalEngineContext engineContext = new HoodieLocalEngineContext(conf);
|
||||||
HoodieTableFileSystemView fsView = FileSystemViewManager.createInMemoryFileSystemView(metaClient,
|
HoodieTableFileSystemView fsView = FileSystemViewManager.createInMemoryFileSystemView(engineContext,
|
||||||
useFileListingFromMetadata, verifyFileListing);
|
metaClient, useFileListingFromMetadata, verifyFileListing);
|
||||||
fsCache.put(metaClient, fsView);
|
fsCache.put(metaClient, fsView);
|
||||||
}
|
}
|
||||||
HoodieTableFileSystemView fsView = fsCache.get(metaClient);
|
HoodieTableFileSystemView fsView = fsCache.get(metaClient);
|
||||||
|
|||||||
@@ -19,6 +19,7 @@
|
|||||||
package org.apache.hudi.integ.testsuite.reader;
|
package org.apache.hudi.integ.testsuite.reader;
|
||||||
|
|
||||||
import org.apache.hudi.avro.HoodieAvroUtils;
|
import org.apache.hudi.avro.HoodieAvroUtils;
|
||||||
|
import org.apache.hudi.client.common.HoodieSparkEngineContext;
|
||||||
import org.apache.hudi.common.config.HoodieMetadataConfig;
|
import org.apache.hudi.common.config.HoodieMetadataConfig;
|
||||||
import org.apache.hudi.common.fs.FSUtils;
|
import org.apache.hudi.common.fs.FSUtils;
|
||||||
import org.apache.hudi.common.model.FileSlice;
|
import org.apache.hudi.common.model.FileSlice;
|
||||||
@@ -86,7 +87,8 @@ public class DFSHoodieDatasetInputReader extends DFSDeltaInputReader {
|
|||||||
// Using FSUtils.getFS here instead of metaClient.getFS() since we dont want to count these listStatus
|
// Using FSUtils.getFS here instead of metaClient.getFS() since we dont want to count these listStatus
|
||||||
// calls in metrics as they are not part of normal HUDI operation.
|
// calls in metrics as they are not part of normal HUDI operation.
|
||||||
FileSystem fs = FSUtils.getFs(metaClient.getBasePath(), metaClient.getHadoopConf());
|
FileSystem fs = FSUtils.getFs(metaClient.getBasePath(), metaClient.getHadoopConf());
|
||||||
List<String> partitionPaths = FSUtils.getAllPartitionPaths(fs, metaClient.getBasePath(),
|
HoodieSparkEngineContext engineContext = new HoodieSparkEngineContext(jsc);
|
||||||
|
List<String> partitionPaths = FSUtils.getAllPartitionPaths(engineContext, fs, metaClient.getBasePath(),
|
||||||
HoodieMetadataConfig.DEFAULT_METADATA_ENABLE_FOR_READERS, HoodieMetadataConfig.DEFAULT_METADATA_VALIDATE, false);
|
HoodieMetadataConfig.DEFAULT_METADATA_ENABLE_FOR_READERS, HoodieMetadataConfig.DEFAULT_METADATA_VALIDATE, false);
|
||||||
// Sort partition so we can pick last N partitions by default
|
// Sort partition so we can pick last N partitions by default
|
||||||
Collections.sort(partitionPaths);
|
Collections.sort(partitionPaths);
|
||||||
|
|||||||
@@ -373,7 +373,7 @@ public class TestBootstrap extends HoodieClientTestBase {
|
|||||||
reloadInputFormats();
|
reloadInputFormats();
|
||||||
List<GenericRecord> records = HoodieMergeOnReadTestUtils.getRecordsUsingInputFormat(
|
List<GenericRecord> records = HoodieMergeOnReadTestUtils.getRecordsUsingInputFormat(
|
||||||
jsc.hadoopConfiguration(),
|
jsc.hadoopConfiguration(),
|
||||||
FSUtils.getAllPartitionPaths(metaClient.getFs(), basePath, HoodieMetadataConfig.DEFAULT_METADATA_ENABLE_FOR_READERS,
|
FSUtils.getAllPartitionPaths(context, metaClient.getFs(), basePath, HoodieMetadataConfig.DEFAULT_METADATA_ENABLE_FOR_READERS,
|
||||||
HoodieMetadataConfig.DEFAULT_METADATA_VALIDATE, false).stream()
|
HoodieMetadataConfig.DEFAULT_METADATA_VALIDATE, false).stream()
|
||||||
.map(f -> basePath + "/" + f).collect(Collectors.toList()),
|
.map(f -> basePath + "/" + f).collect(Collectors.toList()),
|
||||||
basePath, roJobConf, false, schema, TRIP_HIVE_COLUMN_TYPES, false, new ArrayList<>());
|
basePath, roJobConf, false, schema, TRIP_HIVE_COLUMN_TYPES, false, new ArrayList<>());
|
||||||
@@ -392,7 +392,7 @@ public class TestBootstrap extends HoodieClientTestBase {
|
|||||||
seenKeys = new HashSet<>();
|
seenKeys = new HashSet<>();
|
||||||
records = HoodieMergeOnReadTestUtils.getRecordsUsingInputFormat(
|
records = HoodieMergeOnReadTestUtils.getRecordsUsingInputFormat(
|
||||||
jsc.hadoopConfiguration(),
|
jsc.hadoopConfiguration(),
|
||||||
FSUtils.getAllPartitionPaths(metaClient.getFs(), basePath, HoodieMetadataConfig.DEFAULT_METADATA_ENABLE_FOR_READERS,
|
FSUtils.getAllPartitionPaths(context, metaClient.getFs(), basePath, HoodieMetadataConfig.DEFAULT_METADATA_ENABLE_FOR_READERS,
|
||||||
HoodieMetadataConfig.DEFAULT_METADATA_VALIDATE, false).stream()
|
HoodieMetadataConfig.DEFAULT_METADATA_VALIDATE, false).stream()
|
||||||
.map(f -> basePath + "/" + f).collect(Collectors.toList()),
|
.map(f -> basePath + "/" + f).collect(Collectors.toList()),
|
||||||
basePath, rtJobConf, true, schema, TRIP_HIVE_COLUMN_TYPES, false, new ArrayList<>());
|
basePath, rtJobConf, true, schema, TRIP_HIVE_COLUMN_TYPES, false, new ArrayList<>());
|
||||||
@@ -409,7 +409,7 @@ public class TestBootstrap extends HoodieClientTestBase {
|
|||||||
reloadInputFormats();
|
reloadInputFormats();
|
||||||
records = HoodieMergeOnReadTestUtils.getRecordsUsingInputFormat(
|
records = HoodieMergeOnReadTestUtils.getRecordsUsingInputFormat(
|
||||||
jsc.hadoopConfiguration(),
|
jsc.hadoopConfiguration(),
|
||||||
FSUtils.getAllPartitionPaths(metaClient.getFs(), basePath, HoodieMetadataConfig.DEFAULT_METADATA_ENABLE_FOR_READERS,
|
FSUtils.getAllPartitionPaths(context, metaClient.getFs(), basePath, HoodieMetadataConfig.DEFAULT_METADATA_ENABLE_FOR_READERS,
|
||||||
HoodieMetadataConfig.DEFAULT_METADATA_VALIDATE, false).stream()
|
HoodieMetadataConfig.DEFAULT_METADATA_VALIDATE, false).stream()
|
||||||
.map(f -> basePath + "/" + f).collect(Collectors.toList()),
|
.map(f -> basePath + "/" + f).collect(Collectors.toList()),
|
||||||
basePath, roJobConf, false, schema, TRIP_HIVE_COLUMN_TYPES,
|
basePath, roJobConf, false, schema, TRIP_HIVE_COLUMN_TYPES,
|
||||||
@@ -427,7 +427,7 @@ public class TestBootstrap extends HoodieClientTestBase {
|
|||||||
seenKeys = new HashSet<>();
|
seenKeys = new HashSet<>();
|
||||||
records = HoodieMergeOnReadTestUtils.getRecordsUsingInputFormat(
|
records = HoodieMergeOnReadTestUtils.getRecordsUsingInputFormat(
|
||||||
jsc.hadoopConfiguration(),
|
jsc.hadoopConfiguration(),
|
||||||
FSUtils.getAllPartitionPaths(metaClient.getFs(), basePath, HoodieMetadataConfig.DEFAULT_METADATA_ENABLE_FOR_READERS,
|
FSUtils.getAllPartitionPaths(context, metaClient.getFs(), basePath, HoodieMetadataConfig.DEFAULT_METADATA_ENABLE_FOR_READERS,
|
||||||
HoodieMetadataConfig.DEFAULT_METADATA_VALIDATE, false).stream()
|
HoodieMetadataConfig.DEFAULT_METADATA_VALIDATE, false).stream()
|
||||||
.map(f -> basePath + "/" + f).collect(Collectors.toList()),
|
.map(f -> basePath + "/" + f).collect(Collectors.toList()),
|
||||||
basePath, rtJobConf, true, schema, TRIP_HIVE_COLUMN_TYPES, true,
|
basePath, rtJobConf, true, schema, TRIP_HIVE_COLUMN_TYPES, true,
|
||||||
@@ -443,7 +443,7 @@ public class TestBootstrap extends HoodieClientTestBase {
|
|||||||
reloadInputFormats();
|
reloadInputFormats();
|
||||||
records = HoodieMergeOnReadTestUtils.getRecordsUsingInputFormat(
|
records = HoodieMergeOnReadTestUtils.getRecordsUsingInputFormat(
|
||||||
jsc.hadoopConfiguration(),
|
jsc.hadoopConfiguration(),
|
||||||
FSUtils.getAllPartitionPaths(metaClient.getFs(), basePath, HoodieMetadataConfig.DEFAULT_METADATA_ENABLE_FOR_READERS,
|
FSUtils.getAllPartitionPaths(context, metaClient.getFs(), basePath, HoodieMetadataConfig.DEFAULT_METADATA_ENABLE_FOR_READERS,
|
||||||
HoodieMetadataConfig.DEFAULT_METADATA_VALIDATE, false).stream()
|
HoodieMetadataConfig.DEFAULT_METADATA_VALIDATE, false).stream()
|
||||||
.map(f -> basePath + "/" + f).collect(Collectors.toList()),
|
.map(f -> basePath + "/" + f).collect(Collectors.toList()),
|
||||||
basePath, roJobConf, false, schema, TRIP_HIVE_COLUMN_TYPES, true,
|
basePath, roJobConf, false, schema, TRIP_HIVE_COLUMN_TYPES, true,
|
||||||
@@ -461,7 +461,7 @@ public class TestBootstrap extends HoodieClientTestBase {
|
|||||||
seenKeys = new HashSet<>();
|
seenKeys = new HashSet<>();
|
||||||
records = HoodieMergeOnReadTestUtils.getRecordsUsingInputFormat(
|
records = HoodieMergeOnReadTestUtils.getRecordsUsingInputFormat(
|
||||||
jsc.hadoopConfiguration(),
|
jsc.hadoopConfiguration(),
|
||||||
FSUtils.getAllPartitionPaths(metaClient.getFs(), basePath, HoodieMetadataConfig.DEFAULT_METADATA_ENABLE_FOR_READERS,
|
FSUtils.getAllPartitionPaths(context, metaClient.getFs(), basePath, HoodieMetadataConfig.DEFAULT_METADATA_ENABLE_FOR_READERS,
|
||||||
HoodieMetadataConfig.DEFAULT_METADATA_VALIDATE, false).stream()
|
HoodieMetadataConfig.DEFAULT_METADATA_VALIDATE, false).stream()
|
||||||
.map(f -> basePath + "/" + f).collect(Collectors.toList()),
|
.map(f -> basePath + "/" + f).collect(Collectors.toList()),
|
||||||
basePath, rtJobConf, true, schema, TRIP_HIVE_COLUMN_TYPES, true,
|
basePath, rtJobConf, true, schema, TRIP_HIVE_COLUMN_TYPES, true,
|
||||||
|
|||||||
@@ -18,10 +18,7 @@
|
|||||||
package org.apache.hudi.functional
|
package org.apache.hudi.functional
|
||||||
|
|
||||||
import java.sql.{Date, Timestamp}
|
import java.sql.{Date, Timestamp}
|
||||||
import java.util.function.Supplier
|
|
||||||
import java.util.stream.Stream
|
|
||||||
|
|
||||||
import org.apache.hadoop.fs.Path
|
|
||||||
import org.apache.hudi.common.config.HoodieMetadataConfig
|
import org.apache.hudi.common.config.HoodieMetadataConfig
|
||||||
import org.apache.hudi.common.table.HoodieTableMetaClient
|
import org.apache.hudi.common.table.HoodieTableMetaClient
|
||||||
import org.apache.hudi.common.table.timeline.HoodieInstant
|
import org.apache.hudi.common.table.timeline.HoodieInstant
|
||||||
|
|||||||
@@ -20,6 +20,8 @@ package org.apache.hudi.sync.common;
|
|||||||
|
|
||||||
import org.apache.hadoop.fs.FileSystem;
|
import org.apache.hadoop.fs.FileSystem;
|
||||||
import org.apache.hadoop.fs.Path;
|
import org.apache.hadoop.fs.Path;
|
||||||
|
|
||||||
|
import org.apache.hudi.common.engine.HoodieLocalEngineContext;
|
||||||
import org.apache.hudi.common.fs.FSUtils;
|
import org.apache.hudi.common.fs.FSUtils;
|
||||||
import org.apache.hudi.common.model.HoodieTableType;
|
import org.apache.hudi.common.model.HoodieTableType;
|
||||||
import org.apache.hudi.common.table.HoodieTableMetaClient;
|
import org.apache.hudi.common.table.HoodieTableMetaClient;
|
||||||
@@ -127,7 +129,9 @@ public abstract class AbstractSyncHoodieClient {
|
|||||||
if (!lastCommitTimeSynced.isPresent()) {
|
if (!lastCommitTimeSynced.isPresent()) {
|
||||||
LOG.info("Last commit time synced is not known, listing all partitions in " + basePath + ",FS :" + fs);
|
LOG.info("Last commit time synced is not known, listing all partitions in " + basePath + ",FS :" + fs);
|
||||||
try {
|
try {
|
||||||
return FSUtils.getAllPartitionPaths(fs, basePath, useFileListingFromMetadata, verifyMetadataFileListing, assumeDatePartitioning);
|
HoodieLocalEngineContext engineContext = new HoodieLocalEngineContext(metaClient.getHadoopConf());
|
||||||
|
return FSUtils.getAllPartitionPaths(engineContext, fs, basePath, useFileListingFromMetadata, verifyMetadataFileListing,
|
||||||
|
assumeDatePartitioning);
|
||||||
} catch (IOException e) {
|
} catch (IOException e) {
|
||||||
throw new HoodieIOException("Failed to list all partitions in " + basePath, e);
|
throw new HoodieIOException("Failed to list all partitions in " + basePath, e);
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -86,6 +86,7 @@ public class HoodieSnapshotCopier implements Serializable {
|
|||||||
final HoodieTableMetaClient tableMetadata = new HoodieTableMetaClient(fs.getConf(), baseDir);
|
final HoodieTableMetaClient tableMetadata = new HoodieTableMetaClient(fs.getConf(), baseDir);
|
||||||
final BaseFileOnlyView fsView = new HoodieTableFileSystemView(tableMetadata,
|
final BaseFileOnlyView fsView = new HoodieTableFileSystemView(tableMetadata,
|
||||||
tableMetadata.getActiveTimeline().getCommitsAndCompactionTimeline().filterCompletedInstants());
|
tableMetadata.getActiveTimeline().getCommitsAndCompactionTimeline().filterCompletedInstants());
|
||||||
|
HoodieEngineContext context = new HoodieSparkEngineContext(jsc);
|
||||||
// Get the latest commit
|
// Get the latest commit
|
||||||
Option<HoodieInstant> latestCommit =
|
Option<HoodieInstant> latestCommit =
|
||||||
tableMetadata.getActiveTimeline().getCommitsAndCompactionTimeline().filterCompletedInstants().lastInstant();
|
tableMetadata.getActiveTimeline().getCommitsAndCompactionTimeline().filterCompletedInstants().lastInstant();
|
||||||
@@ -97,7 +98,7 @@ public class HoodieSnapshotCopier implements Serializable {
|
|||||||
LOG.info(String.format("Starting to snapshot latest version files which are also no-late-than %s.",
|
LOG.info(String.format("Starting to snapshot latest version files which are also no-late-than %s.",
|
||||||
latestCommitTimestamp));
|
latestCommitTimestamp));
|
||||||
|
|
||||||
List<String> partitions = FSUtils.getAllPartitionPaths(fs, baseDir, useFileListingFromMetadata, verifyMetadataFileListing, shouldAssumeDatePartitioning);
|
List<String> partitions = FSUtils.getAllPartitionPaths(context, fs, baseDir, useFileListingFromMetadata, verifyMetadataFileListing, shouldAssumeDatePartitioning);
|
||||||
if (partitions.size() > 0) {
|
if (partitions.size() > 0) {
|
||||||
LOG.info(String.format("The job needs to copy %d partitions.", partitions.size()));
|
LOG.info(String.format("The job needs to copy %d partitions.", partitions.size()));
|
||||||
|
|
||||||
@@ -108,7 +109,6 @@ public class HoodieSnapshotCopier implements Serializable {
|
|||||||
fs.delete(new Path(outputDir), true);
|
fs.delete(new Path(outputDir), true);
|
||||||
}
|
}
|
||||||
|
|
||||||
HoodieEngineContext context = new HoodieSparkEngineContext(jsc);
|
|
||||||
context.setJobStatus(this.getClass().getSimpleName(), "Creating a snapshot");
|
context.setJobStatus(this.getClass().getSimpleName(), "Creating a snapshot");
|
||||||
|
|
||||||
List<Tuple2<String, String>> filesToCopy = context.flatMap(partitions, partition -> {
|
List<Tuple2<String, String>> filesToCopy = context.flatMap(partitions, partition -> {
|
||||||
|
|||||||
@@ -117,6 +117,7 @@ public class HoodieSnapshotExporter {
|
|||||||
|
|
||||||
public void export(JavaSparkContext jsc, Config cfg) throws IOException {
|
public void export(JavaSparkContext jsc, Config cfg) throws IOException {
|
||||||
FileSystem fs = FSUtils.getFs(cfg.sourceBasePath, jsc.hadoopConfiguration());
|
FileSystem fs = FSUtils.getFs(cfg.sourceBasePath, jsc.hadoopConfiguration());
|
||||||
|
HoodieSparkEngineContext engineContext = new HoodieSparkEngineContext(jsc);
|
||||||
|
|
||||||
if (outputPathExists(fs, cfg)) {
|
if (outputPathExists(fs, cfg)) {
|
||||||
throw new HoodieSnapshotExporterException("The target output path already exists.");
|
throw new HoodieSnapshotExporterException("The target output path already exists.");
|
||||||
@@ -128,7 +129,7 @@ public class HoodieSnapshotExporter {
|
|||||||
LOG.info(String.format("Starting to snapshot latest version files which are also no-late-than %s.",
|
LOG.info(String.format("Starting to snapshot latest version files which are also no-late-than %s.",
|
||||||
latestCommitTimestamp));
|
latestCommitTimestamp));
|
||||||
|
|
||||||
final List<String> partitions = getPartitions(fs, cfg);
|
final List<String> partitions = getPartitions(engineContext, fs, cfg);
|
||||||
if (partitions.isEmpty()) {
|
if (partitions.isEmpty()) {
|
||||||
throw new HoodieSnapshotExporterException("The source dataset has 0 partition to snapshot.");
|
throw new HoodieSnapshotExporterException("The source dataset has 0 partition to snapshot.");
|
||||||
}
|
}
|
||||||
@@ -153,8 +154,8 @@ public class HoodieSnapshotExporter {
|
|||||||
return latestCommit.isPresent() ? Option.of(latestCommit.get().getTimestamp()) : Option.empty();
|
return latestCommit.isPresent() ? Option.of(latestCommit.get().getTimestamp()) : Option.empty();
|
||||||
}
|
}
|
||||||
|
|
||||||
private List<String> getPartitions(FileSystem fs, Config cfg) throws IOException {
|
private List<String> getPartitions(HoodieEngineContext engineContext, FileSystem fs, Config cfg) throws IOException {
|
||||||
return FSUtils.getAllPartitionPaths(fs, cfg.sourceBasePath, true, false, false);
|
return FSUtils.getAllPartitionPaths(engineContext, fs, cfg.sourceBasePath, true, false, false);
|
||||||
}
|
}
|
||||||
|
|
||||||
private void createSuccessTag(FileSystem fs, Config cfg) throws IOException {
|
private void createSuccessTag(FileSystem fs, Config cfg) throws IOException {
|
||||||
|
|||||||
@@ -85,13 +85,14 @@ public class TimelineServerPerf implements Serializable {
|
|||||||
}
|
}
|
||||||
|
|
||||||
public void run() throws IOException {
|
public void run() throws IOException {
|
||||||
|
JavaSparkContext jsc = UtilHelpers.buildSparkContext("hudi-view-perf-" + cfg.basePath, cfg.sparkMaster);
|
||||||
List<String> allPartitionPaths = FSUtils.getAllPartitionPaths(timelineServer.getFs(), cfg.basePath, cfg.useFileListingFromMetadata,
|
HoodieSparkEngineContext engineContext = new HoodieSparkEngineContext(jsc);
|
||||||
cfg.verifyMetadataFileListing, true);
|
List<String> allPartitionPaths = FSUtils.getAllPartitionPaths(engineContext, timelineServer.getFs(), cfg.basePath,
|
||||||
|
cfg.useFileListingFromMetadata, cfg.verifyMetadataFileListing, true);
|
||||||
Collections.shuffle(allPartitionPaths);
|
Collections.shuffle(allPartitionPaths);
|
||||||
List<String> selected = allPartitionPaths.stream().filter(p -> !p.contains("error")).limit(cfg.maxPartitions)
|
List<String> selected = allPartitionPaths.stream().filter(p -> !p.contains("error")).limit(cfg.maxPartitions)
|
||||||
.collect(Collectors.toList());
|
.collect(Collectors.toList());
|
||||||
JavaSparkContext jsc = UtilHelpers.buildSparkContext("hudi-view-perf-" + cfg.basePath, cfg.sparkMaster);
|
|
||||||
if (!useExternalTimelineServer) {
|
if (!useExternalTimelineServer) {
|
||||||
this.timelineServer.startService();
|
this.timelineServer.startService();
|
||||||
setHostAddrFromSparkConf(jsc.getConf());
|
setHostAddrFromSparkConf(jsc.getConf());
|
||||||
|
|||||||
Reference in New Issue
Block a user