|
|
|
|
@@ -55,6 +55,7 @@ import org.apache.hudi.common.table.view.HoodieTableFileSystemView;
|
|
|
|
|
import org.apache.hudi.common.util.HoodieTimer;
|
|
|
|
|
import org.apache.hudi.common.util.Option;
|
|
|
|
|
import org.apache.hudi.common.util.ValidationUtils;
|
|
|
|
|
import org.apache.hudi.common.util.collection.Pair;
|
|
|
|
|
import org.apache.hudi.config.HoodieCompactionConfig;
|
|
|
|
|
import org.apache.hudi.config.HoodieWriteConfig;
|
|
|
|
|
import org.apache.hudi.config.metrics.HoodieMetricsConfig;
|
|
|
|
|
@@ -81,11 +82,11 @@ import java.util.LinkedList;
|
|
|
|
|
import java.util.List;
|
|
|
|
|
import java.util.Map;
|
|
|
|
|
import java.util.Properties;
|
|
|
|
|
import java.util.concurrent.atomic.AtomicLong;
|
|
|
|
|
import java.util.stream.Collectors;
|
|
|
|
|
|
|
|
|
|
import static org.apache.hudi.common.table.HoodieTableConfig.ARCHIVELOG_FOLDER;
|
|
|
|
|
import static org.apache.hudi.metadata.HoodieTableMetadata.METADATA_TABLE_NAME_SUFFIX;
|
|
|
|
|
import static org.apache.hudi.metadata.HoodieTableMetadata.NON_PARTITIONED_NAME;
|
|
|
|
|
import static org.apache.hudi.metadata.HoodieTableMetadata.SOLO_COMMIT_TIMESTAMP;
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
@@ -121,7 +122,7 @@ public abstract class HoodieBackedTableMetadataWriter implements HoodieTableMeta
|
|
|
|
|
* @param hadoopConf - Hadoop configuration to use for the metadata writer
|
|
|
|
|
* @param writeConfig - Writer config
|
|
|
|
|
* @param engineContext - Engine context
|
|
|
|
|
* @param actionMetadata - Optional action metadata to help decide bootstrap operations
|
|
|
|
|
* @param actionMetadata - Optional action metadata to help decide initialize operations
|
|
|
|
|
* @param <T> - Action metadata types extending Avro generated SpecificRecordBase
|
|
|
|
|
* @param inflightInstantTimestamp - Timestamp of any instant in progress
|
|
|
|
|
*/
|
|
|
|
|
@@ -203,7 +204,7 @@ public abstract class HoodieBackedTableMetadataWriter implements HoodieTableMeta
|
|
|
|
|
* @param metadataConfig - Table config
|
|
|
|
|
* @param metaClient - Meta client for the metadata table
|
|
|
|
|
* @param fsView - Metadata table filesystem view to use
|
|
|
|
|
* @param isBootstrapCompleted - Is metadata table bootstrap completed
|
|
|
|
|
* @param isBootstrapCompleted - Is metadata table initializing completed
|
|
|
|
|
*/
|
|
|
|
|
private void enablePartition(final MetadataPartitionType partitionType, final HoodieMetadataConfig metadataConfig,
|
|
|
|
|
final Option<HoodieTableMetaClient> metaClient, Option<HoodieTableFileSystemView> fsView, boolean isBootstrapCompleted) {
|
|
|
|
|
@@ -319,13 +320,13 @@ public abstract class HoodieBackedTableMetadataWriter implements HoodieTableMeta
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Initialize the metadata table if it does not exist.
|
|
|
|
|
*
|
|
|
|
|
* If the metadata table does not exist, then file and partition listing is used to bootstrap the table.
|
|
|
|
|
* <p>
|
|
|
|
|
* If the metadata table does not exist, then file and partition listing is used to initialize the table.
|
|
|
|
|
*
|
|
|
|
|
* @param engineContext
|
|
|
|
|
* @param actionMetadata Action metadata types extending Avro generated SpecificRecordBase
|
|
|
|
|
* @param actionMetadata Action metadata types extending Avro generated SpecificRecordBase
|
|
|
|
|
* @param inflightInstantTimestamp Timestamp of an instant in progress on the dataset. This instant is ignored
|
|
|
|
|
* while deciding to bootstrap the metadata table.
|
|
|
|
|
* while deciding to initialize the metadata table.
|
|
|
|
|
*/
|
|
|
|
|
protected abstract <T extends SpecificRecordBase> void initialize(HoodieEngineContext engineContext,
|
|
|
|
|
Option<T> actionMetadata,
|
|
|
|
|
@@ -345,27 +346,25 @@ public abstract class HoodieBackedTableMetadataWriter implements HoodieTableMeta
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Bootstrap the metadata table if needed.
|
|
|
|
|
* Initialize the metadata table if needed.
|
|
|
|
|
*
|
|
|
|
|
* @param engineContext - Engine context
|
|
|
|
|
* @param dataMetaClient - Meta client for the data table
|
|
|
|
|
* @param actionMetadata - Optional action metadata
|
|
|
|
|
* @param <T> - Action metadata types extending Avro generated SpecificRecordBase
|
|
|
|
|
* @param inflightInstantTimestamp - Timestamp of an instant in progress on the dataset. This instant is ignored
|
|
|
|
|
* @param dataMetaClient - meta client for the data table
|
|
|
|
|
* @param actionMetadata - optional action metadata
|
|
|
|
|
* @param inflightInstantTimestamp - timestamp of an instant in progress on the dataset
|
|
|
|
|
* @param <T> - action metadata types extending Avro generated SpecificRecordBase
|
|
|
|
|
* @throws IOException
|
|
|
|
|
*/
|
|
|
|
|
protected <T extends SpecificRecordBase> void bootstrapIfNeeded(HoodieEngineContext engineContext,
|
|
|
|
|
HoodieTableMetaClient dataMetaClient,
|
|
|
|
|
Option<T> actionMetadata,
|
|
|
|
|
Option<String> inflightInstantTimestamp) throws IOException {
|
|
|
|
|
protected <T extends SpecificRecordBase> void initializeIfNeeded(HoodieTableMetaClient dataMetaClient,
|
|
|
|
|
Option<T> actionMetadata,
|
|
|
|
|
Option<String> inflightInstantTimestamp) throws IOException {
|
|
|
|
|
HoodieTimer timer = new HoodieTimer().startTimer();
|
|
|
|
|
|
|
|
|
|
boolean exists = dataMetaClient.getFs().exists(new Path(metadataWriteConfig.getBasePath(),
|
|
|
|
|
HoodieTableMetaClient.METAFOLDER_NAME));
|
|
|
|
|
boolean rebootstrap = false;
|
|
|
|
|
boolean reInitialize = false;
|
|
|
|
|
|
|
|
|
|
// If the un-synced instants have been archived, then
|
|
|
|
|
// the metadata table will need to be bootstrapped again.
|
|
|
|
|
// the metadata table will need to be initialized again.
|
|
|
|
|
if (exists) {
|
|
|
|
|
HoodieTableMetaClient metadataMetaClient = HoodieTableMetaClient.builder().setConf(hadoopConf.get())
|
|
|
|
|
.setBasePath(metadataWriteConfig.getBasePath()).build();
|
|
|
|
|
@@ -378,39 +377,39 @@ public abstract class HoodieBackedTableMetadataWriter implements HoodieTableMeta
|
|
|
|
|
final Option<HoodieInstant> latestMetadataInstant =
|
|
|
|
|
metadataMetaClient.getActiveTimeline().filterCompletedInstants().lastInstant();
|
|
|
|
|
|
|
|
|
|
rebootstrap = isBootstrapNeeded(latestMetadataInstant, actionMetadata);
|
|
|
|
|
reInitialize = isBootstrapNeeded(latestMetadataInstant, actionMetadata);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (rebootstrap) {
|
|
|
|
|
if (reInitialize) {
|
|
|
|
|
metrics.ifPresent(m -> m.updateMetrics(HoodieMetadataMetrics.REBOOTSTRAP_STR, 1));
|
|
|
|
|
LOG.info("Deleting Metadata Table directory so that it can be re-bootstrapped");
|
|
|
|
|
LOG.info("Deleting Metadata Table directory so that it can be re-initialized");
|
|
|
|
|
dataMetaClient.getFs().delete(new Path(metadataWriteConfig.getBasePath()), true);
|
|
|
|
|
exists = false;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (!exists) {
|
|
|
|
|
// Initialize for the first time by listing partitions and files directly from the file system
|
|
|
|
|
if (bootstrapFromFilesystem(engineContext, dataMetaClient, inflightInstantTimestamp)) {
|
|
|
|
|
if (initializeFromFilesystem(dataMetaClient, inflightInstantTimestamp)) {
|
|
|
|
|
metrics.ifPresent(m -> m.updateMetrics(HoodieMetadataMetrics.INITIALIZE_STR, timer.endTimer()));
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Whether bootstrap operation needed for this metadata table.
|
|
|
|
|
* Whether initialize operation needed for this metadata table.
|
|
|
|
|
* <p>
|
|
|
|
|
* Rollback of the first commit would look like un-synced instants in the metadata table.
|
|
|
|
|
* Action metadata is needed to verify the instant time and avoid erroneous bootstrapping.
|
|
|
|
|
* Action metadata is needed to verify the instant time and avoid erroneous initializing.
|
|
|
|
|
* <p>
|
|
|
|
|
* TODO: Revisit this logic and validate that filtering for all
|
|
|
|
|
* commits timeline is the right thing to do
|
|
|
|
|
*
|
|
|
|
|
* @return True if the bootstrap is not needed, False otherwise
|
|
|
|
|
* @return True if the initialize is not needed, False otherwise
|
|
|
|
|
*/
|
|
|
|
|
private <T extends SpecificRecordBase> boolean isBootstrapNeeded(Option<HoodieInstant> latestMetadataInstant,
|
|
|
|
|
Option<T> actionMetadata) {
|
|
|
|
|
if (!latestMetadataInstant.isPresent()) {
|
|
|
|
|
LOG.warn("Metadata Table will need to be re-bootstrapped as no instants were found");
|
|
|
|
|
LOG.warn("Metadata Table will need to be re-initialized as no instants were found");
|
|
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
@@ -423,7 +422,7 @@ public abstract class HoodieBackedTableMetadataWriter implements HoodieTableMeta
|
|
|
|
|
if (dataMetaClient.getActiveTimeline().getAllCommitsTimeline().isBeforeTimelineStarts(
|
|
|
|
|
latestMetadataInstant.get().getTimestamp())
|
|
|
|
|
&& !isCommitRevertedByInFlightAction(actionMetadata, latestMetadataInstantTimestamp)) {
|
|
|
|
|
LOG.error("Metadata Table will need to be re-bootstrapped as un-synced instants have been archived."
|
|
|
|
|
LOG.error("Metadata Table will need to be re-initialized as un-synced instants have been archived."
|
|
|
|
|
+ " latestMetadataInstant=" + latestMetadataInstant.get().getTimestamp()
|
|
|
|
|
+ ", latestDataInstant=" + dataMetaClient.getActiveTimeline().firstInstant().get().getTimestamp());
|
|
|
|
|
return true;
|
|
|
|
|
@@ -455,9 +454,7 @@ public abstract class HoodieBackedTableMetadataWriter implements HoodieTableMeta
|
|
|
|
|
case HoodieTimeline.ROLLBACK_ACTION:
|
|
|
|
|
List<HoodieInstantInfo> rollbackedInstants =
|
|
|
|
|
((HoodieRollbackMetadata) actionMetadata.get()).getInstantsRollback();
|
|
|
|
|
affectedInstantTimestamps = rollbackedInstants.stream().map(instant -> {
|
|
|
|
|
return instant.getCommitTime().toString();
|
|
|
|
|
}).collect(Collectors.toList());
|
|
|
|
|
affectedInstantTimestamps = rollbackedInstants.stream().map(HoodieInstantInfo::getCommitTime).collect(Collectors.toList());
|
|
|
|
|
|
|
|
|
|
if (affectedInstantTimestamps.contains(latestMetadataInstantTimestamp)) {
|
|
|
|
|
return true;
|
|
|
|
|
@@ -466,9 +463,7 @@ public abstract class HoodieBackedTableMetadataWriter implements HoodieTableMeta
|
|
|
|
|
case HoodieTimeline.RESTORE_ACTION:
|
|
|
|
|
List<HoodieInstantInfo> restoredInstants =
|
|
|
|
|
((HoodieRestoreMetadata) actionMetadata.get()).getRestoreInstantInfo();
|
|
|
|
|
affectedInstantTimestamps = restoredInstants.stream().map(instant -> {
|
|
|
|
|
return instant.getCommitTime().toString();
|
|
|
|
|
}).collect(Collectors.toList());
|
|
|
|
|
affectedInstantTimestamps = restoredInstants.stream().map(HoodieInstantInfo::getCommitTime).collect(Collectors.toList());
|
|
|
|
|
|
|
|
|
|
if (affectedInstantTimestamps.contains(latestMetadataInstantTimestamp)) {
|
|
|
|
|
return true;
|
|
|
|
|
@@ -484,14 +479,14 @@ public abstract class HoodieBackedTableMetadataWriter implements HoodieTableMeta
|
|
|
|
|
/**
|
|
|
|
|
* Initialize the Metadata Table by listing files and partitions from the file system.
|
|
|
|
|
*
|
|
|
|
|
* @param dataMetaClient {@code HoodieTableMetaClient} for the dataset.
|
|
|
|
|
* @param inflightInstantTimestamp
|
|
|
|
|
* @param dataMetaClient - {@code HoodieTableMetaClient} for the dataset.
|
|
|
|
|
* @param inflightInstantTimestamp - Current action instant responsible for this initialization
|
|
|
|
|
*/
|
|
|
|
|
private boolean bootstrapFromFilesystem(HoodieEngineContext engineContext, HoodieTableMetaClient dataMetaClient,
|
|
|
|
|
Option<String> inflightInstantTimestamp) throws IOException {
|
|
|
|
|
private boolean initializeFromFilesystem(HoodieTableMetaClient dataMetaClient,
|
|
|
|
|
Option<String> inflightInstantTimestamp) throws IOException {
|
|
|
|
|
ValidationUtils.checkState(enabled, "Metadata table cannot be initialized as it is not enabled");
|
|
|
|
|
|
|
|
|
|
// We can only bootstrap if there are no pending operations on the dataset
|
|
|
|
|
// We can only initialize if there are no pending operations on the dataset
|
|
|
|
|
List<HoodieInstant> pendingDataInstant = dataMetaClient.getActiveTimeline()
|
|
|
|
|
.getInstants().filter(i -> !i.isCompleted())
|
|
|
|
|
.filter(i -> !inflightInstantTimestamp.isPresent() || !i.getTimestamp().equals(inflightInstantTimestamp.get()))
|
|
|
|
|
@@ -499,7 +494,7 @@ public abstract class HoodieBackedTableMetadataWriter implements HoodieTableMeta
|
|
|
|
|
|
|
|
|
|
if (!pendingDataInstant.isEmpty()) {
|
|
|
|
|
metrics.ifPresent(m -> m.updateMetrics(HoodieMetadataMetrics.BOOTSTRAP_ERR_STR, 1));
|
|
|
|
|
LOG.warn("Cannot bootstrap metadata table as operation(s) are in progress on the dataset: "
|
|
|
|
|
LOG.warn("Cannot initialize metadata table as operation(s) are in progress on the dataset: "
|
|
|
|
|
+ Arrays.toString(pendingDataInstant.toArray()));
|
|
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
@@ -514,15 +509,10 @@ public abstract class HoodieBackedTableMetadataWriter implements HoodieTableMeta
|
|
|
|
|
initTableMetadata();
|
|
|
|
|
initializeEnabledFileGroups(dataMetaClient, createInstantTime);
|
|
|
|
|
|
|
|
|
|
// List all partitions in the basePath of the containing dataset
|
|
|
|
|
LOG.info("Initializing metadata table by using file listings in " + dataWriteConfig.getBasePath());
|
|
|
|
|
engineContext.setJobStatus(this.getClass().getSimpleName(), "Bootstrap: initializing metadata table by listing files and partitions");
|
|
|
|
|
List<DirectoryInfo> dirInfoList = listAllPartitions(dataMetaClient);
|
|
|
|
|
|
|
|
|
|
// During bootstrap, the list of files to be committed can be huge. So creating a HoodieCommitMetadata out of these
|
|
|
|
|
// large number of files and calling the existing update(HoodieCommitMetadata) function does not scale well.
|
|
|
|
|
// Hence, we have a special commit just for the bootstrap scenario.
|
|
|
|
|
bootstrapCommit(dirInfoList, createInstantTime);
|
|
|
|
|
// During cold startup, the list of files to be committed can be huge. So creating a HoodieCommitMetadata out
|
|
|
|
|
// of these large number of files and calling the existing update(HoodieCommitMetadata) function does not scale
|
|
|
|
|
// well. Hence, we have a special commit just for the initialization scenario.
|
|
|
|
|
initialCommit(createInstantTime);
|
|
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
@@ -651,6 +641,14 @@ public abstract class HoodieBackedTableMetadataWriter implements HoodieTableMeta
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
private MetadataRecordsGenerationParams getRecordsGenerationParams() {
|
|
|
|
|
return new MetadataRecordsGenerationParams(
|
|
|
|
|
dataMetaClient, enabledPartitionTypes, dataWriteConfig.getBloomFilterType(),
|
|
|
|
|
dataWriteConfig.getBloomIndexParallelism(),
|
|
|
|
|
dataWriteConfig.isMetadataIndexColumnStatsForAllColumnsEnabled(),
|
|
|
|
|
dataWriteConfig.getColumnStatsIndexParallelism());
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Interface to assist in converting commit metadata to List of HoodieRecords to be written to metadata table.
|
|
|
|
|
* Updates of different commit metadata uses the same method to convert to HoodieRecords and hence.
|
|
|
|
|
@@ -681,8 +679,8 @@ public abstract class HoodieBackedTableMetadataWriter implements HoodieTableMeta
|
|
|
|
|
*/
|
|
|
|
|
@Override
|
|
|
|
|
public void update(HoodieCommitMetadata commitMetadata, String instantTime, boolean isTableServiceAction) {
|
|
|
|
|
processAndCommit(instantTime, () -> HoodieTableMetadataUtil.convertMetadataToRecords(engineContext, enabledPartitionTypes,
|
|
|
|
|
commitMetadata, dataMetaClient, dataWriteConfig.isMetadataIndexColumnStatsForAllColumnsEnabled(), instantTime), !isTableServiceAction);
|
|
|
|
|
processAndCommit(instantTime, () -> HoodieTableMetadataUtil.convertMetadataToRecords(
|
|
|
|
|
engineContext, commitMetadata, instantTime, getRecordsGenerationParams()), !isTableServiceAction);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
@@ -693,8 +691,8 @@ public abstract class HoodieBackedTableMetadataWriter implements HoodieTableMeta
|
|
|
|
|
*/
|
|
|
|
|
@Override
|
|
|
|
|
public void update(HoodieCleanMetadata cleanMetadata, String instantTime) {
|
|
|
|
|
processAndCommit(instantTime, () -> HoodieTableMetadataUtil.convertMetadataToRecords(engineContext, enabledPartitionTypes,
|
|
|
|
|
cleanMetadata, dataMetaClient, instantTime), false);
|
|
|
|
|
processAndCommit(instantTime, () -> HoodieTableMetadataUtil.convertMetadataToRecords(engineContext,
|
|
|
|
|
cleanMetadata, getRecordsGenerationParams(), instantTime), false);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
@@ -706,7 +704,7 @@ public abstract class HoodieBackedTableMetadataWriter implements HoodieTableMeta
|
|
|
|
|
@Override
|
|
|
|
|
public void update(HoodieRestoreMetadata restoreMetadata, String instantTime) {
|
|
|
|
|
processAndCommit(instantTime, () -> HoodieTableMetadataUtil.convertMetadataToRecords(engineContext,
|
|
|
|
|
enabledPartitionTypes, metadataMetaClient.getActiveTimeline(), restoreMetadata, dataMetaClient, instantTime,
|
|
|
|
|
metadataMetaClient.getActiveTimeline(), restoreMetadata, getRecordsGenerationParams(), instantTime,
|
|
|
|
|
metadata.getSyncedInstantTime()), false);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
@@ -732,8 +730,8 @@ public abstract class HoodieBackedTableMetadataWriter implements HoodieTableMeta
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
Map<MetadataPartitionType, HoodieData<HoodieRecord>> records =
|
|
|
|
|
HoodieTableMetadataUtil.convertMetadataToRecords(engineContext, enabledPartitionTypes,
|
|
|
|
|
metadataMetaClient.getActiveTimeline(), rollbackMetadata, dataMetaClient, instantTime,
|
|
|
|
|
HoodieTableMetadataUtil.convertMetadataToRecords(engineContext, metadataMetaClient.getActiveTimeline(),
|
|
|
|
|
rollbackMetadata, getRecordsGenerationParams(), instantTime,
|
|
|
|
|
metadata.getSyncedInstantTime(), wasSynced);
|
|
|
|
|
commit(instantTime, records, false);
|
|
|
|
|
}
|
|
|
|
|
@@ -845,20 +843,29 @@ public abstract class HoodieBackedTableMetadataWriter implements HoodieTableMeta
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* This is invoked to bootstrap metadata table for a dataset. Bootstrap Commit has special handling mechanism due to its scale compared to
|
|
|
|
|
* This is invoked to initialize metadata table for a dataset. Bootstrap Commit has special handling mechanism due to its scale compared to
|
|
|
|
|
* other regular commits.
|
|
|
|
|
*
|
|
|
|
|
*/
|
|
|
|
|
protected void bootstrapCommit(List<DirectoryInfo> partitionInfoList, String createInstantTime) {
|
|
|
|
|
List<String> partitions = partitionInfoList.stream().map(p ->
|
|
|
|
|
p.getRelativePath().isEmpty() ? NON_PARTITIONED_NAME : p.getRelativePath()).collect(Collectors.toList());
|
|
|
|
|
final int totalFiles = partitionInfoList.stream().mapToInt(p -> p.getTotalFiles()).sum();
|
|
|
|
|
private void initialCommit(String createInstantTime) {
|
|
|
|
|
// List all partitions in the basePath of the containing dataset
|
|
|
|
|
LOG.info("Initializing metadata table by using file listings in " + dataWriteConfig.getBasePath());
|
|
|
|
|
engineContext.setJobStatus(this.getClass().getSimpleName(), "Initializing metadata table by listing files and partitions");
|
|
|
|
|
|
|
|
|
|
List<DirectoryInfo> partitionInfoList = listAllPartitions(dataMetaClient);
|
|
|
|
|
List<String> partitions = new ArrayList<>();
|
|
|
|
|
AtomicLong totalFiles = new AtomicLong(0);
|
|
|
|
|
Map<String, Map<String, Long>> partitionToFilesMap = partitionInfoList.stream().map(p -> {
|
|
|
|
|
final String partitionName = HoodieTableMetadataUtil.getPartition(p.getRelativePath());
|
|
|
|
|
partitions.add(partitionName);
|
|
|
|
|
totalFiles.addAndGet(p.getTotalFiles());
|
|
|
|
|
return Pair.of(partitionName, p.getFileNameToSizeMap());
|
|
|
|
|
}).collect(Collectors.toMap(Pair::getKey, Pair::getValue));
|
|
|
|
|
final Map<MetadataPartitionType, HoodieData<HoodieRecord>> partitionToRecordsMap = new HashMap<>();
|
|
|
|
|
|
|
|
|
|
// Record which saves the list of all partitions
|
|
|
|
|
HoodieRecord allPartitionRecord = HoodieMetadataPayload.createPartitionListRecord(partitions);
|
|
|
|
|
if (partitions.isEmpty()) {
|
|
|
|
|
// in case of bootstrapping of a fresh table, there won't be any partitions, but we need to make a boostrap commit
|
|
|
|
|
// in case of initializing of a fresh table, there won't be any partitions, but we need to make a boostrap commit
|
|
|
|
|
final HoodieData<HoodieRecord> allPartitionRecordsRDD = engineContext.parallelize(
|
|
|
|
|
Collections.singletonList(allPartitionRecord), 1);
|
|
|
|
|
partitionToRecordsMap.put(MetadataPartitionType.FILES, allPartitionRecordsRDD);
|
|
|
|
|
@@ -866,7 +873,7 @@ public abstract class HoodieBackedTableMetadataWriter implements HoodieTableMeta
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
HoodieData<HoodieRecord> partitionRecords = engineContext.parallelize(Arrays.asList(allPartitionRecord), 1);
|
|
|
|
|
HoodieData<HoodieRecord> filesPartitionRecords = engineContext.parallelize(Arrays.asList(allPartitionRecord), 1);
|
|
|
|
|
if (!partitionInfoList.isEmpty()) {
|
|
|
|
|
HoodieData<HoodieRecord> fileListRecords = engineContext.parallelize(partitionInfoList, partitionInfoList.size()).map(partitionInfo -> {
|
|
|
|
|
Map<String, Long> fileNameToSizeMap = partitionInfo.getFileNameToSizeMap();
|
|
|
|
|
@@ -878,29 +885,41 @@ public abstract class HoodieBackedTableMetadataWriter implements HoodieTableMeta
|
|
|
|
|
|
|
|
|
|
// Record which saves files within a partition
|
|
|
|
|
return HoodieMetadataPayload.createPartitionFilesRecord(
|
|
|
|
|
partitionInfo.getRelativePath().isEmpty() ? NON_PARTITIONED_NAME : partitionInfo.getRelativePath(), Option.of(validFileNameToSizeMap), Option.empty());
|
|
|
|
|
HoodieTableMetadataUtil.getPartition(partitionInfo.getRelativePath()), Option.of(validFileNameToSizeMap), Option.empty());
|
|
|
|
|
});
|
|
|
|
|
partitionRecords = partitionRecords.union(fileListRecords);
|
|
|
|
|
filesPartitionRecords = filesPartitionRecords.union(fileListRecords);
|
|
|
|
|
}
|
|
|
|
|
ValidationUtils.checkState(filesPartitionRecords.count() == (partitions.size() + 1));
|
|
|
|
|
partitionToRecordsMap.put(MetadataPartitionType.FILES, filesPartitionRecords);
|
|
|
|
|
|
|
|
|
|
if (enabledPartitionTypes.contains(MetadataPartitionType.BLOOM_FILTERS)) {
|
|
|
|
|
final HoodieData<HoodieRecord> recordsRDD = HoodieTableMetadataUtil.convertFilesToBloomFilterRecords(
|
|
|
|
|
engineContext, Collections.emptyMap(), partitionToFilesMap, getRecordsGenerationParams(), createInstantTime);
|
|
|
|
|
partitionToRecordsMap.put(MetadataPartitionType.BLOOM_FILTERS, recordsRDD);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (enabledPartitionTypes.contains(MetadataPartitionType.COLUMN_STATS)) {
|
|
|
|
|
final HoodieData<HoodieRecord> recordsRDD = HoodieTableMetadataUtil.convertFilesToColumnStatsRecords(
|
|
|
|
|
engineContext, Collections.emptyMap(), partitionToFilesMap, getRecordsGenerationParams());
|
|
|
|
|
partitionToRecordsMap.put(MetadataPartitionType.COLUMN_STATS, recordsRDD);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
LOG.info("Committing " + partitions.size() + " partitions and " + totalFiles + " files to metadata");
|
|
|
|
|
ValidationUtils.checkState(partitionRecords.count() == (partitions.size() + 1));
|
|
|
|
|
partitionToRecordsMap.put(MetadataPartitionType.FILES, partitionRecords);
|
|
|
|
|
commit(createInstantTime, partitionToRecordsMap, false);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* A class which represents a directory and the files and directories inside it.
|
|
|
|
|
*
|
|
|
|
|
* <p>
|
|
|
|
|
* A {@code PartitionFileInfo} object saves the name of the partition and various properties requires of each file
|
|
|
|
|
* required for bootstrapping the metadata table. Saving limited properties reduces the total memory footprint when
|
|
|
|
|
* a very large number of files are present in the dataset being bootstrapped.
|
|
|
|
|
* required for initializing the metadata table. Saving limited properties reduces the total memory footprint when
|
|
|
|
|
* a very large number of files are present in the dataset being initialized.
|
|
|
|
|
*/
|
|
|
|
|
static class DirectoryInfo implements Serializable {
|
|
|
|
|
// Relative path of the directory (relative to the base directory)
|
|
|
|
|
private final String relativePath;
|
|
|
|
|
// Map of filenames within this partition to their respective sizes
|
|
|
|
|
private HashMap<String, Long> filenameToSizeMap;
|
|
|
|
|
private final HashMap<String, Long> filenameToSizeMap;
|
|
|
|
|
// List of directories within this partition
|
|
|
|
|
private final List<Path> subDirectories = new ArrayList<>();
|
|
|
|
|
// Is this a hoodie partition
|
|
|
|
|
|