[HUDI-1717] Metadata Reader should merge all the un-synced but complete instants from the dataset timeline. (#3082)
This commit is contained in:
@@ -400,7 +400,7 @@ public abstract class HoodieBackedTableMetadataWriter implements HoodieTableMeta
|
||||
// (re) init the metadata for reading.
|
||||
initTableMetadata();
|
||||
try {
|
||||
List<HoodieInstant> instantsToSync = metadata.findInstantsToSync();
|
||||
List<HoodieInstant> instantsToSync = metadata.findInstantsToSyncForWriter();
|
||||
if (instantsToSync.isEmpty()) {
|
||||
return;
|
||||
}
|
||||
@@ -411,7 +411,7 @@ public abstract class HoodieBackedTableMetadataWriter implements HoodieTableMeta
|
||||
for (HoodieInstant instant : instantsToSync) {
|
||||
LOG.info("Syncing instant " + instant + " to metadata table");
|
||||
|
||||
Option<List<HoodieRecord>> records = HoodieTableMetadataUtil.convertInstantToMetaRecords(datasetMetaClient, instant, metadata.getSyncedInstantTime());
|
||||
Option<List<HoodieRecord>> records = HoodieTableMetadataUtil.convertInstantToMetaRecords(datasetMetaClient, instant, getLatestSyncedInstantTime());
|
||||
if (records.isPresent()) {
|
||||
commit(records.get(), MetadataPartitionType.FILES.partitionPath(), instant.getTimestamp());
|
||||
}
|
||||
|
||||
@@ -23,6 +23,7 @@ import org.apache.hudi.avro.model.HoodieCleanerPlan;
|
||||
import org.apache.hudi.avro.model.HoodieRestoreMetadata;
|
||||
import org.apache.hudi.avro.model.HoodieRollbackMetadata;
|
||||
import org.apache.hudi.common.model.HoodieCommitMetadata;
|
||||
import org.apache.hudi.common.util.Option;
|
||||
|
||||
import java.io.Serializable;
|
||||
|
||||
@@ -40,4 +41,9 @@ public interface HoodieTableMetadataWriter extends Serializable, AutoCloseable {
|
||||
void update(HoodieRestoreMetadata restoreMetadata, String instantTime);
|
||||
|
||||
void update(HoodieRollbackMetadata rollbackMetadata, String instantTime);
|
||||
|
||||
/**
|
||||
* Return the timestamp of the latest instant synced to the metadata table.
|
||||
*/
|
||||
Option<String> getLatestSyncedInstantTime();
|
||||
}
|
||||
|
||||
@@ -29,6 +29,8 @@ import org.apache.hudi.common.model.HoodieLogFile;
|
||||
import org.apache.hudi.common.model.HoodieRecord;
|
||||
import org.apache.hudi.common.model.HoodieRecordLocation;
|
||||
import org.apache.hudi.common.table.HoodieTableMetaClient;
|
||||
import org.apache.hudi.common.table.timeline.HoodieActiveTimeline;
|
||||
import org.apache.hudi.common.table.timeline.HoodieInstant;
|
||||
import org.apache.hudi.common.table.view.TableFileSystemView;
|
||||
import org.apache.hudi.common.util.Option;
|
||||
import org.apache.hudi.common.util.ValidationUtils;
|
||||
@@ -132,6 +134,23 @@ public class SparkHoodieBackedTableMetadataWriter extends HoodieBackedTableMetad
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
* Return the timestamp of the latest instant synced.
|
||||
*
|
||||
* To sync a instant on dataset, we create a corresponding delta-commit on the metadata table. So return the latest
|
||||
* delta-commit.
|
||||
*/
|
||||
@Override
|
||||
public Option<String> getLatestSyncedInstantTime() {
|
||||
if (!enabled) {
|
||||
return Option.empty();
|
||||
}
|
||||
|
||||
HoodieActiveTimeline timeline = metaClient.reloadActiveTimeline();
|
||||
return timeline.getDeltaCommitTimeline().filterCompletedInstants()
|
||||
.lastInstant().map(HoodieInstant::getTimestamp);
|
||||
}
|
||||
|
||||
/**
|
||||
* Tag each record with the location.
|
||||
*
|
||||
|
||||
@@ -491,6 +491,8 @@ public class TestHoodieBackedMetadata extends HoodieClientTestHarness {
|
||||
|
||||
// Various table operations without metadata table enabled
|
||||
String restoreToInstant;
|
||||
String inflightActionTimestamp;
|
||||
String beforeInflightActionTimestamp;
|
||||
try (SparkRDDWriteClient client = new SparkRDDWriteClient(engineContext, getWriteConfig(true, false))) {
|
||||
// updates
|
||||
newCommitTime = HoodieActiveTimeline.createNewInstantTime();
|
||||
@@ -523,6 +525,10 @@ public class TestHoodieBackedMetadata extends HoodieClientTestHarness {
|
||||
assertTrue(metadata(client).isInSync());
|
||||
}
|
||||
|
||||
// Record a timestamp for creating an inflight instance for sync testing
|
||||
inflightActionTimestamp = HoodieActiveTimeline.createNewInstantTime();
|
||||
beforeInflightActionTimestamp = newCommitTime;
|
||||
|
||||
// Deletes
|
||||
newCommitTime = HoodieActiveTimeline.createNewInstantTime();
|
||||
records = dataGen.generateDeletes(newCommitTime, 5);
|
||||
@@ -554,9 +560,41 @@ public class TestHoodieBackedMetadata extends HoodieClientTestHarness {
|
||||
assertTrue(metadata(client).isInSync());
|
||||
}
|
||||
|
||||
// If there is an incomplete operation, the Metadata Table is not updated beyond that operations but the
|
||||
// in-memory merge should consider all the completed operations.
|
||||
Path inflightCleanPath = new Path(metaClient.getMetaPath(), HoodieTimeline.makeInflightCleanerFileName(inflightActionTimestamp));
|
||||
fs.create(inflightCleanPath).close();
|
||||
|
||||
try (SparkRDDWriteClient client = new SparkRDDWriteClient(engineContext, getWriteConfig(true, true))) {
|
||||
// Restore cannot be done until the metadata table is in sync. See HUDI-1502 for details
|
||||
client.syncTableMetadata();
|
||||
|
||||
// Table should sync only before the inflightActionTimestamp
|
||||
HoodieBackedTableMetadataWriter writer =
|
||||
(HoodieBackedTableMetadataWriter)SparkHoodieBackedTableMetadataWriter.create(hadoopConf, client.getConfig(), context);
|
||||
assertEquals(writer.getLatestSyncedInstantTime().get(), beforeInflightActionTimestamp);
|
||||
|
||||
// Reader should sync to all the completed instants
|
||||
HoodieTableMetadata metadata = HoodieTableMetadata.create(context, client.getConfig().getMetadataConfig(),
|
||||
client.getConfig().getBasePath(), FileSystemViewStorageConfig.DEFAULT_VIEW_SPILLABLE_DIR);
|
||||
assertEquals(metadata.getSyncedInstantTime().get(), newCommitTime);
|
||||
|
||||
// Remove the inflight instance holding back table sync
|
||||
fs.delete(inflightCleanPath, false);
|
||||
client.syncTableMetadata();
|
||||
|
||||
writer =
|
||||
(HoodieBackedTableMetadataWriter)SparkHoodieBackedTableMetadataWriter.create(hadoopConf, client.getConfig(), context);
|
||||
assertEquals(writer.getLatestSyncedInstantTime().get(), newCommitTime);
|
||||
|
||||
// Reader should sync to all the completed instants
|
||||
metadata = HoodieTableMetadata.create(context, client.getConfig().getMetadataConfig(),
|
||||
client.getConfig().getBasePath(), FileSystemViewStorageConfig.DEFAULT_VIEW_SPILLABLE_DIR);
|
||||
assertEquals(metadata.getSyncedInstantTime().get(), newCommitTime);
|
||||
}
|
||||
|
||||
// Enable metadata table and ensure it is synced
|
||||
try (SparkRDDWriteClient client = new SparkRDDWriteClient(engineContext, getWriteConfig(true, true))) {
|
||||
client.restoreToInstant(restoreToInstant);
|
||||
assertFalse(metadata(client).isInSync());
|
||||
|
||||
|
||||
Reference in New Issue
Block a user