1
0

[HUDI-1717] Metadata Reader should merge all the un-synced but complete instants from the dataset timeline. (#3082)

This commit is contained in:
Prashant Wason
2021-06-22 08:52:18 -07:00
committed by GitHub
parent 062d5baf84
commit 11e64b2db0
6 changed files with 114 additions and 22 deletions

View File

@@ -400,7 +400,7 @@ public abstract class HoodieBackedTableMetadataWriter implements HoodieTableMeta
// (re) init the metadata for reading.
initTableMetadata();
try {
List<HoodieInstant> instantsToSync = metadata.findInstantsToSync();
List<HoodieInstant> instantsToSync = metadata.findInstantsToSyncForWriter();
if (instantsToSync.isEmpty()) {
return;
}
@@ -411,7 +411,7 @@ public abstract class HoodieBackedTableMetadataWriter implements HoodieTableMeta
for (HoodieInstant instant : instantsToSync) {
LOG.info("Syncing instant " + instant + " to metadata table");
Option<List<HoodieRecord>> records = HoodieTableMetadataUtil.convertInstantToMetaRecords(datasetMetaClient, instant, metadata.getSyncedInstantTime());
Option<List<HoodieRecord>> records = HoodieTableMetadataUtil.convertInstantToMetaRecords(datasetMetaClient, instant, getLatestSyncedInstantTime());
if (records.isPresent()) {
commit(records.get(), MetadataPartitionType.FILES.partitionPath(), instant.getTimestamp());
}

View File

@@ -23,6 +23,7 @@ import org.apache.hudi.avro.model.HoodieCleanerPlan;
import org.apache.hudi.avro.model.HoodieRestoreMetadata;
import org.apache.hudi.avro.model.HoodieRollbackMetadata;
import org.apache.hudi.common.model.HoodieCommitMetadata;
import org.apache.hudi.common.util.Option;
import java.io.Serializable;
@@ -40,4 +41,9 @@ public interface HoodieTableMetadataWriter extends Serializable, AutoCloseable {
void update(HoodieRestoreMetadata restoreMetadata, String instantTime);
void update(HoodieRollbackMetadata rollbackMetadata, String instantTime);
/**
* Return the timestamp of the latest instant synced to the metadata table.
*/
Option<String> getLatestSyncedInstantTime();
}

View File

@@ -29,6 +29,8 @@ import org.apache.hudi.common.model.HoodieLogFile;
import org.apache.hudi.common.model.HoodieRecord;
import org.apache.hudi.common.model.HoodieRecordLocation;
import org.apache.hudi.common.table.HoodieTableMetaClient;
import org.apache.hudi.common.table.timeline.HoodieActiveTimeline;
import org.apache.hudi.common.table.timeline.HoodieInstant;
import org.apache.hudi.common.table.view.TableFileSystemView;
import org.apache.hudi.common.util.Option;
import org.apache.hudi.common.util.ValidationUtils;
@@ -132,6 +134,23 @@ public class SparkHoodieBackedTableMetadataWriter extends HoodieBackedTableMetad
});
}
/**
* Return the timestamp of the latest instant synced.
*
* To sync a instant on dataset, we create a corresponding delta-commit on the metadata table. So return the latest
* delta-commit.
*/
@Override
public Option<String> getLatestSyncedInstantTime() {
if (!enabled) {
return Option.empty();
}
HoodieActiveTimeline timeline = metaClient.reloadActiveTimeline();
return timeline.getDeltaCommitTimeline().filterCompletedInstants()
.lastInstant().map(HoodieInstant::getTimestamp);
}
/**
* Tag each record with the location.
*

View File

@@ -491,6 +491,8 @@ public class TestHoodieBackedMetadata extends HoodieClientTestHarness {
// Various table operations without metadata table enabled
String restoreToInstant;
String inflightActionTimestamp;
String beforeInflightActionTimestamp;
try (SparkRDDWriteClient client = new SparkRDDWriteClient(engineContext, getWriteConfig(true, false))) {
// updates
newCommitTime = HoodieActiveTimeline.createNewInstantTime();
@@ -523,6 +525,10 @@ public class TestHoodieBackedMetadata extends HoodieClientTestHarness {
assertTrue(metadata(client).isInSync());
}
// Record a timestamp for creating an inflight instance for sync testing
inflightActionTimestamp = HoodieActiveTimeline.createNewInstantTime();
beforeInflightActionTimestamp = newCommitTime;
// Deletes
newCommitTime = HoodieActiveTimeline.createNewInstantTime();
records = dataGen.generateDeletes(newCommitTime, 5);
@@ -554,9 +560,41 @@ public class TestHoodieBackedMetadata extends HoodieClientTestHarness {
assertTrue(metadata(client).isInSync());
}
// If there is an incomplete operation, the Metadata Table is not updated beyond that operations but the
// in-memory merge should consider all the completed operations.
Path inflightCleanPath = new Path(metaClient.getMetaPath(), HoodieTimeline.makeInflightCleanerFileName(inflightActionTimestamp));
fs.create(inflightCleanPath).close();
try (SparkRDDWriteClient client = new SparkRDDWriteClient(engineContext, getWriteConfig(true, true))) {
// Restore cannot be done until the metadata table is in sync. See HUDI-1502 for details
client.syncTableMetadata();
// Table should sync only before the inflightActionTimestamp
HoodieBackedTableMetadataWriter writer =
(HoodieBackedTableMetadataWriter)SparkHoodieBackedTableMetadataWriter.create(hadoopConf, client.getConfig(), context);
assertEquals(writer.getLatestSyncedInstantTime().get(), beforeInflightActionTimestamp);
// Reader should sync to all the completed instants
HoodieTableMetadata metadata = HoodieTableMetadata.create(context, client.getConfig().getMetadataConfig(),
client.getConfig().getBasePath(), FileSystemViewStorageConfig.DEFAULT_VIEW_SPILLABLE_DIR);
assertEquals(metadata.getSyncedInstantTime().get(), newCommitTime);
// Remove the inflight instance holding back table sync
fs.delete(inflightCleanPath, false);
client.syncTableMetadata();
writer =
(HoodieBackedTableMetadataWriter)SparkHoodieBackedTableMetadataWriter.create(hadoopConf, client.getConfig(), context);
assertEquals(writer.getLatestSyncedInstantTime().get(), newCommitTime);
// Reader should sync to all the completed instants
metadata = HoodieTableMetadata.create(context, client.getConfig().getMetadataConfig(),
client.getConfig().getBasePath(), FileSystemViewStorageConfig.DEFAULT_VIEW_SPILLABLE_DIR);
assertEquals(metadata.getSyncedInstantTime().get(), newCommitTime);
}
// Enable metadata table and ensure it is synced
try (SparkRDDWriteClient client = new SparkRDDWriteClient(engineContext, getWriteConfig(true, true))) {
client.restoreToInstant(restoreToInstant);
assertFalse(metadata(client).isInSync());