diff --git a/hudi-cli/src/test/java/org/apache/hudi/cli/commands/TestArchivedCommitsCommand.java b/hudi-cli/src/test/java/org/apache/hudi/cli/commands/TestArchivedCommitsCommand.java index ef303adbb..313c1bcd3 100644 --- a/hudi-cli/src/test/java/org/apache/hudi/cli/commands/TestArchivedCommitsCommand.java +++ b/hudi-cli/src/test/java/org/apache/hudi/cli/commands/TestArchivedCommitsCommand.java @@ -91,8 +91,8 @@ public class TestArchivedCommitsCommand extends AbstractShellIntegrationTest { metaClient.getActiveTimeline().reload().getAllCommitsTimeline().filterCompletedInstants(); // archive - HoodieTimelineArchiveLog archiveLog = new HoodieTimelineArchiveLog(cfg, metaClient); - archiveLog.archiveIfRequired(hadoopConf); + HoodieTimelineArchiveLog archiveLog = new HoodieTimelineArchiveLog(cfg, hadoopConf); + archiveLog.archiveIfRequired(); } @AfterEach diff --git a/hudi-cli/src/test/java/org/apache/hudi/cli/commands/TestCommitsCommand.java b/hudi-cli/src/test/java/org/apache/hudi/cli/commands/TestCommitsCommand.java index 8bacf4fe0..476125211 100644 --- a/hudi-cli/src/test/java/org/apache/hudi/cli/commands/TestCommitsCommand.java +++ b/hudi-cli/src/test/java/org/apache/hudi/cli/commands/TestCommitsCommand.java @@ -175,8 +175,8 @@ public class TestCommitsCommand extends AbstractShellIntegrationTest { // archive metaClient = HoodieTableMetaClient.reload(HoodieCLI.getTableMetaClient()); - HoodieTimelineArchiveLog archiveLog = new HoodieTimelineArchiveLog(cfg, metaClient); - archiveLog.archiveIfRequired(jsc.hadoopConfiguration()); + HoodieTimelineArchiveLog archiveLog = new HoodieTimelineArchiveLog(cfg, jsc.hadoopConfiguration()); + archiveLog.archiveIfRequired(); CommandResult cr = getShell().executeCommand(String.format("commits showarchived --startTs %s --endTs %s", "100", "104")); assertTrue(cr.isSuccess()); diff --git a/hudi-client/src/main/java/org/apache/hudi/client/AbstractHoodieWriteClient.java b/hudi-client/src/main/java/org/apache/hudi/client/AbstractHoodieWriteClient.java index 096bc2e33..b922caa9d 100644 --- a/hudi-client/src/main/java/org/apache/hudi/client/AbstractHoodieWriteClient.java +++ b/hudi-client/src/main/java/org/apache/hudi/client/AbstractHoodieWriteClient.java @@ -117,7 +117,7 @@ public abstract class AbstractHoodieWriteClient e try { activeTimeline.saveAsComplete(new HoodieInstant(true, actionType, instantTime), Option.of(metadata.toJsonString().getBytes(StandardCharsets.UTF_8))); - postCommit(metadata, instantTime, extraMetadata); + postCommit(table, metadata, instantTime, extraMetadata); emitCommitMetrics(instantTime, metadata, actionType); LOG.info("Committed " + instantTime); } catch (IOException e) { @@ -144,11 +144,13 @@ public abstract class AbstractHoodieWriteClient e /** * Post Commit Hook. Derived classes use this method to perform post-commit processing + * + * @param table table to commit on * @param metadata Commit Metadata corresponding to committed instant * @param instantTime Instant Time * @param extraMetadata Additional Metadata passed by user */ - protected abstract void postCommit(HoodieCommitMetadata metadata, String instantTime, Option> extraMetadata); + protected abstract void postCommit(HoodieTable table, HoodieCommitMetadata metadata, String instantTime, Option> extraMetadata); /** * Finalize Write operation. diff --git a/hudi-client/src/main/java/org/apache/hudi/client/HoodieWriteClient.java b/hudi-client/src/main/java/org/apache/hudi/client/HoodieWriteClient.java index a6d9d0da9..30dfecbe1 100644 --- a/hudi-client/src/main/java/org/apache/hudi/client/HoodieWriteClient.java +++ b/hudi-client/src/main/java/org/apache/hudi/client/HoodieWriteClient.java @@ -47,6 +47,7 @@ import org.apache.hudi.index.HoodieIndex; import org.apache.hudi.metrics.HoodieMetrics; import org.apache.hudi.table.HoodieTable; import org.apache.hudi.table.HoodieTimelineArchiveLog; +import org.apache.hudi.table.MarkerFiles; import org.apache.hudi.table.UserDefinedBulkInsertPartitioner; import org.apache.hudi.table.action.HoodieWriteMetadata; import org.apache.hudi.table.action.compact.CompactHelpers; @@ -323,7 +324,7 @@ public class HoodieWriteClient extends AbstractHo result.getWriteStats().get().size()); } - postCommit(result.getCommitMetadata().get(), instantTime, Option.empty()); + postCommit(hoodieTable, result.getCommitMetadata().get(), instantTime, Option.empty()); emitCommitMetrics(instantTime, result.getCommitMetadata().get(), hoodieTable.getMetaClient().getCommitActionType()); @@ -332,9 +333,12 @@ public class HoodieWriteClient extends AbstractHo } @Override - protected void postCommit(HoodieCommitMetadata metadata, String instantTime, - Option> extraMetadata) { + protected void postCommit(HoodieTable table, HoodieCommitMetadata metadata, String instantTime, Option> extraMetadata) { try { + + // Delete the marker directory for the instant. + new MarkerFiles(table, instantTime).quietDeleteMarkerDir(); + // Do an inline compaction if enabled if (config.isInlineCompaction()) { metadata.addMetadata(HoodieCompactionConfig.INLINE_COMPACT_PROP, "true"); @@ -343,8 +347,8 @@ public class HoodieWriteClient extends AbstractHo metadata.addMetadata(HoodieCompactionConfig.INLINE_COMPACT_PROP, "false"); } // We cannot have unbounded commit files. Archive commits if we have to archive - HoodieTimelineArchiveLog archiveLog = new HoodieTimelineArchiveLog(config, createMetaClient(true)); - archiveLog.archiveIfRequired(hadoopConf); + HoodieTimelineArchiveLog archiveLog = new HoodieTimelineArchiveLog(config, hadoopConf); + archiveLog.archiveIfRequired(); autoCleanOnCommit(instantTime); } catch (IOException ioe) { throw new HoodieIOException(ioe.getMessage(), ioe); diff --git a/hudi-client/src/main/java/org/apache/hudi/config/HoodieWriteConfig.java b/hudi-client/src/main/java/org/apache/hudi/config/HoodieWriteConfig.java index 3b822f0a1..fb679f2ff 100644 --- a/hudi-client/src/main/java/org/apache/hudi/config/HoodieWriteConfig.java +++ b/hudi-client/src/main/java/org/apache/hudi/config/HoodieWriteConfig.java @@ -54,6 +54,8 @@ import java.util.stream.Collectors; public class HoodieWriteConfig extends DefaultHoodieConfig { public static final String TABLE_NAME = "hoodie.table.name"; + public static final String DEFAULT_ROLLBACK_USING_MARKERS = "false"; + public static final String ROLLBACK_USING_MARKERS = "hoodie.rollback.using.markers"; public static final String TIMELINE_LAYOUT_VERSION = "hoodie.timeline.layout.version"; public static final String BASE_PATH_PROP = "hoodie.base.path"; public static final String AVRO_SCHEMA = "hoodie.avro.schema"; @@ -197,6 +199,10 @@ public class HoodieWriteConfig extends DefaultHoodieConfig { return Integer.parseInt(props.getProperty(ROLLBACK_PARALLELISM)); } + public boolean shouldRollbackUsingMarkers() { + return Boolean.parseBoolean(props.getProperty(ROLLBACK_USING_MARKERS)); + } + public int getWriteBufferLimitBytes() { return Integer.parseInt(props.getProperty(WRITE_BUFFER_LIMIT_BYTES, DEFAULT_WRITE_BUFFER_LIMIT_BYTES)); } @@ -710,6 +716,11 @@ public class HoodieWriteConfig extends DefaultHoodieConfig { return this; } + public Builder withRollbackUsingMarkers(boolean rollbackUsingMarkers) { + props.setProperty(ROLLBACK_USING_MARKERS, String.valueOf(rollbackUsingMarkers)); + return this; + } + public Builder withWriteBufferLimitBytes(int writeBufferLimit) { props.setProperty(WRITE_BUFFER_LIMIT_BYTES, String.valueOf(writeBufferLimit)); return this; @@ -807,6 +818,8 @@ public class HoodieWriteConfig extends DefaultHoodieConfig { setDefaultOnCondition(props, !props.containsKey(DELETE_PARALLELISM), DELETE_PARALLELISM, DEFAULT_PARALLELISM); setDefaultOnCondition(props, !props.containsKey(ROLLBACK_PARALLELISM), ROLLBACK_PARALLELISM, DEFAULT_ROLLBACK_PARALLELISM); + setDefaultOnCondition(props, !props.containsKey(ROLLBACK_USING_MARKERS), ROLLBACK_USING_MARKERS, + DEFAULT_ROLLBACK_USING_MARKERS); setDefaultOnCondition(props, !props.containsKey(COMBINE_BEFORE_INSERT_PROP), COMBINE_BEFORE_INSERT_PROP, DEFAULT_COMBINE_BEFORE_INSERT); setDefaultOnCondition(props, !props.containsKey(COMBINE_BEFORE_UPSERT_PROP), COMBINE_BEFORE_UPSERT_PROP, diff --git a/hudi-client/src/main/java/org/apache/hudi/io/HoodieAppendHandle.java b/hudi-client/src/main/java/org/apache/hudi/io/HoodieAppendHandle.java index 12e3a0778..e81c4add7 100644 --- a/hudi-client/src/main/java/org/apache/hudi/io/HoodieAppendHandle.java +++ b/hudi-client/src/main/java/org/apache/hudi/io/HoodieAppendHandle.java @@ -119,10 +119,11 @@ public class HoodieAppendHandle extends HoodieWri SliceView rtView = hoodieTable.getSliceView(); Option fileSlice = rtView.getLatestFileSlice(partitionPath, fileId); // Set the base commit time as the current instantTime for new inserts into log files - String baseInstantTime = instantTime; + String baseInstantTime; if (fileSlice.isPresent()) { baseInstantTime = fileSlice.get().getBaseInstantTime(); } else { + baseInstantTime = instantTime; // This means there is no base data file, start appending to a new log file fileSlice = Option.of(new FileSlice(partitionPath, baseInstantTime, this.fileId)); LOG.info("New InsertHandle for partition :" + partitionPath); @@ -138,6 +139,12 @@ public class HoodieAppendHandle extends HoodieWri HoodiePartitionMetadata partitionMetadata = new HoodiePartitionMetadata(fs, baseInstantTime, new Path(config.getBasePath()), FSUtils.getPartitionPath(config.getBasePath(), partitionPath)); partitionMetadata.trySave(getPartitionId()); + + // Since the actual log file written to can be different based on when rollover happens, we use the + // base file to denote some log appends happened on a slice. writeToken will still fence concurrent + // writers. + createMarkerFile(partitionPath, FSUtils.makeDataFileName(baseInstantTime, writeToken, fileId, hoodieTable.getBaseFileExtension())); + this.writer = createLogWriter(fileSlice, baseInstantTime); this.currentLogFile = writer.getLogFile(); ((HoodieDeltaWriteStat) writeStatus.getStat()).setLogVersion(currentLogFile.getLogVersion()); @@ -278,6 +285,11 @@ public class HoodieAppendHandle extends HoodieWri return writeStatus; } + @Override + public IOType getIOType() { + return IOType.APPEND; + } + private Writer createLogWriter(Option fileSlice, String baseCommitTime) throws IOException, InterruptedException { Option latestLogFile = fileSlice.get().getLatestLogFile(); @@ -288,7 +300,8 @@ public class HoodieAppendHandle extends HoodieWri .withLogVersion(latestLogFile.map(HoodieLogFile::getLogVersion).orElse(HoodieLogFile.LOGFILE_BASE_VERSION)) .withSizeThreshold(config.getLogFileMaxSize()).withFs(fs) .withLogWriteToken(latestLogFile.map(x -> FSUtils.getWriteTokenFromLogPath(x.getPath())).orElse(writeToken)) - .withRolloverLogWriteToken(writeToken).withFileExtension(HoodieLogFile.DELTA_EXTENSION).build(); + .withRolloverLogWriteToken(writeToken) + .withFileExtension(HoodieLogFile.DELTA_EXTENSION).build(); } private void writeToBuffer(HoodieRecord record) { @@ -327,5 +340,4 @@ public class HoodieAppendHandle extends HoodieWri numberOfRecords = 0; } } - } diff --git a/hudi-client/src/main/java/org/apache/hudi/io/HoodieCreateHandle.java b/hudi-client/src/main/java/org/apache/hudi/io/HoodieCreateHandle.java index c437f5422..dfa63b079 100644 --- a/hudi-client/src/main/java/org/apache/hudi/io/HoodieCreateHandle.java +++ b/hudi-client/src/main/java/org/apache/hudi/io/HoodieCreateHandle.java @@ -31,6 +31,7 @@ import org.apache.hudi.common.util.Option; import org.apache.hudi.config.HoodieWriteConfig; import org.apache.hudi.exception.HoodieInsertException; import org.apache.hudi.io.storage.HoodieFileWriter; +import org.apache.hudi.io.storage.HoodieFileWriterFactory; import org.apache.hudi.table.HoodieTable; import org.apache.avro.generic.GenericRecord; @@ -66,9 +67,9 @@ public class HoodieCreateHandle extends HoodieWri HoodiePartitionMetadata partitionMetadata = new HoodiePartitionMetadata(fs, instantTime, new Path(config.getBasePath()), FSUtils.getPartitionPath(config.getBasePath(), partitionPath)); partitionMetadata.trySave(getPartitionId()); - createMarkerFile(partitionPath); - this.fileWriter = createNewFileWriter(instantTime, path, hoodieTable, config, writerSchema, - this.sparkTaskContextSupplier); + createMarkerFile(partitionPath, FSUtils.makeDataFileName(this.instantTime, this.writeToken, this.fileId, hoodieTable.getBaseFileExtension())); + this.fileWriter = + HoodieFileWriterFactory.getFileWriter(instantTime, path, hoodieTable, config, writerSchema, this.sparkTaskContextSupplier); } catch (IOException e) { throw new HoodieInsertException("Failed to initialize HoodieStorageWriter for path " + path, e); } @@ -146,6 +147,11 @@ public class HoodieCreateHandle extends HoodieWri return writeStatus; } + @Override + public IOType getIOType() { + return IOType.CREATE; + } + /** * Performs actions to durably, persist the current changes and returns a WriteStatus object. */ diff --git a/hudi-client/src/main/java/org/apache/hudi/io/HoodieMergeHandle.java b/hudi-client/src/main/java/org/apache/hudi/io/HoodieMergeHandle.java index 0eaf3f2d9..e87cf3c01 100644 --- a/hudi-client/src/main/java/org/apache/hudi/io/HoodieMergeHandle.java +++ b/hudi-client/src/main/java/org/apache/hudi/io/HoodieMergeHandle.java @@ -18,7 +18,6 @@ package org.apache.hudi.io; -import org.apache.hudi.avro.HoodieAvroUtils; import org.apache.hudi.client.SparkTaskContextSupplier; import org.apache.hudi.client.WriteStatus; import org.apache.hudi.client.utils.SparkConfigUtils; @@ -88,10 +87,6 @@ public class HoodieMergeHandle extends HoodieWrit init(fileId, this.partitionPath, dataFileToBeMerged); } - public static Schema createHoodieWriteSchema(Schema originalSchema) { - return HoodieAvroUtils.addMetadataFields(originalSchema); - } - @Override public Schema getWriterSchema() { return writerSchema; @@ -113,8 +108,9 @@ public class HoodieMergeHandle extends HoodieWrit partitionMetadata.trySave(getPartitionId()); oldFilePath = new Path(config.getBasePath() + "/" + partitionPath + "/" + latestValidFilePath); + String newFileName = FSUtils.makeDataFileName(instantTime, writeToken, fileId, hoodieTable.getBaseFileExtension()); String relativePath = new Path((partitionPath.isEmpty() ? "" : partitionPath + "/") - + FSUtils.makeDataFileName(instantTime, writeToken, fileId, hoodieTable.getBaseFileExtension())).toString(); + + newFileName).toString(); newFilePath = new Path(config.getBasePath(), relativePath); LOG.info(String.format("Merging new data into oldPath %s, as newPath %s", oldFilePath.toString(), @@ -127,7 +123,7 @@ public class HoodieMergeHandle extends HoodieWrit writeStatus.getStat().setPath(new Path(config.getBasePath()), newFilePath); // Create Marker file - createMarkerFile(partitionPath); + createMarkerFile(partitionPath, newFileName); // Create the writer for writing the new version file fileWriter = createNewFileWriter(instantTime, newFilePath, hoodieTable, config, writerSchema, sparkTaskContextSupplier); @@ -311,4 +307,9 @@ public class HoodieMergeHandle extends HoodieWrit public WriteStatus getWriteStatus() { return writeStatus; } + + @Override + public IOType getIOType() { + return IOType.MERGE; + } } diff --git a/hudi-client/src/main/java/org/apache/hudi/io/HoodieWriteHandle.java b/hudi-client/src/main/java/org/apache/hudi/io/HoodieWriteHandle.java index 4470b76f1..7fd3b42ae 100644 --- a/hudi-client/src/main/java/org/apache/hudi/io/HoodieWriteHandle.java +++ b/hudi-client/src/main/java/org/apache/hudi/io/HoodieWriteHandle.java @@ -28,7 +28,6 @@ import org.apache.hudi.common.util.HoodieTimer; import org.apache.hudi.common.util.Option; import org.apache.hudi.common.util.ReflectionUtils; import org.apache.hudi.config.HoodieWriteConfig; -import org.apache.hudi.exception.HoodieException; import org.apache.hudi.exception.HoodieIOException; import org.apache.hudi.io.storage.HoodieFileWriter; import org.apache.hudi.io.storage.HoodieFileWriterFactory; @@ -39,6 +38,7 @@ import org.apache.avro.generic.GenericRecord; import org.apache.avro.generic.IndexedRecord; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; +import org.apache.hudi.table.MarkerFiles; import org.apache.log4j.LogManager; import org.apache.log4j.Logger; @@ -50,6 +50,7 @@ import java.io.IOException; public abstract class HoodieWriteHandle extends HoodieIOHandle { private static final Logger LOG = LogManager.getLogger(HoodieWriteHandle.class); + protected final Schema originalSchema; protected final Schema writerSchema; protected HoodieTimer timer; @@ -97,28 +98,9 @@ public abstract class HoodieWriteHandle extends H * * @param partitionPath Partition path */ - protected void createMarkerFile(String partitionPath) { - Path markerPath = makeNewMarkerPath(partitionPath); - try { - LOG.info("Creating Marker Path=" + markerPath); - fs.create(markerPath, false).close(); - } catch (IOException e) { - throw new HoodieException("Failed to create marker file " + markerPath, e); - } - } - - /** - * THe marker path will be /.hoodie/.temp//2019/04/25/filename. - */ - private Path makeNewMarkerPath(String partitionPath) { - Path markerRootPath = new Path(hoodieTable.getMetaClient().getMarkerFolderPath(instantTime)); - Path path = FSUtils.getPartitionPath(markerRootPath, partitionPath); - try { - fs.mkdirs(path); // create a new partition as needed. - } catch (IOException e) { - throw new HoodieIOException("Failed to make dir " + path, e); - } - return new Path(path.toString(), FSUtils.makeMarkerFile(instantTime, writeToken, fileId)); + protected void createMarkerFile(String partitionPath, String dataFileName) { + MarkerFiles markerFiles = new MarkerFiles(hoodieTable, instantTime); + markerFiles.create(partitionPath, dataFileName, getIOType()); } public Schema getWriterSchema() { @@ -167,6 +149,8 @@ public abstract class HoodieWriteHandle extends H public abstract WriteStatus getWriteStatus(); + public abstract IOType getIOType(); + @Override protected FileSystem getFileSystem() { return hoodieTable.getMetaClient().getFs(); diff --git a/hudi-client/src/main/java/org/apache/hudi/io/IOType.java b/hudi-client/src/main/java/org/apache/hudi/io/IOType.java new file mode 100644 index 000000000..aa6660e72 --- /dev/null +++ b/hudi-client/src/main/java/org/apache/hudi/io/IOType.java @@ -0,0 +1,28 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.io; + +/** + * Types of lower level I/O operations done on each file slice. + */ +public enum IOType { + MERGE, + CREATE, + APPEND +} diff --git a/hudi-client/src/main/java/org/apache/hudi/table/HoodieTable.java b/hudi-client/src/main/java/org/apache/hudi/table/HoodieTable.java index b545eae3e..14dd16870 100644 --- a/hudi-client/src/main/java/org/apache/hudi/table/HoodieTable.java +++ b/hudi-client/src/main/java/org/apache/hudi/table/HoodieTable.java @@ -32,7 +32,6 @@ import org.apache.hudi.client.SparkTaskContextSupplier; import org.apache.hudi.common.config.SerializableConfiguration; import org.apache.hudi.common.fs.ConsistencyGuard; import org.apache.hudi.common.fs.ConsistencyGuard.FileVisibility; -import org.apache.hudi.common.fs.FSUtils; import org.apache.hudi.common.fs.FailSafeConsistencyGuard; import org.apache.hudi.common.model.HoodieFileFormat; import org.apache.hudi.common.model.HoodieKey; @@ -378,26 +377,29 @@ public abstract class HoodieTable implements Seri * @throws HoodieIOException if some paths can't be finalized on storage */ public void finalizeWrite(JavaSparkContext jsc, String instantTs, List stats) throws HoodieIOException { - cleanFailedWrites(jsc, instantTs, stats, config.getConsistencyGuardConfig().isConsistencyCheckEnabled()); + reconcileAgainstMarkers(jsc, instantTs, stats, config.getConsistencyGuardConfig().isConsistencyCheckEnabled()); } - /** - * Delete Marker directory corresponding to an instant. - * - * @param instantTs Instant Time - */ - public void deleteMarkerDir(String instantTs) { - try { - FileSystem fs = getMetaClient().getFs(); - Path markerDir = new Path(metaClient.getMarkerFolderPath(instantTs)); - if (fs.exists(markerDir)) { - // For append only case, we do not write to marker dir. Hence, the above check - LOG.info("Removing marker directory=" + markerDir); - fs.delete(markerDir, true); - } - } catch (IOException ioe) { - throw new HoodieIOException(ioe.getMessage(), ioe); - } + private void deleteInvalidFilesByPartitions(JavaSparkContext jsc, Map>> invalidFilesByPartition) { + // Now delete partially written files + jsc.parallelize(new ArrayList<>(invalidFilesByPartition.values()), config.getFinalizeWriteParallelism()) + .map(partitionWithFileList -> { + final FileSystem fileSystem = metaClient.getFs(); + LOG.info("Deleting invalid data files=" + partitionWithFileList); + if (partitionWithFileList.isEmpty()) { + return true; + } + // Delete + partitionWithFileList.stream().map(Pair::getValue).forEach(file -> { + try { + fileSystem.delete(new Path(file), false); + } catch (IOException e) { + throw new HoodieIOException(e.getMessage(), e); + } + }); + + return true; + }).collect(); } /** @@ -410,72 +412,54 @@ public abstract class HoodieTable implements Seri * @param consistencyCheckEnabled Consistency Check Enabled * @throws HoodieIOException */ - protected void cleanFailedWrites(JavaSparkContext jsc, String instantTs, List stats, - boolean consistencyCheckEnabled) throws HoodieIOException { + protected void reconcileAgainstMarkers(JavaSparkContext jsc, + String instantTs, + List stats, + boolean consistencyCheckEnabled) throws HoodieIOException { try { // Reconcile marker and data files with WriteStats so that partially written data-files due to failed // (but succeeded on retry) tasks are removed. String basePath = getMetaClient().getBasePath(); - FileSystem fs = getMetaClient().getFs(); - Path markerDir = new Path(metaClient.getMarkerFolderPath(instantTs)); + MarkerFiles markers = new MarkerFiles(this, instantTs); - if (!fs.exists(markerDir)) { - // Happens when all writes are appends + if (!markers.doesMarkerDirExist()) { + // can happen if it was an empty write say. return; } - final String baseFileExtension = getBaseFileFormat().getFileExtension(); - List invalidDataPaths = FSUtils.getAllDataFilesForMarkers(fs, basePath, instantTs, markerDir.toString(), - baseFileExtension); - List validDataPaths = stats.stream().map(w -> String.format("%s/%s", basePath, w.getPath())) - .filter(p -> p.endsWith(baseFileExtension)).collect(Collectors.toList()); + // we are not including log appends here, since they are already fail-safe. + List invalidDataPaths = markers.createdAndMergedDataPaths(); + List validDataPaths = stats.stream() + .map(HoodieWriteStat::getPath) + .filter(p -> p.endsWith(this.getBaseFileExtension())) + .collect(Collectors.toList()); // Contains list of partially created files. These needs to be cleaned up. invalidDataPaths.removeAll(validDataPaths); if (!invalidDataPaths.isEmpty()) { - LOG.info( - "Removing duplicate data files created due to spark retries before committing. Paths=" + invalidDataPaths); + LOG.info("Removing duplicate data files created due to spark retries before committing. Paths=" + invalidDataPaths); } + Map>> invalidPathsByPartition = invalidDataPaths.stream() + .map(dp -> Pair.of(new Path(dp).getParent().toString(), new Path(basePath, dp).toString())) + .collect(Collectors.groupingBy(Pair::getKey)); - Map>> groupByPartition = invalidDataPaths.stream() - .map(dp -> Pair.of(new Path(dp).getParent().toString(), dp)).collect(Collectors.groupingBy(Pair::getKey)); - - if (!groupByPartition.isEmpty()) { + if (!invalidPathsByPartition.isEmpty()) { // Ensure all files in delete list is actually present. This is mandatory for an eventually consistent FS. // Otherwise, we may miss deleting such files. If files are not found even after retries, fail the commit if (consistencyCheckEnabled) { // This will either ensure all files to be deleted are present. - waitForAllFiles(jsc, groupByPartition, FileVisibility.APPEAR); + waitForAllFiles(jsc, invalidPathsByPartition, FileVisibility.APPEAR); } // Now delete partially written files jsc.setJobGroup(this.getClass().getSimpleName(), "Delete all partially written files"); - jsc.parallelize(new ArrayList<>(groupByPartition.values()), config.getFinalizeWriteParallelism()) - .map(partitionWithFileList -> { - final FileSystem fileSystem = metaClient.getFs(); - LOG.info("Deleting invalid data files=" + partitionWithFileList); - if (partitionWithFileList.isEmpty()) { - return true; - } - // Delete - partitionWithFileList.stream().map(Pair::getValue).forEach(file -> { - try { - fileSystem.delete(new Path(file), false); - } catch (IOException e) { - throw new HoodieIOException(e.getMessage(), e); - } - }); - - return true; - }).collect(); + deleteInvalidFilesByPartitions(jsc, invalidPathsByPartition); // Now ensure the deleted files disappear if (consistencyCheckEnabled) { // This will either ensure all files to be deleted are absent. - waitForAllFiles(jsc, groupByPartition, FileVisibility.DISAPPEAR); + waitForAllFiles(jsc, invalidPathsByPartition, FileVisibility.DISAPPEAR); } } - // Now delete the marker directory - deleteMarkerDir(instantTs); } catch (IOException ioe) { throw new HoodieIOException(ioe.getMessage(), ioe); } diff --git a/hudi-client/src/main/java/org/apache/hudi/table/HoodieTimelineArchiveLog.java b/hudi-client/src/main/java/org/apache/hudi/table/HoodieTimelineArchiveLog.java index c94f30b3d..98d3e05fc 100644 --- a/hudi-client/src/main/java/org/apache/hudi/table/HoodieTimelineArchiveLog.java +++ b/hudi-client/src/main/java/org/apache/hudi/table/HoodieTimelineArchiveLog.java @@ -19,6 +19,12 @@ package org.apache.hudi.table; import org.apache.hadoop.conf.Configuration; +import com.fasterxml.jackson.databind.DeserializationFeature; +import com.fasterxml.jackson.databind.ObjectMapper; +import org.apache.avro.Schema; +import org.apache.avro.generic.IndexedRecord; +import org.apache.hadoop.fs.Path; + import org.apache.hudi.avro.model.HoodieArchivedMetaEntry; import org.apache.hudi.avro.model.HoodieCompactionPlan; import org.apache.hudi.avro.model.HoodieRollbackMetadata; @@ -46,12 +52,6 @@ import org.apache.hudi.config.HoodieWriteConfig; import org.apache.hudi.exception.HoodieCommitException; import org.apache.hudi.exception.HoodieException; import org.apache.hudi.exception.HoodieIOException; - -import com.fasterxml.jackson.databind.DeserializationFeature; -import com.fasterxml.jackson.databind.ObjectMapper; -import org.apache.avro.Schema; -import org.apache.avro.generic.IndexedRecord; -import org.apache.hadoop.fs.Path; import org.apache.log4j.LogManager; import org.apache.log4j.Logger; @@ -67,6 +67,9 @@ import java.util.Map; import java.util.stream.Collectors; import java.util.stream.Stream; +import static org.apache.hudi.common.table.timeline.HoodieTimeline.GREATER_THAN; +import static org.apache.hudi.common.table.timeline.HoodieTimeline.LESSER_THAN_OR_EQUALS; + /** * Archiver to bound the growth of files under .hoodie meta path. */ @@ -75,14 +78,20 @@ public class HoodieTimelineArchiveLog { private static final Logger LOG = LogManager.getLogger(HoodieTimelineArchiveLog.class); private final Path archiveFilePath; - private final HoodieTableMetaClient metaClient; private final HoodieWriteConfig config; private Writer writer; + private final int maxInstantsToKeep; + private final int minInstantsToKeep; + private final HoodieTable table; + private final HoodieTableMetaClient metaClient; - public HoodieTimelineArchiveLog(HoodieWriteConfig config, HoodieTableMetaClient metaClient) { + public HoodieTimelineArchiveLog(HoodieWriteConfig config, Configuration configuration) { this.config = config; - this.metaClient = metaClient; + this.table = HoodieTable.create(config, configuration); + this.metaClient = table.getMetaClient(); this.archiveFilePath = HoodieArchivedTimeline.getArchiveLogPath(metaClient.getArchivePath()); + this.maxInstantsToKeep = config.getMaxCommitsToKeep(); + this.minInstantsToKeep = config.getMinCommitsToKeep(); } private Writer openWriter() { @@ -112,9 +121,9 @@ public class HoodieTimelineArchiveLog { /** * Check if commits need to be archived. If yes, archive commits. */ - public boolean archiveIfRequired(final Configuration hadoopConf) throws IOException { + public boolean archiveIfRequired() throws IOException { try { - List instantsToArchive = getInstantsToArchive(hadoopConf).collect(Collectors.toList()); + List instantsToArchive = getInstantsToArchive().collect(Collectors.toList()); boolean success = true; if (!instantsToArchive.isEmpty()) { @@ -133,28 +142,21 @@ public class HoodieTimelineArchiveLog { } } - private Stream getInstantsToArchive(Configuration hadoopConf) { - - // TODO : rename to max/minInstantsToKeep - int maxCommitsToKeep = config.getMaxCommitsToKeep(); - int minCommitsToKeep = config.getMinCommitsToKeep(); - - HoodieTable table = HoodieTable.create(metaClient, config, hadoopConf); - - // GroupBy each action and limit each action timeline to maxCommitsToKeep - // TODO: Handle ROLLBACK_ACTION in future - // ROLLBACK_ACTION is currently not defined in HoodieActiveTimeline + private Stream getCleanInstantsToArchive() { HoodieTimeline cleanAndRollbackTimeline = table.getActiveTimeline() .getTimelineOfActions(Collections.singleton(HoodieTimeline.CLEAN_ACTION)).filterCompletedInstants(); - Stream instants = cleanAndRollbackTimeline.getInstants() - .collect(Collectors.groupingBy(HoodieInstant::getAction)).values().stream().map(hoodieInstants -> { - if (hoodieInstants.size() > maxCommitsToKeep) { - return hoodieInstants.subList(0, hoodieInstants.size() - minCommitsToKeep); + return cleanAndRollbackTimeline.getInstants() + .collect(Collectors.groupingBy(HoodieInstant::getAction)).values().stream() + .map(hoodieInstants -> { + if (hoodieInstants.size() > this.maxInstantsToKeep) { + return hoodieInstants.subList(0, hoodieInstants.size() - this.minInstantsToKeep); } else { return new ArrayList(); } }).flatMap(Collection::stream); + } + private Stream getCommitInstantsToArchive() { // TODO (na) : Add a way to return actions associated with a timeline and then merge/unify // with logic above to avoid Stream.concats HoodieTimeline commitTimeline = table.getCompletedCommitsTimeline(); @@ -164,17 +166,26 @@ public class HoodieTimelineArchiveLog { // We cannot have any holes in the commit timeline. We cannot archive any commits which are // made after the first savepoint present. Option firstSavepoint = table.getCompletedSavepointTimeline().firstInstant(); - if (!commitTimeline.empty() && commitTimeline.countInstants() > maxCommitsToKeep) { + if (!commitTimeline.empty() && commitTimeline.countInstants() > maxInstantsToKeep) { // Actually do the commits - instants = Stream.concat(instants, commitTimeline.getInstants().filter(s -> { - // if no savepoint present, then dont filter - return !(firstSavepoint.isPresent() && HoodieTimeline.compareTimestamps(firstSavepoint.get().getTimestamp(), - HoodieTimeline.LESSER_THAN_OR_EQUALS, s.getTimestamp())); - }).filter(s -> { - // Ensure commits >= oldest pending compaction commit is retained - return oldestPendingCompactionInstant.map(instant -> HoodieTimeline.compareTimestamps(instant.getTimestamp(), HoodieTimeline.GREATER_THAN, s.getTimestamp())).orElse(true); - }).limit(commitTimeline.countInstants() - minCommitsToKeep)); + return commitTimeline.getInstants() + .filter(s -> { + // if no savepoint present, then dont filter + return !(firstSavepoint.isPresent() && HoodieTimeline.compareTimestamps(firstSavepoint.get().getTimestamp(), LESSER_THAN_OR_EQUALS, s.getTimestamp())); + }).filter(s -> { + // Ensure commits >= oldest pending compaction commit is retained + return oldestPendingCompactionInstant + .map(instant -> HoodieTimeline.compareTimestamps(instant.getTimestamp(), GREATER_THAN, s.getTimestamp())) + .orElse(true); + }).limit(commitTimeline.countInstants() - minInstantsToKeep); + } else { + return Stream.empty(); } + } + + private Stream getInstantsToArchive() { + // TODO: Handle ROLLBACK_ACTION in future + Stream instants = Stream.concat(getCleanInstantsToArchive(), getCommitInstantsToArchive()); // For archiving and cleaning instants, we need to include intermediate state files if they exist HoodieActiveTimeline rawActiveTimeline = new HoodieActiveTimeline(metaClient, false); @@ -243,7 +254,7 @@ public class HoodieTimelineArchiveLog { List instantsToBeDeleted = instants.stream().filter(instant1 -> HoodieTimeline.compareTimestamps(instant1.getTimestamp(), - HoodieTimeline.LESSER_THAN_OR_EQUALS, thresholdInstant.getTimestamp())).collect(Collectors.toList()); + LESSER_THAN_OR_EQUALS, thresholdInstant.getTimestamp())).collect(Collectors.toList()); for (HoodieInstant deleteInstant : instantsToBeDeleted) { LOG.info("Deleting instant " + deleteInstant + " in auxiliary meta path " + metaClient.getMetaAuxiliaryPath()); @@ -264,6 +275,7 @@ public class HoodieTimelineArchiveLog { List records = new ArrayList<>(); for (HoodieInstant hoodieInstant : instants) { try { + deleteAnyLeftOverMarkerFiles(hoodieInstant); records.add(convertToAvroRecord(commitTimeline, hoodieInstant)); if (records.size() >= this.config.getCommitArchivalBatchSize()) { writeToFile(wrapperSchema, records); @@ -281,8 +293,11 @@ public class HoodieTimelineArchiveLog { } } - public Path getArchiveFilePath() { - return archiveFilePath; + private void deleteAnyLeftOverMarkerFiles(HoodieInstant instant) { + MarkerFiles markerFiles = new MarkerFiles(table, instant.getTimestamp()); + if (markerFiles.deleteMarkerDir()) { + LOG.info("Cleaned up left over marker directory for instant :" + instant); + } } private void writeToFile(Schema wrapperSchema, List records) throws Exception { diff --git a/hudi-client/src/main/java/org/apache/hudi/table/MarkerFiles.java b/hudi-client/src/main/java/org/apache/hudi/table/MarkerFiles.java new file mode 100644 index 000000000..00eb7df7a --- /dev/null +++ b/hudi-client/src/main/java/org/apache/hudi/table/MarkerFiles.java @@ -0,0 +1,153 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.table; + +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hudi.common.fs.FSUtils; +import org.apache.hudi.common.table.HoodieTableMetaClient; +import org.apache.hudi.common.util.ValidationUtils; +import org.apache.hudi.exception.HoodieException; +import org.apache.hudi.exception.HoodieIOException; +import org.apache.hudi.io.IOType; +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.LinkedList; +import java.util.List; + +/** + * Operates on marker files for a given write action (commit, delta commit, compaction). + */ +public class MarkerFiles { + + private static final Logger LOG = LogManager.getLogger(MarkerFiles.class); + + public static String stripMarkerSuffix(String path) { + return path.substring(0, path.indexOf(HoodieTableMetaClient.MARKER_EXTN)); + } + + private final String instantTime; + private final FileSystem fs; + private final Path markerDirPath; + private final String basePath; + + public MarkerFiles(FileSystem fs, String basePath, String markerFolderPath, String instantTime) { + this.instantTime = instantTime; + this.fs = fs; + this.markerDirPath = new Path(markerFolderPath); + this.basePath = basePath; + } + + public MarkerFiles(HoodieTable table, String instantTime) { + this(table.getMetaClient().getFs(), + table.getMetaClient().getBasePath(), + table.getMetaClient().getMarkerFolderPath(instantTime), + instantTime); + } + + public void quietDeleteMarkerDir() { + try { + deleteMarkerDir(); + } catch (HoodieIOException ioe) { + LOG.warn("Error deleting marker directory for instant " + instantTime, ioe); + } + } + + /** + * Delete Marker directory corresponding to an instant. + */ + public boolean deleteMarkerDir() { + try { + boolean result = fs.delete(markerDirPath, true); + if (result) { + LOG.info("Removing marker directory at " + markerDirPath); + } else { + LOG.info("No marker directory to delete at " + markerDirPath); + } + return result; + } catch (IOException ioe) { + throw new HoodieIOException(ioe.getMessage(), ioe); + } + } + + public boolean doesMarkerDirExist() throws IOException { + return fs.exists(markerDirPath); + } + + public List createdAndMergedDataPaths() throws IOException { + List dataFiles = new LinkedList<>(); + FSUtils.processFiles(fs, markerDirPath.toString(), (status) -> { + String pathStr = status.getPath().toString(); + if (pathStr.contains(HoodieTableMetaClient.MARKER_EXTN) && !pathStr.endsWith(IOType.APPEND.name())) { + dataFiles.add(translateMarkerToDataPath(pathStr)); + } + return true; + }, false); + return dataFiles; + } + + private String translateMarkerToDataPath(String markerPath) { + String rPath = stripMarkerFolderPrefix(markerPath); + return MarkerFiles.stripMarkerSuffix(rPath); + } + + public List allMarkerFilePaths() throws IOException { + List markerFiles = new ArrayList<>(); + FSUtils.processFiles(fs, markerDirPath.toString(), fileStatus -> { + markerFiles.add(stripMarkerFolderPrefix(fileStatus.getPath().toString())); + return true; + }, false); + return markerFiles; + } + + private String stripMarkerFolderPrefix(String fullMarkerPath) { + ValidationUtils.checkArgument(fullMarkerPath.contains(HoodieTableMetaClient.MARKER_EXTN)); + String markerRootPath = Path.getPathWithoutSchemeAndAuthority( + new Path(String.format("%s/%s/%s", basePath, HoodieTableMetaClient.TEMPFOLDER_NAME, instantTime))).toString(); + int begin = fullMarkerPath.indexOf(markerRootPath); + ValidationUtils.checkArgument(begin >= 0, + "Not in marker dir. Marker Path=" + fullMarkerPath + ", Expected Marker Root=" + markerRootPath); + return fullMarkerPath.substring(begin + markerRootPath.length() + 1); + } + + /** + * The marker path will be /.hoodie/.temp//2019/04/25/filename.marker.writeIOType. + */ + public Path create(String partitionPath, String dataFileName, IOType type) { + Path path = FSUtils.getPartitionPath(markerDirPath, partitionPath); + try { + fs.mkdirs(path); // create a new partition as needed. + } catch (IOException e) { + throw new HoodieIOException("Failed to make dir " + path, e); + } + String markerFileName = String.format("%s%s.%s", dataFileName, HoodieTableMetaClient.MARKER_EXTN, type.name()); + Path markerPath = new Path(path, markerFileName); + try { + LOG.info("Creating Marker Path=" + markerPath); + fs.create(markerPath, false).close(); + } catch (IOException e) { + throw new HoodieException("Failed to create marker file " + markerPath, e); + } + return markerPath; + } + +} diff --git a/hudi-client/src/main/java/org/apache/hudi/table/action/restore/CopyOnWriteRestoreActionExecutor.java b/hudi-client/src/main/java/org/apache/hudi/table/action/restore/CopyOnWriteRestoreActionExecutor.java index 53e94c648..2c9c2bcc5 100644 --- a/hudi-client/src/main/java/org/apache/hudi/table/action/restore/CopyOnWriteRestoreActionExecutor.java +++ b/hudi-client/src/main/java/org/apache/hudi/table/action/restore/CopyOnWriteRestoreActionExecutor.java @@ -48,7 +48,8 @@ public class CopyOnWriteRestoreActionExecutor extends BaseRestoreActionExecutor HoodieActiveTimeline.createNewInstantTime(), instantToRollback, true, - true); + true, + false); if (!instantToRollback.getAction().equals(HoodieTimeline.COMMIT_ACTION)) { throw new HoodieRollbackException("Unsupported action in rollback instant:" + instantToRollback); } diff --git a/hudi-client/src/main/java/org/apache/hudi/table/action/restore/MergeOnReadRestoreActionExecutor.java b/hudi-client/src/main/java/org/apache/hudi/table/action/restore/MergeOnReadRestoreActionExecutor.java index 804086117..827d8e239 100644 --- a/hudi-client/src/main/java/org/apache/hudi/table/action/restore/MergeOnReadRestoreActionExecutor.java +++ b/hudi-client/src/main/java/org/apache/hudi/table/action/restore/MergeOnReadRestoreActionExecutor.java @@ -47,7 +47,8 @@ public class MergeOnReadRestoreActionExecutor extends BaseRestoreActionExecutor HoodieActiveTimeline.createNewInstantTime(), instantToRollback, true, - true); + true, + false); switch (instantToRollback.getAction()) { case HoodieTimeline.COMMIT_ACTION: diff --git a/hudi-client/src/main/java/org/apache/hudi/table/action/rollback/BaseRollbackActionExecutor.java b/hudi-client/src/main/java/org/apache/hudi/table/action/rollback/BaseRollbackActionExecutor.java index bb12bdd3c..846e8a853 100644 --- a/hudi-client/src/main/java/org/apache/hudi/table/action/rollback/BaseRollbackActionExecutor.java +++ b/hudi-client/src/main/java/org/apache/hudi/table/action/rollback/BaseRollbackActionExecutor.java @@ -27,16 +27,19 @@ import org.apache.hudi.common.table.timeline.HoodieTimeline; import org.apache.hudi.common.table.timeline.TimelineMetadataUtils; import org.apache.hudi.common.util.HoodieTimer; import org.apache.hudi.common.util.Option; +import org.apache.hudi.common.util.ValidationUtils; import org.apache.hudi.config.HoodieWriteConfig; import org.apache.hudi.exception.HoodieIOException; import org.apache.hudi.exception.HoodieRollbackException; import org.apache.hudi.table.HoodieTable; +import org.apache.hudi.table.MarkerFiles; import org.apache.hudi.table.action.BaseActionExecutor; import org.apache.log4j.LogManager; import org.apache.log4j.Logger; import org.apache.spark.api.java.JavaSparkContext; import java.io.IOException; +import java.io.Serializable; import java.util.Collections; import java.util.List; import java.util.Objects; @@ -46,9 +49,14 @@ public abstract class BaseRollbackActionExecutor extends BaseActionExecutor execute(HoodieInstant instantToRollback); + } + protected final HoodieInstant instantToRollback; protected final boolean deleteInstants; protected final boolean skipTimelinePublish; + protected final boolean useMarkerBasedStrategy; public BaseRollbackActionExecutor(JavaSparkContext jsc, HoodieWriteConfig config, @@ -56,7 +64,8 @@ public abstract class BaseRollbackActionExecutor extends BaseActionExecutor executeRollback() throws IOException; + protected abstract List executeRollbackUsingFileListing(HoodieInstant instantToRollback); + @Override public HoodieRollbackMetadata execute() { HoodieTimer rollbackTimer = new HoodieTimer().startTimer(); + List stats = doRollbackAndGetStats(); HoodieRollbackMetadata rollbackMetadata = TimelineMetadataUtils.convertRollbackMetadata( instantTime, Option.of(rollbackTimer.endTimer()), Collections.singletonList(instantToRollback.getTimestamp()), - doRollbackAndGetStats()); + stats); if (!skipTimelinePublish) { finishRollback(rollbackMetadata); } + + // Finally, remove the marker files post rollback. + new MarkerFiles(table, instantToRollback.getTimestamp()).quietDeleteMarkerDir(); + return rollbackMetadata; } + private void validateSavepointRollbacks() { + // Check if any of the commits is a savepoint - do not allow rollback on those commits + List savepoints = table.getCompletedSavepointTimeline().getInstants() + .map(HoodieInstant::getTimestamp) + .collect(Collectors.toList()); + savepoints.forEach(s -> { + if (s.contains(instantToRollback.getTimestamp())) { + throw new HoodieRollbackException( + "Could not rollback a savepointed commit. Delete savepoint first before rolling back" + s); + } + }); + } + + private void validateRollbackCommitSequence() { + final String instantTimeToRollback = instantToRollback.getTimestamp(); + HoodieTimeline commitTimeline = table.getCompletedCommitsTimeline(); + HoodieTimeline inflightAndRequestedCommitTimeline = table.getPendingCommitTimeline(); + // Make sure only the last n commits are being rolled back + // If there is a commit in-between or after that is not rolled back, then abort + if ((instantTimeToRollback != null) && !commitTimeline.empty() + && !commitTimeline.findInstantsAfter(instantTimeToRollback, Integer.MAX_VALUE).empty()) { + throw new HoodieRollbackException( + "Found commits after time :" + instantTimeToRollback + ", please rollback greater commits first"); + } + + List inflights = inflightAndRequestedCommitTimeline.getInstants().map(HoodieInstant::getTimestamp) + .collect(Collectors.toList()); + if ((instantTimeToRollback != null) && !inflights.isEmpty() + && (inflights.indexOf(instantTimeToRollback) != inflights.size() - 1)) { + throw new HoodieRollbackException( + "Found in-flight commits after time :" + instantTimeToRollback + ", please rollback greater commits first"); + } + } + + private void rollBackIndex() { + if (!table.getIndex().rollbackCommit(instantToRollback.getTimestamp())) { + throw new HoodieRollbackException("Rollback index changes failed, for time :" + instantToRollback); + } + LOG.info("Index rolled back for commits " + instantToRollback); + } + public List doRollbackAndGetStats() { final String instantTimeToRollback = instantToRollback.getTimestamp(); final boolean isPendingCompaction = Objects.equals(HoodieTimeline.COMPACTION_ACTION, instantToRollback.getAction()) && !instantToRollback.isCompleted(); - HoodieTimeline inflightAndRequestedCommitTimeline = table.getPendingCommitTimeline(); - HoodieTimeline commitTimeline = table.getCompletedCommitsTimeline(); - // Check if any of the commits is a savepoint - do not allow rollback on those commits - List savepoints = table.getCompletedSavepointTimeline().getInstants() - .map(HoodieInstant::getTimestamp) - .collect(Collectors.toList()); - savepoints.forEach(s -> { - if (s.contains(instantTimeToRollback)) { - throw new HoodieRollbackException( - "Could not rollback a savepointed commit. Delete savepoint first before rolling back" + s); - } - }); - - if (commitTimeline.empty() && inflightAndRequestedCommitTimeline.empty()) { - LOG.info("No commits to rollback " + instantTimeToRollback); - } - - // Make sure only the last n commits are being rolled back - // If there is a commit in-between or after that is not rolled back, then abort + validateSavepointRollbacks(); if (!isPendingCompaction) { - if ((instantTimeToRollback != null) && !commitTimeline.empty() - && !commitTimeline.findInstantsAfter(instantTimeToRollback, Integer.MAX_VALUE).empty()) { - throw new HoodieRollbackException( - "Found commits after time :" + instantTimeToRollback + ", please rollback greater commits first"); - } - - List inflights = inflightAndRequestedCommitTimeline.getInstants().map(HoodieInstant::getTimestamp) - .collect(Collectors.toList()); - if ((instantTimeToRollback != null) && !inflights.isEmpty() - && (inflights.indexOf(instantTimeToRollback) != inflights.size() - 1)) { - throw new HoodieRollbackException( - "Found in-flight commits after time :" + instantTimeToRollback + ", please rollback greater commits first"); - } + validateRollbackCommitSequence(); } try { List stats = executeRollback(); LOG.info("Rolled back inflight instant " + instantTimeToRollback); if (!isPendingCompaction) { - if (!table.getIndex().rollbackCommit(instantTimeToRollback)) { - throw new HoodieRollbackException("Rollback index changes failed, for time :" + instantTimeToRollback); - } - LOG.info("Index rolled back for commits " + instantTimeToRollback); + rollBackIndex(); } return stats; } catch (IOException e) { @@ -171,9 +209,6 @@ public abstract class BaseRollbackActionExecutor extends BaseActionExecutor rollbackRequests = generateRollbackRequests(resolvedInstant); - - //TODO: We need to persist this as rollback workload and use it in case of partial failures - stats = new RollbackHelper(table.getMetaClient(), config).performRollback(jsc, resolvedInstant, rollbackRequests); + LOG.info("Clean out all base files generated for commit: " + resolvedInstant); + stats = getRollbackStrategy().execute(resolvedInstant); } // Delete Inflight instant if enabled deleteInflightAndRequestedInstant(deleteInstants, activeTimeline, resolvedInstant); - LOG.info("Time(in ms) taken to finish rollback " + (System.currentTimeMillis() - startTime)); + LOG.info("Time(in ms) taken to finish rollback " + rollbackTimer.endTimer()); return stats; } - private List generateRollbackRequests(HoodieInstant instantToRollback) - throws IOException { - return FSUtils.getAllPartitionPaths(table.getMetaClient().getFs(), table.getMetaClient().getBasePath(), - config.shouldAssumeDatePartitioning()).stream() - .map(partitionPath -> RollbackRequest.createRollbackRequestWithDeleteDataAndLogFilesAction(partitionPath, instantToRollback)) - .collect(Collectors.toList()); + private List generateRollbackRequestsByListing() { + try { + return FSUtils.getAllPartitionPaths(table.getMetaClient().getFs(), table.getMetaClient().getBasePath(), + config.shouldAssumeDatePartitioning()).stream() + .map(ListingBasedRollbackRequest::createRollbackRequestWithDeleteDataAndLogFilesAction) + .collect(Collectors.toList()); + } catch (IOException e) { + throw new HoodieIOException("Error generating rollback requests", e); + } + } + + @Override + protected List executeRollbackUsingFileListing(HoodieInstant instantToRollback) { + List rollbackRequests = generateRollbackRequestsByListing(); + return new ListingBasedRollbackHelper(table.getMetaClient(), config).performRollback(jsc, instantToRollback, rollbackRequests); } } diff --git a/hudi-client/src/main/java/org/apache/hudi/table/action/rollback/RollbackHelper.java b/hudi-client/src/main/java/org/apache/hudi/table/action/rollback/ListingBasedRollbackHelper.java similarity index 71% rename from hudi-client/src/main/java/org/apache/hudi/table/action/rollback/RollbackHelper.java rename to hudi-client/src/main/java/org/apache/hudi/table/action/rollback/ListingBasedRollbackHelper.java index f5b92e760..d7304fa1c 100644 --- a/hudi-client/src/main/java/org/apache/hudi/table/action/rollback/RollbackHelper.java +++ b/hudi-client/src/main/java/org/apache/hudi/table/action/rollback/ListingBasedRollbackHelper.java @@ -28,9 +28,8 @@ import org.apache.hudi.common.table.log.block.HoodieCommandBlock; import org.apache.hudi.common.table.log.block.HoodieCommandBlock.HoodieCommandBlockTypeEnum; import org.apache.hudi.common.table.log.block.HoodieLogBlock.HeaderMetadataType; import org.apache.hudi.common.table.timeline.HoodieInstant; -import org.apache.hudi.common.util.Option; -import org.apache.hudi.common.util.ValidationUtils; import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.exception.HoodieIOException; import org.apache.hudi.exception.HoodieRollbackException; import org.apache.hadoop.fs.FileStatus; @@ -42,8 +41,7 @@ import org.apache.spark.api.java.JavaSparkContext; import java.io.IOException; import java.io.Serializable; -import java.io.UncheckedIOException; -import java.util.ArrayList; +import java.util.Collections; import java.util.HashMap; import java.util.List; import java.util.Map; @@ -54,14 +52,14 @@ import scala.Tuple2; /** * Performs Rollback of Hoodie Tables. */ -public class RollbackHelper implements Serializable { +public class ListingBasedRollbackHelper implements Serializable { - private static final Logger LOG = LogManager.getLogger(RollbackHelper.class); + private static final Logger LOG = LogManager.getLogger(ListingBasedRollbackHelper.class); private final HoodieTableMetaClient metaClient; private final HoodieWriteConfig config; - public RollbackHelper(HoodieTableMetaClient metaClient, HoodieWriteConfig config) { + public ListingBasedRollbackHelper(HoodieTableMetaClient metaClient, HoodieWriteConfig config) { this.metaClient = metaClient; this.config = config; } @@ -69,14 +67,12 @@ public class RollbackHelper implements Serializable { /** * Performs all rollback actions that we have collected in parallel. */ - public List performRollback(JavaSparkContext jsc, HoodieInstant instantToRollback, List rollbackRequests) { - - String basefileExtension = metaClient.getTableConfig().getBaseFileFormat().getFileExtension(); + public List performRollback(JavaSparkContext jsc, HoodieInstant instantToRollback, List rollbackRequests) { SerializablePathFilter filter = (path) -> { - if (path.toString().contains(basefileExtension)) { + if (path.toString().endsWith(this.metaClient.getTableConfig().getBaseFileFormat().getFileExtension())) { String fileCommitTime = FSUtils.getCommitTime(path.getName()); return instantToRollback.getTimestamp().equals(fileCommitTime); - } else if (path.toString().contains(".log")) { + } else if (FSUtils.isLogFile(path)) { // Since the baseCommitTime is the only commit for new log files, it's okay here String fileCommitTime = FSUtils.getBaseCommitTimeFromLogPath(path); return instantToRollback.getTimestamp().equals(fileCommitTime); @@ -87,17 +83,16 @@ public class RollbackHelper implements Serializable { int sparkPartitions = Math.max(Math.min(rollbackRequests.size(), config.getRollbackParallelism()), 1); jsc.setJobGroup(this.getClass().getSimpleName(), "Perform rollback actions"); return jsc.parallelize(rollbackRequests, sparkPartitions).mapToPair(rollbackRequest -> { - final Map filesToDeletedStatus = new HashMap<>(); - switch (rollbackRequest.getRollbackAction()) { + switch (rollbackRequest.getType()) { case DELETE_DATA_FILES_ONLY: { - deleteCleanedFiles(metaClient, config, filesToDeletedStatus, instantToRollback.getTimestamp(), + final Map filesToDeletedStatus = deleteCleanedFiles(metaClient, config, instantToRollback.getTimestamp(), rollbackRequest.getPartitionPath()); return new Tuple2<>(rollbackRequest.getPartitionPath(), HoodieRollbackStat.newBuilder().withPartitionPath(rollbackRequest.getPartitionPath()) .withDeletedFileResults(filesToDeletedStatus).build()); } case DELETE_DATA_AND_LOG_FILES: { - deleteCleanedFiles(metaClient, config, filesToDeletedStatus, rollbackRequest.getPartitionPath(), filter); + final Map filesToDeletedStatus = deleteCleanedFiles(metaClient, config, rollbackRequest.getPartitionPath(), filter); return new Tuple2<>(rollbackRequest.getPartitionPath(), HoodieRollbackStat.newBuilder().withPartitionPath(rollbackRequest.getPartitionPath()) .withDeletedFileResults(filesToDeletedStatus).build()); @@ -123,15 +118,17 @@ public class RollbackHelper implements Serializable { writer.close(); } } catch (IOException io) { - throw new UncheckedIOException(io); + throw new HoodieIOException("Error appending rollback block..", io); } } // This step is intentionally done after writer is closed. Guarantees that // getFileStatus would reflect correct stats and FileNotFoundException is not thrown in // cloud-storage : HUDI-168 - Map filesToNumBlocksRollback = new HashMap<>(); - filesToNumBlocksRollback.put(metaClient.getFs().getFileStatus(Objects.requireNonNull(writer).getLogFile().getPath()), 1L); + Map filesToNumBlocksRollback = Collections.singletonMap( + metaClient.getFs().getFileStatus(Objects.requireNonNull(writer).getLogFile().getPath()), + 1L + ); return new Tuple2<>(rollbackRequest.getPartitionPath(), HoodieRollbackStat.newBuilder().withPartitionPath(rollbackRequest.getPartitionPath()) .withRollbackBlockAppendResults(filesToNumBlocksRollback).build()); @@ -139,36 +136,18 @@ public class RollbackHelper implements Serializable { default: throw new IllegalStateException("Unknown Rollback action " + rollbackRequest); } - }).reduceByKey(this::mergeRollbackStat).map(Tuple2::_2).collect(); + }).reduceByKey(RollbackUtils::mergeRollbackStat).map(Tuple2::_2).collect(); } - /** - * Helper to merge 2 rollback-stats for a given partition. - * - * @param stat1 HoodieRollbackStat - * @param stat2 HoodieRollbackStat - * @return Merged HoodieRollbackStat - */ - private HoodieRollbackStat mergeRollbackStat(HoodieRollbackStat stat1, HoodieRollbackStat stat2) { - ValidationUtils.checkArgument(stat1.getPartitionPath().equals(stat2.getPartitionPath())); - final List successDeleteFiles = new ArrayList<>(); - final List failedDeleteFiles = new ArrayList<>(); - final Map commandBlocksCount = new HashMap<>(); - Option.ofNullable(stat1.getSuccessDeleteFiles()).ifPresent(successDeleteFiles::addAll); - Option.ofNullable(stat2.getSuccessDeleteFiles()).ifPresent(successDeleteFiles::addAll); - Option.ofNullable(stat1.getFailedDeleteFiles()).ifPresent(failedDeleteFiles::addAll); - Option.ofNullable(stat2.getFailedDeleteFiles()).ifPresent(failedDeleteFiles::addAll); - Option.ofNullable(stat1.getCommandBlocksCount()).ifPresent(commandBlocksCount::putAll); - Option.ofNullable(stat2.getCommandBlocksCount()).ifPresent(commandBlocksCount::putAll); - return new HoodieRollbackStat(stat1.getPartitionPath(), successDeleteFiles, failedDeleteFiles, commandBlocksCount); - } + /** - * Common method used for cleaning out parquet files under a partition path during rollback of a set of commits. + * Common method used for cleaning out base files under a partition path during rollback of a set of commits. */ private Map deleteCleanedFiles(HoodieTableMetaClient metaClient, HoodieWriteConfig config, - Map results, String partitionPath, PathFilter filter) throws IOException { + String partitionPath, PathFilter filter) throws IOException { LOG.info("Cleaning path " + partitionPath); + final Map results = new HashMap<>(); FileSystem fs = metaClient.getFs(); FileStatus[] toBeDeleted = fs.listStatus(FSUtils.getPartitionPath(config.getBasePath(), partitionPath), filter); for (FileStatus file : toBeDeleted) { @@ -180,10 +159,11 @@ public class RollbackHelper implements Serializable { } /** - * Common method used for cleaning out parquet files under a partition path during rollback of a set of commits. + * Common method used for cleaning out base files under a partition path during rollback of a set of commits. */ private Map deleteCleanedFiles(HoodieTableMetaClient metaClient, HoodieWriteConfig config, - Map results, String commit, String partitionPath) throws IOException { + String commit, String partitionPath) throws IOException { + final Map results = new HashMap<>(); LOG.info("Cleaning path " + partitionPath); FileSystem fs = metaClient.getFs(); String basefileExtension = metaClient.getTableConfig().getBaseFileFormat().getFileExtension(); @@ -214,6 +194,5 @@ public class RollbackHelper implements Serializable { } public interface SerializablePathFilter extends PathFilter, Serializable { - } } diff --git a/hudi-client/src/main/java/org/apache/hudi/table/action/rollback/RollbackRequest.java b/hudi-client/src/main/java/org/apache/hudi/table/action/rollback/ListingBasedRollbackRequest.java similarity index 51% rename from hudi-client/src/main/java/org/apache/hudi/table/action/rollback/RollbackRequest.java rename to hudi-client/src/main/java/org/apache/hudi/table/action/rollback/ListingBasedRollbackRequest.java index 71cb57d26..fc369a467 100644 --- a/hudi-client/src/main/java/org/apache/hudi/table/action/rollback/RollbackRequest.java +++ b/hudi-client/src/main/java/org/apache/hudi/table/action/rollback/ListingBasedRollbackRequest.java @@ -18,18 +18,17 @@ package org.apache.hudi.table.action.rollback; -import org.apache.hudi.common.table.timeline.HoodieInstant; import org.apache.hudi.common.util.Option; /** * Request for performing one rollback action. */ -public class RollbackRequest { +public class ListingBasedRollbackRequest { /** - * Rollback Action Types. + * Rollback commands, that trigger a specific handling for rollback. */ - public enum RollbackAction { + public enum Type { DELETE_DATA_FILES_ONLY, DELETE_DATA_AND_LOG_FILES, APPEND_ROLLBACK_BLOCK @@ -40,11 +39,6 @@ public class RollbackRequest { */ private final String partitionPath; - /** - * Rollback Instant. - */ - private final HoodieInstant rollbackInstant; - /** * FileId in case of appending rollback block. */ @@ -55,46 +49,38 @@ public class RollbackRequest { */ private final Option latestBaseInstant; - /** - * Rollback Action. - */ - private final RollbackAction rollbackAction; + private final Type type; - public RollbackRequest(String partitionPath, HoodieInstant rollbackInstant, Option fileId, - Option latestBaseInstant, RollbackAction rollbackAction) { + public ListingBasedRollbackRequest(String partitionPath, + Option fileId, + Option latestBaseInstant, + Type type) { this.partitionPath = partitionPath; - this.rollbackInstant = rollbackInstant; this.fileId = fileId; this.latestBaseInstant = latestBaseInstant; - this.rollbackAction = rollbackAction; + this.type = type; } - public static RollbackRequest createRollbackRequestWithDeleteDataFilesOnlyAction(String partitionPath, - HoodieInstant rollbackInstant) { - return new RollbackRequest(partitionPath, rollbackInstant, Option.empty(), Option.empty(), - RollbackAction.DELETE_DATA_FILES_ONLY); + public static ListingBasedRollbackRequest createRollbackRequestWithDeleteDataFilesOnlyAction(String partitionPath) { + return new ListingBasedRollbackRequest(partitionPath, Option.empty(), Option.empty(), + Type.DELETE_DATA_FILES_ONLY); } - public static RollbackRequest createRollbackRequestWithDeleteDataAndLogFilesAction(String partitionPath, - HoodieInstant rollbackInstant) { - return new RollbackRequest(partitionPath, rollbackInstant, Option.empty(), Option.empty(), - RollbackAction.DELETE_DATA_AND_LOG_FILES); + public static ListingBasedRollbackRequest createRollbackRequestWithDeleteDataAndLogFilesAction(String partitionPath) { + return new ListingBasedRollbackRequest(partitionPath, Option.empty(), Option.empty(), + Type.DELETE_DATA_AND_LOG_FILES); } - public static RollbackRequest createRollbackRequestWithAppendRollbackBlockAction(String partitionPath, String fileId, - String baseInstant, HoodieInstant rollbackInstant) { - return new RollbackRequest(partitionPath, rollbackInstant, Option.of(fileId), Option.of(baseInstant), - RollbackAction.APPEND_ROLLBACK_BLOCK); + public static ListingBasedRollbackRequest createRollbackRequestWithAppendRollbackBlockAction(String partitionPath, String fileId, + String baseInstant) { + return new ListingBasedRollbackRequest(partitionPath, Option.of(fileId), Option.of(baseInstant), + Type.APPEND_ROLLBACK_BLOCK); } public String getPartitionPath() { return partitionPath; } - public HoodieInstant getRollbackInstant() { - return rollbackInstant; - } - public Option getFileId() { return fileId; } @@ -103,7 +89,7 @@ public class RollbackRequest { return latestBaseInstant; } - public RollbackAction getRollbackAction() { - return rollbackAction; + public Type getType() { + return type; } } diff --git a/hudi-client/src/main/java/org/apache/hudi/table/action/rollback/MarkerBasedRollbackStrategy.java b/hudi-client/src/main/java/org/apache/hudi/table/action/rollback/MarkerBasedRollbackStrategy.java new file mode 100644 index 000000000..40b81a213 --- /dev/null +++ b/hudi-client/src/main/java/org/apache/hudi/table/action/rollback/MarkerBasedRollbackStrategy.java @@ -0,0 +1,161 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.table.action.rollback; + +import org.apache.hadoop.fs.Path; +import org.apache.hudi.common.HoodieRollbackStat; +import org.apache.hudi.common.fs.FSUtils; +import org.apache.hudi.common.model.HoodieLogFile; +import org.apache.hudi.common.table.log.HoodieLogFormat; +import org.apache.hudi.common.table.log.block.HoodieCommandBlock; +import org.apache.hudi.common.table.log.block.HoodieLogBlock; +import org.apache.hudi.common.table.timeline.HoodieInstant; +import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.exception.HoodieIOException; +import org.apache.hudi.exception.HoodieRollbackException; +import org.apache.hudi.io.IOType; +import org.apache.hudi.table.HoodieTable; +import org.apache.hudi.table.MarkerFiles; +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; +import org.apache.spark.api.java.JavaSparkContext; +import scala.Tuple2; + +import java.io.IOException; +import java.util.Collections; +import java.util.List; +import java.util.Map; + +/** + * Performs rollback using marker files generated during the write.. + */ +public class MarkerBasedRollbackStrategy implements BaseRollbackActionExecutor.RollbackStrategy { + + private static final Logger LOG = LogManager.getLogger(MarkerBasedRollbackStrategy.class); + + private final HoodieTable table; + + private final transient JavaSparkContext jsc; + + private final HoodieWriteConfig config; + + private final String basePath; + + private final String instantTime; + + public MarkerBasedRollbackStrategy(HoodieTable table, JavaSparkContext jsc, HoodieWriteConfig config, String instantTime) { + this.table = table; + this.jsc = jsc; + this.basePath = table.getMetaClient().getBasePath(); + this.config = config; + this.instantTime = instantTime; + } + + private HoodieRollbackStat undoMerge(String mergedBaseFilePath) throws IOException { + LOG.info("Rolling back by deleting the merged base file:" + mergedBaseFilePath); + return deleteBaseFile(mergedBaseFilePath); + } + + private HoodieRollbackStat undoCreate(String createdBaseFilePath) throws IOException { + LOG.info("Rolling back by deleting the created base file:" + createdBaseFilePath); + return deleteBaseFile(createdBaseFilePath); + } + + private HoodieRollbackStat deleteBaseFile(String baseFilePath) throws IOException { + Path fullDeletePath = new Path(basePath, baseFilePath); + String partitionPath = FSUtils.getRelativePartitionPath(new Path(basePath), fullDeletePath.getParent()); + boolean isDeleted = table.getMetaClient().getFs().delete(fullDeletePath); + return HoodieRollbackStat.newBuilder() + .withPartitionPath(partitionPath) + .withDeletedFileResult(baseFilePath, isDeleted) + .build(); + } + + private HoodieRollbackStat undoAppend(String appendBaseFilePath, HoodieInstant instantToRollback) throws IOException, InterruptedException { + Path baseFilePathForAppend = new Path(basePath, appendBaseFilePath); + String fileId = FSUtils.getFileIdFromFilePath(baseFilePathForAppend); + String baseCommitTime = FSUtils.getCommitTime(baseFilePathForAppend.getName()); + String partitionPath = FSUtils.getRelativePartitionPath(new Path(basePath), new Path(basePath, appendBaseFilePath).getParent()); + + HoodieLogFormat.Writer writer = null; + try { + Path partitionFullPath = FSUtils.getPartitionPath(basePath, partitionPath); + + if (!table.getMetaClient().getFs().exists(partitionFullPath)) { + return HoodieRollbackStat.newBuilder() + .withPartitionPath(partitionPath) + .build(); + } + writer = HoodieLogFormat.newWriterBuilder() + .onParentPath(partitionFullPath) + .withFileId(fileId) + .overBaseCommit(baseCommitTime) + .withFs(table.getMetaClient().getFs()) + .withFileExtension(HoodieLogFile.DELTA_EXTENSION).build(); + + // generate metadata + Map header = RollbackUtils.generateHeader(instantToRollback.getTimestamp(), instantTime); + // if update belongs to an existing log file + writer = writer.appendBlock(new HoodieCommandBlock(header)); + } finally { + try { + if (writer != null) { + writer.close(); + } + } catch (IOException io) { + throw new HoodieIOException("Error closing append of rollback block..", io); + } + } + + return HoodieRollbackStat.newBuilder() + .withPartitionPath(partitionPath) + // we don't use this field per se. Avoiding the extra file status call. + .withRollbackBlockAppendResults(Collections.emptyMap()) + .build(); + } + + @Override + public List execute(HoodieInstant instantToRollback) { + try { + MarkerFiles markerFiles = new MarkerFiles(table, instantToRollback.getTimestamp()); + List markerFilePaths = markerFiles.allMarkerFilePaths(); + int parallelism = Math.max(Math.min(markerFilePaths.size(), config.getRollbackParallelism()), 1); + return jsc.parallelize(markerFilePaths, parallelism) + .map(markerFilePath -> { + String typeStr = markerFilePath.substring(markerFilePath.lastIndexOf(".") + 1); + IOType type = IOType.valueOf(typeStr); + switch (type) { + case MERGE: + return undoMerge(MarkerFiles.stripMarkerSuffix(markerFilePath)); + case APPEND: + return undoAppend(MarkerFiles.stripMarkerSuffix(markerFilePath), instantToRollback); + case CREATE: + return undoCreate(MarkerFiles.stripMarkerSuffix(markerFilePath)); + default: + throw new HoodieRollbackException("Unknown marker type, during rollback of " + instantToRollback); + } + }) + .mapToPair(rollbackStat -> new Tuple2<>(rollbackStat.getPartitionPath(), rollbackStat)) + .reduceByKey(RollbackUtils::mergeRollbackStat) + .map(Tuple2::_2).collect(); + } catch (Exception e) { + throw new HoodieRollbackException("Error rolling back using marker files written for " + instantToRollback, e); + } + } +} diff --git a/hudi-client/src/main/java/org/apache/hudi/table/action/rollback/MergeOnReadRollbackActionExecutor.java b/hudi-client/src/main/java/org/apache/hudi/table/action/rollback/MergeOnReadRollbackActionExecutor.java index 481fb2d7b..d11b7adde 100644 --- a/hudi-client/src/main/java/org/apache/hudi/table/action/rollback/MergeOnReadRollbackActionExecutor.java +++ b/hudi-client/src/main/java/org/apache/hudi/table/action/rollback/MergeOnReadRollbackActionExecutor.java @@ -26,15 +26,16 @@ import org.apache.hudi.common.model.HoodieWriteStat; import org.apache.hudi.common.table.timeline.HoodieActiveTimeline; import org.apache.hudi.common.table.timeline.HoodieInstant; import org.apache.hudi.common.table.timeline.HoodieTimeline; +import org.apache.hudi.common.util.HoodieTimer; import org.apache.hudi.common.util.ValidationUtils; import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.exception.HoodieIOException; import org.apache.hudi.table.HoodieTable; import org.apache.log4j.LogManager; import org.apache.log4j.Logger; import org.apache.spark.api.java.JavaSparkContext; import java.io.IOException; -import java.io.UncheckedIOException; import java.util.ArrayList; import java.util.List; import java.util.Map; @@ -60,14 +61,17 @@ public class MergeOnReadRollbackActionExecutor extends BaseRollbackActionExecuto String instantTime, HoodieInstant commitInstant, boolean deleteInstants, - boolean skipTimelinePublish) { - super(jsc, config, table, instantTime, commitInstant, deleteInstants, skipTimelinePublish); + boolean skipTimelinePublish, + boolean useMarkerBasedStrategy) { + super(jsc, config, table, instantTime, commitInstant, deleteInstants, skipTimelinePublish, useMarkerBasedStrategy); } @Override protected List executeRollback() throws IOException { - long startTime = System.currentTimeMillis(); - LOG.info("Rolling back instant " + instantToRollback.getTimestamp()); + HoodieTimer rollbackTimer = new HoodieTimer(); + rollbackTimer.startTimer(); + + LOG.info("Rolling back instant " + instantToRollback); HoodieInstant resolvedInstant = instantToRollback; // Atomically un-publish all non-inflight commits @@ -85,35 +89,40 @@ public class MergeOnReadRollbackActionExecutor extends BaseRollbackActionExecuto // (commitToRollback). // NOTE {@link HoodieCompactionConfig#withCompactionLazyBlockReadEnabled} needs to be set to TRUE. This is // required to avoid OOM when merging multiple LogBlocks performed during nested rollbacks. - // Atomically un-publish all non-inflight commits + + // For Requested State (like failure during index lookup), there is nothing to do rollback other than // deleting the timeline file if (!resolvedInstant.isRequested()) { - LOG.info("Un-published " + resolvedInstant); - List rollbackRequests = generateRollbackRequests(jsc, resolvedInstant); - // TODO: We need to persist this as rollback workload and use it in case of partial failures - allRollbackStats = new RollbackHelper(table.getMetaClient(), config).performRollback(jsc, resolvedInstant, rollbackRequests); + LOG.info("Unpublished " + resolvedInstant); + allRollbackStats = getRollbackStrategy().execute(resolvedInstant); } // Delete Inflight instants if enabled deleteInflightAndRequestedInstant(deleteInstants, table.getActiveTimeline(), resolvedInstant); - - LOG.info("Time(in ms) taken to finish rollback " + (System.currentTimeMillis() - startTime)); - + LOG.info("Time(in ms) taken to finish rollback " + rollbackTimer.endTimer()); return allRollbackStats; } + @Override + protected List executeRollbackUsingFileListing(HoodieInstant resolvedInstant) { + List rollbackRequests; + try { + rollbackRequests = generateRollbackRequestsUsingFileListing(resolvedInstant); + } catch (IOException e) { + throw new HoodieIOException("Error generating rollback requests by file listing.", e); + } + return new ListingBasedRollbackHelper(table.getMetaClient(), config).performRollback(jsc, resolvedInstant, rollbackRequests); + } + /** * Generate all rollback requests that we need to perform for rolling back this action without actually performing * rolling back. * - * @param jsc JavaSparkContext * @param instantToRollback Instant to Rollback * @return list of rollback requests - * @throws IOException */ - private List generateRollbackRequests(JavaSparkContext jsc, HoodieInstant instantToRollback) - throws IOException { + private List generateRollbackRequestsUsingFileListing(HoodieInstant instantToRollback) throws IOException { String commit = instantToRollback.getTimestamp(); List partitions = FSUtils.getAllPartitionPaths(table.getMetaClient().getFs(), table.getMetaClient().getBasePath(), config.shouldAssumeDatePartitioning()); @@ -121,12 +130,12 @@ public class MergeOnReadRollbackActionExecutor extends BaseRollbackActionExecuto jsc.setJobGroup(this.getClass().getSimpleName(), "Generate all rollback requests"); return jsc.parallelize(partitions, Math.min(partitions.size(), sparkPartitions)).flatMap(partitionPath -> { HoodieActiveTimeline activeTimeline = table.getMetaClient().reloadActiveTimeline(); - List partitionRollbackRequests = new ArrayList<>(); + List partitionRollbackRequests = new ArrayList<>(); switch (instantToRollback.getAction()) { case HoodieTimeline.COMMIT_ACTION: - LOG.info("Rolling back commit action. There are higher delta commits. So only rolling back this instant"); + LOG.info("Rolling back commit action."); partitionRollbackRequests.add( - RollbackRequest.createRollbackRequestWithDeleteDataAndLogFilesAction(partitionPath, instantToRollback)); + ListingBasedRollbackRequest.createRollbackRequestWithDeleteDataAndLogFilesAction(partitionPath)); break; case HoodieTimeline.COMPACTION_ACTION: // If there is no delta commit present after the current commit (if compaction), no action, else we @@ -141,7 +150,7 @@ public class MergeOnReadRollbackActionExecutor extends BaseRollbackActionExecuto // have been written to the log files. LOG.info("Rolling back compaction. There are higher delta commits. So only deleting data files"); partitionRollbackRequests.add( - RollbackRequest.createRollbackRequestWithDeleteDataFilesOnlyAction(partitionPath, instantToRollback)); + ListingBasedRollbackRequest.createRollbackRequestWithDeleteDataFilesOnlyAction(partitionPath)); } else { // No deltacommits present after this compaction commit (inflight or requested). In this case, we // can also delete any log files that were created with this compaction commit as base @@ -149,7 +158,7 @@ public class MergeOnReadRollbackActionExecutor extends BaseRollbackActionExecuto LOG.info("Rolling back compaction plan. There are NO higher delta commits. So deleting both data and" + " log files"); partitionRollbackRequests.add( - RollbackRequest.createRollbackRequestWithDeleteDataAndLogFilesAction(partitionPath, instantToRollback)); + ListingBasedRollbackRequest.createRollbackRequestWithDeleteDataAndLogFilesAction(partitionPath)); } break; case HoodieTimeline.DELTA_COMMIT_ACTION: @@ -179,8 +188,7 @@ public class MergeOnReadRollbackActionExecutor extends BaseRollbackActionExecuto try { HoodieCommitMetadata commitMetadata = HoodieCommitMetadata.fromBytes( table.getMetaClient().getCommitTimeline() - .getInstantDetails( - new HoodieInstant(true, instantToRollback.getAction(), instantToRollback.getTimestamp())) + .getInstantDetails(new HoodieInstant(true, instantToRollback.getAction(), instantToRollback.getTimestamp())) .get(), HoodieCommitMetadata.class); @@ -188,7 +196,7 @@ public class MergeOnReadRollbackActionExecutor extends BaseRollbackActionExecuto // We do not know fileIds for inserts (first inserts are either log files or parquet files), // delete all files for the corresponding failed commit, if present (same as COW) partitionRollbackRequests.add( - RollbackRequest.createRollbackRequestWithDeleteDataAndLogFilesAction(partitionPath, instantToRollback)); + ListingBasedRollbackRequest.createRollbackRequestWithDeleteDataAndLogFilesAction(partitionPath)); // append rollback blocks for updates if (commitMetadata.getPartitionToWriteStats().containsKey(partitionPath)) { @@ -197,7 +205,7 @@ public class MergeOnReadRollbackActionExecutor extends BaseRollbackActionExecuto } break; } catch (IOException io) { - throw new UncheckedIOException("Failed to collect rollback actions for commit " + commit, io); + throw new HoodieIOException("Failed to collect rollback actions for commit " + commit, io); } default: break; @@ -206,8 +214,8 @@ public class MergeOnReadRollbackActionExecutor extends BaseRollbackActionExecuto }).filter(Objects::nonNull).collect(); } - private List generateAppendRollbackBlocksAction(String partitionPath, HoodieInstant rollbackInstant, - HoodieCommitMetadata commitMetadata) { + private List generateAppendRollbackBlocksAction(String partitionPath, HoodieInstant rollbackInstant, + HoodieCommitMetadata commitMetadata) { ValidationUtils.checkArgument(rollbackInstant.getAction().equals(HoodieTimeline.DELTA_COMMIT_ACTION)); // wStat.getPrevCommit() might not give the right commit time in the following @@ -236,8 +244,8 @@ public class MergeOnReadRollbackActionExecutor extends BaseRollbackActionExecuto wStat.getFileId()), HoodieTimeline.LESSER_THAN, rollbackInstant.getTimestamp()); }).map(wStat -> { String baseCommitTime = fileIdToBaseCommitTimeForLogMap.get(wStat.getFileId()); - return RollbackRequest.createRollbackRequestWithAppendRollbackBlockAction(partitionPath, wStat.getFileId(), - baseCommitTime, rollbackInstant); + return ListingBasedRollbackRequest.createRollbackRequestWithAppendRollbackBlockAction(partitionPath, wStat.getFileId(), + baseCommitTime); }).collect(Collectors.toList()); } } diff --git a/hudi-client/src/main/java/org/apache/hudi/table/action/rollback/RollbackUtils.java b/hudi-client/src/main/java/org/apache/hudi/table/action/rollback/RollbackUtils.java new file mode 100644 index 000000000..cbb5a2c82 --- /dev/null +++ b/hudi-client/src/main/java/org/apache/hudi/table/action/rollback/RollbackUtils.java @@ -0,0 +1,66 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.table.action.rollback; + +import org.apache.hadoop.fs.FileStatus; +import org.apache.hudi.common.HoodieRollbackStat; +import org.apache.hudi.common.table.log.block.HoodieCommandBlock; +import org.apache.hudi.common.table.log.block.HoodieLogBlock; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.common.util.ValidationUtils; + +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +public class RollbackUtils { + + static Map generateHeader(String instantToRollback, String rollbackInstantTime) { + // generate metadata + Map header = new HashMap<>(3); + header.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, rollbackInstantTime); + header.put(HoodieLogBlock.HeaderMetadataType.TARGET_INSTANT_TIME, instantToRollback); + header.put(HoodieLogBlock.HeaderMetadataType.COMMAND_BLOCK_TYPE, + String.valueOf(HoodieCommandBlock.HoodieCommandBlockTypeEnum.ROLLBACK_PREVIOUS_BLOCK.ordinal())); + return header; + } + + /** + * Helper to merge 2 rollback-stats for a given partition. + * + * @param stat1 HoodieRollbackStat + * @param stat2 HoodieRollbackStat + * @return Merged HoodieRollbackStat + */ + static HoodieRollbackStat mergeRollbackStat(HoodieRollbackStat stat1, HoodieRollbackStat stat2) { + ValidationUtils.checkArgument(stat1.getPartitionPath().equals(stat2.getPartitionPath())); + final List successDeleteFiles = new ArrayList<>(); + final List failedDeleteFiles = new ArrayList<>(); + final Map commandBlocksCount = new HashMap<>(); + Option.ofNullable(stat1.getSuccessDeleteFiles()).ifPresent(successDeleteFiles::addAll); + Option.ofNullable(stat2.getSuccessDeleteFiles()).ifPresent(successDeleteFiles::addAll); + Option.ofNullable(stat1.getFailedDeleteFiles()).ifPresent(failedDeleteFiles::addAll); + Option.ofNullable(stat2.getFailedDeleteFiles()).ifPresent(failedDeleteFiles::addAll); + Option.ofNullable(stat1.getCommandBlocksCount()).ifPresent(commandBlocksCount::putAll); + Option.ofNullable(stat2.getCommandBlocksCount()).ifPresent(commandBlocksCount::putAll); + return new HoodieRollbackStat(stat1.getPartitionPath(), successDeleteFiles, failedDeleteFiles, commandBlocksCount); + } + +} diff --git a/hudi-client/src/test/java/org/apache/hudi/client/TestHoodieClientOnCopyOnWriteStorage.java b/hudi-client/src/test/java/org/apache/hudi/client/TestHoodieClientOnCopyOnWriteStorage.java index 5518a3f4a..e9a70a8de 100644 --- a/hudi-client/src/test/java/org/apache/hudi/client/TestHoodieClientOnCopyOnWriteStorage.java +++ b/hudi-client/src/test/java/org/apache/hudi/client/TestHoodieClientOnCopyOnWriteStorage.java @@ -44,7 +44,9 @@ import org.apache.hudi.exception.HoodieCommitException; import org.apache.hudi.exception.HoodieIOException; import org.apache.hudi.index.HoodieIndex; import org.apache.hudi.index.HoodieIndex.IndexType; +import org.apache.hudi.io.IOType; import org.apache.hudi.table.HoodieTable; +import org.apache.hudi.table.MarkerFiles; import org.apache.hudi.table.action.commit.WriteHelper; import org.apache.hudi.testutils.HoodieClientTestBase; import org.apache.hudi.testutils.HoodieClientTestUtils; @@ -1063,11 +1065,10 @@ public class TestHoodieClientOnCopyOnWriteStorage extends HoodieClientTestBase { assertFalse(metaClient.getFs().exists(new Path(metaClient.getMarkerFolderPath(instantTime)))); } - @Test - public void testRollbackAfterConsistencyCheckFailure() throws Exception { + private void testRollbackAfterConsistencyCheckFailureUsingFileList(boolean rollbackUsingMarkers) throws Exception { String instantTime = "000"; HoodieTableMetaClient metaClient = new HoodieTableMetaClient(hadoopConf, basePath); - HoodieWriteConfig cfg = getConfigBuilder().withAutoCommit(false).build(); + HoodieWriteConfig cfg = getConfigBuilder().withRollbackUsingMarkers(rollbackUsingMarkers).withAutoCommit(false).build(); HoodieWriteClient client = getHoodieWriteClient(cfg); testConsistencyCheck(metaClient, instantTime); @@ -1079,6 +1080,16 @@ public class TestHoodieClientOnCopyOnWriteStorage extends HoodieClientTestBase { assertFalse(metaClient.getFs().exists(new Path(metaClient.getMarkerFolderPath(instantTime)))); } + @Test + public void testRollbackAfterConsistencyCheckFailureUsingFileList() throws Exception { + testRollbackAfterConsistencyCheckFailureUsingFileList(false); + } + + @Test + public void testRollbackAfterConsistencyCheckFailureUsingMarkers() throws Exception { + testRollbackAfterConsistencyCheckFailureUsingFileList(true); + } + private Pair> testConsistencyCheck(HoodieTableMetaClient metaClient, String instantTime) throws Exception { HoodieWriteConfig cfg = getConfigBuilder().withAutoCommit(false) @@ -1096,11 +1107,13 @@ public class TestHoodieClientOnCopyOnWriteStorage extends HoodieClientTestBase { // This should fail the commit String partitionPath = Arrays .stream(fs.globStatus(new Path(String.format("%s/*/*/*/*", metaClient.getMarkerFolderPath(instantTime))), - path -> path.toString().endsWith(HoodieTableMetaClient.MARKER_EXTN))) + path -> path.toString().contains(HoodieTableMetaClient.MARKER_EXTN))) .limit(1).map(status -> status.getPath().getParent().toString()).collect(Collectors.toList()).get(0); - Path markerFilePath = new Path(String.format("%s/%s", partitionPath, - FSUtils.makeMarkerFile(instantTime, "1-0-1", UUID.randomUUID().toString()))); - metaClient.getFs().create(markerFilePath); + + Path markerFilePath = new MarkerFiles(fs, basePath, metaClient.getMarkerFolderPath(instantTime), instantTime) + .create(partitionPath, + FSUtils.makeDataFileName(instantTime, "1-0-1", UUID.randomUUID().toString()), + IOType.MERGE); LOG.info("Created a dummy marker path=" + markerFilePath); Exception e = assertThrows(HoodieCommitException.class, () -> { diff --git a/hudi-client/src/test/java/org/apache/hudi/io/TestHoodieCommitArchiveLog.java b/hudi-client/src/test/java/org/apache/hudi/io/TestHoodieTimelineArchiveLog.java similarity index 95% rename from hudi-client/src/test/java/org/apache/hudi/io/TestHoodieCommitArchiveLog.java rename to hudi-client/src/test/java/org/apache/hudi/io/TestHoodieTimelineArchiveLog.java index 127cface3..7b2114fb3 100644 --- a/hudi-client/src/test/java/org/apache/hudi/io/TestHoodieCommitArchiveLog.java +++ b/hudi-client/src/test/java/org/apache/hudi/io/TestHoodieTimelineArchiveLog.java @@ -51,7 +51,7 @@ import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertFalse; import static org.junit.jupiter.api.Assertions.assertTrue; -public class TestHoodieCommitArchiveLog extends HoodieClientTestHarness { +public class TestHoodieTimelineArchiveLog extends HoodieClientTestHarness { private Configuration hadoopConf; private HoodieTableMetaClient metaClient; @@ -78,8 +78,8 @@ public class TestHoodieCommitArchiveLog extends HoodieClientTestHarness { HoodieWriteConfig.newBuilder().withPath(basePath).withSchema(HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA) .withParallelism(2, 2).forTable("test-trip-table").build(); metaClient = HoodieTableMetaClient.reload(metaClient); - HoodieTimelineArchiveLog archiveLog = new HoodieTimelineArchiveLog(cfg, metaClient); - boolean result = archiveLog.archiveIfRequired(hadoopConf); + HoodieTimelineArchiveLog archiveLog = new HoodieTimelineArchiveLog(cfg, hadoopConf); + boolean result = archiveLog.archiveIfRequired(); assertTrue(result); } @@ -156,9 +156,8 @@ public class TestHoodieCommitArchiveLog extends HoodieClientTestHarness { verifyInflightInstants(metaClient, 2); metaClient = HoodieTableMetaClient.reload(metaClient); - HoodieTimelineArchiveLog archiveLog = new HoodieTimelineArchiveLog(cfg, metaClient); - - assertTrue(archiveLog.archiveIfRequired(hadoopConf)); + HoodieTimelineArchiveLog archiveLog = new HoodieTimelineArchiveLog(cfg, hadoopConf); + assertTrue(archiveLog.archiveIfRequired()); // reload the timeline and remove the remaining commits timeline = metaClient.getActiveTimeline().reload().getAllCommitsTimeline().filterCompletedInstants(); @@ -215,7 +214,7 @@ public class TestHoodieCommitArchiveLog extends HoodieClientTestHarness { .withCompactionConfig(HoodieCompactionConfig.newBuilder().retainCommits(1).archiveCommitsWith(2, 5).build()) .build(); metaClient = HoodieTableMetaClient.reload(metaClient); - HoodieTimelineArchiveLog archiveLog = new HoodieTimelineArchiveLog(cfg, metaClient); + HoodieTimelineArchiveLog archiveLog = new HoodieTimelineArchiveLog(cfg, hadoopConf); // Requested Compaction HoodieTestDataGenerator.createCompactionAuxiliaryMetadata(basePath, new HoodieInstant(State.REQUESTED, HoodieTimeline.COMPACTION_ACTION, "100"), dfs.getConf()); @@ -247,7 +246,7 @@ public class TestHoodieCommitArchiveLog extends HoodieClientTestHarness { HoodieTimeline timeline = metaClient.getActiveTimeline().getCommitsTimeline().filterCompletedInstants(); assertEquals(4, timeline.countInstants(), "Loaded 4 commits and the count should match"); - boolean result = archiveLog.archiveIfRequired(hadoopConf); + boolean result = archiveLog.archiveIfRequired(); assertTrue(result); timeline = metaClient.getActiveTimeline().reload().getCommitsTimeline().filterCompletedInstants(); assertEquals(4, timeline.countInstants(), "Should not archive commits when maxCommitsToKeep is 5"); @@ -280,7 +279,7 @@ public class TestHoodieCommitArchiveLog extends HoodieClientTestHarness { .withCompactionConfig(HoodieCompactionConfig.newBuilder().retainCommits(1).archiveCommitsWith(2, 5).build()) .build(); metaClient = HoodieTableMetaClient.reload(metaClient); - HoodieTimelineArchiveLog archiveLog = new HoodieTimelineArchiveLog(cfg, metaClient); + HoodieTimelineArchiveLog archiveLog = new HoodieTimelineArchiveLog(cfg, hadoopConf); HoodieTestDataGenerator.createCommitFile(basePath, "100", dfs.getConf()); HoodieTestDataGenerator.createCommitFile(basePath, "101", dfs.getConf()); HoodieTestDataGenerator.createCommitFile(basePath, "102", dfs.getConf()); @@ -290,7 +289,7 @@ public class TestHoodieCommitArchiveLog extends HoodieClientTestHarness { HoodieTimeline timeline = metaClient.getActiveTimeline().getCommitsTimeline().filterCompletedInstants(); assertEquals(6, timeline.countInstants(), "Loaded 6 commits and the count should match"); - boolean result = archiveLog.archiveIfRequired(hadoopConf); + boolean result = archiveLog.archiveIfRequired(); assertTrue(result); timeline = metaClient.getActiveTimeline().reload().getCommitsTimeline().filterCompletedInstants(); assertTrue(timeline.containsOrBeforeTimelineStarts("100"), "Archived commits should always be safe"); @@ -305,8 +304,6 @@ public class TestHoodieCommitArchiveLog extends HoodieClientTestHarness { .withSchema(HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA).withParallelism(2, 2).forTable("test-trip-table") .withCompactionConfig(HoodieCompactionConfig.newBuilder().retainCommits(1).archiveCommitsWith(2, 5).build()) .build(); - metaClient = HoodieTableMetaClient.reload(metaClient); - HoodieTimelineArchiveLog archiveLog = new HoodieTimelineArchiveLog(cfg, metaClient); HoodieTestDataGenerator.createCommitFile(basePath, "100", dfs.getConf()); HoodieTestDataGenerator.createCommitFile(basePath, "101", dfs.getConf()); HoodieTestDataGenerator.createSavepointFile(basePath, "101", dfs.getConf()); @@ -314,11 +311,11 @@ public class TestHoodieCommitArchiveLog extends HoodieClientTestHarness { HoodieTestDataGenerator.createCommitFile(basePath, "103", dfs.getConf()); HoodieTestDataGenerator.createCommitFile(basePath, "104", dfs.getConf()); HoodieTestDataGenerator.createCommitFile(basePath, "105", dfs.getConf()); + HoodieTimelineArchiveLog archiveLog = new HoodieTimelineArchiveLog(cfg, hadoopConf); HoodieTimeline timeline = metaClient.getActiveTimeline().getCommitsTimeline().filterCompletedInstants(); assertEquals(6, timeline.countInstants(), "Loaded 6 commits and the count should match"); - boolean result = archiveLog.archiveIfRequired(hadoopConf); - assertTrue(result); + assertTrue(archiveLog.archiveIfRequired()); timeline = metaClient.getActiveTimeline().reload().getCommitsTimeline().filterCompletedInstants(); assertEquals(5, timeline.countInstants(), "Since we have a savepoint at 101, we should never archive any commit after 101 (we only archive 100)"); @@ -336,8 +333,6 @@ public class TestHoodieCommitArchiveLog extends HoodieClientTestHarness { .withSchema(HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA).withParallelism(2, 2).forTable("test-trip-table") .withCompactionConfig(HoodieCompactionConfig.newBuilder().retainCommits(1).archiveCommitsWith(2, 5).build()) .build(); - metaClient = HoodieTableMetaClient.reload(metaClient); - HoodieTimelineArchiveLog archiveLog = new HoodieTimelineArchiveLog(cfg, metaClient); HoodieTestDataGenerator.createCommitFile(basePath, "100", dfs.getConf()); HoodieTestDataGenerator.createCompactionRequestedFile(basePath, "101", dfs.getConf()); HoodieTestDataGenerator.createCompactionAuxiliaryMetadata(basePath, @@ -350,10 +345,11 @@ public class TestHoodieCommitArchiveLog extends HoodieClientTestHarness { HoodieTestDataGenerator.createCommitFile(basePath, "105", dfs.getConf()); HoodieTestDataGenerator.createCommitFile(basePath, "106", dfs.getConf()); HoodieTestDataGenerator.createCommitFile(basePath, "107", dfs.getConf()); + HoodieTimelineArchiveLog archiveLog = new HoodieTimelineArchiveLog(cfg, dfs.getConf()); HoodieTimeline timeline = metaClient.getActiveTimeline().getCommitsAndCompactionTimeline(); assertEquals(8, timeline.countInstants(), "Loaded 6 commits and the count should match"); - boolean result = archiveLog.archiveIfRequired(hadoopConf); + boolean result = archiveLog.archiveIfRequired(); assertTrue(result); timeline = metaClient.getActiveTimeline().reload().getCommitsAndCompactionTimeline(); assertFalse(timeline.containsInstant(new HoodieInstant(false, HoodieTimeline.COMMIT_ACTION, "100")), @@ -378,34 +374,35 @@ public class TestHoodieCommitArchiveLog extends HoodieClientTestHarness { } @Test - public void checkArchiveCommitTimeline() throws IOException { + public void testArchiveCommitTimeline() throws IOException { HoodieWriteConfig cfg = HoodieWriteConfig.newBuilder().withPath(basePath).withSchema(HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA) .withParallelism(2, 2).forTable("test-trip-table") .withCompactionConfig(HoodieCompactionConfig.newBuilder().retainCommits(1).archiveCommitsWith(2, 3).build()) .build(); metaClient = HoodieTableMetaClient.reload(metaClient); - HoodieTimelineArchiveLog archiveLog = new HoodieTimelineArchiveLog(cfg, metaClient); HoodieTestDataGenerator.createCommitFile(basePath, "1", dfs.getConf()); HoodieInstant instant1 = new HoodieInstant(false, HoodieTimeline.COMMIT_ACTION, "1"); HoodieTestDataGenerator.createCommitFile(basePath, "2", dfs.getConf()); + Path markerPath = new Path(metaClient.getMarkerFolderPath("2")); + dfs.mkdirs(markerPath); HoodieInstant instant2 = new HoodieInstant(false, HoodieTimeline.COMMIT_ACTION, "2"); HoodieTestDataGenerator.createCommitFile(basePath, "3", dfs.getConf()); HoodieInstant instant3 = new HoodieInstant(false, HoodieTimeline.COMMIT_ACTION, "3"); //add 2 more instants to pass filter criteria set in compaction config above HoodieTestDataGenerator.createCommitFile(basePath, "4", dfs.getConf()); - HoodieInstant instant4 = new HoodieInstant(false, HoodieTimeline.COMMIT_ACTION, "4"); HoodieTestDataGenerator.createCommitFile(basePath, "5", dfs.getConf()); - HoodieInstant instant5 = new HoodieInstant(false, HoodieTimeline.COMMIT_ACTION, "5"); - boolean result = archiveLog.archiveIfRequired(hadoopConf); + + HoodieTimelineArchiveLog archiveLog = new HoodieTimelineArchiveLog(cfg, dfs.getConf()); + boolean result = archiveLog.archiveIfRequired(); assertTrue(result); - HoodieArchivedTimeline archivedTimeline = metaClient.getArchivedTimeline(); List archivedInstants = Arrays.asList(instant1, instant2, instant3); assertEquals(new HashSet<>(archivedInstants), archivedTimeline.getInstants().collect(Collectors.toSet())); + assertFalse(dfs.exists(markerPath)); } private void verifyInflightInstants(HoodieTableMetaClient metaClient, int expectedTotalInstants) { @@ -425,7 +422,7 @@ public class TestHoodieCommitArchiveLog extends HoodieClientTestHarness { .withCompactionConfig(HoodieCompactionConfig.newBuilder().retainCommits(1).archiveCommitsWith(2, 5).build()) .build(); metaClient = HoodieTableMetaClient.reload(metaClient); - HoodieTimelineArchiveLog archiveLog = new HoodieTimelineArchiveLog(cfg, metaClient); + HoodieTimelineArchiveLog archiveLog = new HoodieTimelineArchiveLog(cfg, hadoopConf); org.apache.hudi.avro.model.HoodieCommitMetadata expectedCommitMetadata = archiveLog.convertCommitMetadata(hoodieCommitMetadata); assertEquals(expectedCommitMetadata.getOperationType(), WriteOperationType.INSERT.toString()); diff --git a/hudi-client/src/test/java/org/apache/hudi/table/TestCleaner.java b/hudi-client/src/test/java/org/apache/hudi/table/TestCleaner.java index 3c32bbce4..dc360051b 100644 --- a/hudi-client/src/test/java/org/apache/hudi/table/TestCleaner.java +++ b/hudi-client/src/test/java/org/apache/hudi/table/TestCleaner.java @@ -43,6 +43,7 @@ import org.apache.hudi.common.table.timeline.HoodieTimeline; import org.apache.hudi.common.table.timeline.TimelineMetadataUtils; import org.apache.hudi.common.table.timeline.versioning.clean.CleanMetadataMigrator; import org.apache.hudi.common.table.view.TableFileSystemView; +import org.apache.hudi.common.testutils.FileSystemTestUtils; import org.apache.hudi.common.testutils.HoodieTestUtils; import org.apache.hudi.common.util.CleanerUtils; import org.apache.hudi.common.util.CollectionUtils; @@ -54,10 +55,10 @@ import org.apache.hudi.config.HoodieWriteConfig; import org.apache.hudi.exception.HoodieIOException; import org.apache.hudi.index.HoodieIndex; import org.apache.hudi.testutils.HoodieClientTestBase; +import org.apache.hudi.testutils.HoodieClientTestUtils; import org.apache.hudi.testutils.HoodieTestDataGenerator; import org.apache.hadoop.fs.Path; -import org.apache.hadoop.fs.RemoteIterator; import org.apache.log4j.LogManager; import org.apache.log4j.Logger; import org.apache.spark.api.java.JavaRDD; @@ -1128,7 +1129,7 @@ public class TestCleaner extends HoodieClientTestBase { private List createMarkerFiles(String instantTime, int numFiles) throws IOException { List files = new ArrayList<>(); for (int i = 0; i < numFiles; i++) { - files.add(HoodieTestUtils.createNewMarkerFile(basePath, "2019/03/29", instantTime)); + files.add(HoodieClientTestUtils.createNewMarkerFile(basePath, "2019/03/29", instantTime)); } return files; } @@ -1140,13 +1141,8 @@ public class TestCleaner extends HoodieClientTestBase { * @throws IOException in case of error */ private int getTotalTempFiles() throws IOException { - RemoteIterator itr = fs.listFiles(new Path(basePath, HoodieTableMetaClient.TEMPFOLDER_NAME), true); - int count = 0; - while (itr.hasNext()) { - count++; - itr.next(); - } - return count; + return FileSystemTestUtils.listRecursive(fs, new Path(basePath, HoodieTableMetaClient.TEMPFOLDER_NAME)) + .size(); } private Stream> convertPathToFileIdWithCommitTime(final HoodieTableMetaClient metaClient, diff --git a/hudi-client/src/test/java/org/apache/hudi/table/TestHoodieMergeOnReadTable.java b/hudi-client/src/test/java/org/apache/hudi/table/TestHoodieMergeOnReadTable.java index 3bc1f5a42..1551df156 100644 --- a/hudi-client/src/test/java/org/apache/hudi/table/TestHoodieMergeOnReadTable.java +++ b/hudi-client/src/test/java/org/apache/hudi/table/TestHoodieMergeOnReadTable.java @@ -387,15 +387,12 @@ public class TestHoodieMergeOnReadTable extends HoodieClientTestHarness { } } - @ParameterizedTest - @MethodSource("argumentsProvider") - public void testCOWToMORConvertedTableRollback(HoodieFileFormat baseFileFormat) throws Exception { + private void testCOWToMORConvertedTableRollback(HoodieFileFormat baseFileFormat, Boolean rollbackUsingMarkers) throws Exception { init(baseFileFormat); - // Set TableType to COW HoodieTestUtils.init(hadoopConf, basePath, HoodieTableType.COPY_ON_WRITE, baseFileFormat); - HoodieWriteConfig cfg = getConfig(true); + HoodieWriteConfig cfg = getConfig(false, rollbackUsingMarkers); try (HoodieWriteClient client = getHoodieWriteClient(cfg);) { /** @@ -410,6 +407,7 @@ public class TestHoodieMergeOnReadTable extends HoodieClientTestHarness { List statuses = client.upsert(writeRecords, newCommitTime).collect(); // verify there are no errors assertNoWriteErrors(statuses); + client.commit(newCommitTime, jsc.parallelize(statuses)); metaClient = getHoodieMetaClient(hadoopConf, cfg.getBasePath()); Option commit = metaClient.getActiveTimeline().getCommitTimeline().firstInstant(); @@ -446,10 +444,20 @@ public class TestHoodieMergeOnReadTable extends HoodieClientTestHarness { @ParameterizedTest @MethodSource("argumentsProvider") - public void testRollbackWithDeltaAndCompactionCommit(HoodieFileFormat baseFileFormat) throws Exception { + public void testCOWToMORConvertedTableRollbackUsingFileList(HoodieFileFormat baseFileFormat) throws Exception { + testCOWToMORConvertedTableRollback(baseFileFormat, false); + } + + @ParameterizedTest + @MethodSource("argumentsProvider") + public void testCOWToMORConvertedTableRollbackUsingMarkers(HoodieFileFormat baseFileFormat) throws Exception { + testCOWToMORConvertedTableRollback(baseFileFormat, true); + } + + private void testRollbackWithDeltaAndCompactionCommit(HoodieFileFormat baseFileFormat, Boolean rollbackUsingMarkers) throws Exception { init(baseFileFormat); - HoodieWriteConfig cfg = getConfig(false); + HoodieWriteConfig cfg = getConfig(false, rollbackUsingMarkers); try (HoodieWriteClient client = getHoodieWriteClient(cfg);) { // Test delta commit rollback @@ -538,7 +546,6 @@ public class TestHoodieMergeOnReadTable extends HoodieClientTestHarness { writeRecords = jsc.parallelize(copyOfRecords, 1); writeStatusJavaRDD = thirdClient.upsert(writeRecords, commitTime2); - thirdClient.commit(commitTime2, writeStatusJavaRDD); statuses = writeStatusJavaRDD.collect(); // Verify there are no errors assertNoWriteErrors(statuses); @@ -574,20 +581,17 @@ public class TestHoodieMergeOnReadTable extends HoodieClientTestHarness { metaClient = HoodieTableMetaClient.reload(metaClient); String compactionInstantTime = thirdClient.scheduleCompaction(Option.empty()).get().toString(); - JavaRDD ws = thirdClient.compact(compactionInstantTime); - thirdClient.commitCompaction(compactionInstantTime, ws, Option.empty()); + thirdClient.compact(compactionInstantTime); allFiles = listAllDataFilesInPath(hoodieTable, cfg.getBasePath()); metaClient = HoodieTableMetaClient.reload(metaClient); tableView = getHoodieTableFileSystemView(metaClient, metaClient.getCommitsTimeline(), allFiles); - final String compactedCommitTime = - metaClient.getActiveTimeline().reload().getCommitsTimeline().lastInstant().get().getTimestamp(); - - assertTrue(tableView.getLatestBaseFiles().anyMatch(file -> compactedCommitTime.equals(file.getCommitTime()))); - - thirdClient.rollback(compactedCommitTime); - + final String compactedCommitTime = metaClient.getActiveTimeline().reload().lastInstant().get().getTimestamp(); + assertTrue(Arrays.stream(listAllDataFilesInPath(hoodieTable, cfg.getBasePath())) + .anyMatch(file -> compactedCommitTime.equals(new HoodieBaseFile(file).getCommitTime()))); + thirdClient.rollbackInflightCompaction(new HoodieInstant(State.INFLIGHT, HoodieTimeline.COMPACTION_ACTION, compactedCommitTime), + hoodieTable); allFiles = listAllDataFilesInPath(hoodieTable, cfg.getBasePath()); metaClient = HoodieTableMetaClient.reload(metaClient); tableView = getHoodieTableFileSystemView(metaClient, metaClient.getCommitsTimeline(), allFiles); @@ -597,6 +601,18 @@ public class TestHoodieMergeOnReadTable extends HoodieClientTestHarness { } } + @ParameterizedTest + @MethodSource("argumentsProvider") + public void testRollbackWithDeltaAndCompactionCommitUsingFileList(HoodieFileFormat baseFileFormat) throws Exception { + testRollbackWithDeltaAndCompactionCommit(baseFileFormat, false); + } + + @ParameterizedTest + @MethodSource("argumentsProvider") + public void testRollbackWithDeltaAndCompactionCommitUsingMarkers(HoodieFileFormat baseFileFormat) throws Exception { + testRollbackWithDeltaAndCompactionCommit(baseFileFormat, true); + } + @ParameterizedTest @MethodSource("argumentsProvider") public void testMultiRollbackWithDeltaAndCompactionCommit(HoodieFileFormat baseFileFormat) throws Exception { @@ -960,15 +976,14 @@ public class TestHoodieMergeOnReadTable extends HoodieClientTestHarness { } } - @ParameterizedTest - @MethodSource("argumentsProvider") - public void testInsertsGeneratedIntoLogFilesRollback(HoodieFileFormat baseFileFormat) throws Exception { + private void testInsertsGeneratedIntoLogFilesRollback(HoodieFileFormat baseFileFormat, + Boolean rollbackUsingMarkers) throws Exception { init(baseFileFormat); // insert 100 records // Setting IndexType to be InMemory to simulate Global Index nature - HoodieWriteConfig config = getConfigBuilder(false, IndexType.INMEMORY).build(); - try (HoodieWriteClient writeClient = getHoodieWriteClient(config);) { + HoodieWriteConfig config = getConfigBuilder(false, rollbackUsingMarkers, IndexType.INMEMORY).build(); + try (HoodieWriteClient writeClient = getHoodieWriteClient(config)) { String newCommitTime = "100"; writeClient.startCommitWithTime(newCommitTime); @@ -987,14 +1002,13 @@ public class TestHoodieMergeOnReadTable extends HoodieClientTestHarness { // rollback a failed commit boolean rollback = writeClient.rollback(newCommitTime); assertTrue(rollback); - newCommitTime = "101"; - writeClient.startCommitWithTime(newCommitTime); // insert 100 records + newCommitTime = "101"; + writeClient.startCommitWithTime(newCommitTime); records = dataGen.generateInserts(newCommitTime, 100); recordsRDD = jsc.parallelize(records, 1); - statuses = writeClient.insert(recordsRDD, newCommitTime); - writeClient.commit(newCommitTime, statuses); + writeClient.insert(recordsRDD, newCommitTime).collect(); // Sleep for small interval (at least 1 second) to force a new rollback start time. Thread.sleep(1000); @@ -1003,19 +1017,27 @@ public class TestHoodieMergeOnReadTable extends HoodieClientTestHarness { // and calling rollback twice final String lastCommitTime = newCommitTime; metaClient = getHoodieMetaClient(hadoopConf, basePath); - HoodieInstant last = metaClient.getCommitsTimeline().getInstants() - .filter(instant -> instant.getTimestamp().equals(lastCommitTime)).findFirst().get(); - String fileName = last.getFileName(); + // Save the .commit file to local directory. // Rollback will be called twice to test the case where rollback failed first time and retried. // We got the "BaseCommitTime cannot be null" exception before the fix - File file = Files.createTempFile(tempFolder, null, null).toFile(); - metaClient.getFs().copyToLocalFile(new Path(metaClient.getMetaPath(), fileName), - new Path(file.getAbsolutePath())); - writeClient.rollback(newCommitTime); + Map fileNameMap = new HashMap<>(); + for (State state : Arrays.asList(State.REQUESTED, State.INFLIGHT)) { + HoodieInstant toCopy = new HoodieInstant(state, HoodieTimeline.DELTA_COMMIT_ACTION, lastCommitTime); + File file = Files.createTempFile(tempFolder, null, null).toFile(); + metaClient.getFs().copyToLocalFile(new Path(metaClient.getMetaPath(), toCopy.getFileName()), + new Path(file.getAbsolutePath())); + fileNameMap.put(file.getAbsolutePath(), toCopy.getFileName()); + } + Path markerDir = new Path(Files.createTempDirectory(tempFolder,null).toAbsolutePath().toString()); + if (rollbackUsingMarkers) { + metaClient.getFs().copyToLocalFile(new Path(metaClient.getMarkerFolderPath(lastCommitTime)), + markerDir); + } + writeClient.rollback(newCommitTime); metaClient = HoodieTableMetaClient.reload(metaClient); - HoodieTable table = HoodieTable.create(metaClient, config, hadoopConf); + HoodieTable table = HoodieTable.create(config, hadoopConf); SliceView tableRTFileSystemView = table.getSliceView(); long numLogFiles = 0; @@ -1026,22 +1048,43 @@ public class TestHoodieMergeOnReadTable extends HoodieClientTestHarness { .filter(fileSlice -> fileSlice.getLogFiles().count() > 0).count(); } assertEquals(0, numLogFiles); - metaClient.getFs().copyFromLocalFile(new Path(file.getAbsolutePath()), - new Path(metaClient.getMetaPath(), fileName)); + fileNameMap.forEach((key, value) -> { + try { + metaClient.getFs().copyFromLocalFile(new Path(key), + new Path(metaClient.getMetaPath(), value)); + } catch (IOException e) { + throw new HoodieIOException("Error copying state from local disk.", e); + } + }); + if (rollbackUsingMarkers) { + metaClient.getFs().copyFromLocalFile(markerDir, + new Path(metaClient.getMarkerFolderPath(lastCommitTime))); + } Thread.sleep(1000); - // Rollback again to pretend the first rollback failed partially. This should not error our + // Rollback again to pretend the first rollback failed partially. This should not error out writeClient.rollback(newCommitTime); } } @ParameterizedTest @MethodSource("argumentsProvider") - public void testInsertsGeneratedIntoLogFilesRollbackAfterCompaction(HoodieFileFormat baseFileFormat) throws Exception { + public void testInsertsGeneratedIntoLogFilesRollbackUsingFileList(HoodieFileFormat baseFileFormat) throws Exception { + testInsertsGeneratedIntoLogFilesRollback(baseFileFormat, false); + } + + @ParameterizedTest + @MethodSource("argumentsProvider") + public void testInsertsGeneratedIntoLogFilesRollbackUsingMarkers(HoodieFileFormat baseFileFormat) throws Exception { + testInsertsGeneratedIntoLogFilesRollback(baseFileFormat, true); + } + + private void testInsertsGeneratedIntoLogFilesRollbackAfterCompaction(HoodieFileFormat baseFileFormat, + Boolean rollbackUsingMarkers) throws Exception { init(baseFileFormat); // insert 100 records // Setting IndexType to be InMemory to simulate Global Index nature - HoodieWriteConfig config = getConfigBuilder(false, IndexType.INMEMORY).build(); + HoodieWriteConfig config = getConfigBuilder(false, rollbackUsingMarkers, IndexType.INMEMORY).build(); try (HoodieWriteClient writeClient = getHoodieWriteClient(config);) { String newCommitTime = "100"; writeClient.startCommitWithTime(newCommitTime); @@ -1053,8 +1096,7 @@ public class TestHoodieMergeOnReadTable extends HoodieClientTestHarness { // trigger an action statuses.collect(); - HoodieTable table = - HoodieTable.create(getHoodieMetaClient(hadoopConf, basePath), config, hadoopConf); + HoodieTable table = HoodieTable.create(getHoodieMetaClient(hadoopConf, basePath), config, hadoopConf); SliceView tableRTFileSystemView = table.getSliceView(); long numLogFiles = 0; @@ -1072,30 +1114,43 @@ public class TestHoodieMergeOnReadTable extends HoodieClientTestHarness { // Ensure all log files have been compacted into parquet files assertEquals(statuses.map(status -> status.getStat().getPath().contains("parquet")).count(), numLogFiles); assertEquals(statuses.count(), numLogFiles); - writeClient.commitCompaction(newCommitTime, statuses, Option.empty()); + //writeClient.commitCompaction(newCommitTime, statuses, Option.empty()); // Trigger a rollback of compaction - writeClient.rollback(newCommitTime); + table.getActiveTimeline().reload(); + writeClient.rollbackInflightCompaction(new HoodieInstant(State.INFLIGHT, HoodieTimeline.COMPACTION_ACTION, newCommitTime), table); + table = HoodieTable.create(getHoodieMetaClient(hadoopConf, basePath), config, hadoopConf); tableRTFileSystemView = table.getSliceView(); ((SyncableFileSystemView) tableRTFileSystemView).reset(); - Option lastInstant = ((SyncableFileSystemView) tableRTFileSystemView).getLastInstant(); - System.out.println("Last Instant =" + lastInstant); + for (String partitionPath : dataGen.getPartitionPaths()) { - assertTrue(tableRTFileSystemView.getLatestFileSlices(partitionPath).noneMatch(fileSlice -> fileSlice.getBaseFile().isPresent())); - assertTrue(tableRTFileSystemView.getLatestFileSlices(partitionPath).anyMatch(fileSlice -> fileSlice.getLogFiles().count() > 0)); + List fileSlices = getFileSystemViewWithUnCommittedSlices(getHoodieMetaClient(hadoopConf, basePath)) + .getAllFileSlices(partitionPath).filter(fs -> fs.getBaseInstantTime().equals("100")).collect(Collectors.toList()); + assertTrue(fileSlices.stream().noneMatch(fileSlice -> fileSlice.getBaseFile().isPresent())); + assertTrue(fileSlices.stream().anyMatch(fileSlice -> fileSlice.getLogFiles().count() > 0)); } } } + @ParameterizedTest + @MethodSource("argumentsProvider") + public void testInsertsGeneratedIntoLogFilesRollbackAfterCompactionUsingFileList(HoodieFileFormat baseFileFormat) throws Exception { + testInsertsGeneratedIntoLogFilesRollbackAfterCompaction(baseFileFormat, false); + } + + @ParameterizedTest + @MethodSource("argumentsProvider") + public void testInsertsGeneratedIntoLogFilesRollbackAfterCompactionUsingMarkers(HoodieFileFormat baseFileFormat) throws Exception { + testInsertsGeneratedIntoLogFilesRollbackAfterCompaction(baseFileFormat, true); + } + /** * Test to ensure metadata stats are correctly written to metadata file. */ - @ParameterizedTest - @MethodSource("argumentsProvider") - public void testMetadataStatsOnCommit(HoodieFileFormat baseFileFormat) throws Exception { + public void testMetadataStatsOnCommit(HoodieFileFormat baseFileFormat, Boolean rollbackUsingMarkers) throws Exception { init(baseFileFormat); - - HoodieWriteConfig cfg = getConfigBuilder(false, IndexType.INMEMORY).withAutoCommit(false).build(); + HoodieWriteConfig cfg = getConfigBuilder(false, rollbackUsingMarkers, IndexType.INMEMORY) + .withAutoCommit(false).build(); try (HoodieWriteClient client = getHoodieWriteClient(cfg);) { metaClient = getHoodieMetaClient(hadoopConf, basePath); HoodieTable table = HoodieTable.create(metaClient, cfg, hadoopConf); @@ -1136,24 +1191,16 @@ public class TestHoodieMergeOnReadTable extends HoodieClientTestHarness { records = dataGen.generateUpdates(instantTime, records); writeRecords = jsc.parallelize(records, 1); statuses = client.upsert(writeRecords, instantTime); - assertTrue(client.commit(instantTime, statuses), "Commit should succeed"); - - // Read from commit file - table = HoodieTable.create(cfg, hadoopConf); - metadata = HoodieCommitMetadata.fromBytes( - table.getActiveTimeline() - .getInstantDetails(table.getActiveTimeline().getDeltaCommitTimeline().lastInstant().get()).get(), - HoodieCommitMetadata.class); - + //assertTrue(client.commit(instantTime, statuses), "Commit should succeed"); inserts = 0; int upserts = 0; - for (Map.Entry> pstat : metadata.getPartitionToWriteStats().entrySet()) { - for (HoodieWriteStat stat : pstat.getValue()) { - inserts += stat.getNumInserts(); - upserts += stat.getNumUpdateWrites(); - } + List writeStatusList = statuses.collect(); + for (WriteStatus ws: writeStatusList) { + inserts += ws.getStat().getNumInserts(); + upserts += ws.getStat().getNumUpdateWrites(); } + // Read from commit file assertEquals(0, inserts); assertEquals(200, upserts); @@ -1179,7 +1226,22 @@ public class TestHoodieMergeOnReadTable extends HoodieClientTestHarness { } /** - * Test to ensure metadata stats are correctly written to the metadata file, identifies small files and corrects them. + * Test to ensure rolling stats are correctly written to metadata file. + */ + @ParameterizedTest + @MethodSource("argumentsProvider") + public void testMetadataStatsOnCommitUsingFileList(HoodieFileFormat baseFileFormat) throws Exception { + testMetadataStatsOnCommit(baseFileFormat, false); + } + + @ParameterizedTest + @MethodSource("argumentsProvider") + public void testMetadataStatsOnCommitUsingMarkers(HoodieFileFormat baseFileFormat) throws Exception { + testMetadataStatsOnCommit(baseFileFormat, true); + } + + /** + * Test to ensure rolling stats are correctly written to the metadata file, identifies small files and corrects them. */ @ParameterizedTest @MethodSource("argumentsProvider") @@ -1385,11 +1447,19 @@ public class TestHoodieMergeOnReadTable extends HoodieClientTestHarness { return getConfigBuilder(autoCommit).build(); } + private HoodieWriteConfig getConfig(Boolean autoCommit, Boolean rollbackUsingMarkers) { + return getConfigBuilder(autoCommit, rollbackUsingMarkers, IndexType.BLOOM).build(); + } + protected HoodieWriteConfig.Builder getConfigBuilder(Boolean autoCommit) { return getConfigBuilder(autoCommit, IndexType.BLOOM); } protected HoodieWriteConfig.Builder getConfigBuilder(Boolean autoCommit, HoodieIndex.IndexType indexType) { + return getConfigBuilder(autoCommit, false, indexType); + } + + protected HoodieWriteConfig.Builder getConfigBuilder(Boolean autoCommit, Boolean rollbackUsingMarkers, HoodieIndex.IndexType indexType) { return HoodieWriteConfig.newBuilder().withPath(basePath).withSchema(TRIP_EXAMPLE_SCHEMA).withParallelism(2, 2) .withAutoCommit(autoCommit).withAssumeDatePartitioning(true) .withCompactionConfig(HoodieCompactionConfig.newBuilder().compactionSmallFileSize(1024 * 1024 * 1024) @@ -1398,7 +1468,8 @@ public class TestHoodieMergeOnReadTable extends HoodieClientTestHarness { .withEmbeddedTimelineServerEnabled(true).forTable("test-trip-table") .withFileSystemViewConfig(new FileSystemViewStorageConfig.Builder() .withEnableBackupForRemoteFileSystemView(false).build()) - .withIndexConfig(HoodieIndexConfig.newBuilder().withIndexType(indexType).build()); + .withIndexConfig(HoodieIndexConfig.newBuilder().withIndexType(indexType).build()) + .withRollbackUsingMarkers(rollbackUsingMarkers); } private FileStatus[] insertAndGetFilePaths(List records, HoodieWriteClient client, diff --git a/hudi-client/src/test/java/org/apache/hudi/table/TestMarkerFiles.java b/hudi-client/src/test/java/org/apache/hudi/table/TestMarkerFiles.java new file mode 100644 index 000000000..723d9e17a --- /dev/null +++ b/hudi-client/src/test/java/org/apache/hudi/table/TestMarkerFiles.java @@ -0,0 +1,148 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.table; + +import org.apache.hadoop.fs.FileStatus; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hudi.common.fs.FSUtils; +import org.apache.hudi.common.testutils.FileSystemTestUtils; +import org.apache.hudi.common.testutils.HoodieCommonTestHarness; + +import org.apache.hudi.common.util.CollectionUtils; +import org.apache.hudi.exception.HoodieException; +import org.apache.hudi.io.IOType; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; + +import java.io.IOException; +import java.util.List; +import java.util.stream.Collectors; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertFalse; +import static org.junit.jupiter.api.Assertions.assertIterableEquals; +import static org.junit.jupiter.api.Assertions.assertTrue; + +public class TestMarkerFiles extends HoodieCommonTestHarness { + + private MarkerFiles markerFiles; + private FileSystem fs; + private Path markerFolderPath; + + @BeforeEach + public void setup() throws IOException { + initPath(); + initMetaClient(); + this.fs = FSUtils.getFs(metaClient.getBasePath(), metaClient.getHadoopConf()); + this.markerFolderPath = new Path(metaClient.getMarkerFolderPath("000")); + this.markerFiles = new MarkerFiles(fs, metaClient.getBasePath(), markerFolderPath.toString(), "000"); + } + + private void createSomeMarkerFiles() { + markerFiles.create("2020/06/01", "file1", IOType.MERGE); + markerFiles.create("2020/06/02", "file2", IOType.APPEND); + markerFiles.create("2020/06/03", "file3", IOType.CREATE); + } + + private void createInvalidFile(String partitionPath, String invalidFileName) { + Path path = FSUtils.getPartitionPath(markerFolderPath.toString(), partitionPath); + Path invalidFilePath = new Path(path, invalidFileName); + try { + fs.create(invalidFilePath, false).close(); + } catch (IOException e) { + throw new HoodieException("Failed to create invalid file " + invalidFilePath, e); + } + } + + @Test + public void testCreation() throws Exception { + // when + createSomeMarkerFiles(); + + // then + assertTrue(fs.exists(markerFolderPath)); + List markerFiles = FileSystemTestUtils.listRecursive(fs, markerFolderPath) + .stream().filter(status -> status.getPath().getName().contains(".marker")) + .sorted().collect(Collectors.toList()); + assertEquals(3, markerFiles.size()); + assertIterableEquals(CollectionUtils.createImmutableList( + "file:" + markerFolderPath.toString() + "/2020/06/01/file1.marker.MERGE", + "file:" + markerFolderPath.toString() + "/2020/06/02/file2.marker.APPEND", + "file:" + markerFolderPath.toString() + "/2020/06/03/file3.marker.CREATE"), + markerFiles.stream().map(m -> m.getPath().toString()).collect(Collectors.toList()) + ); + } + + @Test + public void testDeletionWhenMarkerDirExists() throws IOException { + //when + markerFiles.create("2020/06/01", "file1", IOType.MERGE); + + // then + assertTrue(markerFiles.doesMarkerDirExist()); + assertTrue(markerFiles.deleteMarkerDir()); + assertFalse(markerFiles.doesMarkerDirExist()); + } + + @Test + public void testDeletionWhenMarkerDirNotExists() throws IOException { + // then + assertFalse(markerFiles.doesMarkerDirExist()); + assertFalse(markerFiles.deleteMarkerDir()); + } + + @Test + public void testDataPathsWhenCreatingOrMerging() throws IOException { + // add markfiles + createSomeMarkerFiles(); + // add invalid file + createInvalidFile("2020/06/01", "invalid_file3"); + int fileSize = FileSystemTestUtils.listRecursive(fs, markerFolderPath).size(); + assertEquals(fileSize,4); + + // then + assertIterableEquals(CollectionUtils.createImmutableList( + "2020/06/01/file1", "2020/06/03/file3"), + markerFiles.createdAndMergedDataPaths().stream().sorted().collect(Collectors.toList()) + ); + } + + @Test + public void testAllMarkerPaths() throws IOException { + // given + createSomeMarkerFiles(); + + // then + assertIterableEquals(CollectionUtils.createImmutableList("2020/06/01/file1.marker.MERGE", + "2020/06/02/file2.marker.APPEND", "2020/06/03/file3.marker.CREATE"), + markerFiles.allMarkerFilePaths().stream().sorted().collect(Collectors.toList()) + ); + } + + @Test + public void testStripMarkerSuffix() { + // Given + final String pathPrefix = "file://" + metaClient.getMetaPath() + "/file"; + final String markerFilePath = pathPrefix + ".marker.APPEND"; + + // when-then + assertEquals(pathPrefix, MarkerFiles.stripMarkerSuffix(markerFilePath)); + } +} diff --git a/hudi-client/src/test/java/org/apache/hudi/table/action/commit/TestUpsertPartitioner.java b/hudi-client/src/test/java/org/apache/hudi/table/action/commit/TestUpsertPartitioner.java index 91b3bb084..09be8f184 100644 --- a/hudi-client/src/test/java/org/apache/hudi/table/action/commit/TestUpsertPartitioner.java +++ b/hudi-client/src/test/java/org/apache/hudi/table/action/commit/TestUpsertPartitioner.java @@ -68,7 +68,7 @@ public class TestUpsertPartitioner extends HoodieClientTestBase { .insertSplitSize(100).autoTuneInsertSplits(autoSplitInserts).build()) .withStorageConfig(HoodieStorageConfig.newBuilder().limitFileSize(1000 * 1024).build()).build(); - HoodieClientTestUtils.fakeCommitFile(basePath, "001"); + HoodieClientTestUtils.fakeCommit(basePath, "001"); HoodieClientTestUtils.fakeDataFile(basePath, testPartitionPath, "001", "file1", fileSize); metaClient = HoodieTableMetaClient.reload(metaClient); HoodieCopyOnWriteTable table = (HoodieCopyOnWriteTable) HoodieTable.create(metaClient, config, hadoopConf); diff --git a/hudi-client/src/test/java/org/apache/hudi/table/action/rollback/HoodieClientRollbackTestBase.java b/hudi-client/src/test/java/org/apache/hudi/table/action/rollback/HoodieClientRollbackTestBase.java new file mode 100644 index 000000000..24aa8697b --- /dev/null +++ b/hudi-client/src/test/java/org/apache/hudi/table/action/rollback/HoodieClientRollbackTestBase.java @@ -0,0 +1,98 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.table.action.rollback; + +import org.apache.hudi.client.HoodieWriteClient; +import org.apache.hudi.client.WriteStatus; +import org.apache.hudi.common.model.FileSlice; +import org.apache.hudi.common.model.HoodieFileGroup; +import org.apache.hudi.common.model.HoodieRecord; +import org.apache.hudi.common.model.HoodieTableType; +import org.apache.hudi.common.table.view.SyncableFileSystemView; +import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.table.HoodieTable; +import org.apache.hudi.testutils.Assertions; +import org.apache.hudi.testutils.HoodieClientTestBase; +import org.apache.hudi.testutils.HoodieTestDataGenerator; +import org.apache.spark.api.java.JavaRDD; + +import java.io.IOException; +import java.util.List; +import java.util.stream.Collectors; + +import static org.apache.hudi.testutils.HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH; +import static org.apache.hudi.testutils.HoodieTestDataGenerator.DEFAULT_SECOND_PARTITION_PATH; +import static org.junit.jupiter.api.Assertions.assertEquals; + +public class HoodieClientRollbackTestBase extends HoodieClientTestBase { + protected void twoUpsertCommitDataWithTwoPartitions(List firstPartitionCommit2FileSlices, + List secondPartitionCommit2FileSlices, + HoodieWriteConfig cfg, + boolean commitSecondUpsert) throws IOException { + //just generate two partitions + dataGen = new HoodieTestDataGenerator(new String[]{DEFAULT_FIRST_PARTITION_PATH, DEFAULT_SECOND_PARTITION_PATH}); + //1. prepare data + HoodieTestDataGenerator.writePartitionMetadata(fs, new String[]{DEFAULT_FIRST_PARTITION_PATH, DEFAULT_SECOND_PARTITION_PATH}, basePath); + HoodieWriteClient client = getHoodieWriteClient(cfg); + /** + * Write 1 (only inserts) + */ + String newCommitTime = "001"; + client.startCommitWithTime(newCommitTime); + List records = dataGen.generateInsertsContainsAllPartitions(newCommitTime, 2); + JavaRDD writeRecords = jsc.parallelize(records, 1); + JavaRDD statuses = client.upsert(writeRecords, newCommitTime); + Assertions.assertNoWriteErrors(statuses.collect()); + client.commit(newCommitTime, statuses); + + /** + * Write 2 (updates) + */ + newCommitTime = "002"; + client.startCommitWithTime(newCommitTime); + records = dataGen.generateUpdates(newCommitTime, records); + statuses = client.upsert(jsc.parallelize(records, 1), newCommitTime); + Assertions.assertNoWriteErrors(statuses.collect()); + if (commitSecondUpsert) { + client.commit(newCommitTime, statuses); + } + + + //2. assert filegroup and get the first partition fileslice + HoodieTable table = this.getHoodieTable(metaClient, cfg); + SyncableFileSystemView fsView = getFileSystemViewWithUnCommittedSlices(table.getMetaClient()); + List firstPartitionCommit2FileGroups = fsView.getAllFileGroups(DEFAULT_FIRST_PARTITION_PATH).collect(Collectors.toList()); + assertEquals(1, firstPartitionCommit2FileGroups.size()); + firstPartitionCommit2FileSlices.addAll(firstPartitionCommit2FileGroups.get(0).getAllFileSlices().collect(Collectors.toList())); + //3. assert filegroup and get the second partition fileslice + List secondPartitionCommit2FileGroups = fsView.getAllFileGroups(DEFAULT_SECOND_PARTITION_PATH).collect(Collectors.toList()); + assertEquals(1, secondPartitionCommit2FileGroups.size()); + secondPartitionCommit2FileSlices.addAll(secondPartitionCommit2FileGroups.get(0).getAllFileSlices().collect(Collectors.toList())); + + //4. assert fileslice + HoodieTableType tableType = this.getTableType(); + if (tableType.equals(HoodieTableType.COPY_ON_WRITE)) { + assertEquals(2, firstPartitionCommit2FileSlices.size()); + assertEquals(2, secondPartitionCommit2FileSlices.size()); + } else { + assertEquals(1, firstPartitionCommit2FileSlices.size()); + assertEquals(1, secondPartitionCommit2FileSlices.size()); + } + } +} diff --git a/hudi-client/src/test/java/org/apache/hudi/table/action/rollback/TestCopyOnWriteRollbackActionExecutor.java b/hudi-client/src/test/java/org/apache/hudi/table/action/rollback/TestCopyOnWriteRollbackActionExecutor.java new file mode 100644 index 000000000..1c5a85a59 --- /dev/null +++ b/hudi-client/src/test/java/org/apache/hudi/table/action/rollback/TestCopyOnWriteRollbackActionExecutor.java @@ -0,0 +1,198 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.table.action.rollback; + +import org.apache.hudi.avro.model.HoodieRollbackPartitionMetadata; +import org.apache.hudi.common.model.FileSlice; +import org.apache.hudi.common.model.HoodieFileGroup; +import org.apache.hudi.common.table.timeline.HoodieInstant; +import org.apache.hudi.common.table.timeline.HoodieTimeline; +import org.apache.hudi.common.HoodieRollbackStat; +import org.apache.hudi.common.testutils.HoodieTestUtils; +import org.apache.hudi.common.util.Option; +import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.table.HoodieTable; +import org.apache.hudi.table.MarkerFiles; +import org.apache.hudi.testutils.HoodieTestDataGenerator; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.ValueSource; + +import java.io.File; +import java.io.IOException; +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; +import java.util.Map; +import java.util.stream.Collectors; + +import static org.apache.hudi.testutils.HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH; +import static org.apache.hudi.testutils.HoodieTestDataGenerator.DEFAULT_SECOND_PARTITION_PATH; +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertFalse; +import static org.junit.jupiter.api.Assertions.assertTrue; + +public class TestCopyOnWriteRollbackActionExecutor extends HoodieClientRollbackTestBase { + @BeforeEach + public void setUp() throws Exception { + initPath(); + initSparkContexts(); + initFileSystem(); + initMetaClient(); + } + + @AfterEach + public void tearDown() throws Exception { + cleanupResources(); + } + + @Test + public void testCopyOnWriteRollbackActionExecutorForFileListingAsGenerateFile() throws IOException { + // Let's create some commit files and parquet files + String commitTime1 = "001"; + String commitTime2 = "002"; + new File(basePath + "/.hoodie").mkdirs(); + HoodieTestDataGenerator.writePartitionMetadata(fs, new String[]{"2015/03/16", "2015/03/17", "2016/03/15"}, + basePath); + HoodieTestUtils.createCommitFiles(basePath, commitTime1, commitTime2); + + // Make commit1 + String file11 = HoodieTestUtils.createDataFile(basePath, "2015/03/16", commitTime1, "id11"); + HoodieTestUtils.createNewLogFile(fs, basePath, "2015/03/16", + commitTime1, "id11", Option.of(3)); + String file12 = HoodieTestUtils.createDataFile(basePath, "2015/03/17", commitTime1, "id12"); + + // Make commit2 + String file21 = HoodieTestUtils.createDataFile(basePath, "2015/03/16", commitTime2, "id21"); + String file22 = HoodieTestUtils.createDataFile(basePath, "2015/03/17", commitTime2, "id22"); + HoodieTable table = this.getHoodieTable(metaClient, getConfig()); + HoodieInstant needRollBackInstant = new HoodieInstant(false, HoodieTimeline.COMMIT_ACTION, "002"); + + // execute CopyOnWriteRollbackActionExecutor with filelisting mode + CopyOnWriteRollbackActionExecutor copyOnWriteRollbackActionExecutor = new CopyOnWriteRollbackActionExecutor(jsc, table.getConfig(), table, "003", needRollBackInstant, true); + assertFalse(copyOnWriteRollbackActionExecutor.getRollbackStrategy() instanceof MarkerBasedRollbackStrategy); + List hoodieRollbackStats = copyOnWriteRollbackActionExecutor.executeRollback(); + + // assert hoodieRollbackStats + assertEquals(hoodieRollbackStats.size(), 3); + hoodieRollbackStats.forEach(stat -> { + if (stat.getPartitionPath().equals("2015/03/16")) { + assertEquals(1, stat.getSuccessDeleteFiles().size()); + assertEquals(0, stat.getFailedDeleteFiles().size()); + assertEquals(Collections.EMPTY_MAP, stat.getCommandBlocksCount()); + assertEquals("file:" + HoodieTestUtils.getDataFilePath(basePath, "2015/03/16", commitTime2, file21), + stat.getSuccessDeleteFiles().get(0)); + } else if (stat.getPartitionPath().equals("2015/03/17")) { + assertEquals(1, stat.getSuccessDeleteFiles().size()); + assertEquals(0, stat.getFailedDeleteFiles().size()); + assertEquals(Collections.EMPTY_MAP, stat.getCommandBlocksCount()); + assertEquals("file:" + HoodieTestUtils.getDataFilePath(basePath, "2015/03/17", commitTime2, file22), + stat.getSuccessDeleteFiles().get(0)); + } else if (stat.getPartitionPath().equals("2016/03/15")) { + assertEquals(0, stat.getSuccessDeleteFiles().size()); + assertEquals(0, stat.getFailedDeleteFiles().size()); + assertEquals(Collections.EMPTY_MAP, stat.getCommandBlocksCount()); + } + }); + + assertTrue(HoodieTestUtils.doesCommitExist(basePath, "001")); + assertTrue(HoodieTestUtils.doesInflightExist(basePath, "001")); + assertFalse(HoodieTestUtils.doesCommitExist(basePath, "002")); + assertFalse(HoodieTestUtils.doesInflightExist(basePath, "002")); + assertTrue(HoodieTestUtils.doesDataFileExist(basePath, "2015/03/16", commitTime1, file11) + && HoodieTestUtils.doesDataFileExist(basePath, "2015/03/17", commitTime1, file12)); + assertFalse(HoodieTestUtils.doesDataFileExist(basePath, "2015/03/16", commitTime2, file21) + || HoodieTestUtils.doesDataFileExist(basePath, "2015/03/17", commitTime2, file22)); + } + + @ParameterizedTest + @ValueSource(booleans = {true, false}) + public void testCopyOnWriteRollbackActionExecutor(boolean isUsingMarkers) throws IOException { + //1. prepare data and assert data result + List firstPartitionCommit2FileSlices = new ArrayList<>(); + List secondPartitionCommit2FileSlices = new ArrayList<>(); + HoodieWriteConfig cfg = getConfigBuilder().withRollbackUsingMarkers(isUsingMarkers).withAutoCommit(false).build(); + this.twoUpsertCommitDataWithTwoPartitions(firstPartitionCommit2FileSlices, secondPartitionCommit2FileSlices, cfg, !isUsingMarkers); + HoodieTable table = this.getHoodieTable(metaClient, cfg); + + //2. rollback + HoodieInstant commitInstant; + if (isUsingMarkers) { + commitInstant = table.getActiveTimeline().getCommitTimeline().filterInflights().lastInstant().get(); + } else { + commitInstant = table.getCompletedCommitTimeline().lastInstant().get(); + } + + CopyOnWriteRollbackActionExecutor copyOnWriteRollbackActionExecutor = new CopyOnWriteRollbackActionExecutor(jsc, cfg, table, "003", commitInstant, false); + if (!isUsingMarkers) { + assertFalse(copyOnWriteRollbackActionExecutor.getRollbackStrategy() instanceof MarkerBasedRollbackStrategy); + } else { + assertTrue(copyOnWriteRollbackActionExecutor.getRollbackStrategy() instanceof MarkerBasedRollbackStrategy); + } + Map rollbackMetadata = copyOnWriteRollbackActionExecutor.execute().getPartitionMetadata(); + + //3. assert the rollback stat + assertEquals(2, rollbackMetadata.size()); + for (Map.Entry entry : rollbackMetadata.entrySet()) { + HoodieRollbackPartitionMetadata meta = entry.getValue(); + assertTrue(meta.getFailedDeleteFiles() == null + || meta.getFailedDeleteFiles().size() == 0); + assertTrue(meta.getSuccessDeleteFiles() == null + || meta.getSuccessDeleteFiles().size() == 1); + } + + //4. assert filegroup after rollback, and compare to the rollbackstat + // assert the first partition file group and file slice + List firstPartitionRollBack1FileGroups = table.getFileSystemView().getAllFileGroups(DEFAULT_FIRST_PARTITION_PATH).collect(Collectors.toList()); + assertEquals(1, firstPartitionRollBack1FileGroups.size()); + List firstPartitionRollBack1FileSlices = firstPartitionRollBack1FileGroups.get(0).getAllFileSlices().collect(Collectors.toList()); + assertEquals(1, firstPartitionRollBack1FileSlices.size()); + + if (!isUsingMarkers) { + firstPartitionCommit2FileSlices.removeAll(firstPartitionRollBack1FileSlices); + assertEquals(1, firstPartitionCommit2FileSlices.size()); + assertEquals(firstPartitionCommit2FileSlices.get(0).getBaseFile().get().getPath(), + rollbackMetadata.get(DEFAULT_FIRST_PARTITION_PATH).getSuccessDeleteFiles().get(0)); + } else { + assertEquals(firstPartitionCommit2FileSlices.get(0).getBaseFile().get().getPath(), + String.format("%s:%s/%s", this.fs.getScheme(), basePath, rollbackMetadata.get(DEFAULT_FIRST_PARTITION_PATH).getSuccessDeleteFiles().get(0))); + } + + // assert the second partition file group and file slice + List secondPartitionRollBack1FileGroups = table.getFileSystemView().getAllFileGroups(DEFAULT_SECOND_PARTITION_PATH).collect(Collectors.toList()); + assertEquals(1, secondPartitionRollBack1FileGroups.size()); + List secondPartitionRollBack1FileSlices = secondPartitionRollBack1FileGroups.get(0).getAllFileSlices().collect(Collectors.toList()); + assertEquals(1, secondPartitionRollBack1FileSlices.size()); + + // assert the second partition rollback file is equals rollBack1SecondPartitionStat + if (!isUsingMarkers) { + secondPartitionCommit2FileSlices.removeAll(secondPartitionRollBack1FileSlices); + assertEquals(1, secondPartitionCommit2FileSlices.size()); + assertEquals(secondPartitionCommit2FileSlices.get(0).getBaseFile().get().getPath(), + rollbackMetadata.get(DEFAULT_SECOND_PARTITION_PATH).getSuccessDeleteFiles().get(0)); + } else { + assertEquals(secondPartitionCommit2FileSlices.get(0).getBaseFile().get().getPath(), + String.format("%s:%s/%s", this.fs.getScheme(), basePath, rollbackMetadata.get(DEFAULT_SECOND_PARTITION_PATH).getSuccessDeleteFiles().get(0))); + } + + assertFalse(new MarkerFiles(table, commitInstant.getTimestamp()).doesMarkerDirExist()); + } +} diff --git a/hudi-client/src/test/java/org/apache/hudi/table/action/rollback/TestMarkerBasedRollbackStrategy.java b/hudi-client/src/test/java/org/apache/hudi/table/action/rollback/TestMarkerBasedRollbackStrategy.java new file mode 100644 index 000000000..c6652ed79 --- /dev/null +++ b/hudi-client/src/test/java/org/apache/hudi/table/action/rollback/TestMarkerBasedRollbackStrategy.java @@ -0,0 +1,135 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.table.action.rollback; + +import org.apache.hudi.common.HoodieRollbackStat; +import org.apache.hudi.common.model.HoodieFileFormat; +import org.apache.hudi.common.table.timeline.HoodieInstant; +import org.apache.hudi.common.table.timeline.HoodieTimeline; +import org.apache.hudi.common.testutils.FileSystemTestUtils; +import org.apache.hudi.io.IOType; +import org.apache.hudi.table.HoodieTable; +import org.apache.hudi.testutils.HoodieClientTestBase; +import org.apache.hudi.testutils.HoodieClientTestUtils; + +import org.apache.hadoop.fs.FileStatus; +import org.apache.hadoop.fs.Path; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; + +import java.util.List; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertTrue; + +public class TestMarkerBasedRollbackStrategy extends HoodieClientTestBase { + + @BeforeEach + public void setUp() throws Exception { + initPath(); + initSparkContexts(); + initFileSystem(); + initMetaClient(); + initDFS(); + } + + @AfterEach + public void tearDown() throws Exception { + cleanupResources(); + } + + private void givenCommit0(boolean isDeltaCommit) throws Exception { + HoodieClientTestUtils.fakeDataFile(basePath, "partA", "000", "f2"); + if (isDeltaCommit) { + HoodieClientTestUtils.fakeDeltaCommit(basePath, "000"); + } else { + HoodieClientTestUtils.fakeCommit(basePath, "000"); + } + } + + private void givenInflightCommit1(boolean isDeltaCommit) throws Exception { + HoodieClientTestUtils.fakeDataFile(basePath, "partB", "001", "f1"); + HoodieClientTestUtils.createMarkerFile(basePath, "partB", "001", "f1", IOType.CREATE); + + HoodieClientTestUtils.createMarkerFile(basePath, "partA", "001", "f3", IOType.CREATE); + + if (isDeltaCommit) { + HoodieClientTestUtils.fakeLogFile(basePath, "partA", "001", "f2", 0); + HoodieClientTestUtils.createMarkerFile(basePath, "partA", "001", "f2", IOType.APPEND); + HoodieClientTestUtils.createMarkerFile(basePath, "partB", "001", "f4", IOType.APPEND); + HoodieClientTestUtils.fakeInflightDeltaCommit(basePath, "001"); + } else { + HoodieClientTestUtils.fakeDataFile(basePath, "partA", "001", "f2"); + HoodieClientTestUtils.createMarkerFile(basePath, "partA", "001", "f2", IOType.MERGE); + HoodieClientTestUtils.fakeInFlightCommit(basePath, "001"); + } + } + + @Test + public void testCopyOnWriteRollback() throws Exception { + // given: wrote some base files and corresponding markers + givenCommit0(false); + givenInflightCommit1(false); + + // when + List stats = new MarkerBasedRollbackStrategy(HoodieTable.create(metaClient, getConfig(), hadoopConf), jsc, getConfig(), "002") + .execute(new HoodieInstant(HoodieInstant.State.INFLIGHT, HoodieTimeline.COMMIT_ACTION, "001")); + + // then: ensure files are deleted correctly, non-existent files reported as failed deletes + assertEquals(2, stats.size()); + + List partAFiles = FileSystemTestUtils.listRecursive(fs, new Path(basePath + "/partA")); + List partBFiles = FileSystemTestUtils.listRecursive(fs, new Path(basePath + "/partB")); + + assertEquals(0, partBFiles.size()); + assertEquals(1, partAFiles.size()); + assertEquals(2, stats.stream().mapToInt(r -> r.getSuccessDeleteFiles().size()).sum()); + assertEquals(1, stats.stream().mapToInt(r -> r.getFailedDeleteFiles().size()).sum()); + } + + @Test + public void testMergeOnReadRollback() throws Exception { + // given: wrote some base + log files and corresponding markers + givenCommit0(true); + givenInflightCommit1(true); + + // when + List stats = new MarkerBasedRollbackStrategy(HoodieTable.create(metaClient, getConfig(), hadoopConf), jsc, getConfig(), "002") + .execute(new HoodieInstant(HoodieInstant.State.INFLIGHT, HoodieTimeline.DELTA_COMMIT_ACTION, "001")); + + // then: ensure files are deleted, rollback block is appended (even if append does not exist) + assertEquals(2, stats.size()); + // will have the log file + List partBFiles = FileSystemTestUtils.listRecursive(fs, new Path(basePath + "/partB")); + assertEquals(1, partBFiles.size()); + assertTrue(partBFiles.get(0).getPath().getName().contains(HoodieFileFormat.HOODIE_LOG.getFileExtension())); + assertTrue(partBFiles.get(0).getLen() > 0); + + List partAFiles = FileSystemTestUtils.listRecursive(fs, new Path(basePath + "/partA")); + assertEquals(3, partAFiles.size()); + assertEquals(2, partAFiles.stream().filter(s -> s.getPath().getName().contains(HoodieFileFormat.HOODIE_LOG.getFileExtension())).count()); + assertEquals(1, partAFiles.stream().filter(s -> s.getPath().getName().contains(HoodieFileFormat.HOODIE_LOG.getFileExtension())).filter(f -> f.getLen() > 0).count()); + + // only partB/f1_001 will be deleted + assertEquals(1, stats.stream().mapToInt(r -> r.getSuccessDeleteFiles().size()).sum()); + // partA/f3_001 is non existent + assertEquals(1, stats.stream().mapToInt(r -> r.getFailedDeleteFiles().size()).sum()); + } +} diff --git a/hudi-client/src/test/java/org/apache/hudi/table/action/rollback/TestMergeOnReadRollbackActionExecutor.java b/hudi-client/src/test/java/org/apache/hudi/table/action/rollback/TestMergeOnReadRollbackActionExecutor.java new file mode 100644 index 000000000..759e7f6b1 --- /dev/null +++ b/hudi-client/src/test/java/org/apache/hudi/table/action/rollback/TestMergeOnReadRollbackActionExecutor.java @@ -0,0 +1,157 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.table.action.rollback; + +import org.apache.hudi.avro.model.HoodieRollbackPartitionMetadata; +import org.apache.hudi.common.model.FileSlice; +import org.apache.hudi.common.model.HoodieFileGroup; +import org.apache.hudi.common.model.HoodieLogFile; +import org.apache.hudi.common.model.HoodieTableType; +import org.apache.hudi.common.table.timeline.HoodieInstant; +import org.apache.hudi.common.table.timeline.HoodieTimeline; +import org.apache.hudi.config.HoodieWriteConfig; +import org.apache.hudi.table.HoodieTable; +import org.apache.hudi.table.MarkerFiles; +import org.apache.hudi.testutils.HoodieTestDataGenerator; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.Assertions; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.ValueSource; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; +import java.util.Map; +import java.util.stream.Collectors; + +import static org.apache.hudi.testutils.HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH; +import static org.apache.hudi.testutils.HoodieTestDataGenerator.DEFAULT_SECOND_PARTITION_PATH; +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertTrue; +import static org.junit.jupiter.api.Assertions.assertFalse; + +public class TestMergeOnReadRollbackActionExecutor extends HoodieClientRollbackTestBase { + @Override + protected HoodieTableType getTableType() { + return HoodieTableType.MERGE_ON_READ; + } + + @BeforeEach + public void setUp() throws Exception { + initPath(); + initSparkContexts(); + //just generate tow partitions + dataGen = new HoodieTestDataGenerator(new String[]{DEFAULT_FIRST_PARTITION_PATH, DEFAULT_SECOND_PARTITION_PATH}); + initFileSystem(); + initMetaClient(); + } + + @AfterEach + public void tearDown() throws Exception { + cleanupResources(); + } + + @ParameterizedTest + @ValueSource(booleans = {true, false}) + public void testMergeOnReadRollbackActionExecutor(boolean isUsingMarkers) throws IOException { + //1. prepare data and assert data result + List firstPartitionCommit2FileSlices = new ArrayList<>(); + List secondPartitionCommit2FileSlices = new ArrayList<>(); + HoodieWriteConfig cfg = getConfigBuilder().withRollbackUsingMarkers(isUsingMarkers).withAutoCommit(false).build(); + twoUpsertCommitDataWithTwoPartitions(firstPartitionCommit2FileSlices, secondPartitionCommit2FileSlices, cfg, !isUsingMarkers); + List firstPartitionCommit2LogFiles = new ArrayList<>(); + List secondPartitionCommit2LogFiles = new ArrayList<>(); + firstPartitionCommit2FileSlices.get(0).getLogFiles().collect(Collectors.toList()).forEach(logFile -> firstPartitionCommit2LogFiles.add(logFile)); + assertEquals(1, firstPartitionCommit2LogFiles.size()); + secondPartitionCommit2FileSlices.get(0).getLogFiles().collect(Collectors.toList()).forEach(logFile -> secondPartitionCommit2LogFiles.add(logFile)); + assertEquals(1, secondPartitionCommit2LogFiles.size()); + HoodieTable table = this.getHoodieTable(metaClient, cfg); + + //2. rollback + HoodieInstant rollBackInstant = new HoodieInstant(isUsingMarkers, HoodieTimeline.DELTA_COMMIT_ACTION, "002"); + MergeOnReadRollbackActionExecutor mergeOnReadRollbackActionExecutor = new MergeOnReadRollbackActionExecutor( + jsc, + cfg, + table, + "003", + rollBackInstant, + true); + // assert is filelist mode + if (!isUsingMarkers) { + assertFalse(mergeOnReadRollbackActionExecutor.getRollbackStrategy() instanceof MarkerBasedRollbackStrategy); + } else { + assertTrue(mergeOnReadRollbackActionExecutor.getRollbackStrategy() instanceof MarkerBasedRollbackStrategy); + } + + //3. assert the rollback stat + Map rollbackMetadata = mergeOnReadRollbackActionExecutor.execute().getPartitionMetadata(); + assertEquals(2, rollbackMetadata.size()); + + for (Map.Entry entry : rollbackMetadata.entrySet()) { + HoodieRollbackPartitionMetadata meta = entry.getValue(); + assertTrue(meta.getFailedDeleteFiles() == null || meta.getFailedDeleteFiles().size() == 0); + assertTrue(meta.getSuccessDeleteFiles() == null || meta.getSuccessDeleteFiles().size() == 0); + } + + //4. assert filegroup after rollback, and compare to the rollbackstat + // assert the first partition data and log file size + List firstPartitionRollBack1FileGroups = table.getFileSystemView().getAllFileGroups(DEFAULT_FIRST_PARTITION_PATH).collect(Collectors.toList()); + assertEquals(1, firstPartitionRollBack1FileGroups.size()); + List firstPartitionRollBack1FileSlices = firstPartitionRollBack1FileGroups.get(0).getAllFileSlices().collect(Collectors.toList()); + assertEquals(1, firstPartitionRollBack1FileSlices.size()); + FileSlice firstPartitionRollBack1FileSlice = firstPartitionRollBack1FileSlices.get(0); + List firstPartitionRollBackLogFiles = firstPartitionRollBack1FileSlice.getLogFiles().collect(Collectors.toList()); + assertEquals(2, firstPartitionRollBackLogFiles.size()); + + firstPartitionRollBackLogFiles.removeAll(firstPartitionCommit2LogFiles); + assertEquals(1, firstPartitionRollBackLogFiles.size()); + + // assert the second partition data and log file size + List secondPartitionRollBack1FileGroups = table.getFileSystemView().getAllFileGroups(DEFAULT_SECOND_PARTITION_PATH).collect(Collectors.toList()); + assertEquals(1, secondPartitionRollBack1FileGroups.size()); + List secondPartitionRollBack1FileSlices = secondPartitionRollBack1FileGroups.get(0).getAllFileSlices().collect(Collectors.toList()); + assertEquals(1, secondPartitionRollBack1FileSlices.size()); + FileSlice secondPartitionRollBack1FileSlice = secondPartitionRollBack1FileSlices.get(0); + List secondPartitionRollBackLogFiles = secondPartitionRollBack1FileSlice.getLogFiles().collect(Collectors.toList()); + assertEquals(2, secondPartitionRollBackLogFiles.size()); + + secondPartitionRollBackLogFiles.removeAll(secondPartitionCommit2LogFiles); + assertEquals(1, secondPartitionRollBackLogFiles.size()); + + assertFalse(new MarkerFiles(table, "002").doesMarkerDirExist()); + } + + @Test + public void testFailForCompletedInstants() { + Assertions.assertThrows(IllegalArgumentException.class, () -> { + HoodieInstant rollBackInstant = new HoodieInstant(false, HoodieTimeline.DELTA_COMMIT_ACTION, "002"); + new MergeOnReadRollbackActionExecutor( + jsc, + getConfigBuilder().build(), + getHoodieTable(metaClient, getConfigBuilder().build()), + "003", + rollBackInstant, + true, + true, + true); + }); + } +} diff --git a/hudi-client/src/test/java/org/apache/hudi/table/action/rollback/TestRollbackUtils.java b/hudi-client/src/test/java/org/apache/hudi/table/action/rollback/TestRollbackUtils.java new file mode 100644 index 000000000..8db2069b0 --- /dev/null +++ b/hudi-client/src/test/java/org/apache/hudi/table/action/rollback/TestRollbackUtils.java @@ -0,0 +1,120 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.table.action.rollback; + +import org.apache.hadoop.fs.FileStatus; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.fs.permission.FsPermission; +import org.apache.hudi.common.HoodieRollbackStat; +import org.apache.hudi.common.table.log.block.HoodieLogBlock; +import org.apache.hudi.common.table.timeline.HoodieInstant; +import org.apache.hudi.common.table.timeline.HoodieTimeline; +import org.apache.hudi.common.util.CollectionUtils; +import org.junit.jupiter.api.Test; + +import java.util.HashMap; +import java.util.Map; +import java.util.stream.Collectors; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertThrows; +import static org.junit.jupiter.api.Assertions.assertIterableEquals; + +public class TestRollbackUtils { + + private FileStatus generateFileStatus(String filePath) { + Path dataFile1Path = new Path(filePath); + return new FileStatus(1, true, 1, 1, 1, 1, + FsPermission.valueOf("-rw-rw-rw-"), "one", "one", null, dataFile1Path); + } + + @Test + public void testGenerateHeader() { + HoodieInstant hoodieInstant = new HoodieInstant(false, HoodieTimeline.COMMIT_ACTION, "101"); + String instantToRollback = "1"; + Map header = RollbackUtils.generateHeader(instantToRollback, hoodieInstant.getTimestamp()); + Map headerExpect = new HashMap<>(3); + headerExpect.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, "101"); + headerExpect.put(HoodieLogBlock.HeaderMetadataType.TARGET_INSTANT_TIME, "1"); + headerExpect.put(HoodieLogBlock.HeaderMetadataType.COMMAND_BLOCK_TYPE, "0"); + assertEquals(header, headerExpect); + } + + @Test + public void testMergeRollbackStat() { + String partitionPath1 = "/partitionPath1/"; + String partitionPath2 = "/partitionPath2/"; + //prepare HoodieRollbackStat for different partition + Map dataFilesOnlyStat1Files = new HashMap<>(); + dataFilesOnlyStat1Files.put(generateFileStatus(partitionPath1 + "dataFile1.parquet"), true); + dataFilesOnlyStat1Files.put(generateFileStatus(partitionPath1 + "dataFile2.parquet"), true); + HoodieRollbackStat dataFilesOnlyStat1 = HoodieRollbackStat.newBuilder() + .withPartitionPath(partitionPath1) + .withDeletedFileResults(dataFilesOnlyStat1Files).build(); + + Map dataFilesOnlyStat2Files = new HashMap<>(); + dataFilesOnlyStat2Files.put(generateFileStatus(partitionPath2 + "dataFile1.parquet"), true); + dataFilesOnlyStat2Files.put(generateFileStatus(partitionPath2 + "dataFile2.parquet"), true); + HoodieRollbackStat dataFilesOnlyStat2 = HoodieRollbackStat.newBuilder() + .withPartitionPath(partitionPath2) + .withDeletedFileResults(dataFilesOnlyStat1Files).build(); + //1. test different partitionpath merge + assertThrows(IllegalArgumentException.class, () -> { + RollbackUtils.mergeRollbackStat(dataFilesOnlyStat1, + dataFilesOnlyStat2); + }, "different partition rollbackstat merge will failed"); + + //prepare HoodieRollbackStat for failed and block append + Map dataFilesOnlyStat3Files = new HashMap<>(); + dataFilesOnlyStat3Files.put(generateFileStatus(partitionPath1 + "dataFile1.log"), true); + dataFilesOnlyStat3Files.put(generateFileStatus(partitionPath1 + "dataFile3.parquet"), false); + HoodieRollbackStat dataFilesOnlyStat3 = HoodieRollbackStat.newBuilder() + .withPartitionPath(partitionPath1) + .withDeletedFileResults(dataFilesOnlyStat3Files).build(); + + Map dataFilesOnlyStat4Files = new HashMap<>(); + dataFilesOnlyStat4Files.put(generateFileStatus(partitionPath1 + "dataFile1.log"), 10L); + HoodieRollbackStat dataFilesOnlyStat4 = HoodieRollbackStat.newBuilder() + .withPartitionPath(partitionPath1) + .withRollbackBlockAppendResults(dataFilesOnlyStat4Files).build(); + + //2. test merge dataFilesOnlyStat1 and dataFilesOnlyStat3 + HoodieRollbackStat dataFilesOnlyStatMerge1 = + RollbackUtils.mergeRollbackStat(dataFilesOnlyStat1, dataFilesOnlyStat3); + assertEquals(partitionPath1, dataFilesOnlyStatMerge1.getPartitionPath()); + assertIterableEquals(CollectionUtils.createImmutableList(partitionPath1 + "dataFile3.parquet"), + dataFilesOnlyStatMerge1.getFailedDeleteFiles()); + assertIterableEquals(CollectionUtils.createImmutableList(partitionPath1 + "dataFile1.parquet", + partitionPath1 + "dataFile2.parquet", partitionPath1 + "dataFile1.log").stream().sorted().collect(Collectors.toList()), + dataFilesOnlyStatMerge1.getSuccessDeleteFiles().stream().sorted().collect(Collectors.toList())); + assertEquals(0, dataFilesOnlyStatMerge1.getCommandBlocksCount().size()); + + //3. test merge dataFilesOnlyStatMerge1 and dataFilesOnlyStat4 + HoodieRollbackStat dataFilesOnlyStatMerge2 = + RollbackUtils.mergeRollbackStat(dataFilesOnlyStatMerge1, dataFilesOnlyStat4); + assertEquals(partitionPath1, dataFilesOnlyStatMerge1.getPartitionPath()); + assertIterableEquals(CollectionUtils.createImmutableList(partitionPath1 + "dataFile3.parquet").stream().sorted().collect(Collectors.toList()), + dataFilesOnlyStatMerge2.getFailedDeleteFiles().stream().sorted().collect(Collectors.toList())); + assertIterableEquals(CollectionUtils.createImmutableList(partitionPath1 + "dataFile1.parquet", + partitionPath1 + "dataFile2.parquet", partitionPath1 + "dataFile1.log").stream().sorted().collect(Collectors.toList()), + dataFilesOnlyStatMerge2.getSuccessDeleteFiles().stream().sorted().collect(Collectors.toList())); + assertEquals(CollectionUtils.createImmutableMap(generateFileStatus(partitionPath1 + "dataFile1.log"), 10L), + dataFilesOnlyStatMerge2.getCommandBlocksCount()); + } +} diff --git a/hudi-client/src/test/java/org/apache/hudi/testutils/HoodieClientTestHarness.java b/hudi-client/src/test/java/org/apache/hudi/testutils/HoodieClientTestHarness.java index a3b534cf5..71be8f330 100644 --- a/hudi-client/src/test/java/org/apache/hudi/testutils/HoodieClientTestHarness.java +++ b/hudi-client/src/test/java/org/apache/hudi/testutils/HoodieClientTestHarness.java @@ -37,7 +37,6 @@ import org.apache.hadoop.hdfs.DistributedFileSystem; import org.apache.hadoop.hdfs.MiniDFSCluster; import org.apache.hudi.config.HoodieWriteConfig; import org.apache.hudi.index.HoodieIndex; -import org.apache.hudi.table.HoodieTable; import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.sql.SQLContext; @@ -71,7 +70,6 @@ public abstract class HoodieClientTestHarness extends HoodieCommonTestHarness im protected transient HoodieWriteClient writeClient; protected transient HoodieReadClient readClient; protected transient HoodieTableFileSystemView tableView; - protected transient HoodieTable hoodieTable; protected final SparkTaskContextSupplier supplier = new SparkTaskContextSupplier(); diff --git a/hudi-client/src/test/java/org/apache/hudi/testutils/HoodieClientTestUtils.java b/hudi-client/src/test/java/org/apache/hudi/testutils/HoodieClientTestUtils.java index 5c57c25ea..dddc04d7f 100644 --- a/hudi-client/src/test/java/org/apache/hudi/testutils/HoodieClientTestUtils.java +++ b/hudi-client/src/test/java/org/apache/hudi/testutils/HoodieClientTestUtils.java @@ -40,6 +40,7 @@ import org.apache.hudi.common.table.view.TableFileSystemView.BaseFileOnlyView; import org.apache.hudi.common.testutils.HoodieTestUtils; import org.apache.hudi.config.HoodieStorageConfig; import org.apache.hudi.exception.HoodieException; +import org.apache.hudi.io.IOType; import org.apache.hudi.io.storage.HoodieParquetConfig; import org.apache.hudi.io.storage.HoodieParquetWriter; @@ -80,6 +81,7 @@ public class HoodieClientTestUtils { private static final Logger LOG = LogManager.getLogger(HoodieClientTestUtils.class); private static final Random RANDOM = new Random(); + public static final String DEFAULT_WRITE_TOKEN = "1-0-1"; public static List collectStatuses(Iterator> statusListItr) { List statuses = new ArrayList<>(); @@ -124,11 +126,19 @@ public class HoodieClientTestUtils { new File(parentPath + "/" + instantTime + suffix).createNewFile(); } - public static void fakeCommitFile(String basePath, String instantTime) throws IOException { + public static void fakeCommit(String basePath, String instantTime) throws IOException { fakeMetaFile(basePath, instantTime, HoodieTimeline.COMMIT_EXTENSION); } - public static void fakeInFlightFile(String basePath, String instantTime) throws IOException { + public static void fakeDeltaCommit(String basePath, String instantTime) throws IOException { + fakeMetaFile(basePath, instantTime, HoodieTimeline.DELTA_COMMIT_EXTENSION); + } + + public static void fakeInflightDeltaCommit(String basePath, String instantTime) throws IOException { + fakeMetaFile(basePath, instantTime, HoodieTimeline.INFLIGHT_DELTA_COMMIT_EXTENSION); + } + + public static void fakeInFlightCommit(String basePath, String instantTime) throws IOException { fakeMetaFile(basePath, instantTime, HoodieTimeline.INFLIGHT_EXTENSION); } @@ -146,6 +156,20 @@ public class HoodieClientTestUtils { new RandomAccessFile(path, "rw").setLength(length); } + public static void fakeLogFile(String basePath, String partitionPath, String baseInstantTime, String fileId, int version) + throws Exception { + fakeLogFile(basePath, partitionPath, baseInstantTime, fileId, version, 0); + } + + public static void fakeLogFile(String basePath, String partitionPath, String baseInstantTime, String fileId, int version, int length) + throws Exception { + String parentPath = String.format("%s/%s", basePath, partitionPath); + new File(parentPath).mkdirs(); + String path = String.format("%s/%s", parentPath, FSUtils.makeLogFileName(fileId, HoodieFileFormat.HOODIE_LOG.getFileExtension(), baseInstantTime, version, "1-0-1")); + new File(path).createNewFile(); + new RandomAccessFile(path, "rw").setLength(length); + } + /** * Returns a Spark config for this test. * @@ -308,4 +332,25 @@ public class HoodieClientTestUtils { return HoodieClientTestUtils.writeParquetFile(basePath, partitionPath, filename, records, schema, filter, createCommitTime); } + + public static String createNewMarkerFile(String basePath, String partitionPath, String instantTime) + throws IOException { + return createMarkerFile(basePath, partitionPath, instantTime); + } + + public static String createMarkerFile(String basePath, String partitionPath, String instantTime) + throws IOException { + return createMarkerFile(basePath, partitionPath, instantTime, UUID.randomUUID().toString(), IOType.MERGE); + } + + public static String createMarkerFile(String basePath, String partitionPath, String instantTime, String fileID, IOType ioType) + throws IOException { + String folderPath = basePath + "/" + HoodieTableMetaClient.TEMPFOLDER_NAME + "/" + instantTime + "/" + partitionPath + "/"; + new File(folderPath).mkdirs(); + String markerFileName = String.format("%s_%s_%s%s%s.%s", fileID, DEFAULT_WRITE_TOKEN, instantTime, + HoodieFileFormat.PARQUET.getFileExtension(), HoodieTableMetaClient.MARKER_EXTN, ioType); + File f = new File(folderPath + markerFileName); + f.createNewFile(); + return f.getAbsolutePath(); + } } diff --git a/hudi-client/src/test/java/org/apache/hudi/testutils/HoodieMergeOnReadTestUtils.java b/hudi-client/src/test/java/org/apache/hudi/testutils/HoodieMergeOnReadTestUtils.java index 20bcafde3..b88cfffff 100644 --- a/hudi-client/src/test/java/org/apache/hudi/testutils/HoodieMergeOnReadTestUtils.java +++ b/hudi-client/src/test/java/org/apache/hudi/testutils/HoodieMergeOnReadTestUtils.java @@ -90,7 +90,7 @@ public class HoodieMergeOnReadTestUtils { }).reduce((a, b) -> { a.addAll(b); return a; - }).orElse(new ArrayList()); + }).orElse(new ArrayList<>()); } private static void setPropsForInputFormat(FileInputFormat inputFormat, JobConf jobConf, Schema schema, diff --git a/hudi-client/src/test/java/org/apache/hudi/testutils/HoodieTestDataGenerator.java b/hudi-client/src/test/java/org/apache/hudi/testutils/HoodieTestDataGenerator.java index 0eb88e24d..2b413101b 100644 --- a/hudi-client/src/test/java/org/apache/hudi/testutils/HoodieTestDataGenerator.java +++ b/hudi-client/src/test/java/org/apache/hudi/testutils/HoodieTestDataGenerator.java @@ -427,15 +427,33 @@ public class HoodieTestDataGenerator { return generateInsertsStream(commitTime, n, false, schemaStr); } + public List generateInsertsContainsAllPartitions(String instantTime, Integer n) { + if (n < partitionPaths.length) { + throw new HoodieIOException("n must greater then partitionPaths length"); + } + return generateInsertsStream( + instantTime, n, false, TRIP_EXAMPLE_SCHEMA, true).collect(Collectors.toList()); + } + /** * Generates new inserts, uniformly across the partition paths above. It also updates the list of existing keys. */ public Stream generateInsertsStream( - String instantTime, Integer n, boolean isFlattened, String schemaStr) { - int currSize = getNumExistingKeys(schemaStr); + String instantTime, Integer n, boolean isFlattened, String schemaStr) { + return generateInsertsStream(instantTime, n, isFlattened, schemaStr, false); + } + /** + * Generates new inserts, uniformly across the partition paths above. It also updates the list of existing keys. + */ + public Stream generateInsertsStream( + String instantTime, Integer n, boolean isFlattened, String schemaStr, boolean containsAllPartitions) { + int currSize = getNumExistingKeys(schemaStr); return IntStream.range(0, n).boxed().map(i -> { String partitionPath = partitionPaths[RAND.nextInt(partitionPaths.length)]; + if (containsAllPartitions && i < partitionPaths.length) { + partitionPath = partitionPaths[i]; + } HoodieKey key = new HoodieKey(UUID.randomUUID().toString(), partitionPath); KeyPartition kp = new KeyPartition(); kp.key = key; diff --git a/hudi-common/src/main/java/org/apache/hudi/common/HoodieRollbackStat.java b/hudi-common/src/main/java/org/apache/hudi/common/HoodieRollbackStat.java index 68ea5318f..a3191fa02 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/HoodieRollbackStat.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/HoodieRollbackStat.java @@ -21,6 +21,7 @@ package org.apache.hudi.common; import org.apache.hadoop.fs.FileStatus; import java.io.Serializable; +import java.util.Collections; import java.util.List; import java.util.Map; import java.util.stream.Collectors; @@ -85,6 +86,15 @@ public class HoodieRollbackStat implements Serializable { return this; } + public Builder withDeletedFileResult(String fileName, boolean isDeleted) { + if (isDeleted) { + successDeleteFiles = Collections.singletonList(fileName); + } else { + failedDeleteFiles = Collections.singletonList(fileName); + } + return this; + } + public Builder withRollbackBlockAppendResults(Map commandBlocksCount) { this.commandBlocksCount = commandBlocksCount; return this; @@ -96,6 +106,15 @@ public class HoodieRollbackStat implements Serializable { } public HoodieRollbackStat build() { + if (successDeleteFiles == null) { + successDeleteFiles = Collections.EMPTY_LIST; + } + if (failedDeleteFiles == null) { + failedDeleteFiles = Collections.EMPTY_LIST; + } + if (commandBlocksCount == null) { + commandBlocksCount = Collections.EMPTY_MAP; + } return new HoodieRollbackStat(partitionPath, successDeleteFiles, failedDeleteFiles, commandBlocksCount); } } diff --git a/hudi-common/src/main/java/org/apache/hudi/common/fs/FSUtils.java b/hudi-common/src/main/java/org/apache/hudi/common/fs/FSUtils.java index fffd43f59..6807f6f5d 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/fs/FSUtils.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/fs/FSUtils.java @@ -24,7 +24,6 @@ import org.apache.hudi.common.model.HoodiePartitionMetadata; import org.apache.hudi.common.table.HoodieTableMetaClient; import org.apache.hudi.common.table.timeline.HoodieInstant; import org.apache.hudi.common.util.Option; -import org.apache.hudi.common.util.ValidationUtils; import org.apache.hudi.common.util.collection.Pair; import org.apache.hudi.exception.HoodieException; import org.apache.hudi.exception.HoodieIOException; @@ -46,7 +45,6 @@ import java.io.File; import java.io.IOException; import java.util.ArrayList; import java.util.Arrays; -import java.util.LinkedList; import java.util.List; import java.util.Map.Entry; import java.util.Objects; @@ -116,22 +114,6 @@ public class FSUtils { return String.format("%s_%s_%s%s", fileId, writeToken, instantTime, fileExtension); } - public static String makeMarkerFile(String instantTime, String writeToken, String fileId) { - return String.format("%s_%s_%s%s", fileId, writeToken, instantTime, HoodieTableMetaClient.MARKER_EXTN); - } - - public static String translateMarkerToDataPath(String basePath, String markerPath, String instantTs, - String baseFileExtension) { - ValidationUtils.checkArgument(markerPath.endsWith(HoodieTableMetaClient.MARKER_EXTN)); - String markerRootPath = Path.getPathWithoutSchemeAndAuthority( - new Path(String.format("%s/%s/%s", basePath, HoodieTableMetaClient.TEMPFOLDER_NAME, instantTs))).toString(); - int begin = markerPath.indexOf(markerRootPath); - ValidationUtils.checkArgument(begin >= 0, - "Not in marker dir. Marker Path=" + markerPath + ", Expected Marker Root=" + markerRootPath); - String rPath = markerPath.substring(begin + markerRootPath.length() + 1); - return String.format("%s/%s%s", basePath, rPath.replace(HoodieTableMetaClient.MARKER_EXTN, ""), baseFileExtension); - } - public static String maskWithoutFileId(String instantTime, int taskPartitionId) { return String.format("*_%s_%s%s", taskPartitionId, instantTime, HoodieFileFormat.PARQUET.getFileExtension()); } @@ -171,15 +153,15 @@ public class FSUtils { /** * Given a base partition and a partition path, return relative path of partition path to the base path. */ - public static String getRelativePartitionPath(Path basePath, Path partitionPath) { + public static String getRelativePartitionPath(Path basePath, Path fullPartitionPath) { basePath = Path.getPathWithoutSchemeAndAuthority(basePath); - partitionPath = Path.getPathWithoutSchemeAndAuthority(partitionPath); - String partitionFullPath = partitionPath.toString(); - int partitionStartIndex = partitionFullPath.indexOf(basePath.getName(), + fullPartitionPath = Path.getPathWithoutSchemeAndAuthority(fullPartitionPath); + String fullPartitionPathStr = fullPartitionPath.toString(); + int partitionStartIndex = fullPartitionPathStr.indexOf(basePath.getName(), basePath.getParent() == null ? 0 : basePath.getParent().toString().length()); // Partition-Path could be empty for non-partitioned tables - return partitionStartIndex + basePath.getName().length() == partitionFullPath.length() ? "" - : partitionFullPath.substring(partitionStartIndex + basePath.getName().length() + 1); + return partitionStartIndex + basePath.getName().length() == fullPartitionPathStr.length() ? "" + : fullPartitionPathStr.substring(partitionStartIndex + basePath.getName().length() + 1); } /** @@ -199,19 +181,6 @@ public class FSUtils { return partitions; } - public static List getAllDataFilesForMarkers(FileSystem fs, String basePath, String instantTs, - String markerDir, String baseFileExtension) throws IOException { - List dataFiles = new LinkedList<>(); - processFiles(fs, markerDir, (status) -> { - String pathStr = status.getPath().toString(); - if (pathStr.endsWith(HoodieTableMetaClient.MARKER_EXTN)) { - dataFiles.add(FSUtils.translateMarkerToDataPath(basePath, pathStr, instantTs, baseFileExtension)); - } - return true; - }, false); - return dataFiles; - } - /** * Recursively processes all files in the base-path. If excludeMetaFolder is set, the meta-folder and all its subdirs * are skipped @@ -222,8 +191,8 @@ public class FSUtils { * @param excludeMetaFolder Exclude .hoodie folder * @throws IOException - */ - static void processFiles(FileSystem fs, String basePathStr, Function consumer, - boolean excludeMetaFolder) throws IOException { + public static void processFiles(FileSystem fs, String basePathStr, Function consumer, + boolean excludeMetaFolder) throws IOException { PathFilter pathFilter = excludeMetaFolder ? getExcludeMetaPathFilter() : ALLOW_ALL_FILTER; FileStatus[] topLevelStatuses = fs.listStatus(new Path(basePathStr)); for (FileStatus child : topLevelStatuses) { @@ -390,7 +359,7 @@ public class FSUtils { public static boolean isLogFile(Path logPath) { Matcher matcher = LOG_FILE_PATTERN.matcher(logPath.getName()); - return matcher.find(); + return matcher.find() && logPath.getName().contains(".log"); } /** @@ -501,18 +470,6 @@ public class FSUtils { } } - public static void deleteOlderRestoreMetaFiles(FileSystem fs, String metaPath, Stream instants) { - // TODO - this should be archived when archival is made general for all meta-data - // skip MIN_ROLLBACK_TO_KEEP and delete rest - instants.skip(MIN_ROLLBACK_TO_KEEP).map(s -> { - try { - return fs.delete(new Path(metaPath, s.getFileName()), false); - } catch (IOException e) { - throw new HoodieIOException("Could not delete restore meta files " + s.getFileName(), e); - } - }); - } - public static void createPathIfNotExists(FileSystem fs, Path partitionPath) throws IOException { if (!fs.exists(partitionPath)) { fs.mkdirs(partitionPath); @@ -535,8 +492,8 @@ public class FSUtils { /** * Get DFS full partition path (e.g. hdfs://ip-address:8020:/) */ - public static String getDFSFullPartitionPath(FileSystem fs, Path partitionPath) { - return fs.getUri() + partitionPath.toUri().getRawPath(); + public static String getDFSFullPartitionPath(FileSystem fs, Path fullPartitionPath) { + return fs.getUri() + fullPartitionPath.toUri().getRawPath(); } /** diff --git a/hudi-common/src/test/java/org/apache/hudi/common/testutils/FileSystemTestUtils.java b/hudi-common/src/test/java/org/apache/hudi/common/testutils/FileSystemTestUtils.java index 8432b26f8..76fdf18d4 100644 --- a/hudi-common/src/test/java/org/apache/hudi/common/testutils/FileSystemTestUtils.java +++ b/hudi-common/src/test/java/org/apache/hudi/common/testutils/FileSystemTestUtils.java @@ -18,6 +18,10 @@ package org.apache.hudi.common.testutils; +import org.apache.hadoop.fs.FileStatus; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.LocatedFileStatus; +import org.apache.hadoop.fs.RemoteIterator; import org.apache.hudi.common.fs.inline.InLineFSUtils; import org.apache.hudi.common.fs.inline.InLineFileSystem; import org.apache.hudi.common.fs.inline.InMemoryFileSystem; @@ -26,6 +30,8 @@ import org.apache.hadoop.fs.Path; import java.io.File; import java.io.IOException; +import java.util.ArrayList; +import java.util.List; import java.util.Random; import java.util.UUID; @@ -67,4 +73,13 @@ public class FileSystemTestUtils { throw new IOException(message); } } + + public static List listRecursive(FileSystem fs, Path path) throws IOException { + RemoteIterator itr = fs.listFiles(path, true); + List statuses = new ArrayList<>(); + while (itr.hasNext()) { + statuses.add(itr.next()); + } + return statuses; + } } diff --git a/hudi-common/src/test/java/org/apache/hudi/common/testutils/HoodieCommonTestHarness.java b/hudi-common/src/test/java/org/apache/hudi/common/testutils/HoodieCommonTestHarness.java index 8e1d080bc..31986b908 100644 --- a/hudi-common/src/test/java/org/apache/hudi/common/testutils/HoodieCommonTestHarness.java +++ b/hudi-common/src/test/java/org/apache/hudi/common/testutils/HoodieCommonTestHarness.java @@ -24,6 +24,7 @@ import org.apache.hudi.common.table.timeline.HoodieTimeline; import org.apache.hudi.common.table.view.HoodieTableFileSystemView; import org.apache.hudi.common.table.view.SyncableFileSystemView; +import org.apache.hudi.exception.HoodieIOException; import org.junit.jupiter.api.io.TempDir; import java.io.IOException; @@ -77,6 +78,17 @@ public class HoodieCommonTestHarness { return getFileSystemView(timeline, true); } + protected SyncableFileSystemView getFileSystemViewWithUnCommittedSlices(HoodieTableMetaClient metaClient) { + try { + return new HoodieTableFileSystemView(metaClient, + metaClient.getActiveTimeline(), + HoodieTestUtils.listAllDataFilesAndLogFilesInPath(metaClient.getFs(), metaClient.getBasePath()) + ); + } catch (IOException ioe) { + throw new HoodieIOException("Error getting file system view", ioe); + } + } + /** * Gets a default {@link HoodieTableType#COPY_ON_WRITE} table type. Sub-classes can override this method to specify a * new table type. diff --git a/hudi-common/src/test/java/org/apache/hudi/common/testutils/HoodieTestUtils.java b/hudi-common/src/test/java/org/apache/hudi/common/testutils/HoodieTestUtils.java index caae246ec..cbf3af6b7 100644 --- a/hudi-common/src/test/java/org/apache/hudi/common/testutils/HoodieTestUtils.java +++ b/hudi-common/src/test/java/org/apache/hudi/common/testutils/HoodieTestUtils.java @@ -18,6 +18,12 @@ package org.apache.hudi.common.testutils; +import org.apache.hadoop.fs.FSDataOutputStream; +import org.apache.hadoop.fs.FileStatus; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.LocatedFileStatus; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.fs.RemoteIterator; import org.apache.hudi.avro.HoodieAvroUtils; import org.apache.hudi.avro.model.HoodieActionInstant; import org.apache.hudi.avro.model.HoodieCleanMetadata; @@ -60,12 +66,6 @@ import org.apache.avro.Schema; import org.apache.avro.generic.GenericRecord; import org.apache.avro.generic.IndexedRecord; import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.FSDataOutputStream; -import org.apache.hadoop.fs.FileStatus; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.LocatedFileStatus; -import org.apache.hadoop.fs.Path; -import org.apache.hadoop.fs.RemoteIterator; import org.apache.hadoop.util.StringUtils; import java.io.ByteArrayInputStream; @@ -268,12 +268,6 @@ public class HoodieTestUtils { return createDataFileFixLength(basePath, partitionPath, instantTime, fileID, length); } - public static String createNewMarkerFile(String basePath, String partitionPath, String instantTime) - throws IOException { - String fileID = UUID.randomUUID().toString(); - return createMarkerFile(basePath, partitionPath, instantTime, fileID); - } - public static String createDataFile(String basePath, String partitionPath, String instantTime, String fileID) throws IOException { String folderPath = basePath + "/" + partitionPath + "/"; @@ -294,16 +288,6 @@ public class HoodieTestUtils { return fileID; } - public static String createMarkerFile(String basePath, String partitionPath, String instantTime, String fileID) - throws IOException { - String folderPath = - basePath + "/" + HoodieTableMetaClient.TEMPFOLDER_NAME + "/" + instantTime + "/" + partitionPath + "/"; - new File(folderPath).mkdirs(); - File f = new File(folderPath + FSUtils.makeMarkerFile(instantTime, DEFAULT_WRITE_TOKEN, fileID)); - f.createNewFile(); - return f.getAbsolutePath(); - } - public static String createNewLogFile(FileSystem fs, String basePath, String partitionPath, String instantTime, String fileID, Option version) throws IOException { String folderPath = basePath + "/" + partitionPath + "/"; @@ -465,7 +449,7 @@ public class HoodieTestUtils { // TODO: should be removed public static FileStatus[] listAllDataFilesInPath(FileSystem fs, String basePath) throws IOException { - return listAllDataFilesInPath(fs, basePath, ".parquet"); + return listAllDataFilesInPath(fs, basePath, HoodieFileFormat.PARQUET.getFileExtension()); } public static FileStatus[] listAllDataFilesInPath(FileSystem fs, String basePath, String datafileExtension) @@ -474,26 +458,31 @@ public class HoodieTestUtils { List returns = new ArrayList<>(); while (itr.hasNext()) { LocatedFileStatus status = itr.next(); - if (status.getPath().getName().contains(datafileExtension)) { + if (status.getPath().getName().contains(datafileExtension) && !status.getPath().getName().contains(".marker")) { returns.add(status); } } return returns.toArray(new FileStatus[returns.size()]); } - public static FileStatus[] listAllLogFilesInPath(FileSystem fs, String basePath, String logfileExtension) + public static FileStatus[] listAllLogFilesInPath(FileSystem fs, String basePath) throws IOException { RemoteIterator itr = fs.listFiles(new Path(basePath), true); List returns = new ArrayList<>(); while (itr.hasNext()) { LocatedFileStatus status = itr.next(); - if (status.getPath().getName().contains(logfileExtension)) { + if (status.getPath().getName().contains(HoodieFileFormat.HOODIE_LOG.getFileExtension())) { returns.add(status); } } return returns.toArray(new FileStatus[returns.size()]); } + public static FileStatus[] listAllDataFilesAndLogFilesInPath(FileSystem fs, String basePath) throws IOException { + return Stream.concat(Arrays.stream(listAllDataFilesInPath(fs, basePath)), Arrays.stream(listAllLogFilesInPath(fs, basePath))) + .toArray(FileStatus[]::new); + } + public static List monotonicIncreasingCommitTimestamps(int numTimestamps, int startSecsDelta) { Calendar cal = Calendar.getInstance(); cal.add(Calendar.SECOND, startSecsDelta); diff --git a/hudi-utilities/src/test/java/org/apache/hudi/utilities/functional/TestHoodieSnapshotExporter.java b/hudi-utilities/src/test/java/org/apache/hudi/utilities/functional/TestHoodieSnapshotExporter.java index 6cadbc552..20c781177 100644 --- a/hudi-utilities/src/test/java/org/apache/hudi/utilities/functional/TestHoodieSnapshotExporter.java +++ b/hudi-utilities/src/test/java/org/apache/hudi/utilities/functional/TestHoodieSnapshotExporter.java @@ -18,6 +18,8 @@ package org.apache.hudi.utilities.functional; +import org.apache.hadoop.fs.LocatedFileStatus; +import org.apache.hadoop.fs.RemoteIterator; import org.apache.hudi.client.HoodieWriteClient; import org.apache.hudi.common.model.HoodieAvroPayload; import org.apache.hudi.common.model.HoodieRecord; @@ -35,9 +37,7 @@ import org.apache.hudi.utilities.HoodieSnapshotExporter.Partitioner; import org.apache.hudi.utilities.exception.HoodieSnapshotExporterException; import org.apache.hadoop.fs.FileStatus; -import org.apache.hadoop.fs.LocatedFileStatus; import org.apache.hadoop.fs.Path; -import org.apache.hadoop.fs.RemoteIterator; import org.apache.log4j.LogManager; import org.apache.log4j.Logger; import org.apache.spark.api.java.JavaRDD; @@ -92,7 +92,6 @@ public class TestHoodieSnapshotExporter extends FunctionalTestHarness { JavaRDD recordsRDD = jsc().parallelize(records, 1); hdfsWriteClient.bulkInsert(recordsRDD, COMMIT_TIME); hdfsWriteClient.close(); - RemoteIterator itr = dfs().listFiles(new Path(sourcePath), true); while (itr.hasNext()) { LOG.info(">>> Prepared test file: " + itr.next().getPath());