1
0

[HUDI-711] Refactor exporter main logic (#1436)

* Refactor exporter main logic
* break main method into multiple readable methods
* fix bug of passing wrong file list
* avoid deleting output path when exists
* throw exception to early abort on multiple cases
* use JavaSparkContext instead of SparkSession
* improve unit test for expected exceptions
This commit is contained in:
Raymond Xu
2020-03-25 03:02:24 -07:00
committed by GitHub
parent cafc87041b
commit bc82e2be6c
3 changed files with 154 additions and 88 deletions

View File

@@ -19,18 +19,20 @@
package org.apache.hudi.utilities; package org.apache.hudi.utilities;
import org.apache.hudi.common.SerializableConfiguration; import org.apache.hudi.common.SerializableConfiguration;
import org.apache.hudi.common.model.HoodieBaseFile;
import org.apache.hudi.common.model.HoodiePartitionMetadata; import org.apache.hudi.common.model.HoodiePartitionMetadata;
import org.apache.hudi.common.model.HoodieRecord; import org.apache.hudi.common.model.HoodieRecord;
import org.apache.hudi.common.table.HoodieTableConfig; import org.apache.hudi.common.table.HoodieTableConfig;
import org.apache.hudi.common.table.HoodieTableMetaClient; import org.apache.hudi.common.table.HoodieTableMetaClient;
import org.apache.hudi.common.table.HoodieTimeline; import org.apache.hudi.common.table.HoodieTimeline;
import org.apache.hudi.common.table.TableFileSystemView; import org.apache.hudi.common.table.TableFileSystemView.BaseFileOnlyView;
import org.apache.hudi.common.table.timeline.HoodieInstant; import org.apache.hudi.common.table.timeline.HoodieInstant;
import org.apache.hudi.common.table.view.HoodieTableFileSystemView; import org.apache.hudi.common.table.view.HoodieTableFileSystemView;
import org.apache.hudi.common.util.FSUtils; import org.apache.hudi.common.util.FSUtils;
import org.apache.hudi.common.util.Option; import org.apache.hudi.common.util.Option;
import org.apache.hudi.common.util.ReflectionUtils; import org.apache.hudi.common.util.ReflectionUtils;
import org.apache.hudi.common.util.StringUtils; import org.apache.hudi.common.util.StringUtils;
import org.apache.hudi.utilities.exception.HoodieSnapshotExporterException;
import com.beust.jcommander.IValueValidator; import com.beust.jcommander.IValueValidator;
import com.beust.jcommander.JCommander; import com.beust.jcommander.JCommander;
@@ -43,19 +45,21 @@ import org.apache.hadoop.fs.FileUtil;
import org.apache.hadoop.fs.Path; import org.apache.hadoop.fs.Path;
import org.apache.log4j.LogManager; import org.apache.log4j.LogManager;
import org.apache.log4j.Logger; import org.apache.log4j.Logger;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.sql.Column; import org.apache.spark.sql.Column;
import org.apache.spark.sql.DataFrameWriter; import org.apache.spark.sql.DataFrameWriter;
import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row; import org.apache.spark.sql.Row;
import org.apache.spark.sql.SQLContext;
import org.apache.spark.sql.SaveMode; import org.apache.spark.sql.SaveMode;
import org.apache.spark.sql.SparkSession;
import java.io.IOException; import java.io.IOException;
import java.io.Serializable; import java.io.Serializable;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Iterator;
import java.util.List; import java.util.List;
import java.util.stream.Collectors; import java.util.stream.Stream;
import scala.Tuple2; import scala.Tuple2;
import scala.collection.JavaConversions; import scala.collection.JavaConversions;
@@ -109,46 +113,56 @@ public class HoodieSnapshotExporter {
String outputPartitioner = null; String outputPartitioner = null;
} }
public void export(SparkSession spark, Config cfg) throws IOException { public void export(JavaSparkContext jsc, Config cfg) throws IOException {
JavaSparkContext jsc = new JavaSparkContext(spark.sparkContext());
FileSystem fs = FSUtils.getFs(cfg.sourceBasePath, jsc.hadoopConfiguration()); FileSystem fs = FSUtils.getFs(cfg.sourceBasePath, jsc.hadoopConfiguration());
final SerializableConfiguration serConf = new SerializableConfiguration(jsc.hadoopConfiguration()); if (outputPathExists(fs, cfg)) {
final HoodieTableMetaClient tableMetadata = new HoodieTableMetaClient(fs.getConf(), cfg.sourceBasePath); throw new HoodieSnapshotExporterException("The target output path already exists.");
final TableFileSystemView.BaseFileOnlyView fsView = new HoodieTableFileSystemView(tableMetadata,
tableMetadata.getActiveTimeline().getCommitsTimeline().filterCompletedInstants());
// Get the latest commit
Option<HoodieInstant> latestCommit =
tableMetadata.getActiveTimeline().getCommitsTimeline().filterCompletedInstants().lastInstant();
if (!latestCommit.isPresent()) {
LOG.error("No commits present. Nothing to snapshot");
return;
} }
final String latestCommitTimestamp = latestCommit.get().getTimestamp();
final String latestCommitTimestamp = getLatestCommitTimestamp(fs, cfg).<HoodieSnapshotExporterException>orElseThrow(() -> {
throw new HoodieSnapshotExporterException("No commits present. Nothing to snapshot.");
});
LOG.info(String.format("Starting to snapshot latest version files which are also no-late-than %s.", LOG.info(String.format("Starting to snapshot latest version files which are also no-late-than %s.",
latestCommitTimestamp)); latestCommitTimestamp));
List<String> partitions = FSUtils.getAllPartitionPaths(fs, cfg.sourceBasePath, false); final List<String> partitions = getPartitions(fs, cfg);
if (partitions.size() > 0) { if (partitions.isEmpty()) {
List<String> dataFiles = new ArrayList<>(); throw new HoodieSnapshotExporterException("The source dataset has 0 partition to snapshot.");
for (String partition : partitions) {
dataFiles.addAll(fsView.getLatestBaseFilesBeforeOrOn(partition, latestCommitTimestamp).map(f -> f.getPath()).collect(Collectors.toList()));
} }
LOG.info(String.format("The job needs to export %d partitions.", partitions.size()));
if (!cfg.outputFormat.equals(OutputFormatValidator.HUDI)) { if (cfg.outputFormat.equals(OutputFormatValidator.HUDI)) {
exportAsNonHudi(spark, cfg, dataFiles); exportAsHudi(jsc, cfg, partitions, latestCommitTimestamp);
} else { } else {
// No transformation is needed for output format "HUDI", just copy the original files. exportAsNonHudi(jsc, cfg, partitions, latestCommitTimestamp);
copySnapshot(jsc, fs, cfg, partitions, dataFiles, latestCommitTimestamp, serConf);
} }
createSuccessTag(fs, cfg.targetOutputPath); createSuccessTag(fs, cfg);
} else { }
LOG.info("The job has 0 partition to copy.");
private boolean outputPathExists(FileSystem fs, Config cfg) throws IOException {
return fs.exists(new Path(cfg.targetOutputPath));
}
private Option<String> getLatestCommitTimestamp(FileSystem fs, Config cfg) {
final HoodieTableMetaClient tableMetadata = new HoodieTableMetaClient(fs.getConf(), cfg.sourceBasePath);
Option<HoodieInstant> latestCommit = tableMetadata.getActiveTimeline().getCommitsTimeline().filterCompletedInstants().lastInstant();
return latestCommit.isPresent() ? Option.of(latestCommit.get().getTimestamp()) : Option.empty();
}
private List<String> getPartitions(FileSystem fs, Config cfg) throws IOException {
return FSUtils.getAllPartitionPaths(fs, cfg.sourceBasePath, false);
}
private void createSuccessTag(FileSystem fs, Config cfg) throws IOException {
Path successTagPath = new Path(cfg.targetOutputPath + "/_SUCCESS");
if (!fs.exists(successTagPath)) {
LOG.info(String.format("Creating _SUCCESS under target output path: %s", cfg.targetOutputPath));
fs.createNewFile(successTagPath);
} }
} }
private void exportAsNonHudi(SparkSession spark, Config cfg, List<String> dataFiles) { private void exportAsNonHudi(JavaSparkContext jsc, Config cfg, List<String> partitions, String latestCommitTimestamp) {
Partitioner defaultPartitioner = dataset -> { Partitioner defaultPartitioner = dataset -> {
Dataset<Row> hoodieDroppedDataset = dataset.drop(JavaConversions.asScalaIterator(HoodieRecord.HOODIE_META_COLUMNS.iterator()).toSeq()); Dataset<Row> hoodieDroppedDataset = dataset.drop(JavaConversions.asScalaIterator(HoodieRecord.HOODIE_META_COLUMNS.iterator()).toSeq());
return StringUtils.isNullOrEmpty(cfg.outputPartitionField) return StringUtils.isNullOrEmpty(cfg.outputPartitionField)
@@ -160,37 +174,35 @@ public class HoodieSnapshotExporter {
? defaultPartitioner ? defaultPartitioner
: ReflectionUtils.loadClass(cfg.outputPartitioner); : ReflectionUtils.loadClass(cfg.outputPartitioner);
Dataset<Row> sourceDataset = spark.read().parquet(JavaConversions.asScalaIterator(dataFiles.iterator()).toSeq()); final BaseFileOnlyView fsView = getBaseFileOnlyView(jsc, cfg);
Iterator<String> exportingFilePaths = jsc
.parallelize(partitions, partitions.size())
.flatMap(partition -> fsView
.getLatestBaseFilesBeforeOrOn(partition, latestCommitTimestamp)
.map(HoodieBaseFile::getPath).iterator())
.toLocalIterator();
Dataset<Row> sourceDataset = new SQLContext(jsc).read().parquet(JavaConversions.asScalaIterator(exportingFilePaths).toSeq());
partitioner.partition(sourceDataset) partitioner.partition(sourceDataset)
.format(cfg.outputFormat) .format(cfg.outputFormat)
.mode(SaveMode.Overwrite) .mode(SaveMode.Overwrite)
.save(cfg.targetOutputPath); .save(cfg.targetOutputPath);
} }
private void copySnapshot(JavaSparkContext jsc, private void exportAsHudi(JavaSparkContext jsc, Config cfg, List<String> partitions, String latestCommitTimestamp) throws IOException {
FileSystem fs, final BaseFileOnlyView fsView = getBaseFileOnlyView(jsc, cfg);
Config cfg, final SerializableConfiguration serConf = new SerializableConfiguration(jsc.hadoopConfiguration());
List<String> partitions,
List<String> dataFiles,
String latestCommitTimestamp,
SerializableConfiguration serConf) throws IOException {
// Make sure the output directory is empty
Path outputPath = new Path(cfg.targetOutputPath);
if (fs.exists(outputPath)) {
LOG.warn(String.format("The output path %s targetBasePath already exists, deleting", outputPath));
fs.delete(new Path(cfg.targetOutputPath), true);
}
jsc.parallelize(partitions, partitions.size()).flatMap(partition -> { jsc.parallelize(partitions, partitions.size()).flatMap(partition -> {
// Only take latest version files <= latestCommit. // Only take latest version files <= latestCommit.
FileSystem fs1 = FSUtils.getFs(cfg.sourceBasePath, serConf.newCopy());
List<Tuple2<String, String>> filePaths = new ArrayList<>(); List<Tuple2<String, String>> filePaths = new ArrayList<>();
dataFiles.forEach(hoodieDataFile -> filePaths.add(new Tuple2<>(partition, hoodieDataFile))); Stream<HoodieBaseFile> dataFiles = fsView.getLatestBaseFilesBeforeOrOn(partition, latestCommitTimestamp);
dataFiles.forEach(hoodieDataFile -> filePaths.add(new Tuple2<>(partition, hoodieDataFile.getPath())));
// also need to copy over partition metadata // also need to copy over partition metadata
Path partitionMetaFile = Path partitionMetaFile =
new Path(new Path(cfg.sourceBasePath, partition), HoodiePartitionMetadata.HOODIE_PARTITION_METAFILE); new Path(new Path(cfg.sourceBasePath, partition), HoodiePartitionMetadata.HOODIE_PARTITION_METAFILE);
if (fs1.exists(partitionMetaFile)) { FileSystem fs = FSUtils.getFs(cfg.sourceBasePath, serConf.newCopy());
if (fs.exists(partitionMetaFile)) {
filePaths.add(new Tuple2<>(partition, partitionMetaFile.toString())); filePaths.add(new Tuple2<>(partition, partitionMetaFile.toString()));
} }
@@ -199,19 +211,20 @@ public class HoodieSnapshotExporter {
String partition = tuple._1(); String partition = tuple._1();
Path sourceFilePath = new Path(tuple._2()); Path sourceFilePath = new Path(tuple._2());
Path toPartitionPath = new Path(cfg.targetOutputPath, partition); Path toPartitionPath = new Path(cfg.targetOutputPath, partition);
FileSystem ifs = FSUtils.getFs(cfg.targetOutputPath, serConf.newCopy()); FileSystem fs = FSUtils.getFs(cfg.targetOutputPath, serConf.newCopy());
if (!ifs.exists(toPartitionPath)) { if (!fs.exists(toPartitionPath)) {
ifs.mkdirs(toPartitionPath); fs.mkdirs(toPartitionPath);
} }
FileUtil.copy(ifs, sourceFilePath, ifs, new Path(toPartitionPath, sourceFilePath.getName()), false, FileUtil.copy(fs, sourceFilePath, fs, new Path(toPartitionPath, sourceFilePath.getName()), false,
ifs.getConf()); fs.getConf());
}); });
// Also copy the .commit files // Also copy the .commit files
LOG.info(String.format("Copying .commit files which are no-late-than %s.", latestCommitTimestamp)); LOG.info(String.format("Copying .commit files which are no-late-than %s.", latestCommitTimestamp));
final FileSystem fileSystem = FSUtils.getFs(cfg.sourceBasePath, jsc.hadoopConfiguration());
FileStatus[] commitFilesToCopy = FileStatus[] commitFilesToCopy =
fs.listStatus(new Path(cfg.sourceBasePath + "/" + HoodieTableMetaClient.METAFOLDER_NAME), (commitFilePath) -> { fileSystem.listStatus(new Path(cfg.sourceBasePath + "/" + HoodieTableMetaClient.METAFOLDER_NAME), (commitFilePath) -> {
if (commitFilePath.getName().equals(HoodieTableConfig.HOODIE_PROPERTIES_FILE)) { if (commitFilePath.getName().equals(HoodieTableConfig.HOODIE_PROPERTIES_FILE)) {
return true; return true;
} else { } else {
@@ -223,39 +236,37 @@ public class HoodieSnapshotExporter {
for (FileStatus commitStatus : commitFilesToCopy) { for (FileStatus commitStatus : commitFilesToCopy) {
Path targetFilePath = Path targetFilePath =
new Path(cfg.targetOutputPath + "/" + HoodieTableMetaClient.METAFOLDER_NAME + "/" + commitStatus.getPath().getName()); new Path(cfg.targetOutputPath + "/" + HoodieTableMetaClient.METAFOLDER_NAME + "/" + commitStatus.getPath().getName());
if (!fs.exists(targetFilePath.getParent())) { if (!fileSystem.exists(targetFilePath.getParent())) {
fs.mkdirs(targetFilePath.getParent()); fileSystem.mkdirs(targetFilePath.getParent());
} }
if (fs.exists(targetFilePath)) { if (fileSystem.exists(targetFilePath)) {
LOG.error( LOG.error(
String.format("The target output commit file (%s targetBasePath) already exists.", targetFilePath)); String.format("The target output commit file (%s targetBasePath) already exists.", targetFilePath));
} }
FileUtil.copy(fs, commitStatus.getPath(), fs, targetFilePath, false, fs.getConf()); FileUtil.copy(fileSystem, commitStatus.getPath(), fileSystem, targetFilePath, false, fileSystem.getConf());
} }
} }
private void createSuccessTag(FileSystem fs, String targetOutputPath) throws IOException { private BaseFileOnlyView getBaseFileOnlyView(JavaSparkContext jsc, Config cfg) {
Path successTagPath = new Path(targetOutputPath + "/_SUCCESS"); FileSystem fs = FSUtils.getFs(cfg.sourceBasePath, jsc.hadoopConfiguration());
if (!fs.exists(successTagPath)) { HoodieTableMetaClient tableMetadata = new HoodieTableMetaClient(fs.getConf(), cfg.sourceBasePath);
LOG.info(String.format("Creating _SUCCESS under target output path: %s", targetOutputPath)); return new HoodieTableFileSystemView(tableMetadata, tableMetadata
fs.createNewFile(successTagPath); .getActiveTimeline().getCommitsTimeline().filterCompletedInstants());
}
} }
public static void main(String[] args) throws IOException { public static void main(String[] args) throws IOException {
// Take input configs
final Config cfg = new Config(); final Config cfg = new Config();
new JCommander(cfg, null, args); new JCommander(cfg, null, args);
// Create a spark job to do the snapshot export SparkConf sparkConf = new SparkConf().setAppName("Hoodie-snapshot-exporter");
SparkSession spark = SparkSession.builder().appName("Hoodie-snapshot-exporter") sparkConf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer");
.config("spark.serializer", "org.apache.spark.serializer.KryoSerializer").getOrCreate(); JavaSparkContext jsc = new JavaSparkContext(sparkConf);
LOG.info("Initializing spark job."); LOG.info("Initializing spark job.");
HoodieSnapshotExporter hoodieSnapshotExporter = new HoodieSnapshotExporter(); try {
hoodieSnapshotExporter.export(spark, cfg); new HoodieSnapshotExporter().export(jsc, cfg);
} finally {
// Stop the job jsc.stop();
spark.stop(); }
} }
} }

View File

@@ -0,0 +1,10 @@
package org.apache.hudi.utilities.exception;
import org.apache.hudi.exception.HoodieException;
public class HoodieSnapshotExporterException extends HoodieException {
public HoodieSnapshotExporterException(String msg) {
super(msg);
}
}

View File

@@ -31,8 +31,10 @@ import org.apache.hudi.index.HoodieIndex.IndexType;
import org.apache.hudi.utilities.HoodieSnapshotExporter.Config; import org.apache.hudi.utilities.HoodieSnapshotExporter.Config;
import org.apache.hudi.utilities.HoodieSnapshotExporter.OutputFormatValidator; import org.apache.hudi.utilities.HoodieSnapshotExporter.OutputFormatValidator;
import org.apache.hudi.utilities.HoodieSnapshotExporter.Partitioner; import org.apache.hudi.utilities.HoodieSnapshotExporter.Partitioner;
import org.apache.hudi.utilities.exception.HoodieSnapshotExporterException;
import com.beust.jcommander.ParameterException; import com.beust.jcommander.ParameterException;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.LocatedFileStatus; import org.apache.hadoop.fs.LocatedFileStatus;
import org.apache.hadoop.fs.Path; import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.RemoteIterator; import org.apache.hadoop.fs.RemoteIterator;
@@ -43,11 +45,12 @@ import org.apache.spark.sql.Column;
import org.apache.spark.sql.DataFrameWriter; import org.apache.spark.sql.DataFrameWriter;
import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row; import org.apache.spark.sql.Row;
import org.apache.spark.sql.SparkSession;
import org.junit.After; import org.junit.After;
import org.junit.Before; import org.junit.Before;
import org.junit.Rule;
import org.junit.Test; import org.junit.Test;
import org.junit.experimental.runners.Enclosed; import org.junit.experimental.runners.Enclosed;
import org.junit.rules.ExpectedException;
import org.junit.runner.RunWith; import org.junit.runner.RunWith;
import org.junit.runners.Parameterized; import org.junit.runners.Parameterized;
import org.junit.runners.Parameterized.Parameter; import org.junit.runners.Parameterized.Parameter;
@@ -56,9 +59,9 @@ import org.junit.runners.Parameterized.Parameters;
import java.io.IOException; import java.io.IOException;
import java.util.Arrays; import java.util.Arrays;
import java.util.List; import java.util.List;
import java.util.stream.Collectors;
import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertFalse;
import static org.junit.Assert.assertNull; import static org.junit.Assert.assertNull;
import static org.junit.Assert.assertTrue; import static org.junit.Assert.assertTrue;
@@ -85,7 +88,6 @@ public class TestHoodieSnapshotExporter {
sourcePath = dfsBasePath + "/source/"; sourcePath = dfsBasePath + "/source/";
targetPath = dfsBasePath + "/target/"; targetPath = dfsBasePath + "/target/";
dfs.mkdirs(new Path(sourcePath)); dfs.mkdirs(new Path(sourcePath));
dfs.mkdirs(new Path(targetPath));
HoodieTableMetaClient HoodieTableMetaClient
.initTableType(jsc.hadoopConfiguration(), sourcePath, HoodieTableType.COPY_ON_WRITE, TABLE_NAME, .initTableType(jsc.hadoopConfiguration(), sourcePath, HoodieTableType.COPY_ON_WRITE, TABLE_NAME,
HoodieAvroPayload.class.getName()); HoodieAvroPayload.class.getName());
@@ -140,7 +142,7 @@ public class TestHoodieSnapshotExporter {
@Test @Test
public void testExportAsHudi() throws IOException { public void testExportAsHudi() throws IOException {
new HoodieSnapshotExporter().export(SparkSession.builder().config(jsc.getConf()).getOrCreate(), cfg); new HoodieSnapshotExporter().export(jsc, cfg);
// Check results // Check results
assertTrue(dfs.exists(new Path(targetPath + "/.hoodie/" + COMMIT_TIME + ".clean"))); assertTrue(dfs.exists(new Path(targetPath + "/.hoodie/" + COMMIT_TIME + ".clean")));
@@ -159,18 +161,61 @@ public class TestHoodieSnapshotExporter {
assertTrue(dfs.exists(new Path(partition + "/.hoodie_partition_metadata"))); assertTrue(dfs.exists(new Path(partition + "/.hoodie_partition_metadata")));
assertTrue(dfs.exists(new Path(targetPath + "/_SUCCESS"))); assertTrue(dfs.exists(new Path(targetPath + "/_SUCCESS")));
} }
}
public static class TestHoodieSnapshotExporterForEarlyAbort extends ExporterTestHarness {
private HoodieSnapshotExporter.Config cfg;
@Rule
public ExpectedException exceptionRule = ExpectedException.none();
@Before
public void setUp() throws Exception {
super.setUp();
cfg = new Config();
cfg.sourceBasePath = sourcePath;
cfg.targetOutputPath = targetPath;
cfg.outputFormat = OutputFormatValidator.HUDI;
}
@Test @Test
public void testExportEmptyDataset() throws IOException { public void testExportWhenTargetPathExists() throws IOException {
// make target output path present
dfs.mkdirs(new Path(targetPath));
// export
exceptionRule.expect(HoodieSnapshotExporterException.class);
exceptionRule.expectMessage("The target output path already exists.");
new HoodieSnapshotExporter().export(jsc, cfg);
}
@Test
public void testExportDatasetWithNoCommit() throws IOException {
// delete commit files
List<Path> commitFiles = Arrays.stream(dfs.listStatus(new Path(sourcePath + "/.hoodie")))
.map(FileStatus::getPath)
.filter(filePath -> filePath.getName().endsWith(".commit"))
.collect(Collectors.toList());
for (Path p : commitFiles) {
dfs.delete(p, false);
}
// export
exceptionRule.expect(HoodieSnapshotExporterException.class);
exceptionRule.expectMessage("No commits present. Nothing to snapshot.");
new HoodieSnapshotExporter().export(jsc, cfg);
}
@Test
public void testExportDatasetWithNoPartition() throws IOException {
// delete all source data // delete all source data
dfs.delete(new Path(sourcePath + "/" + PARTITION_PATH), true); dfs.delete(new Path(sourcePath + "/" + PARTITION_PATH), true);
// export // export
new HoodieSnapshotExporter().export(SparkSession.builder().config(jsc.getConf()).getOrCreate(), cfg); exceptionRule.expect(HoodieSnapshotExporterException.class);
exceptionRule.expectMessage("The source dataset has 0 partition to snapshot.");
// Check results new HoodieSnapshotExporter().export(jsc, cfg);
assertEquals("Target path should be empty.", 0, dfs.listStatus(new Path(targetPath)).length);
assertFalse(dfs.exists(new Path(targetPath + "/_SUCCESS")));
} }
} }
@@ -191,7 +236,7 @@ public class TestHoodieSnapshotExporter {
cfg.sourceBasePath = sourcePath; cfg.sourceBasePath = sourcePath;
cfg.targetOutputPath = targetPath; cfg.targetOutputPath = targetPath;
cfg.outputFormat = format; cfg.outputFormat = format;
new HoodieSnapshotExporter().export(SparkSession.builder().config(jsc.getConf()).getOrCreate(), cfg); new HoodieSnapshotExporter().export(jsc, cfg);
assertEquals(NUM_RECORDS, sqlContext.read().format(format).load(targetPath).count()); assertEquals(NUM_RECORDS, sqlContext.read().format(format).load(targetPath).count());
assertTrue(dfs.exists(new Path(targetPath + "/_SUCCESS"))); assertTrue(dfs.exists(new Path(targetPath + "/_SUCCESS")));
} }
@@ -228,7 +273,7 @@ public class TestHoodieSnapshotExporter {
public void testExportWithPartitionField() throws IOException { public void testExportWithPartitionField() throws IOException {
// `driver` field is set in HoodieTestDataGenerator // `driver` field is set in HoodieTestDataGenerator
cfg.outputPartitionField = "driver"; cfg.outputPartitionField = "driver";
new HoodieSnapshotExporter().export(SparkSession.builder().config(jsc.getConf()).getOrCreate(), cfg); new HoodieSnapshotExporter().export(jsc, cfg);
assertEquals(NUM_RECORDS, sqlContext.read().format("json").load(targetPath).count()); assertEquals(NUM_RECORDS, sqlContext.read().format("json").load(targetPath).count());
assertTrue(dfs.exists(new Path(targetPath + "/_SUCCESS"))); assertTrue(dfs.exists(new Path(targetPath + "/_SUCCESS")));
@@ -238,7 +283,7 @@ public class TestHoodieSnapshotExporter {
@Test @Test
public void testExportForUserDefinedPartitioner() throws IOException { public void testExportForUserDefinedPartitioner() throws IOException {
cfg.outputPartitioner = UserDefinedPartitioner.class.getName(); cfg.outputPartitioner = UserDefinedPartitioner.class.getName();
new HoodieSnapshotExporter().export(SparkSession.builder().config(jsc.getConf()).getOrCreate(), cfg); new HoodieSnapshotExporter().export(jsc, cfg);
assertEquals(NUM_RECORDS, sqlContext.read().format("json").load(targetPath).count()); assertEquals(NUM_RECORDS, sqlContext.read().format("json").load(targetPath).count());
assertTrue(dfs.exists(new Path(targetPath + "/_SUCCESS"))); assertTrue(dfs.exists(new Path(targetPath + "/_SUCCESS")));