[HUDI-92] Provide reasonable names for Spark DAG stages in HUDI. (#1289)
This commit is contained in:
12
README.md
12
README.md
@@ -92,6 +92,18 @@ spark-2.4.4-bin-hadoop2.7/bin/spark-shell \
|
|||||||
--conf 'spark.serializer=org.apache.spark.serializer.KryoSerializer'
|
--conf 'spark.serializer=org.apache.spark.serializer.KryoSerializer'
|
||||||
```
|
```
|
||||||
|
|
||||||
|
## Running Tests
|
||||||
|
|
||||||
|
All tests can be run with maven
|
||||||
|
```
|
||||||
|
mvn test
|
||||||
|
```
|
||||||
|
|
||||||
|
To run tests with spark event logging enabled, define the Spark event log directory. This allows visualizing test DAG and stages using Spark History Server UI.
|
||||||
|
```
|
||||||
|
mvn test -DSPARK_EVLOG_DIR=/path/for/spark/event/log
|
||||||
|
```
|
||||||
|
|
||||||
## Quickstart
|
## Quickstart
|
||||||
|
|
||||||
Please visit [https://hudi.apache.org/docs/quick-start-guide.html](https://hudi.apache.org/docs/quick-start-guide.html) to quickly explore Hudi's capabilities using spark-shell.
|
Please visit [https://hudi.apache.org/docs/quick-start-guide.html](https://hudi.apache.org/docs/quick-start-guide.html) to quickly explore Hudi's capabilities using spark-shell.
|
||||||
|
|||||||
@@ -85,6 +85,7 @@ public class CompactionAdminClient extends AbstractHoodieClient {
|
|||||||
if (plan.getOperations() != null) {
|
if (plan.getOperations() != null) {
|
||||||
List<CompactionOperation> ops = plan.getOperations().stream()
|
List<CompactionOperation> ops = plan.getOperations().stream()
|
||||||
.map(CompactionOperation::convertFromAvroRecordInstance).collect(Collectors.toList());
|
.map(CompactionOperation::convertFromAvroRecordInstance).collect(Collectors.toList());
|
||||||
|
jsc.setJobGroup(this.getClass().getSimpleName(), "Validate compaction operations");
|
||||||
return jsc.parallelize(ops, parallelism).map(op -> {
|
return jsc.parallelize(ops, parallelism).map(op -> {
|
||||||
try {
|
try {
|
||||||
return validateCompactionOperation(metaClient, compactionInstant, op, Option.of(fsView));
|
return validateCompactionOperation(metaClient, compactionInstant, op, Option.of(fsView));
|
||||||
@@ -350,6 +351,7 @@ public class CompactionAdminClient extends AbstractHoodieClient {
|
|||||||
} else {
|
} else {
|
||||||
LOG.info("The following compaction renaming operations needs to be performed to un-schedule");
|
LOG.info("The following compaction renaming operations needs to be performed to un-schedule");
|
||||||
if (!dryRun) {
|
if (!dryRun) {
|
||||||
|
jsc.setJobGroup(this.getClass().getSimpleName(), "Execute unschedule operations");
|
||||||
return jsc.parallelize(renameActions, parallelism).map(lfPair -> {
|
return jsc.parallelize(renameActions, parallelism).map(lfPair -> {
|
||||||
try {
|
try {
|
||||||
LOG.info("RENAME " + lfPair.getLeft().getPath() + " => " + lfPair.getRight().getPath());
|
LOG.info("RENAME " + lfPair.getLeft().getPath() + " => " + lfPair.getRight().getPath());
|
||||||
@@ -392,6 +394,7 @@ public class CompactionAdminClient extends AbstractHoodieClient {
|
|||||||
"Number of Compaction Operations :" + plan.getOperations().size() + " for instant :" + compactionInstant);
|
"Number of Compaction Operations :" + plan.getOperations().size() + " for instant :" + compactionInstant);
|
||||||
List<CompactionOperation> ops = plan.getOperations().stream()
|
List<CompactionOperation> ops = plan.getOperations().stream()
|
||||||
.map(CompactionOperation::convertFromAvroRecordInstance).collect(Collectors.toList());
|
.map(CompactionOperation::convertFromAvroRecordInstance).collect(Collectors.toList());
|
||||||
|
jsc.setJobGroup(this.getClass().getSimpleName(), "Generate compaction unscheduling operations");
|
||||||
return jsc.parallelize(ops, parallelism).flatMap(op -> {
|
return jsc.parallelize(ops, parallelism).flatMap(op -> {
|
||||||
try {
|
try {
|
||||||
return getRenamingActionsForUnschedulingCompactionOperation(metaClient, compactionInstant, op,
|
return getRenamingActionsForUnschedulingCompactionOperation(metaClient, compactionInstant, op,
|
||||||
|
|||||||
@@ -49,6 +49,7 @@ public class HoodieIndexUtils {
|
|||||||
public static List<Pair<String, HoodieBaseFile>> getLatestBaseFilesForAllPartitions(final List<String> partitions,
|
public static List<Pair<String, HoodieBaseFile>> getLatestBaseFilesForAllPartitions(final List<String> partitions,
|
||||||
final JavaSparkContext jsc,
|
final JavaSparkContext jsc,
|
||||||
final HoodieTable hoodieTable) {
|
final HoodieTable hoodieTable) {
|
||||||
|
jsc.setJobGroup(HoodieIndexUtils.class.getSimpleName(), "Load latest base files from all partitions");
|
||||||
return jsc.parallelize(partitions, Math.max(partitions.size(), 1))
|
return jsc.parallelize(partitions, Math.max(partitions.size(), 1))
|
||||||
.flatMap(partitionPath -> {
|
.flatMap(partitionPath -> {
|
||||||
Option<HoodieInstant> latestCommitTime = hoodieTable.getMetaClient().getCommitsTimeline()
|
Option<HoodieInstant> latestCommitTime = hoodieTable.getMetaClient().getCommitsTimeline()
|
||||||
|
|||||||
@@ -199,6 +199,7 @@ public class HoodieBloomIndex<T extends HoodieRecordPayload> extends HoodieIndex
|
|||||||
|
|
||||||
if (config.getBloomIndexPruneByRanges()) {
|
if (config.getBloomIndexPruneByRanges()) {
|
||||||
// also obtain file ranges, if range pruning is enabled
|
// also obtain file ranges, if range pruning is enabled
|
||||||
|
jsc.setJobDescription("Obtain key ranges for file slices (range pruning=on)");
|
||||||
return jsc.parallelize(partitionPathFileIDList, Math.max(partitionPathFileIDList.size(), 1)).mapToPair(pf -> {
|
return jsc.parallelize(partitionPathFileIDList, Math.max(partitionPathFileIDList.size(), 1)).mapToPair(pf -> {
|
||||||
try {
|
try {
|
||||||
HoodieRangeInfoHandle<T> rangeInfoHandle = new HoodieRangeInfoHandle<T>(config, hoodieTable, pf);
|
HoodieRangeInfoHandle<T> rangeInfoHandle = new HoodieRangeInfoHandle<T>(config, hoodieTable, pf);
|
||||||
|
|||||||
@@ -448,6 +448,7 @@ public abstract class HoodieTable<T extends HoodieRecordPayload> implements Seri
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Now delete partially written files
|
// Now delete partially written files
|
||||||
|
jsc.setJobGroup(this.getClass().getSimpleName(), "Delete all partially written files");
|
||||||
jsc.parallelize(new ArrayList<>(groupByPartition.values()), config.getFinalizeWriteParallelism())
|
jsc.parallelize(new ArrayList<>(groupByPartition.values()), config.getFinalizeWriteParallelism())
|
||||||
.map(partitionWithFileList -> {
|
.map(partitionWithFileList -> {
|
||||||
final FileSystem fileSystem = metaClient.getFs();
|
final FileSystem fileSystem = metaClient.getFs();
|
||||||
@@ -489,6 +490,7 @@ public abstract class HoodieTable<T extends HoodieRecordPayload> implements Seri
|
|||||||
*/
|
*/
|
||||||
private void waitForAllFiles(JavaSparkContext jsc, Map<String, List<Pair<String, String>>> groupByPartition, FileVisibility visibility) {
|
private void waitForAllFiles(JavaSparkContext jsc, Map<String, List<Pair<String, String>>> groupByPartition, FileVisibility visibility) {
|
||||||
// This will either ensure all files to be deleted are present.
|
// This will either ensure all files to be deleted are present.
|
||||||
|
jsc.setJobGroup(this.getClass().getSimpleName(), "Wait for all files to appear/disappear");
|
||||||
boolean checkPassed =
|
boolean checkPassed =
|
||||||
jsc.parallelize(new ArrayList<>(groupByPartition.entrySet()), config.getFinalizeWriteParallelism())
|
jsc.parallelize(new ArrayList<>(groupByPartition.entrySet()), config.getFinalizeWriteParallelism())
|
||||||
.map(partitionWithFileList -> waitForCondition(partitionWithFileList.getKey(),
|
.map(partitionWithFileList -> waitForCondition(partitionWithFileList.getKey(),
|
||||||
|
|||||||
@@ -81,6 +81,7 @@ public class CleanActionExecutor extends BaseActionExecutor<HoodieCleanMetadata>
|
|||||||
int cleanerParallelism = Math.min(partitionsToClean.size(), config.getCleanerParallelism());
|
int cleanerParallelism = Math.min(partitionsToClean.size(), config.getCleanerParallelism());
|
||||||
LOG.info("Using cleanerParallelism: " + cleanerParallelism);
|
LOG.info("Using cleanerParallelism: " + cleanerParallelism);
|
||||||
|
|
||||||
|
jsc.setJobGroup(this.getClass().getSimpleName(), "Generates list of file slices to be cleaned");
|
||||||
Map<String, List<String>> cleanOps = jsc
|
Map<String, List<String>> cleanOps = jsc
|
||||||
.parallelize(partitionsToClean, cleanerParallelism)
|
.parallelize(partitionsToClean, cleanerParallelism)
|
||||||
.map(partitionPathToClean -> Pair.of(partitionPathToClean, planner.getDeletePaths(partitionPathToClean)))
|
.map(partitionPathToClean -> Pair.of(partitionPathToClean, planner.getDeletePaths(partitionPathToClean)))
|
||||||
@@ -147,6 +148,8 @@ public class CleanActionExecutor extends BaseActionExecutor<HoodieCleanMetadata>
|
|||||||
(int) (cleanerPlan.getFilesToBeDeletedPerPartition().values().stream().mapToInt(List::size).count()),
|
(int) (cleanerPlan.getFilesToBeDeletedPerPartition().values().stream().mapToInt(List::size).count()),
|
||||||
config.getCleanerParallelism());
|
config.getCleanerParallelism());
|
||||||
LOG.info("Using cleanerParallelism: " + cleanerParallelism);
|
LOG.info("Using cleanerParallelism: " + cleanerParallelism);
|
||||||
|
|
||||||
|
jsc.setJobGroup(this.getClass().getSimpleName(), "Perform cleaning of partitions");
|
||||||
List<Tuple2<String, PartitionCleanStat>> partitionCleanStats = jsc
|
List<Tuple2<String, PartitionCleanStat>> partitionCleanStats = jsc
|
||||||
.parallelize(cleanerPlan.getFilesToBeDeletedPerPartition().entrySet().stream()
|
.parallelize(cleanerPlan.getFilesToBeDeletedPerPartition().entrySet().stream()
|
||||||
.flatMap(x -> x.getValue().stream().map(y -> new Tuple2<>(x.getKey(), y)))
|
.flatMap(x -> x.getValue().stream().map(y -> new Tuple2<>(x.getKey(), y)))
|
||||||
|
|||||||
@@ -210,6 +210,7 @@ public class UpsertPartitioner<T extends HoodieRecordPayload<T>> extends Partiti
|
|||||||
|
|
||||||
Map<String, List<SmallFile>> partitionSmallFilesMap = new HashMap<>();
|
Map<String, List<SmallFile>> partitionSmallFilesMap = new HashMap<>();
|
||||||
if (partitionPaths != null && partitionPaths.size() > 0) {
|
if (partitionPaths != null && partitionPaths.size() > 0) {
|
||||||
|
jsc.setJobGroup(this.getClass().getSimpleName(), "Getting small files from partitions");
|
||||||
JavaRDD<String> partitionPathRdds = jsc.parallelize(partitionPaths, partitionPaths.size());
|
JavaRDD<String> partitionPathRdds = jsc.parallelize(partitionPaths, partitionPaths.size());
|
||||||
partitionSmallFilesMap = partitionPathRdds.mapToPair((PairFunction<String, String, List<SmallFile>>)
|
partitionSmallFilesMap = partitionPathRdds.mapToPair((PairFunction<String, String, List<SmallFile>>)
|
||||||
partitionPath -> new Tuple2<>(partitionPath, getSmallFiles(partitionPath))).collectAsMap();
|
partitionPath -> new Tuple2<>(partitionPath, getSmallFiles(partitionPath))).collectAsMap();
|
||||||
|
|||||||
@@ -94,6 +94,7 @@ public class HoodieMergeOnReadTableCompactor implements HoodieCompactor {
|
|||||||
.map(CompactionOperation::convertFromAvroRecordInstance).collect(toList());
|
.map(CompactionOperation::convertFromAvroRecordInstance).collect(toList());
|
||||||
LOG.info("Compactor compacting " + operations + " files");
|
LOG.info("Compactor compacting " + operations + " files");
|
||||||
|
|
||||||
|
jsc.setJobGroup(this.getClass().getSimpleName(), "Compacting file slices");
|
||||||
return jsc.parallelize(operations, operations.size())
|
return jsc.parallelize(operations, operations.size())
|
||||||
.map(s -> compact(table, metaClient, config, s, compactionInstantTime)).flatMap(List::iterator);
|
.map(s -> compact(table, metaClient, config, s, compactionInstantTime)).flatMap(List::iterator);
|
||||||
}
|
}
|
||||||
@@ -192,6 +193,7 @@ public class HoodieMergeOnReadTableCompactor implements HoodieCompactor {
|
|||||||
|
|
||||||
SliceView fileSystemView = hoodieTable.getSliceView();
|
SliceView fileSystemView = hoodieTable.getSliceView();
|
||||||
LOG.info("Compaction looking for files to compact in " + partitionPaths + " partitions");
|
LOG.info("Compaction looking for files to compact in " + partitionPaths + " partitions");
|
||||||
|
jsc.setJobGroup(this.getClass().getSimpleName(), "Looking for files to compact");
|
||||||
List<HoodieCompactionOperation> operations = jsc.parallelize(partitionPaths, partitionPaths.size())
|
List<HoodieCompactionOperation> operations = jsc.parallelize(partitionPaths, partitionPaths.size())
|
||||||
.flatMap((FlatMapFunction<String, CompactionOperation>) partitionPath -> fileSystemView
|
.flatMap((FlatMapFunction<String, CompactionOperation>) partitionPath -> fileSystemView
|
||||||
.getLatestFileSlices(partitionPath)
|
.getLatestFileSlices(partitionPath)
|
||||||
|
|||||||
@@ -118,6 +118,7 @@ public class MergeOnReadRollbackActionExecutor extends BaseRollbackActionExecuto
|
|||||||
List<String> partitions = FSUtils.getAllPartitionPaths(table.getMetaClient().getFs(), table.getMetaClient().getBasePath(),
|
List<String> partitions = FSUtils.getAllPartitionPaths(table.getMetaClient().getFs(), table.getMetaClient().getBasePath(),
|
||||||
config.shouldAssumeDatePartitioning());
|
config.shouldAssumeDatePartitioning());
|
||||||
int sparkPartitions = Math.max(Math.min(partitions.size(), config.getRollbackParallelism()), 1);
|
int sparkPartitions = Math.max(Math.min(partitions.size(), config.getRollbackParallelism()), 1);
|
||||||
|
jsc.setJobGroup(this.getClass().getSimpleName(), "Generate all rollback requests");
|
||||||
return jsc.parallelize(partitions, Math.min(partitions.size(), sparkPartitions)).flatMap(partitionPath -> {
|
return jsc.parallelize(partitions, Math.min(partitions.size(), sparkPartitions)).flatMap(partitionPath -> {
|
||||||
HoodieActiveTimeline activeTimeline = table.getMetaClient().reloadActiveTimeline();
|
HoodieActiveTimeline activeTimeline = table.getMetaClient().reloadActiveTimeline();
|
||||||
List<RollbackRequest> partitionRollbackRequests = new ArrayList<>();
|
List<RollbackRequest> partitionRollbackRequests = new ArrayList<>();
|
||||||
|
|||||||
@@ -85,6 +85,7 @@ public class RollbackHelper implements Serializable {
|
|||||||
};
|
};
|
||||||
|
|
||||||
int sparkPartitions = Math.max(Math.min(rollbackRequests.size(), config.getRollbackParallelism()), 1);
|
int sparkPartitions = Math.max(Math.min(rollbackRequests.size(), config.getRollbackParallelism()), 1);
|
||||||
|
jsc.setJobGroup(this.getClass().getSimpleName(), "Perform rollback actions");
|
||||||
return jsc.parallelize(rollbackRequests, sparkPartitions).mapToPair(rollbackRequest -> {
|
return jsc.parallelize(rollbackRequests, sparkPartitions).mapToPair(rollbackRequest -> {
|
||||||
final Map<FileStatus, Boolean> filesToDeletedStatus = new HashMap<>();
|
final Map<FileStatus, Boolean> filesToDeletedStatus = new HashMap<>();
|
||||||
switch (rollbackRequest.getRollbackAction()) {
|
switch (rollbackRequest.getRollbackAction()) {
|
||||||
|
|||||||
@@ -87,6 +87,7 @@ public class SavepointActionExecutor extends BaseActionExecutor<HoodieSavepointM
|
|||||||
ValidationUtils.checkArgument(HoodieTimeline.compareTimestamps(instantTime, HoodieTimeline.GREATER_THAN_OR_EQUALS, lastCommitRetained),
|
ValidationUtils.checkArgument(HoodieTimeline.compareTimestamps(instantTime, HoodieTimeline.GREATER_THAN_OR_EQUALS, lastCommitRetained),
|
||||||
"Could not savepoint commit " + instantTime + " as this is beyond the lookup window " + lastCommitRetained);
|
"Could not savepoint commit " + instantTime + " as this is beyond the lookup window " + lastCommitRetained);
|
||||||
|
|
||||||
|
jsc.setJobGroup(this.getClass().getSimpleName(), "Collecting latest files for savepoint " + instantTime);
|
||||||
Map<String, List<String>> latestFilesMap = jsc.parallelize(FSUtils.getAllPartitionPaths(table.getMetaClient().getFs(),
|
Map<String, List<String>> latestFilesMap = jsc.parallelize(FSUtils.getAllPartitionPaths(table.getMetaClient().getFs(),
|
||||||
table.getMetaClient().getBasePath(), config.shouldAssumeDatePartitioning()))
|
table.getMetaClient().getBasePath(), config.shouldAssumeDatePartitioning()))
|
||||||
.mapToPair(partitionPath -> {
|
.mapToPair(partitionPath -> {
|
||||||
|
|||||||
@@ -86,7 +86,7 @@ public class TestHoodieBloomIndex extends HoodieClientTestHarness {
|
|||||||
|
|
||||||
@BeforeEach
|
@BeforeEach
|
||||||
public void setUp() throws Exception {
|
public void setUp() throws Exception {
|
||||||
initSparkContexts("TestHoodieBloomIndex");
|
initSparkContexts();
|
||||||
initPath();
|
initPath();
|
||||||
initFileSystem();
|
initFileSystem();
|
||||||
// We have some records to be tagged (two different partitions)
|
// We have some records to be tagged (two different partitions)
|
||||||
|
|||||||
@@ -71,7 +71,7 @@ public class TestHoodieGlobalBloomIndex extends HoodieClientTestHarness {
|
|||||||
|
|
||||||
@BeforeEach
|
@BeforeEach
|
||||||
public void setUp() throws Exception {
|
public void setUp() throws Exception {
|
||||||
initSparkContexts("TestHoodieGlobalBloomIndex");
|
initSparkContexts();
|
||||||
initPath();
|
initPath();
|
||||||
// We have some records to be tagged (two different partitions)
|
// We have some records to be tagged (two different partitions)
|
||||||
String schemaStr = FileIOUtils.readAsUTFString(getClass().getResourceAsStream("/exampleSchema.txt"));
|
String schemaStr = FileIOUtils.readAsUTFString(getClass().getResourceAsStream("/exampleSchema.txt"));
|
||||||
|
|||||||
@@ -60,7 +60,7 @@ public class TestHoodieCommitArchiveLog extends HoodieClientTestHarness {
|
|||||||
public void init() throws Exception {
|
public void init() throws Exception {
|
||||||
initDFS();
|
initDFS();
|
||||||
initPath();
|
initPath();
|
||||||
initSparkContexts("TestHoodieCommitArchiveLog");
|
initSparkContexts();
|
||||||
hadoopConf = dfs.getConf();
|
hadoopConf = dfs.getConf();
|
||||||
hadoopConf.addResource(dfs.getConf());
|
hadoopConf.addResource(dfs.getConf());
|
||||||
dfs.mkdirs(new Path(basePath));
|
dfs.mkdirs(new Path(basePath));
|
||||||
|
|||||||
@@ -57,7 +57,7 @@ public class TestHoodieMergeHandle extends HoodieClientTestHarness {
|
|||||||
|
|
||||||
@BeforeEach
|
@BeforeEach
|
||||||
public void setUp() throws Exception {
|
public void setUp() throws Exception {
|
||||||
initSparkContexts("TestHoodieMergeHandle");
|
initSparkContexts();
|
||||||
initPath();
|
initPath();
|
||||||
initFileSystem();
|
initFileSystem();
|
||||||
initTestDataGenerator();
|
initTestDataGenerator();
|
||||||
|
|||||||
@@ -66,7 +66,7 @@ public class TestHoodieCompactor extends HoodieClientTestHarness {
|
|||||||
@BeforeEach
|
@BeforeEach
|
||||||
public void setUp() throws Exception {
|
public void setUp() throws Exception {
|
||||||
// Initialize a local spark env
|
// Initialize a local spark env
|
||||||
initSparkContexts("TestHoodieCompactor");
|
initSparkContexts();
|
||||||
|
|
||||||
// Create a temp folder as the base path
|
// Create a temp folder as the base path
|
||||||
initPath();
|
initPath();
|
||||||
|
|||||||
@@ -41,6 +41,8 @@ import org.apache.hudi.table.HoodieTable;
|
|||||||
|
|
||||||
import org.apache.spark.api.java.JavaSparkContext;
|
import org.apache.spark.api.java.JavaSparkContext;
|
||||||
import org.apache.spark.sql.SQLContext;
|
import org.apache.spark.sql.SQLContext;
|
||||||
|
import org.junit.jupiter.api.BeforeEach;
|
||||||
|
import org.junit.jupiter.api.TestInfo;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
@@ -56,7 +58,8 @@ import java.util.concurrent.atomic.AtomicInteger;
|
|||||||
public abstract class HoodieClientTestHarness extends HoodieCommonTestHarness implements Serializable {
|
public abstract class HoodieClientTestHarness extends HoodieCommonTestHarness implements Serializable {
|
||||||
|
|
||||||
private static final Logger LOG = LoggerFactory.getLogger(HoodieClientTestHarness.class);
|
private static final Logger LOG = LoggerFactory.getLogger(HoodieClientTestHarness.class);
|
||||||
|
|
||||||
|
private String testMethodName;
|
||||||
protected transient JavaSparkContext jsc = null;
|
protected transient JavaSparkContext jsc = null;
|
||||||
protected transient Configuration hadoopConf = null;
|
protected transient Configuration hadoopConf = null;
|
||||||
protected transient SQLContext sqlContext;
|
protected transient SQLContext sqlContext;
|
||||||
@@ -82,6 +85,15 @@ public abstract class HoodieClientTestHarness extends HoodieCommonTestHarness im
|
|||||||
protected transient MiniDFSCluster dfsCluster;
|
protected transient MiniDFSCluster dfsCluster;
|
||||||
protected transient DistributedFileSystem dfs;
|
protected transient DistributedFileSystem dfs;
|
||||||
|
|
||||||
|
@BeforeEach
|
||||||
|
public void setTestMethodName(TestInfo testInfo) {
|
||||||
|
if (testInfo.getTestMethod().isPresent()) {
|
||||||
|
testMethodName = testInfo.getTestMethod().get().getName();
|
||||||
|
} else {
|
||||||
|
testMethodName = "Unknown";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Initializes resource group for the subclasses of {@link HoodieClientTestBase}.
|
* Initializes resource group for the subclasses of {@link HoodieClientTestBase}.
|
||||||
*/
|
*/
|
||||||
@@ -113,7 +125,7 @@ public abstract class HoodieClientTestHarness extends HoodieCommonTestHarness im
|
|||||||
*/
|
*/
|
||||||
protected void initSparkContexts(String appName) {
|
protected void initSparkContexts(String appName) {
|
||||||
// Initialize a local spark env
|
// Initialize a local spark env
|
||||||
jsc = new JavaSparkContext(HoodieClientTestUtils.getSparkConfForTest(appName));
|
jsc = new JavaSparkContext(HoodieClientTestUtils.getSparkConfForTest(appName + "#" + testMethodName));
|
||||||
jsc.setLogLevel("ERROR");
|
jsc.setLogLevel("ERROR");
|
||||||
hadoopConf = jsc.hadoopConfiguration();
|
hadoopConf = jsc.hadoopConfiguration();
|
||||||
|
|
||||||
@@ -122,11 +134,11 @@ public abstract class HoodieClientTestHarness extends HoodieCommonTestHarness im
|
|||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Initializes the Spark contexts ({@link JavaSparkContext} and {@link SQLContext}) with a default name
|
* Initializes the Spark contexts ({@link JavaSparkContext} and {@link SQLContext})
|
||||||
* <b>TestHoodieClient</b>.
|
* with a default name matching the name of the class.
|
||||||
*/
|
*/
|
||||||
protected void initSparkContexts() {
|
protected void initSparkContexts() {
|
||||||
initSparkContexts("TestHoodieClient");
|
initSparkContexts(this.getClass().getSimpleName());
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|||||||
@@ -146,9 +146,30 @@ public class HoodieClientTestUtils {
|
|||||||
new RandomAccessFile(path, "rw").setLength(length);
|
new RandomAccessFile(path, "rw").setLength(length);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Returns a Spark config for this test.
|
||||||
|
*
|
||||||
|
* The following properties may be set to customize the Spark context:
|
||||||
|
* SPARK_EVLOG_DIR: Local directory where event logs should be saved. This
|
||||||
|
* allows viewing the logs with spark-history-server.
|
||||||
|
*
|
||||||
|
* @note When running the tests using maven, use the following syntax to set
|
||||||
|
* a property:
|
||||||
|
* mvn -DSPARK_XXX=yyy ...
|
||||||
|
*
|
||||||
|
* @param appName A name for the Spark application. Shown in the Spark web UI.
|
||||||
|
* @return A Spark config
|
||||||
|
*/
|
||||||
public static SparkConf getSparkConfForTest(String appName) {
|
public static SparkConf getSparkConfForTest(String appName) {
|
||||||
SparkConf sparkConf = new SparkConf().setAppName(appName)
|
SparkConf sparkConf = new SparkConf().setAppName(appName)
|
||||||
.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer").setMaster("local[8]");
|
.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer").setMaster("local[8]");
|
||||||
|
|
||||||
|
String evlogDir = System.getProperty("SPARK_EVLOG_DIR");
|
||||||
|
if (evlogDir != null) {
|
||||||
|
sparkConf.set("spark.eventLog.enabled", "true");
|
||||||
|
sparkConf.set("spark.eventLog.dir", evlogDir);
|
||||||
|
}
|
||||||
|
|
||||||
return HoodieReadClient.addHoodieSupport(sparkConf);
|
return HoodieReadClient.addHoodieSupport(sparkConf);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -166,6 +166,7 @@ public class HDFSParquetImporter implements Serializable {
|
|||||||
AvroReadSupport.setAvroReadSchema(jsc.hadoopConfiguration(), (new Schema.Parser().parse(schemaStr)));
|
AvroReadSupport.setAvroReadSchema(jsc.hadoopConfiguration(), (new Schema.Parser().parse(schemaStr)));
|
||||||
ParquetInputFormat.setReadSupportClass(job, (AvroReadSupport.class));
|
ParquetInputFormat.setReadSupportClass(job, (AvroReadSupport.class));
|
||||||
|
|
||||||
|
jsc.setJobGroup(this.getClass().getSimpleName(), "Build records for import");
|
||||||
return jsc.newAPIHadoopFile(cfg.srcPath, ParquetInputFormat.class, Void.class, GenericRecord.class,
|
return jsc.newAPIHadoopFile(cfg.srcPath, ParquetInputFormat.class, Void.class, GenericRecord.class,
|
||||||
job.getConfiguration())
|
job.getConfiguration())
|
||||||
// To reduce large number of tasks.
|
// To reduce large number of tasks.
|
||||||
|
|||||||
@@ -97,6 +97,7 @@ public class HoodieSnapshotCopier implements Serializable {
|
|||||||
fs.delete(new Path(outputDir), true);
|
fs.delete(new Path(outputDir), true);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
jsc.setJobGroup(this.getClass().getSimpleName(), "Creating a snapshot");
|
||||||
jsc.parallelize(partitions, partitions.size()).flatMap(partition -> {
|
jsc.parallelize(partitions, partitions.size()).flatMap(partition -> {
|
||||||
// Only take latest version files <= latestCommit.
|
// Only take latest version files <= latestCommit.
|
||||||
FileSystem fs1 = FSUtils.getFs(baseDir, serConf.newCopy());
|
FileSystem fs1 = FSUtils.getFs(baseDir, serConf.newCopy());
|
||||||
|
|||||||
@@ -175,6 +175,7 @@ public class HoodieSnapshotExporter {
|
|||||||
? defaultPartitioner
|
? defaultPartitioner
|
||||||
: ReflectionUtils.loadClass(cfg.outputPartitioner);
|
: ReflectionUtils.loadClass(cfg.outputPartitioner);
|
||||||
|
|
||||||
|
jsc.setJobGroup(this.getClass().getSimpleName(), "Exporting as non-HUDI dataset");
|
||||||
final BaseFileOnlyView fsView = getBaseFileOnlyView(jsc, cfg);
|
final BaseFileOnlyView fsView = getBaseFileOnlyView(jsc, cfg);
|
||||||
Iterator<String> exportingFilePaths = jsc
|
Iterator<String> exportingFilePaths = jsc
|
||||||
.parallelize(partitions, partitions.size())
|
.parallelize(partitions, partitions.size())
|
||||||
@@ -193,6 +194,7 @@ public class HoodieSnapshotExporter {
|
|||||||
private void exportAsHudi(JavaSparkContext jsc, Config cfg, List<String> partitions, String latestCommitTimestamp) throws IOException {
|
private void exportAsHudi(JavaSparkContext jsc, Config cfg, List<String> partitions, String latestCommitTimestamp) throws IOException {
|
||||||
final BaseFileOnlyView fsView = getBaseFileOnlyView(jsc, cfg);
|
final BaseFileOnlyView fsView = getBaseFileOnlyView(jsc, cfg);
|
||||||
final SerializableConfiguration serConf = new SerializableConfiguration(jsc.hadoopConfiguration());
|
final SerializableConfiguration serConf = new SerializableConfiguration(jsc.hadoopConfiguration());
|
||||||
|
jsc.setJobGroup(this.getClass().getSimpleName(), "Exporting as HUDI dataset");
|
||||||
jsc.parallelize(partitions, partitions.size()).flatMap(partition -> {
|
jsc.parallelize(partitions, partitions.size()).flatMap(partition -> {
|
||||||
// Only take latest version files <= latestCommit.
|
// Only take latest version files <= latestCommit.
|
||||||
List<Tuple2<String, String>> filePaths = new ArrayList<>();
|
List<Tuple2<String, String>> filePaths = new ArrayList<>();
|
||||||
|
|||||||
@@ -86,6 +86,7 @@ public class HoodieWithTimelineServer implements Serializable {
|
|||||||
System.out.println("Driver Hostname is :" + driverHost);
|
System.out.println("Driver Hostname is :" + driverHost);
|
||||||
List<String> messages = new ArrayList<>();
|
List<String> messages = new ArrayList<>();
|
||||||
IntStream.range(0, cfg.numPartitions).forEach(i -> messages.add("Hello World"));
|
IntStream.range(0, cfg.numPartitions).forEach(i -> messages.add("Hello World"));
|
||||||
|
jsc.setJobGroup(this.getClass().getSimpleName(), "Sending requests to driver host");
|
||||||
List<String> gotMessages = jsc.parallelize(messages).map(msg -> sendRequest(driverHost, cfg.serverPort)).collect();
|
List<String> gotMessages = jsc.parallelize(messages).map(msg -> sendRequest(driverHost, cfg.serverPort)).collect();
|
||||||
System.out.println("Got Messages :" + gotMessages);
|
System.out.println("Got Messages :" + gotMessages);
|
||||||
ValidationUtils.checkArgument(gotMessages.equals(messages), "Got expected reply from Server");
|
ValidationUtils.checkArgument(gotMessages.equals(messages), "Got expected reply from Server");
|
||||||
|
|||||||
@@ -132,6 +132,7 @@ public class TimelineServerPerf implements Serializable {
|
|||||||
|
|
||||||
public List<PerfStats> runLookups(JavaSparkContext jsc, List<String> partitionPaths, SyncableFileSystemView fsView,
|
public List<PerfStats> runLookups(JavaSparkContext jsc, List<String> partitionPaths, SyncableFileSystemView fsView,
|
||||||
int numIterations, int concurrency) {
|
int numIterations, int concurrency) {
|
||||||
|
jsc.setJobGroup(this.getClass().getSimpleName(), "Lookup all performance stats");
|
||||||
return jsc.parallelize(partitionPaths, cfg.numExecutors).flatMap(p -> {
|
return jsc.parallelize(partitionPaths, cfg.numExecutors).flatMap(p -> {
|
||||||
ScheduledThreadPoolExecutor executor = new ScheduledThreadPoolExecutor(100);
|
ScheduledThreadPoolExecutor executor = new ScheduledThreadPoolExecutor(100);
|
||||||
final List<PerfStats> result = new ArrayList<>();
|
final List<PerfStats> result = new ArrayList<>();
|
||||||
|
|||||||
@@ -56,6 +56,7 @@ public class AvroDFSSource extends AvroSource {
|
|||||||
}
|
}
|
||||||
|
|
||||||
private JavaRDD<GenericRecord> fromFiles(String pathStr) {
|
private JavaRDD<GenericRecord> fromFiles(String pathStr) {
|
||||||
|
sparkContext.setJobGroup(this.getClass().getSimpleName(), "Fetch Avro data from files");
|
||||||
JavaPairRDD<AvroKey, NullWritable> avroRDD = sparkContext.newAPIHadoopFile(pathStr, AvroKeyInputFormat.class,
|
JavaPairRDD<AvroKey, NullWritable> avroRDD = sparkContext.newAPIHadoopFile(pathStr, AvroKeyInputFormat.class,
|
||||||
AvroKey.class, NullWritable.class, sparkContext.hadoopConfiguration());
|
AvroKey.class, NullWritable.class, sparkContext.hadoopConfiguration());
|
||||||
return avroRDD.keys().map(r -> ((GenericRecord) r.datum()));
|
return avroRDD.keys().map(r -> ((GenericRecord) r.datum()));
|
||||||
|
|||||||
@@ -128,6 +128,7 @@ public class HiveIncrPullSource extends AvroSource {
|
|||||||
String pathStr = commitDeltaFiles.stream().map(f -> f.getPath().toString()).collect(Collectors.joining(","));
|
String pathStr = commitDeltaFiles.stream().map(f -> f.getPath().toString()).collect(Collectors.joining(","));
|
||||||
JavaPairRDD<AvroKey, NullWritable> avroRDD = sparkContext.newAPIHadoopFile(pathStr, AvroKeyInputFormat.class,
|
JavaPairRDD<AvroKey, NullWritable> avroRDD = sparkContext.newAPIHadoopFile(pathStr, AvroKeyInputFormat.class,
|
||||||
AvroKey.class, NullWritable.class, sparkContext.hadoopConfiguration());
|
AvroKey.class, NullWritable.class, sparkContext.hadoopConfiguration());
|
||||||
|
sparkContext.setJobGroup(this.getClass().getSimpleName(), "Fetch new data");
|
||||||
return new InputBatch<>(Option.of(avroRDD.keys().map(r -> ((GenericRecord) r.datum()))),
|
return new InputBatch<>(Option.of(avroRDD.keys().map(r -> ((GenericRecord) r.datum()))),
|
||||||
String.valueOf(commitToPull.get()));
|
String.valueOf(commitToPull.get()));
|
||||||
} catch (IOException ioe) {
|
} catch (IOException ioe) {
|
||||||
|
|||||||
Reference in New Issue
Block a user