[HUDI-394] Provide a basic implementation of test suite
This commit is contained in:
@@ -0,0 +1,77 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.integ.testsuite;
|
||||
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.hadoop.hive.conf.HiveConf;
|
||||
import org.apache.hudi.client.WriteStatus;
|
||||
import org.apache.hudi.common.model.HoodieRecord;
|
||||
import org.apache.hudi.common.util.collection.Pair;
|
||||
import org.apache.hudi.utilities.deltastreamer.HoodieDeltaStreamer;
|
||||
import org.apache.hudi.utilities.schema.SchemaProvider;
|
||||
import org.apache.spark.api.java.JavaRDD;
|
||||
import org.apache.spark.api.java.JavaSparkContext;
|
||||
|
||||
/**
|
||||
* Extends the {@link HoodieDeltaStreamer} to expose certain operations helpful in running the Test Suite.
|
||||
* This is done to achieve 2 things 1) Leverage some components of {@link HoodieDeltaStreamer} 2)
|
||||
* Piggyback on the suite to test {@link HoodieDeltaStreamer}
|
||||
*/
|
||||
public class HoodieDeltaStreamerWrapper extends HoodieDeltaStreamer {
|
||||
|
||||
public HoodieDeltaStreamerWrapper(Config cfg, JavaSparkContext jssc) throws Exception {
|
||||
super(cfg, jssc);
|
||||
}
|
||||
|
||||
public HoodieDeltaStreamerWrapper(Config cfg, JavaSparkContext jssc, FileSystem fs, HiveConf conf) throws Exception {
|
||||
super(cfg, jssc, fs, conf);
|
||||
}
|
||||
|
||||
public JavaRDD<WriteStatus> upsert(Operation operation) throws
|
||||
Exception {
|
||||
cfg.operation = operation;
|
||||
return deltaSyncService.getDeltaSync().syncOnce().getRight();
|
||||
}
|
||||
|
||||
public JavaRDD<WriteStatus> insert() throws Exception {
|
||||
return upsert(Operation.INSERT);
|
||||
}
|
||||
|
||||
public JavaRDD<WriteStatus> bulkInsert() throws
|
||||
Exception {
|
||||
return upsert(Operation.BULK_INSERT);
|
||||
}
|
||||
|
||||
public void scheduleCompact() throws Exception {
|
||||
// Since we don't support scheduleCompact() operation in delta-streamer, assume upsert without any data that will
|
||||
// trigger scheduling compaction
|
||||
upsert(Operation.UPSERT);
|
||||
}
|
||||
|
||||
public JavaRDD<WriteStatus> compact() throws Exception {
|
||||
// Since we don't support compact() operation in delta-streamer, assume upsert without any data that will trigger
|
||||
// inline compaction
|
||||
return upsert(Operation.UPSERT);
|
||||
}
|
||||
|
||||
public Pair<SchemaProvider, Pair<String, JavaRDD<HoodieRecord>>> fetchSource() throws Exception {
|
||||
return deltaSyncService.getDeltaSync().readFromSource(deltaSyncService.getDeltaSync().getCommitTimelineOpt());
|
||||
}
|
||||
|
||||
}
|
||||
@@ -0,0 +1,186 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.integ.testsuite;
|
||||
|
||||
import com.beust.jcommander.JCommander;
|
||||
import com.beust.jcommander.Parameter;
|
||||
import java.io.IOException;
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.apache.hadoop.hive.conf.HiveConf;
|
||||
import org.apache.hudi.DataSourceUtils;
|
||||
import org.apache.hudi.common.config.SerializableConfiguration;
|
||||
import org.apache.hudi.common.config.TypedProperties;
|
||||
import org.apache.hudi.common.fs.FSUtils;
|
||||
import org.apache.hudi.common.model.HoodieTableType;
|
||||
import org.apache.hudi.common.table.HoodieTableMetaClient;
|
||||
import org.apache.hudi.common.util.ReflectionUtils;
|
||||
import org.apache.hudi.exception.HoodieException;
|
||||
import org.apache.hudi.integ.testsuite.configuration.DFSDeltaConfig;
|
||||
import org.apache.hudi.integ.testsuite.dag.DagUtils;
|
||||
import org.apache.hudi.integ.testsuite.dag.WorkflowDag;
|
||||
import org.apache.hudi.integ.testsuite.dag.WorkflowDagGenerator;
|
||||
import org.apache.hudi.integ.testsuite.dag.scheduler.DagScheduler;
|
||||
import org.apache.hudi.integ.testsuite.generator.DeltaGenerator;
|
||||
import org.apache.hudi.integ.testsuite.reader.DeltaInputType;
|
||||
import org.apache.hudi.integ.testsuite.writer.DeltaOutputMode;
|
||||
import org.apache.hudi.keygen.KeyGenerator;
|
||||
import org.apache.hudi.utilities.UtilHelpers;
|
||||
import org.apache.hudi.utilities.deltastreamer.HoodieDeltaStreamer;
|
||||
import org.apache.hudi.utilities.schema.SchemaProvider;
|
||||
import org.apache.spark.api.java.JavaSparkContext;
|
||||
import org.apache.spark.sql.SparkSession;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
/**
|
||||
* This is the entry point for running a Hudi Test Suite. Although this class has similarities with
|
||||
* {@link HoodieDeltaStreamer} this class does not extend it since do not want to create a dependency on the changes in
|
||||
* DeltaStreamer.
|
||||
*/
|
||||
public class HoodieTestSuiteJob {
|
||||
|
||||
private static volatile Logger log = LoggerFactory.getLogger(HoodieTestSuiteJob.class);
|
||||
|
||||
private final HoodieTestSuiteConfig cfg;
|
||||
/**
|
||||
* Bag of properties with source, hoodie client, key generator etc.
|
||||
*/
|
||||
TypedProperties props;
|
||||
/**
|
||||
* Schema provider that supplies the command for writing out the generated payloads.
|
||||
*/
|
||||
private transient SchemaProvider schemaProvider;
|
||||
/**
|
||||
* Filesystem used.
|
||||
*/
|
||||
private transient FileSystem fs;
|
||||
/**
|
||||
* Spark context.
|
||||
*/
|
||||
private transient JavaSparkContext jsc;
|
||||
/**
|
||||
* Spark Session.
|
||||
*/
|
||||
private transient SparkSession sparkSession;
|
||||
/**
|
||||
* Hive Config.
|
||||
*/
|
||||
private transient HiveConf hiveConf;
|
||||
|
||||
private KeyGenerator keyGenerator;
|
||||
|
||||
public HoodieTestSuiteJob(HoodieTestSuiteConfig cfg, JavaSparkContext jsc) throws IOException {
|
||||
this.cfg = cfg;
|
||||
this.jsc = jsc;
|
||||
this.sparkSession = SparkSession.builder().config(jsc.getConf()).getOrCreate();
|
||||
this.fs = FSUtils.getFs(cfg.inputBasePath, jsc.hadoopConfiguration());
|
||||
this.props = UtilHelpers.readConfig(fs, new Path(cfg.propsFilePath), cfg.configs).getConfig();
|
||||
log.info("Creating workload generator with configs : {}", props.toString());
|
||||
this.schemaProvider = UtilHelpers.createSchemaProvider(cfg.schemaProviderClassName, props, jsc);
|
||||
this.hiveConf = getDefaultHiveConf(jsc.hadoopConfiguration());
|
||||
this.keyGenerator = DataSourceUtils.createKeyGenerator(props);
|
||||
if (!fs.exists(new Path(cfg.targetBasePath))) {
|
||||
HoodieTableMetaClient.initTableType(jsc.hadoopConfiguration(), cfg.targetBasePath,
|
||||
HoodieTableType.valueOf(cfg.tableType), cfg.targetTableName, "archived");
|
||||
}
|
||||
}
|
||||
|
||||
private static HiveConf getDefaultHiveConf(Configuration cfg) {
|
||||
HiveConf hiveConf = new HiveConf();
|
||||
hiveConf.addResource(cfg);
|
||||
return hiveConf;
|
||||
}
|
||||
|
||||
public static void main(String[] args) throws Exception {
|
||||
final HoodieTestSuiteConfig cfg = new HoodieTestSuiteConfig();
|
||||
JCommander cmd = new JCommander(cfg, args);
|
||||
if (cfg.help || args.length == 0) {
|
||||
cmd.usage();
|
||||
System.exit(1);
|
||||
}
|
||||
|
||||
JavaSparkContext jssc = UtilHelpers.buildSparkContext("workload-generator-" + cfg.outputTypeName
|
||||
+ "-" + cfg.inputFormatName, cfg.sparkMaster);
|
||||
new HoodieTestSuiteJob(cfg, jssc).runTestSuite();
|
||||
}
|
||||
|
||||
public void runTestSuite() {
|
||||
try {
|
||||
WorkflowDag workflowDag = this.cfg.workloadYamlPath == null ? ((WorkflowDagGenerator) ReflectionUtils
|
||||
.loadClass((this.cfg).workloadDagGenerator)).build()
|
||||
: DagUtils.convertYamlPathToDag(this.fs, this.cfg.workloadYamlPath);
|
||||
log.info("Workflow Dag => " + DagUtils.convertDagToYaml(workflowDag));
|
||||
long startTime = System.currentTimeMillis();
|
||||
String schemaStr = schemaProvider.getSourceSchema().toString();
|
||||
final HoodieTestSuiteWriter writer = new HoodieTestSuiteWriter(jsc, props, cfg, schemaStr);
|
||||
final DeltaGenerator deltaGenerator = new DeltaGenerator(
|
||||
new DFSDeltaConfig(DeltaOutputMode.valueOf(cfg.outputTypeName), DeltaInputType.valueOf(cfg.inputFormatName),
|
||||
new SerializableConfiguration(jsc.hadoopConfiguration()), cfg.inputBasePath, cfg.targetBasePath,
|
||||
schemaStr, cfg.limitFileSize), jsc, sparkSession, schemaStr, keyGenerator);
|
||||
DagScheduler dagScheduler = new DagScheduler(workflowDag, writer, deltaGenerator);
|
||||
dagScheduler.schedule();
|
||||
log.info("Finished scheduling all tasks, Time taken {}", System.currentTimeMillis() - startTime);
|
||||
} catch (Exception e) {
|
||||
log.error("Failed to run Test Suite ", e);
|
||||
throw new HoodieException("Failed to run Test Suite ", e);
|
||||
} finally {
|
||||
jsc.stop();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* The Hudi test suite uses {@link HoodieDeltaStreamer} to run some operations hence extend delta streamer config.
|
||||
*/
|
||||
public static class HoodieTestSuiteConfig extends HoodieDeltaStreamer.Config {
|
||||
|
||||
@Parameter(names = {"--input-base-path"}, description = "base path for input data"
|
||||
+ "(Will be created if did not exist first time around. If exists, more data will be added to that path)",
|
||||
required = true)
|
||||
public String inputBasePath;
|
||||
|
||||
@Parameter(names = {
|
||||
"--workload-generator-classname"}, description = "WorkflowDag of operations to generate the workload",
|
||||
required = true)
|
||||
public String workloadDagGenerator = WorkflowDagGenerator.class.getName();
|
||||
|
||||
@Parameter(names = {
|
||||
"--workload-yaml-path"}, description = "Workflow Dag yaml path to generate the workload")
|
||||
public String workloadYamlPath;
|
||||
|
||||
@Parameter(names = {"--delta-output-type"}, description = "Subclass of "
|
||||
+ "org.apache.hudi.testsuite.workload.DeltaOutputMode to readAvro data.")
|
||||
public String outputTypeName = DeltaOutputMode.DFS.name();
|
||||
|
||||
@Parameter(names = {"--delta-input-format"}, description = "Subclass of "
|
||||
+ "org.apache.hudi.testsuite.workload.DeltaOutputMode to read avro data.")
|
||||
public String inputFormatName = DeltaInputType.AVRO.name();
|
||||
|
||||
@Parameter(names = {"--input-file-size"}, description = "The min/max size of the input files to generate",
|
||||
required = true)
|
||||
public Long limitFileSize = 1024 * 1024 * 120L;
|
||||
|
||||
@Parameter(names = {"--use-deltastreamer"}, description = "Choose whether to use HoodieDeltaStreamer to "
|
||||
+ "perform"
|
||||
+ " ingestion. If set to false, HoodieWriteClient will be used")
|
||||
public Boolean useDeltaStreamer = false;
|
||||
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,219 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.integ.testsuite;
|
||||
|
||||
import java.util.Arrays;
|
||||
import java.util.HashMap;
|
||||
import java.util.HashSet;
|
||||
import java.util.Map;
|
||||
import java.util.Properties;
|
||||
import java.util.Set;
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hudi.avro.model.HoodieCompactionPlan;
|
||||
import org.apache.hudi.client.HoodieReadClient;
|
||||
import org.apache.hudi.client.HoodieWriteClient;
|
||||
import org.apache.hudi.client.WriteStatus;
|
||||
import org.apache.hudi.common.model.HoodieRecord;
|
||||
import org.apache.hudi.common.table.timeline.HoodieActiveTimeline;
|
||||
import org.apache.hudi.common.util.Option;
|
||||
import org.apache.hudi.common.util.collection.Pair;
|
||||
import org.apache.hudi.config.HoodieCompactionConfig;
|
||||
import org.apache.hudi.config.HoodieIndexConfig;
|
||||
import org.apache.hudi.config.HoodieWriteConfig;
|
||||
import org.apache.hudi.index.HoodieIndex;
|
||||
import org.apache.hudi.integ.testsuite.dag.nodes.CleanNode;
|
||||
import org.apache.hudi.integ.testsuite.dag.nodes.DagNode;
|
||||
import org.apache.hudi.integ.testsuite.dag.nodes.RollbackNode;
|
||||
import org.apache.hudi.integ.testsuite.dag.nodes.ScheduleCompactNode;
|
||||
import org.apache.hudi.integ.testsuite.HoodieTestSuiteJob.HoodieTestSuiteConfig;
|
||||
import org.apache.hudi.utilities.deltastreamer.HoodieDeltaStreamer.Operation;
|
||||
import org.apache.hudi.utilities.schema.SchemaProvider;
|
||||
import org.apache.spark.api.java.JavaRDD;
|
||||
import org.apache.spark.api.java.JavaSparkContext;
|
||||
|
||||
/**
|
||||
* A writer abstraction for the Hudi test suite. This class wraps different implementations of writers used to perform
|
||||
* write operations into the target hudi dataset. Current supported writers are {@link HoodieDeltaStreamerWrapper}
|
||||
* and {@link HoodieWriteClient}.
|
||||
*/
|
||||
public class HoodieTestSuiteWriter {
|
||||
|
||||
private HoodieDeltaStreamerWrapper deltaStreamerWrapper;
|
||||
private HoodieWriteClient writeClient;
|
||||
protected HoodieTestSuiteConfig cfg;
|
||||
private Option<String> lastCheckpoint;
|
||||
private HoodieReadClient hoodieReadClient;
|
||||
private Properties props;
|
||||
private String schema;
|
||||
private transient Configuration configuration;
|
||||
private transient JavaSparkContext sparkContext;
|
||||
private static Set<String> VALID_DAG_NODES_TO_ALLOW_WRITE_CLIENT_IN_DELTASTREAMER_MODE = new HashSet<>(
|
||||
Arrays.asList(RollbackNode.class.getName(), CleanNode.class.getName(), ScheduleCompactNode.class.getName()));
|
||||
|
||||
public HoodieTestSuiteWriter(JavaSparkContext jsc, Properties props, HoodieTestSuiteConfig cfg, String schema) throws
|
||||
Exception {
|
||||
this(jsc, props, cfg, schema, true);
|
||||
}
|
||||
|
||||
public HoodieTestSuiteWriter(JavaSparkContext jsc, Properties props, HoodieTestSuiteConfig cfg, String schema,
|
||||
boolean rollbackInflight) throws Exception {
|
||||
// We ensure that only 1 instance of HoodieWriteClient is instantiated for a HoodieTestSuiteWriter
|
||||
// This does not instantiate a HoodieWriteClient until a
|
||||
// {@link HoodieDeltaStreamer#commit(HoodieWriteClient, JavaRDD, Option)} is invoked.
|
||||
this.deltaStreamerWrapper = new HoodieDeltaStreamerWrapper(cfg, jsc);
|
||||
this.hoodieReadClient = new HoodieReadClient(jsc, cfg.targetBasePath);
|
||||
if (!cfg.useDeltaStreamer) {
|
||||
this.writeClient = new HoodieWriteClient(jsc, getHoodieClientConfig(cfg, props, schema), rollbackInflight);
|
||||
}
|
||||
this.cfg = cfg;
|
||||
this.configuration = jsc.hadoopConfiguration();
|
||||
this.sparkContext = jsc;
|
||||
this.props = props;
|
||||
this.schema = schema;
|
||||
}
|
||||
|
||||
private HoodieWriteConfig getHoodieClientConfig(HoodieTestSuiteConfig cfg, Properties props, String schema) {
|
||||
HoodieWriteConfig.Builder builder =
|
||||
HoodieWriteConfig.newBuilder().combineInput(true, true).withPath(cfg.targetBasePath)
|
||||
.withAutoCommit(false)
|
||||
.withCompactionConfig(HoodieCompactionConfig.newBuilder().withPayloadClass(cfg.payloadClassName).build())
|
||||
.forTable(cfg.targetTableName)
|
||||
.withIndexConfig(HoodieIndexConfig.newBuilder().withIndexType(HoodieIndex.IndexType.BLOOM).build())
|
||||
.withProps(props);
|
||||
builder = builder.withSchema(schema);
|
||||
return builder.build();
|
||||
}
|
||||
|
||||
private boolean allowWriteClientAccess(DagNode dagNode) {
|
||||
if (VALID_DAG_NODES_TO_ALLOW_WRITE_CLIENT_IN_DELTASTREAMER_MODE.contains(dagNode.getClass().getName())) {
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
public Pair<SchemaProvider, Pair<String, JavaRDD<HoodieRecord>>> fetchSource() throws Exception {
|
||||
return this.deltaStreamerWrapper.fetchSource();
|
||||
}
|
||||
|
||||
public Option<String> startCommit() {
|
||||
if (cfg.useDeltaStreamer) {
|
||||
return Option.of(HoodieActiveTimeline.createNewInstantTime());
|
||||
} else {
|
||||
return Option.of(writeClient.startCommit());
|
||||
}
|
||||
}
|
||||
|
||||
public JavaRDD<WriteStatus> upsert(Option<String> instantTime) throws Exception {
|
||||
if (cfg.useDeltaStreamer) {
|
||||
return deltaStreamerWrapper.upsert(Operation.UPSERT);
|
||||
} else {
|
||||
Pair<SchemaProvider, Pair<String, JavaRDD<HoodieRecord>>> nextBatch = fetchSource();
|
||||
lastCheckpoint = Option.of(nextBatch.getValue().getLeft());
|
||||
return writeClient.upsert(nextBatch.getRight().getRight(), instantTime.get());
|
||||
}
|
||||
}
|
||||
|
||||
public JavaRDD<WriteStatus> insert(Option<String> instantTime) throws Exception {
|
||||
if (cfg.useDeltaStreamer) {
|
||||
return deltaStreamerWrapper.insert();
|
||||
} else {
|
||||
Pair<SchemaProvider, Pair<String, JavaRDD<HoodieRecord>>> nextBatch = fetchSource();
|
||||
lastCheckpoint = Option.of(nextBatch.getValue().getLeft());
|
||||
return writeClient.insert(nextBatch.getRight().getRight(), instantTime.get());
|
||||
}
|
||||
}
|
||||
|
||||
public JavaRDD<WriteStatus> bulkInsert(Option<String> instantTime) throws Exception {
|
||||
if (cfg.useDeltaStreamer) {
|
||||
return deltaStreamerWrapper.bulkInsert();
|
||||
} else {
|
||||
Pair<SchemaProvider, Pair<String, JavaRDD<HoodieRecord>>> nextBatch = fetchSource();
|
||||
lastCheckpoint = Option.of(nextBatch.getValue().getLeft());
|
||||
return writeClient.bulkInsert(nextBatch.getRight().getRight(), instantTime.get());
|
||||
}
|
||||
}
|
||||
|
||||
public JavaRDD<WriteStatus> compact(Option<String> instantTime) throws Exception {
|
||||
if (cfg.useDeltaStreamer) {
|
||||
return deltaStreamerWrapper.compact();
|
||||
} else {
|
||||
if (!instantTime.isPresent()) {
|
||||
Option<Pair<String, HoodieCompactionPlan>> compactionPlanPair = Option
|
||||
.fromJavaOptional(hoodieReadClient.getPendingCompactions()
|
||||
.stream().findFirst());
|
||||
if (compactionPlanPair.isPresent()) {
|
||||
instantTime = Option.of(compactionPlanPair.get().getLeft());
|
||||
}
|
||||
}
|
||||
if (instantTime.isPresent()) {
|
||||
return writeClient.compact(instantTime.get());
|
||||
} else {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public Option<String> scheduleCompaction(Option<Map<String, String>> previousCommitExtraMetadata) throws
|
||||
Exception {
|
||||
if (!cfg.useDeltaStreamer) {
|
||||
deltaStreamerWrapper.scheduleCompact();
|
||||
return Option.empty();
|
||||
} else {
|
||||
return writeClient.scheduleCompaction(previousCommitExtraMetadata);
|
||||
}
|
||||
}
|
||||
|
||||
public void commit(JavaRDD<WriteStatus> records, Option<String> instantTime) {
|
||||
if (!cfg.useDeltaStreamer) {
|
||||
Map<String, String> extraMetadata = new HashMap<>();
|
||||
/** Store the checkpoint in the commit metadata just like
|
||||
* {@link HoodieDeltaStreamer#commit(HoodieWriteClient, JavaRDD, Option)} **/
|
||||
extraMetadata.put(HoodieDeltaStreamerWrapper.CHECKPOINT_KEY, lastCheckpoint.get());
|
||||
writeClient.commit(instantTime.get(), records, Option.of(extraMetadata));
|
||||
}
|
||||
}
|
||||
|
||||
public HoodieWriteClient getWriteClient(DagNode dagNode) throws IllegalAccessException {
|
||||
if (cfg.useDeltaStreamer & !allowWriteClientAccess(dagNode)) {
|
||||
throw new IllegalAccessException("cannot access write client when testing in deltastreamer mode");
|
||||
}
|
||||
synchronized (this) {
|
||||
if (writeClient == null) {
|
||||
this.writeClient = new HoodieWriteClient(this.sparkContext, getHoodieClientConfig(cfg, props, schema), false);
|
||||
}
|
||||
}
|
||||
return writeClient;
|
||||
}
|
||||
|
||||
public HoodieDeltaStreamerWrapper getDeltaStreamerWrapper() {
|
||||
return deltaStreamerWrapper;
|
||||
}
|
||||
|
||||
public HoodieTestSuiteConfig getCfg() {
|
||||
return cfg;
|
||||
}
|
||||
|
||||
public Configuration getConfiguration() {
|
||||
return configuration;
|
||||
}
|
||||
|
||||
public JavaSparkContext getSparkContext() {
|
||||
return sparkContext;
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,73 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.integ.testsuite.configuration;
|
||||
|
||||
import org.apache.hudi.common.config.SerializableConfiguration;
|
||||
import org.apache.hudi.integ.testsuite.reader.DeltaInputType;
|
||||
import org.apache.hudi.integ.testsuite.writer.DeltaOutputMode;
|
||||
|
||||
/**
|
||||
* Configuration to hold details about a DFS based output type, implements {@link DeltaConfig}.
|
||||
*/
|
||||
public class DFSDeltaConfig extends DeltaConfig {
|
||||
|
||||
// The base path where the generated data should be written to. This data will in turn be used to write into a hudi
|
||||
// dataset
|
||||
private final String deltaBasePath;
|
||||
private final String datasetOutputPath;
|
||||
private final String schemaStr;
|
||||
// Maximum file size for the files generated
|
||||
private final Long maxFileSize;
|
||||
// The current batch id
|
||||
private Integer batchId;
|
||||
|
||||
public DFSDeltaConfig(DeltaOutputMode deltaOutputMode, DeltaInputType deltaInputType,
|
||||
SerializableConfiguration configuration,
|
||||
String deltaBasePath, String targetBasePath, String schemaStr, Long maxFileSize) {
|
||||
super(deltaOutputMode, deltaInputType, configuration);
|
||||
this.deltaBasePath = deltaBasePath;
|
||||
this.schemaStr = schemaStr;
|
||||
this.maxFileSize = maxFileSize;
|
||||
this.datasetOutputPath = targetBasePath;
|
||||
}
|
||||
|
||||
public String getDeltaBasePath() {
|
||||
return deltaBasePath;
|
||||
}
|
||||
|
||||
public String getDatasetOutputPath() {
|
||||
return datasetOutputPath;
|
||||
}
|
||||
|
||||
public String getSchemaStr() {
|
||||
return schemaStr;
|
||||
}
|
||||
|
||||
public Long getMaxFileSize() {
|
||||
return maxFileSize;
|
||||
}
|
||||
|
||||
public Integer getBatchId() {
|
||||
return batchId;
|
||||
}
|
||||
|
||||
public void setBatchId(Integer batchId) {
|
||||
this.batchId = batchId;
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,256 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.integ.testsuite.configuration;
|
||||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
import java.io.Serializable;
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hudi.common.config.SerializableConfiguration;
|
||||
import org.apache.hudi.common.util.collection.Pair;
|
||||
import org.apache.hudi.integ.testsuite.reader.DeltaInputType;
|
||||
import org.apache.hudi.integ.testsuite.writer.DeltaOutputMode;
|
||||
|
||||
/**
|
||||
* Configuration to hold the delta output type and delta input format.
|
||||
*/
|
||||
public class DeltaConfig implements Serializable {
|
||||
|
||||
private final DeltaOutputMode deltaOutputMode;
|
||||
private final DeltaInputType deltaInputType;
|
||||
private final SerializableConfiguration configuration;
|
||||
|
||||
public DeltaConfig(DeltaOutputMode deltaOutputMode, DeltaInputType deltaInputType,
|
||||
SerializableConfiguration configuration) {
|
||||
this.deltaOutputMode = deltaOutputMode;
|
||||
this.deltaInputType = deltaInputType;
|
||||
this.configuration = configuration;
|
||||
}
|
||||
|
||||
public DeltaOutputMode getDeltaOutputMode() {
|
||||
return deltaOutputMode;
|
||||
}
|
||||
|
||||
public DeltaInputType getDeltaInputType() {
|
||||
return deltaInputType;
|
||||
}
|
||||
|
||||
public Configuration getConfiguration() {
|
||||
return configuration.get();
|
||||
}
|
||||
|
||||
/**
|
||||
* Represents any kind of workload operation for new data. Each workload also contains a set of Option sequence of
|
||||
* actions that can be executed in parallel.
|
||||
*/
|
||||
public static class Config {
|
||||
|
||||
public static final String CONFIG_NAME = "config";
|
||||
public static final String TYPE = "type";
|
||||
public static final String NODE_NAME = "name";
|
||||
public static final String DEPENDENCIES = "deps";
|
||||
public static final String CHILDREN = "children";
|
||||
public static final String HIVE_QUERIES = "hive_queries";
|
||||
public static final String HIVE_PROPERTIES = "hive_props";
|
||||
private static String NUM_RECORDS_INSERT = "num_records_insert";
|
||||
private static String NUM_RECORDS_UPSERT = "num_records_upsert";
|
||||
private static String REPEAT_COUNT = "repeat_count";
|
||||
private static String RECORD_SIZE = "record_size";
|
||||
private static String NUM_PARTITIONS_INSERT = "num_partitions_insert";
|
||||
private static String NUM_PARTITIONS_UPSERT = "num_partitions_upsert";
|
||||
private static String NUM_FILES_UPSERT = "num_files_upsert";
|
||||
private static String FRACTION_UPSERT_PER_FILE = "fraction_upsert_per_file";
|
||||
private static String DISABLE_GENERATE = "disable_generate";
|
||||
private static String DISABLE_INGEST = "disable_ingest";
|
||||
private static String HIVE_LOCAL = "hive_local";
|
||||
|
||||
private Map<String, Object> configsMap;
|
||||
|
||||
public Config(Map<String, Object> configsMap) {
|
||||
this.configsMap = configsMap;
|
||||
}
|
||||
|
||||
public static Builder newBuilder() {
|
||||
return new Builder();
|
||||
}
|
||||
|
||||
public long getNumRecordsInsert() {
|
||||
return Long.valueOf(configsMap.getOrDefault(NUM_RECORDS_INSERT, 0).toString());
|
||||
}
|
||||
|
||||
public long getNumRecordsUpsert() {
|
||||
return Long.valueOf(configsMap.getOrDefault(NUM_RECORDS_UPSERT, 0).toString());
|
||||
}
|
||||
|
||||
public int getRecordSize() {
|
||||
return Integer.valueOf(configsMap.getOrDefault(RECORD_SIZE, 1024).toString());
|
||||
}
|
||||
|
||||
public int getNumInsertPartitions() {
|
||||
return Integer.valueOf(configsMap.getOrDefault(NUM_PARTITIONS_INSERT, 1).toString());
|
||||
}
|
||||
|
||||
public int getRepeatCount() {
|
||||
return Integer.valueOf(configsMap.getOrDefault(REPEAT_COUNT, 1).toString());
|
||||
}
|
||||
|
||||
public int getNumUpsertPartitions() {
|
||||
return Integer.valueOf(configsMap.getOrDefault(NUM_PARTITIONS_UPSERT, 0).toString());
|
||||
}
|
||||
|
||||
public int getNumUpsertFiles() {
|
||||
return Integer.valueOf(configsMap.getOrDefault(NUM_FILES_UPSERT, 1).toString());
|
||||
}
|
||||
|
||||
public double getFractionUpsertPerFile() {
|
||||
return Double.valueOf(configsMap.getOrDefault(FRACTION_UPSERT_PER_FILE, 0.0).toString());
|
||||
}
|
||||
|
||||
public boolean isDisableGenerate() {
|
||||
return Boolean.valueOf(configsMap.getOrDefault(DISABLE_GENERATE, false).toString());
|
||||
}
|
||||
|
||||
public boolean isDisableIngest() {
|
||||
return Boolean.valueOf(configsMap.getOrDefault(DISABLE_INGEST, false).toString());
|
||||
}
|
||||
|
||||
public Map<String, Object> getOtherConfigs() {
|
||||
if (configsMap == null) {
|
||||
return new HashMap<>();
|
||||
}
|
||||
return configsMap;
|
||||
}
|
||||
|
||||
public List<Pair<String, Integer>> getHiveQueries() {
|
||||
try {
|
||||
return (List<Pair<String, Integer>>) this.configsMap.getOrDefault("hive_queries", new ArrayList<>());
|
||||
} catch (Exception e) {
|
||||
throw new RuntimeException("unable to get hive queries from configs");
|
||||
}
|
||||
}
|
||||
|
||||
public boolean isHiveLocal() {
|
||||
return Boolean.valueOf(configsMap.getOrDefault(HIVE_LOCAL, false).toString());
|
||||
}
|
||||
|
||||
public List<String> getHiveProperties() {
|
||||
return (List<String>) this.configsMap.getOrDefault(HIVE_PROPERTIES, new ArrayList<>());
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
try {
|
||||
return new ObjectMapper().writeValueAsString(this.configsMap);
|
||||
} catch (Exception e) {
|
||||
throw new RuntimeException("unable to generate string representation of config");
|
||||
}
|
||||
}
|
||||
|
||||
public static class Builder {
|
||||
|
||||
private Map<String, Object> configsMap = new HashMap<>();
|
||||
|
||||
public Builder() {
|
||||
}
|
||||
|
||||
public Builder withNumRecordsToInsert(long numRecordsInsert) {
|
||||
this.configsMap.put(NUM_RECORDS_INSERT, numRecordsInsert);
|
||||
return this;
|
||||
}
|
||||
|
||||
public Builder withNumRecordsToUpdate(long numRecordsUpsert) {
|
||||
this.configsMap.put(NUM_RECORDS_UPSERT, numRecordsUpsert);
|
||||
return this;
|
||||
}
|
||||
|
||||
public Builder withNumInsertPartitions(int numInsertPartitions) {
|
||||
this.configsMap.put(NUM_PARTITIONS_INSERT, numInsertPartitions);
|
||||
return this;
|
||||
}
|
||||
|
||||
public Builder withNumUpsertPartitions(int numUpsertPartitions) {
|
||||
this.configsMap.put(NUM_PARTITIONS_UPSERT, numUpsertPartitions);
|
||||
return this;
|
||||
}
|
||||
|
||||
public Builder withNumUpsertFiles(int numUpsertFiles) {
|
||||
this.configsMap.put(NUM_FILES_UPSERT, numUpsertFiles);
|
||||
return this;
|
||||
}
|
||||
|
||||
public Builder withFractionUpsertPerFile(double fractionUpsertPerFile) {
|
||||
this.configsMap.put(FRACTION_UPSERT_PER_FILE, fractionUpsertPerFile);
|
||||
return this;
|
||||
}
|
||||
|
||||
public Builder withNumTimesToRepeat(int repeatCount) {
|
||||
this.configsMap.put(REPEAT_COUNT, repeatCount);
|
||||
return this;
|
||||
}
|
||||
|
||||
public Builder withRecordSize(int recordSize) {
|
||||
this.configsMap.put(RECORD_SIZE, recordSize);
|
||||
return this;
|
||||
}
|
||||
|
||||
public Builder disableGenerate(boolean generate) {
|
||||
this.configsMap.put(DISABLE_GENERATE, generate);
|
||||
return this;
|
||||
}
|
||||
|
||||
public Builder disableIngest(boolean ingest) {
|
||||
this.configsMap.put(DISABLE_INGEST, ingest);
|
||||
return this;
|
||||
}
|
||||
|
||||
public Builder withConfig(String name, Object value) {
|
||||
this.configsMap.put(name, value);
|
||||
return this;
|
||||
}
|
||||
|
||||
public Builder withHiveQueryAndResults(List<Pair<String, Integer>> hiveQueries) {
|
||||
this.configsMap.put(HIVE_QUERIES, hiveQueries);
|
||||
return this;
|
||||
}
|
||||
|
||||
public Builder withHiveLocal(boolean startHiveLocal) {
|
||||
this.configsMap.put(HIVE_LOCAL, startHiveLocal);
|
||||
return this;
|
||||
}
|
||||
|
||||
public Builder withHiveProperties(List<String> hiveProperties) {
|
||||
this.configsMap.put(HIVE_PROPERTIES, hiveProperties);
|
||||
return this;
|
||||
}
|
||||
|
||||
public Builder withConfigsMap(Map<String, Object> configsMap) {
|
||||
this.configsMap = configsMap;
|
||||
return this;
|
||||
}
|
||||
|
||||
public Config build() {
|
||||
return new Config(configsMap);
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,33 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.integ.testsuite.converter;
|
||||
|
||||
import java.io.Serializable;
|
||||
import org.apache.spark.api.java.JavaRDD;
|
||||
|
||||
/**
|
||||
* Implementations of {@link Converter} will convert data from one format to another.
|
||||
*
|
||||
* @param <I> Input Data Type
|
||||
* @param <O> Output Data Type
|
||||
*/
|
||||
public interface Converter<I, O> extends Serializable {
|
||||
|
||||
JavaRDD<O> convert(JavaRDD<I> inputRDD);
|
||||
}
|
||||
@@ -0,0 +1,53 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.integ.testsuite.converter;
|
||||
|
||||
import java.util.List;
|
||||
import org.apache.avro.generic.GenericRecord;
|
||||
import org.apache.hudi.integ.testsuite.generator.LazyRecordGeneratorIterator;
|
||||
import org.apache.hudi.integ.testsuite.generator.UpdateGeneratorIterator;
|
||||
import org.apache.spark.api.java.JavaRDD;
|
||||
|
||||
/**
|
||||
* This converter creates an update {@link GenericRecord} from an existing {@link GenericRecord}.
|
||||
*/
|
||||
public class UpdateConverter implements Converter<GenericRecord, GenericRecord> {
|
||||
|
||||
private final String schemaStr;
|
||||
// The fields that should not be mutated when converting the insert record to an update record, generally the
|
||||
// record_key
|
||||
private final List<String> partitionPathFields;
|
||||
private final List<String> recordKeyFields;
|
||||
private final int minPayloadSize;
|
||||
|
||||
public UpdateConverter(String schemaStr, int minPayloadSize, List<String> partitionPathFields,
|
||||
List<String> recordKeyFields) {
|
||||
this.schemaStr = schemaStr;
|
||||
this.partitionPathFields = partitionPathFields;
|
||||
this.recordKeyFields = recordKeyFields;
|
||||
this.minPayloadSize = minPayloadSize;
|
||||
}
|
||||
|
||||
@Override
|
||||
public JavaRDD<GenericRecord> convert(JavaRDD<GenericRecord> inputRDD) {
|
||||
return inputRDD.mapPartitions(recordItr -> new LazyRecordGeneratorIterator(new UpdateGeneratorIterator(recordItr,
|
||||
schemaStr, partitionPathFields, recordKeyFields, minPayloadSize)));
|
||||
}
|
||||
|
||||
}
|
||||
@@ -0,0 +1,217 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.integ.testsuite.dag;
|
||||
|
||||
import com.fasterxml.jackson.core.JsonParser;
|
||||
import com.fasterxml.jackson.databind.JsonNode;
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
import com.fasterxml.jackson.databind.node.ObjectNode;
|
||||
import com.fasterxml.jackson.dataformat.yaml.YAMLFactory;
|
||||
import com.fasterxml.jackson.dataformat.yaml.YAMLGenerator.Feature;
|
||||
import java.io.ByteArrayOutputStream;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.HashMap;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Map.Entry;
|
||||
import java.util.stream.Collectors;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.apache.hudi.common.util.ReflectionUtils;
|
||||
import org.apache.hudi.common.util.StringUtils;
|
||||
import org.apache.hudi.common.util.collection.Pair;
|
||||
import org.apache.hudi.integ.testsuite.configuration.DeltaConfig;
|
||||
import org.apache.hudi.integ.testsuite.dag.nodes.DagNode;
|
||||
|
||||
/**
|
||||
* Utility class to SerDe workflow dag.
|
||||
*/
|
||||
public class DagUtils {
|
||||
|
||||
static final ObjectMapper MAPPER = new ObjectMapper();
|
||||
|
||||
/**
|
||||
* Converts a YAML path to {@link WorkflowDag}.
|
||||
*/
|
||||
public static WorkflowDag convertYamlPathToDag(FileSystem fs, String path) throws IOException {
|
||||
InputStream is = fs.open(new Path(path));
|
||||
return convertYamlToDag(toString(is));
|
||||
}
|
||||
|
||||
/**
|
||||
* Converts a YAML representation to {@link WorkflowDag}.
|
||||
*/
|
||||
public static WorkflowDag convertYamlToDag(String yaml) throws IOException {
|
||||
Map<String, DagNode> allNodes = new HashMap<>();
|
||||
final ObjectMapper yamlReader = new ObjectMapper(new YAMLFactory());
|
||||
final JsonNode jsonNode = yamlReader.readTree(yaml);
|
||||
Iterator<Entry<String, JsonNode>> itr = jsonNode.fields();
|
||||
while (itr.hasNext()) {
|
||||
Entry<String, JsonNode> dagNode = itr.next();
|
||||
allNodes.put(dagNode.getKey(), convertJsonToDagNode(allNodes, dagNode.getValue()));
|
||||
}
|
||||
return new WorkflowDag(findRootNodes(allNodes));
|
||||
}
|
||||
|
||||
/**
|
||||
* Converts {@link WorkflowDag} to a YAML representation.
|
||||
*/
|
||||
public static String convertDagToYaml(WorkflowDag dag) throws IOException {
|
||||
final ObjectMapper yamlWriter = new ObjectMapper(new YAMLFactory().disable(Feature.WRITE_DOC_START_MARKER)
|
||||
.enable(Feature.MINIMIZE_QUOTES).enable(JsonParser.Feature.ALLOW_UNQUOTED_FIELD_NAMES));
|
||||
JsonNode yamlNode = MAPPER.createObjectNode();
|
||||
convertDagToYaml(yamlNode, dag.getNodeList());
|
||||
return yamlWriter.writerWithDefaultPrettyPrinter().writeValueAsString(yamlNode);
|
||||
}
|
||||
|
||||
private static void convertDagToYaml(JsonNode yamlNode, List<DagNode> dagNodes) throws IOException {
|
||||
for (DagNode dagNode : dagNodes) {
|
||||
String name = dagNode.getConfig().getOtherConfigs().getOrDefault(DeltaConfig.Config.NODE_NAME, dagNode.getName()).toString();
|
||||
((ObjectNode) yamlNode).put(name, convertDagNodeToJsonNode(dagNode));
|
||||
if (dagNode.getChildNodes().size() > 0) {
|
||||
convertDagToYaml(yamlNode, dagNode.getChildNodes());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private static DagNode convertJsonToDagNode(Map<String, DagNode> allNodes, JsonNode node) throws IOException {
|
||||
String type = node.get(DeltaConfig.Config.TYPE).asText();
|
||||
final DagNode retNode = convertJsonToDagNode(node, type);
|
||||
Arrays.asList(node.get(DeltaConfig.Config.DEPENDENCIES).textValue().split(",")).stream().forEach(dep -> {
|
||||
DagNode parentNode = allNodes.get(dep);
|
||||
if (parentNode != null) {
|
||||
parentNode.addChildNode(retNode);
|
||||
}
|
||||
});
|
||||
return retNode;
|
||||
}
|
||||
|
||||
private static List<DagNode> findRootNodes(Map<String, DagNode> allNodes) {
|
||||
final List<DagNode> rootNodes = new ArrayList<>();
|
||||
allNodes.entrySet().stream().forEach(entry -> {
|
||||
if (entry.getValue().getParentNodes().size() < 1) {
|
||||
rootNodes.add(entry.getValue());
|
||||
}
|
||||
});
|
||||
return rootNodes;
|
||||
}
|
||||
|
||||
private static DagNode convertJsonToDagNode(JsonNode node, String type) {
|
||||
try {
|
||||
DeltaConfig.Config config = DeltaConfig.Config.newBuilder().withConfigsMap(convertJsonNodeToMap(node)).build();
|
||||
return (DagNode) ReflectionUtils.loadClass(generateFQN(type), config);
|
||||
} catch (ClassNotFoundException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
|
||||
private static String generateFQN(String name) throws ClassNotFoundException {
|
||||
return Class.forName(StringUtils.joinUsingDelim(".",
|
||||
DagNode.class.getName().substring(0, DagNode.class.getName().lastIndexOf(".")), name)).getName();
|
||||
}
|
||||
|
||||
private static JsonNode convertDagNodeToJsonNode(DagNode node) throws IOException {
|
||||
return createJsonNode(node, node.getClass().getSimpleName());
|
||||
}
|
||||
|
||||
private static Map<String, Object> convertJsonNodeToMap(JsonNode node) {
|
||||
Map<String, Object> configsMap = new HashMap<>();
|
||||
Iterator<Entry<String, JsonNode>> itr = node.get(DeltaConfig.Config.CONFIG_NAME).fields();
|
||||
while (itr.hasNext()) {
|
||||
Entry<String, JsonNode> entry = itr.next();
|
||||
switch (entry.getKey()) {
|
||||
case DeltaConfig.Config.HIVE_QUERIES:
|
||||
configsMap.put(DeltaConfig.Config.HIVE_QUERIES, getHiveQueries(entry));
|
||||
break;
|
||||
case DeltaConfig.Config.HIVE_PROPERTIES:
|
||||
configsMap.put(DeltaConfig.Config.HIVE_PROPERTIES, getProperties(entry));
|
||||
break;
|
||||
default:
|
||||
configsMap.put(entry.getKey(), getValue(entry.getValue()));
|
||||
break;
|
||||
}
|
||||
}
|
||||
return configsMap;
|
||||
}
|
||||
|
||||
private static List<Pair<String, Integer>> getHiveQueries(Entry<String, JsonNode> entry) {
|
||||
List<Pair<String, Integer>> queries = new ArrayList<>();
|
||||
Iterator<Entry<String, JsonNode>> queriesItr = entry.getValue().fields();
|
||||
while (queriesItr.hasNext()) {
|
||||
queries.add(Pair.of(queriesItr.next().getValue().textValue(), queriesItr.next().getValue().asInt()));
|
||||
}
|
||||
return queries;
|
||||
}
|
||||
|
||||
private static List<String> getProperties(Entry<String, JsonNode> entry) {
|
||||
List<String> properties = new ArrayList<>();
|
||||
Iterator<Entry<String, JsonNode>> queriesItr = entry.getValue().fields();
|
||||
while (queriesItr.hasNext()) {
|
||||
properties.add(queriesItr.next().getValue().textValue());
|
||||
}
|
||||
return properties;
|
||||
}
|
||||
|
||||
private static Object getValue(JsonNode node) {
|
||||
if (node.isInt()) {
|
||||
return node.asInt();
|
||||
} else if (node.isLong()) {
|
||||
return node.asLong();
|
||||
} else if (node.isShort()) {
|
||||
return node.asInt();
|
||||
} else if (node.isBoolean()) {
|
||||
return node.asBoolean();
|
||||
} else if (node.isDouble()) {
|
||||
return node.asDouble();
|
||||
} else if (node.isFloat()) {
|
||||
return node.asDouble();
|
||||
}
|
||||
return node.textValue();
|
||||
}
|
||||
|
||||
private static JsonNode createJsonNode(DagNode node, String type) throws IOException {
|
||||
JsonNode configNode = MAPPER.readTree(node.getConfig().toString());
|
||||
JsonNode jsonNode = MAPPER.createObjectNode();
|
||||
((ObjectNode) jsonNode).put(DeltaConfig.Config.CONFIG_NAME, configNode);
|
||||
((ObjectNode) jsonNode).put(DeltaConfig.Config.TYPE, type);
|
||||
((ObjectNode) jsonNode).put(DeltaConfig.Config.DEPENDENCIES, getDependencyNames(node));
|
||||
return jsonNode;
|
||||
}
|
||||
|
||||
private static String getDependencyNames(DagNode node) {
|
||||
return node.getParentNodes().stream()
|
||||
.map(e -> ((DagNode) e).getConfig().getOtherConfigs().getOrDefault(DeltaConfig.Config.NODE_NAME, node.getName()).toString())
|
||||
.collect(Collectors.joining(",")).toString();
|
||||
}
|
||||
|
||||
public static String toString(InputStream inputStream) throws IOException {
|
||||
ByteArrayOutputStream result = new ByteArrayOutputStream();
|
||||
byte[] buffer = new byte[1024];
|
||||
int length;
|
||||
while ((length = inputStream.read(buffer)) != -1) {
|
||||
result.write(buffer, 0, length);
|
||||
}
|
||||
return result.toString("utf-8");
|
||||
}
|
||||
|
||||
}
|
||||
@@ -0,0 +1,54 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.integ.testsuite.dag;
|
||||
|
||||
import java.io.Serializable;
|
||||
import org.apache.hudi.integ.testsuite.HoodieTestSuiteWriter;
|
||||
import org.apache.hudi.integ.testsuite.dag.nodes.DagNode;
|
||||
import org.apache.hudi.integ.testsuite.generator.DeltaGenerator;
|
||||
import org.apache.spark.api.java.JavaSparkContext;
|
||||
|
||||
/**
|
||||
* This wraps the context needed for an execution of
|
||||
* a {@link DagNode#execute(ExecutionContext)}.
|
||||
*/
|
||||
public class ExecutionContext implements Serializable {
|
||||
|
||||
private HoodieTestSuiteWriter hoodieTestSuiteWriter;
|
||||
private DeltaGenerator deltaGenerator;
|
||||
private transient JavaSparkContext jsc;
|
||||
|
||||
public ExecutionContext(JavaSparkContext jsc, HoodieTestSuiteWriter hoodieTestSuiteWriter, DeltaGenerator deltaGenerator) {
|
||||
this.hoodieTestSuiteWriter = hoodieTestSuiteWriter;
|
||||
this.deltaGenerator = deltaGenerator;
|
||||
this.jsc = jsc;
|
||||
}
|
||||
|
||||
public HoodieTestSuiteWriter getHoodieTestSuiteWriter() {
|
||||
return hoodieTestSuiteWriter;
|
||||
}
|
||||
|
||||
public DeltaGenerator getDeltaGenerator() {
|
||||
return deltaGenerator;
|
||||
}
|
||||
|
||||
public JavaSparkContext getJsc() {
|
||||
return jsc;
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,75 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.integ.testsuite.dag;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import org.apache.hudi.common.util.collection.Pair;
|
||||
import org.apache.hudi.integ.testsuite.configuration.DeltaConfig;
|
||||
import org.apache.hudi.integ.testsuite.dag.nodes.DagNode;
|
||||
import org.apache.hudi.integ.testsuite.dag.nodes.HiveQueryNode;
|
||||
import org.apache.hudi.integ.testsuite.dag.nodes.InsertNode;
|
||||
import org.apache.hudi.integ.testsuite.dag.nodes.UpsertNode;
|
||||
|
||||
/**
|
||||
* An example of how to generate a workflow dag programmatically. This is also used as the default workflow dag if
|
||||
* none is provided.
|
||||
*/
|
||||
public class SimpleWorkflowDagGenerator implements WorkflowDagGenerator {
|
||||
|
||||
@Override
|
||||
public WorkflowDag build() {
|
||||
|
||||
DagNode root = new InsertNode(DeltaConfig.Config.newBuilder()
|
||||
.withNumRecordsToInsert(100)
|
||||
.withNumInsertPartitions(1)
|
||||
.withNumTimesToRepeat(2)
|
||||
.withRecordSize(1000).build());
|
||||
|
||||
DagNode child1 = new InsertNode(DeltaConfig.Config.newBuilder()
|
||||
.withNumRecordsToInsert(100)
|
||||
.withNumInsertPartitions(1)
|
||||
.withNumTimesToRepeat(2)
|
||||
.withRecordSize(1000).build());
|
||||
|
||||
root.addChildNode(child1);
|
||||
|
||||
DagNode child1OfChild1 = new UpsertNode(DeltaConfig.Config.newBuilder()
|
||||
.withNumRecordsToUpdate(100)
|
||||
.withNumUpsertPartitions(2)
|
||||
.withNumTimesToRepeat(1)
|
||||
.withRecordSize(1000).build());
|
||||
|
||||
// Tests running 2 nodes in parallel
|
||||
child1.addChildNode(child1OfChild1);
|
||||
|
||||
List<Pair<String, Integer>> queryAndResult = new ArrayList<>();
|
||||
queryAndResult.add(Pair.of("select " + "count(*) from testdb1.table1 group "
|
||||
+ "by rider having count(*) < 1", 0));
|
||||
DagNode child2OfChild1 = new HiveQueryNode(DeltaConfig.Config.newBuilder()
|
||||
.withHiveQueryAndResults(queryAndResult).withHiveLocal(true).build());
|
||||
child1.addChildNode(child2OfChild1);
|
||||
|
||||
List<DagNode> rootNodes = new ArrayList<>();
|
||||
rootNodes.add(root);
|
||||
|
||||
return new WorkflowDag(rootNodes);
|
||||
}
|
||||
|
||||
}
|
||||
@@ -0,0 +1,39 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.integ.testsuite.dag;
|
||||
|
||||
import java.util.List;
|
||||
import org.apache.hudi.integ.testsuite.dag.nodes.DagNode;
|
||||
|
||||
/**
|
||||
* Workflow dag that encapsulates all execute nodes.
|
||||
*/
|
||||
public class WorkflowDag<O> {
|
||||
|
||||
private List<DagNode<O>> nodeList;
|
||||
|
||||
public WorkflowDag(List<DagNode<O>> nodeList) {
|
||||
this.nodeList = nodeList;
|
||||
}
|
||||
|
||||
public List<DagNode<O>> getNodeList() {
|
||||
return nodeList;
|
||||
}
|
||||
|
||||
}
|
||||
@@ -0,0 +1,31 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.integ.testsuite.dag;
|
||||
|
||||
/**
|
||||
* A interface which represents a workflow dag generator.
|
||||
*/
|
||||
public interface WorkflowDagGenerator {
|
||||
|
||||
/**
|
||||
* Builds a {@link WorkflowDag}.
|
||||
*/
|
||||
WorkflowDag build();
|
||||
|
||||
}
|
||||
@@ -0,0 +1,40 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.integ.testsuite.dag.nodes;
|
||||
|
||||
import org.apache.hudi.client.WriteStatus;
|
||||
import org.apache.hudi.common.util.Option;
|
||||
import org.apache.hudi.integ.testsuite.HoodieTestSuiteWriter;
|
||||
import org.apache.hudi.integ.testsuite.configuration.DeltaConfig.Config;
|
||||
import org.apache.spark.api.java.JavaRDD;
|
||||
|
||||
public class BulkInsertNode extends InsertNode {
|
||||
|
||||
public BulkInsertNode(Config config) {
|
||||
super(config);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected JavaRDD<WriteStatus> ingest(HoodieTestSuiteWriter hoodieTestSuiteWriter, Option<String> commitTime)
|
||||
throws Exception {
|
||||
log.info("Execute bulk ingest node {}", this.getName());
|
||||
return hoodieTestSuiteWriter.bulkInsert(commitTime);
|
||||
}
|
||||
|
||||
}
|
||||
@@ -0,0 +1,34 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.integ.testsuite.dag.nodes;
|
||||
|
||||
import org.apache.hudi.integ.testsuite.dag.ExecutionContext;
|
||||
|
||||
public class CleanNode extends DagNode<Boolean> {
|
||||
|
||||
public CleanNode() {
|
||||
}
|
||||
|
||||
@Override
|
||||
public void execute(ExecutionContext executionContext) throws Exception {
|
||||
log.info("Executing clean node {}", this.getName());
|
||||
executionContext.getHoodieTestSuiteWriter().getWriteClient(this).clean();
|
||||
}
|
||||
|
||||
}
|
||||
@@ -0,0 +1,47 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.integ.testsuite.dag.nodes;
|
||||
|
||||
import org.apache.hudi.client.WriteStatus;
|
||||
import org.apache.hudi.common.table.HoodieTableMetaClient;
|
||||
import org.apache.hudi.common.table.timeline.HoodieInstant;
|
||||
import org.apache.hudi.common.util.Option;
|
||||
import org.apache.hudi.integ.testsuite.configuration.DeltaConfig.Config;
|
||||
import org.apache.hudi.integ.testsuite.dag.ExecutionContext;
|
||||
import org.apache.spark.api.java.JavaRDD;
|
||||
|
||||
public class CompactNode extends DagNode<JavaRDD<WriteStatus>> {
|
||||
|
||||
public CompactNode(Config config) {
|
||||
this.config = config;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void execute(ExecutionContext executionContext) throws Exception {
|
||||
HoodieTableMetaClient metaClient = new HoodieTableMetaClient(executionContext.getHoodieTestSuiteWriter().getConfiguration(),
|
||||
executionContext.getHoodieTestSuiteWriter().getCfg().targetBasePath);
|
||||
Option<HoodieInstant> lastInstant = metaClient.getActiveTimeline()
|
||||
.getCommitsAndCompactionTimeline().filterPendingCompactionTimeline().lastInstant();
|
||||
if (lastInstant.isPresent()) {
|
||||
log.info("Compacting instant {}", lastInstant.get());
|
||||
this.result = executionContext.getHoodieTestSuiteWriter().compact(Option.of(lastInstant.get().getTimestamp()));
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
@@ -0,0 +1,125 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.integ.testsuite.dag.nodes;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
import java.util.Objects;
|
||||
import java.util.UUID;
|
||||
import org.apache.hudi.integ.testsuite.configuration.DeltaConfig.Config;
|
||||
import org.apache.hudi.integ.testsuite.dag.ExecutionContext;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
/**
|
||||
* Represents a Node in the DAG of operations for a workflow.
|
||||
*/
|
||||
public abstract class DagNode<O> implements Comparable<DagNode<O>> {
|
||||
|
||||
protected static Logger log = LoggerFactory.getLogger(DagNode.class);
|
||||
|
||||
protected List<DagNode<O>> childNodes;
|
||||
protected List<DagNode<O>> parentNodes;
|
||||
protected O result;
|
||||
protected Config config;
|
||||
private boolean isCompleted;
|
||||
|
||||
public DagNode<O> addChildNode(DagNode childNode) {
|
||||
childNode.getParentNodes().add(this);
|
||||
getChildNodes().add(childNode);
|
||||
return this;
|
||||
}
|
||||
|
||||
public DagNode<O> addParentNode(DagNode parentNode) {
|
||||
if (!this.getParentNodes().contains(parentNode)) {
|
||||
this.getParentNodes().add(parentNode);
|
||||
}
|
||||
return this;
|
||||
}
|
||||
|
||||
public O getResult() {
|
||||
return result;
|
||||
}
|
||||
|
||||
public List<DagNode<O>> getChildNodes() {
|
||||
if (childNodes == null) {
|
||||
childNodes = new LinkedList<>();
|
||||
}
|
||||
return childNodes;
|
||||
}
|
||||
|
||||
public List<DagNode<O>> getParentNodes() {
|
||||
if (parentNodes == null) {
|
||||
this.parentNodes = new ArrayList<>();
|
||||
}
|
||||
return this.parentNodes;
|
||||
}
|
||||
|
||||
public void setParentNodes(List<DagNode<O>> parentNodes) {
|
||||
this.parentNodes = parentNodes;
|
||||
}
|
||||
|
||||
public abstract void execute(ExecutionContext context) throws Exception;
|
||||
|
||||
public boolean isCompleted() {
|
||||
return isCompleted;
|
||||
}
|
||||
|
||||
public void setCompleted(boolean completed) {
|
||||
isCompleted = completed;
|
||||
}
|
||||
|
||||
public Config getConfig() {
|
||||
return config;
|
||||
}
|
||||
|
||||
public String getName() {
|
||||
Object name = this.config.getOtherConfigs().get(Config.NODE_NAME);
|
||||
if (name == null) {
|
||||
String randomName = UUID.randomUUID().toString();
|
||||
this.config.getOtherConfigs().put(Config.NODE_NAME, randomName);
|
||||
return randomName;
|
||||
}
|
||||
return name.toString();
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean equals(Object o) {
|
||||
if (this == o) {
|
||||
return true;
|
||||
}
|
||||
if (o == null || getClass() != o.getClass()) {
|
||||
return false;
|
||||
}
|
||||
DagNode<?> dagNode = (DagNode<?>) o;
|
||||
return getName() == dagNode.getName();
|
||||
}
|
||||
|
||||
@Override
|
||||
public int hashCode() {
|
||||
return Objects.hash(getName());
|
||||
}
|
||||
|
||||
@Override
|
||||
public int compareTo(DagNode<O> thatNode) {
|
||||
return this.hashCode() - thatNode.hashCode();
|
||||
}
|
||||
|
||||
}
|
||||
@@ -0,0 +1,87 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.integ.testsuite.dag.nodes;
|
||||
|
||||
import java.sql.Connection;
|
||||
import java.sql.DriverManager;
|
||||
import java.sql.ResultSet;
|
||||
import java.sql.SQLException;
|
||||
import java.sql.Statement;
|
||||
import org.apache.hudi.DataSourceUtils;
|
||||
import org.apache.hudi.common.util.collection.Pair;
|
||||
import org.apache.hudi.hive.HiveSyncConfig;
|
||||
import org.apache.hudi.integ.testsuite.configuration.DeltaConfig;
|
||||
import org.apache.hudi.integ.testsuite.dag.ExecutionContext;
|
||||
import org.apache.hudi.integ.testsuite.helpers.HiveServiceProvider;
|
||||
|
||||
public class HiveQueryNode extends DagNode<Boolean> {
|
||||
|
||||
private HiveServiceProvider hiveServiceProvider;
|
||||
|
||||
public HiveQueryNode(DeltaConfig.Config config) {
|
||||
this.config = config;
|
||||
this.hiveServiceProvider = new HiveServiceProvider(config);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void execute(ExecutionContext executionContext) throws Exception {
|
||||
log.info("Executing hive query node {}", this.getName());
|
||||
this.hiveServiceProvider.startLocalHiveServiceIfNeeded(executionContext.getHoodieTestSuiteWriter().getConfiguration());
|
||||
HiveSyncConfig hiveSyncConfig = DataSourceUtils
|
||||
.buildHiveSyncConfig(executionContext.getHoodieTestSuiteWriter().getDeltaStreamerWrapper()
|
||||
.getDeltaSyncService().getDeltaSync().getProps(),
|
||||
executionContext.getHoodieTestSuiteWriter().getDeltaStreamerWrapper()
|
||||
.getDeltaSyncService().getDeltaSync().getCfg().targetBasePath,
|
||||
executionContext.getHoodieTestSuiteWriter().getDeltaStreamerWrapper()
|
||||
.getDeltaSyncService().getDeltaSync().getCfg().baseFileFormat);
|
||||
this.hiveServiceProvider.syncToLocalHiveIfNeeded(executionContext.getHoodieTestSuiteWriter());
|
||||
Connection con = DriverManager.getConnection(hiveSyncConfig.jdbcUrl, hiveSyncConfig.hiveUser,
|
||||
hiveSyncConfig.hivePass);
|
||||
Statement stmt = con.createStatement();
|
||||
stmt.execute("set hive.input.format=org.apache.hadoop.hive.ql.io.HiveInputFormat");
|
||||
for (String hiveProperty : this.config.getHiveProperties()) {
|
||||
executeStatement(hiveProperty, stmt);
|
||||
}
|
||||
for (Pair<String, Integer> queryAndResult : this.config.getHiveQueries()) {
|
||||
log.info("Running {}", queryAndResult.getLeft());
|
||||
ResultSet res = stmt.executeQuery(queryAndResult.getLeft());
|
||||
if (!res.next()) {
|
||||
log.info("res.next() was False - typically this means the query returned no rows.");
|
||||
assert 0 == queryAndResult.getRight();
|
||||
} else {
|
||||
Integer result = res.getInt(1);
|
||||
if (!queryAndResult.getRight().equals(result)) {
|
||||
throw new AssertionError(
|
||||
"QUERY: " + queryAndResult.getLeft()
|
||||
+ " | EXPECTED RESULT = " + queryAndResult.getRight()
|
||||
+ " | ACTUAL RESULT = " + result
|
||||
);
|
||||
}
|
||||
}
|
||||
log.info("Successfully validated query!");
|
||||
}
|
||||
this.hiveServiceProvider.stopLocalHiveServiceIfNeeded();
|
||||
}
|
||||
|
||||
private void executeStatement(String query, Statement stmt) throws SQLException {
|
||||
log.info("Executing statement {}", stmt.toString());
|
||||
stmt.execute(query);
|
||||
}
|
||||
|
||||
}
|
||||
@@ -0,0 +1,46 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.integ.testsuite.dag.nodes;
|
||||
|
||||
import org.apache.hudi.integ.testsuite.helpers.HiveServiceProvider;
|
||||
import org.apache.hudi.integ.testsuite.configuration.DeltaConfig.Config;
|
||||
import org.apache.hudi.integ.testsuite.dag.ExecutionContext;
|
||||
|
||||
public class HiveSyncNode extends DagNode<Boolean> {
|
||||
|
||||
private HiveServiceProvider hiveServiceProvider;
|
||||
|
||||
public HiveSyncNode(Config config) {
|
||||
this.config = config;
|
||||
this.hiveServiceProvider = new HiveServiceProvider(config);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void execute(ExecutionContext executionContext) throws Exception {
|
||||
log.info("Executing hive sync node");
|
||||
this.hiveServiceProvider.startLocalHiveServiceIfNeeded(executionContext.getHoodieTestSuiteWriter().getConfiguration());
|
||||
this.hiveServiceProvider.syncToLocalHiveIfNeeded(executionContext.getHoodieTestSuiteWriter());
|
||||
executionContext.getHoodieTestSuiteWriter().getDeltaStreamerWrapper().getDeltaSyncService().getDeltaSync().syncHive();
|
||||
this.hiveServiceProvider.stopLocalHiveServiceIfNeeded();
|
||||
}
|
||||
|
||||
public HiveServiceProvider getHiveServiceProvider() {
|
||||
return hiveServiceProvider;
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,60 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.integ.testsuite.dag.nodes;
|
||||
|
||||
import org.apache.hudi.client.WriteStatus;
|
||||
import org.apache.hudi.common.util.Option;
|
||||
import org.apache.hudi.integ.testsuite.HoodieTestSuiteWriter;
|
||||
import org.apache.hudi.integ.testsuite.configuration.DeltaConfig.Config;
|
||||
import org.apache.hudi.integ.testsuite.dag.ExecutionContext;
|
||||
import org.apache.hudi.integ.testsuite.generator.DeltaGenerator;
|
||||
import org.apache.spark.api.java.JavaRDD;
|
||||
|
||||
public class InsertNode extends DagNode<JavaRDD<WriteStatus>> {
|
||||
|
||||
public InsertNode(Config config) {
|
||||
this.config = config;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void execute(ExecutionContext executionContext) throws Exception {
|
||||
generate(executionContext.getDeltaGenerator());
|
||||
log.info("Configs : {}", this.config);
|
||||
if (!config.isDisableIngest()) {
|
||||
log.info("Inserting input data {}", this.getName());
|
||||
Option<String> commitTime = executionContext.getHoodieTestSuiteWriter().startCommit();
|
||||
JavaRDD<WriteStatus> writeStatus = ingest(executionContext.getHoodieTestSuiteWriter(), commitTime);
|
||||
executionContext.getHoodieTestSuiteWriter().commit(writeStatus, commitTime);
|
||||
this.result = writeStatus;
|
||||
}
|
||||
}
|
||||
|
||||
protected void generate(DeltaGenerator deltaGenerator) throws Exception {
|
||||
if (!config.isDisableGenerate()) {
|
||||
log.info("Generating input data for node {}", this.getName());
|
||||
deltaGenerator.writeRecords(deltaGenerator.generateInserts(config)).count();
|
||||
}
|
||||
}
|
||||
|
||||
protected JavaRDD<WriteStatus> ingest(HoodieTestSuiteWriter hoodieTestSuiteWriter,
|
||||
Option<String> commitTime) throws Exception {
|
||||
return hoodieTestSuiteWriter.insert(commitTime);
|
||||
}
|
||||
|
||||
}
|
||||
@@ -0,0 +1,48 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.integ.testsuite.dag.nodes;
|
||||
|
||||
import org.apache.hudi.common.table.HoodieTableMetaClient;
|
||||
import org.apache.hudi.common.table.timeline.HoodieInstant;
|
||||
import org.apache.hudi.common.util.Option;
|
||||
import org.apache.hudi.integ.testsuite.configuration.DeltaConfig.Config;
|
||||
import org.apache.hudi.integ.testsuite.dag.ExecutionContext;
|
||||
|
||||
public class RollbackNode extends DagNode<Option<HoodieInstant>> {
|
||||
|
||||
public RollbackNode(Config config) {
|
||||
this.config = config;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void execute(ExecutionContext executionContext) throws Exception {
|
||||
log.info("Executing rollback node {}", this.getName());
|
||||
// Can only be done with an instantiation of a new WriteClient hence cannot be done during DeltaStreamer
|
||||
// testing for now
|
||||
HoodieTableMetaClient metaClient = new HoodieTableMetaClient(executionContext.getHoodieTestSuiteWriter().getConfiguration(),
|
||||
executionContext.getHoodieTestSuiteWriter().getCfg().targetBasePath);
|
||||
Option<HoodieInstant> lastInstant = metaClient.getActiveTimeline().getCommitsTimeline().lastInstant();
|
||||
if (lastInstant.isPresent()) {
|
||||
log.info("Rolling back last instant {}", lastInstant.get());
|
||||
executionContext.getHoodieTestSuiteWriter().getWriteClient(this).rollback(lastInstant.get().getTimestamp());
|
||||
this.result = lastInstant;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
@@ -0,0 +1,56 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.integ.testsuite.dag.nodes;
|
||||
|
||||
import org.apache.hudi.common.model.HoodieCommitMetadata;
|
||||
import org.apache.hudi.common.table.HoodieTableMetaClient;
|
||||
import org.apache.hudi.common.table.timeline.HoodieInstant;
|
||||
import org.apache.hudi.common.util.Option;
|
||||
import org.apache.hudi.integ.testsuite.configuration.DeltaConfig.Config;
|
||||
import org.apache.hudi.integ.testsuite.dag.ExecutionContext;
|
||||
|
||||
public class ScheduleCompactNode extends DagNode<Option<String>> {
|
||||
|
||||
public ScheduleCompactNode(Config config) {
|
||||
this.config = config;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void execute(ExecutionContext executionContext) throws Exception {
|
||||
log.info("Executing schedule compact node {}", this.getName());
|
||||
// Can only be done with an instantiation of a new WriteClient hence cannot be done during DeltaStreamer
|
||||
// testing for now
|
||||
// Find the last commit and extra the extra metadata to be passed to the schedule compaction. This is
|
||||
// done to ensure the CHECKPOINT is correctly passed from commit to commit
|
||||
HoodieTableMetaClient metaClient = new HoodieTableMetaClient(executionContext.getHoodieTestSuiteWriter().getConfiguration(),
|
||||
executionContext.getHoodieTestSuiteWriter().getCfg().targetBasePath);
|
||||
Option<HoodieInstant> lastInstant = metaClient.getActiveTimeline().getCommitsTimeline().lastInstant();
|
||||
if (lastInstant.isPresent()) {
|
||||
HoodieCommitMetadata metadata = org.apache.hudi.common.model.HoodieCommitMetadata.fromBytes(metaClient
|
||||
.getActiveTimeline().getInstantDetails(lastInstant.get()).get(), HoodieCommitMetadata.class);
|
||||
Option<String> scheduledInstant = executionContext.getHoodieTestSuiteWriter().scheduleCompaction(Option.of(metadata
|
||||
.getExtraMetadata()));
|
||||
if (scheduledInstant.isPresent()) {
|
||||
log.info("Scheduling compaction instant {}", scheduledInstant.get());
|
||||
}
|
||||
this.result = scheduledInstant;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
@@ -0,0 +1,61 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.integ.testsuite.dag.nodes;
|
||||
|
||||
import org.apache.hudi.common.util.collection.Pair;
|
||||
import org.apache.hudi.integ.testsuite.helpers.HiveServiceProvider;
|
||||
import org.apache.hudi.integ.testsuite.configuration.DeltaConfig.Config;
|
||||
import org.apache.hudi.integ.testsuite.dag.ExecutionContext;
|
||||
import org.apache.spark.sql.Dataset;
|
||||
import org.apache.spark.sql.Row;
|
||||
import org.apache.spark.sql.SparkSession;
|
||||
|
||||
public class SparkSQLQueryNode extends DagNode<Boolean> {
|
||||
|
||||
HiveServiceProvider hiveServiceProvider;
|
||||
|
||||
public SparkSQLQueryNode(Config config) {
|
||||
this.config = config;
|
||||
this.hiveServiceProvider = new HiveServiceProvider(config);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void execute(ExecutionContext executionContext) throws Exception {
|
||||
log.info("Executing spark sql query node");
|
||||
this.hiveServiceProvider.startLocalHiveServiceIfNeeded(executionContext.getHoodieTestSuiteWriter().getConfiguration());
|
||||
this.hiveServiceProvider.syncToLocalHiveIfNeeded(executionContext.getHoodieTestSuiteWriter());
|
||||
SparkSession session = SparkSession.builder().sparkContext(executionContext.getJsc().sc()).getOrCreate();
|
||||
for (String hiveProperty : this.config.getHiveProperties()) {
|
||||
session.sql(hiveProperty).count();
|
||||
}
|
||||
for (Pair<String, Integer> queryAndResult : this.config.getHiveQueries()) {
|
||||
log.info("Running {}", queryAndResult.getLeft());
|
||||
Dataset<Row> res = session.sql(queryAndResult.getLeft());
|
||||
if (res.count() == 0) {
|
||||
assert 0 == queryAndResult.getRight();
|
||||
} else {
|
||||
assert ((Row[]) res.collect())[0].getInt(0) == queryAndResult.getRight();
|
||||
}
|
||||
log.info("Successfully validated query!");
|
||||
}
|
||||
this.hiveServiceProvider.stopLocalHiveServiceIfNeeded();
|
||||
this.result = true;
|
||||
}
|
||||
|
||||
}
|
||||
@@ -0,0 +1,52 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.integ.testsuite.dag.nodes;
|
||||
|
||||
import org.apache.hudi.client.WriteStatus;
|
||||
import org.apache.hudi.common.util.Option;
|
||||
import org.apache.hudi.integ.testsuite.HoodieTestSuiteWriter;
|
||||
import org.apache.hudi.integ.testsuite.configuration.DeltaConfig.Config;
|
||||
import org.apache.hudi.integ.testsuite.generator.DeltaGenerator;
|
||||
import org.apache.spark.api.java.JavaRDD;
|
||||
|
||||
public class UpsertNode extends InsertNode {
|
||||
|
||||
public UpsertNode(Config config) {
|
||||
super(config);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected void generate(DeltaGenerator deltaGenerator) throws Exception {
|
||||
if (!config.isDisableGenerate()) {
|
||||
log.info("Generating input data {}", this.getName());
|
||||
deltaGenerator.writeRecords(deltaGenerator.generateUpdates(config)).count();
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
protected JavaRDD<WriteStatus> ingest(HoodieTestSuiteWriter hoodieTestSuiteWriter, Option<String> commitTime)
|
||||
throws Exception {
|
||||
if (!config.isDisableIngest()) {
|
||||
log.info("Upserting input data {}", this.getName());
|
||||
this.result = hoodieTestSuiteWriter.upsert(commitTime);
|
||||
}
|
||||
return this.result;
|
||||
}
|
||||
|
||||
}
|
||||
@@ -0,0 +1,48 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.integ.testsuite.dag.nodes;
|
||||
|
||||
import java.util.List;
|
||||
import java.util.function.Function;
|
||||
import org.apache.hudi.integ.testsuite.configuration.DeltaConfig.Config;
|
||||
import org.apache.hudi.integ.testsuite.dag.ExecutionContext;
|
||||
|
||||
public class ValidateNode<R> extends DagNode {
|
||||
|
||||
protected Function<List<DagNode>, R> function;
|
||||
|
||||
public ValidateNode(Config config, Function<List<DagNode>, R> function) {
|
||||
this.function = function;
|
||||
this.config = config;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void execute(ExecutionContext executionContext) {
|
||||
if (this.getParentNodes().size() > 0 && (Boolean) this.config.getOtherConfigs().getOrDefault("WAIT_FOR_PARENTS",
|
||||
true)) {
|
||||
for (DagNode node : (List<DagNode>) this.getParentNodes()) {
|
||||
if (!node.isCompleted()) {
|
||||
throw new RuntimeException("cannot validate before parent nodes are complete");
|
||||
}
|
||||
}
|
||||
}
|
||||
this.result = this.function.apply((List<DagNode>) this.getParentNodes());
|
||||
}
|
||||
|
||||
}
|
||||
@@ -0,0 +1,100 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.integ.testsuite.dag.scheduler;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.PriorityQueue;
|
||||
import java.util.Queue;
|
||||
import java.util.Set;
|
||||
import java.util.concurrent.ExecutorService;
|
||||
import java.util.concurrent.Executors;
|
||||
import java.util.concurrent.Future;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
import org.apache.hudi.exception.HoodieException;
|
||||
import org.apache.hudi.integ.testsuite.dag.nodes.DagNode;
|
||||
import org.apache.hudi.integ.testsuite.HoodieTestSuiteWriter;
|
||||
import org.apache.hudi.integ.testsuite.dag.ExecutionContext;
|
||||
import org.apache.hudi.integ.testsuite.dag.WorkflowDag;
|
||||
import org.apache.hudi.integ.testsuite.generator.DeltaGenerator;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
public class DagScheduler {
|
||||
|
||||
private static Logger log = LoggerFactory.getLogger(DagScheduler.class);
|
||||
private WorkflowDag workflowDag;
|
||||
private ExecutionContext executionContext;
|
||||
|
||||
public DagScheduler(WorkflowDag workflowDag, HoodieTestSuiteWriter hoodieTestSuiteWriter, DeltaGenerator deltaGenerator) {
|
||||
this.workflowDag = workflowDag;
|
||||
this.executionContext = new ExecutionContext(null, hoodieTestSuiteWriter, deltaGenerator);
|
||||
}
|
||||
|
||||
public void schedule() throws Exception {
|
||||
ExecutorService service = Executors.newFixedThreadPool(2);
|
||||
try {
|
||||
execute(service, workflowDag.getNodeList());
|
||||
service.shutdown();
|
||||
} finally {
|
||||
if (!service.isShutdown()) {
|
||||
log.info("Forcing shutdown of executor service, this might kill running tasks");
|
||||
service.shutdownNow();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private void execute(ExecutorService service, List<DagNode> nodes) throws Exception {
|
||||
// Nodes at the same level are executed in parallel
|
||||
Queue<DagNode> queue = new PriorityQueue<>(nodes);
|
||||
log.info("Running workloads");
|
||||
do {
|
||||
List<Future> futures = new ArrayList<>();
|
||||
Set<DagNode> childNodes = new HashSet<>();
|
||||
while (queue.size() > 0) {
|
||||
DagNode nodeToExecute = queue.poll();
|
||||
futures.add(service.submit(() -> executeNode(nodeToExecute)));
|
||||
if (nodeToExecute.getChildNodes().size() > 0) {
|
||||
childNodes.addAll(nodeToExecute.getChildNodes());
|
||||
}
|
||||
}
|
||||
queue.addAll(childNodes);
|
||||
childNodes.clear();
|
||||
for (Future future : futures) {
|
||||
future.get(1, TimeUnit.HOURS);
|
||||
}
|
||||
} while (queue.size() > 0);
|
||||
log.info("Finished workloads");
|
||||
}
|
||||
|
||||
private void executeNode(DagNode node) {
|
||||
if (node.isCompleted()) {
|
||||
throw new RuntimeException("DagNode already completed! Cannot re-execute");
|
||||
}
|
||||
try {
|
||||
node.execute(executionContext);
|
||||
node.setCompleted(true);
|
||||
log.info("Finished executing {}", node.getName());
|
||||
} catch (Exception e) {
|
||||
log.error("Exception executing node");
|
||||
throw new HoodieException(e);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,237 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.integ.testsuite.generator;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Serializable;
|
||||
import java.io.UncheckedIOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.Collections;
|
||||
import java.util.HashMap;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.stream.StreamSupport;
|
||||
import org.apache.avro.generic.GenericRecord;
|
||||
import org.apache.hudi.common.util.Option;
|
||||
import org.apache.hudi.integ.testsuite.converter.Converter;
|
||||
import org.apache.hudi.integ.testsuite.converter.UpdateConverter;
|
||||
import org.apache.hudi.integ.testsuite.reader.DFSAvroDeltaInputReader;
|
||||
import org.apache.hudi.integ.testsuite.reader.DFSHoodieDatasetInputReader;
|
||||
import org.apache.hudi.integ.testsuite.reader.DeltaInputReader;
|
||||
import org.apache.hudi.integ.testsuite.writer.DeltaOutputMode;
|
||||
import org.apache.hudi.integ.testsuite.writer.DeltaWriteStats;
|
||||
import org.apache.hudi.integ.testsuite.writer.DeltaWriterAdapter;
|
||||
import org.apache.hudi.integ.testsuite.writer.DeltaWriterFactory;
|
||||
import org.apache.hudi.keygen.ComplexKeyGenerator;
|
||||
import org.apache.hudi.keygen.KeyGenerator;
|
||||
import org.apache.hudi.keygen.SimpleKeyGenerator;
|
||||
import org.apache.hudi.integ.testsuite.configuration.DFSDeltaConfig;
|
||||
import org.apache.hudi.integ.testsuite.configuration.DeltaConfig;
|
||||
import org.apache.hudi.integ.testsuite.configuration.DeltaConfig.Config;
|
||||
import org.apache.spark.api.java.JavaRDD;
|
||||
import org.apache.spark.api.java.JavaSparkContext;
|
||||
import org.apache.spark.sql.SparkSession;
|
||||
import org.apache.spark.storage.StorageLevel;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
import scala.Tuple2;
|
||||
|
||||
/**
|
||||
* The delta generator generates all types of workloads (insert, update) for the given configs.
|
||||
*/
|
||||
public class DeltaGenerator implements Serializable {
|
||||
|
||||
private static Logger log = LoggerFactory.getLogger(DFSHoodieDatasetInputReader.class);
|
||||
|
||||
private DeltaConfig deltaOutputConfig;
|
||||
private transient JavaSparkContext jsc;
|
||||
private transient SparkSession sparkSession;
|
||||
private String schemaStr;
|
||||
private List<String> recordRowKeyFieldNames;
|
||||
private List<String> partitionPathFieldNames;
|
||||
private int batchId;
|
||||
|
||||
public DeltaGenerator(DeltaConfig deltaOutputConfig, JavaSparkContext jsc, SparkSession sparkSession,
|
||||
String schemaStr,
|
||||
KeyGenerator keyGenerator) {
|
||||
this.deltaOutputConfig = deltaOutputConfig;
|
||||
this.jsc = jsc;
|
||||
this.sparkSession = sparkSession;
|
||||
this.schemaStr = schemaStr;
|
||||
this.recordRowKeyFieldNames = keyGenerator instanceof ComplexKeyGenerator ? ((ComplexKeyGenerator) keyGenerator)
|
||||
.getRecordKeyFields() : Arrays.asList(((SimpleKeyGenerator) keyGenerator).getRecordKeyField());
|
||||
this.partitionPathFieldNames = keyGenerator instanceof ComplexKeyGenerator ? ((ComplexKeyGenerator) keyGenerator)
|
||||
.getPartitionPathFields() : Arrays.asList(((SimpleKeyGenerator) keyGenerator).getPartitionPathField());
|
||||
}
|
||||
|
||||
public JavaRDD<DeltaWriteStats> writeRecords(JavaRDD<GenericRecord> records) {
|
||||
// The following creates a new anonymous function for iterator and hence results in serialization issues
|
||||
JavaRDD<DeltaWriteStats> ws = records.mapPartitions(itr -> {
|
||||
try {
|
||||
DeltaWriterAdapter<GenericRecord> deltaWriterAdapter = DeltaWriterFactory
|
||||
.getDeltaWriterAdapter(deltaOutputConfig, batchId);
|
||||
return Collections.singletonList(deltaWriterAdapter.write(itr)).iterator();
|
||||
} catch (IOException io) {
|
||||
throw new UncheckedIOException(io);
|
||||
}
|
||||
}).flatMap(List::iterator);
|
||||
batchId++;
|
||||
return ws;
|
||||
}
|
||||
|
||||
public JavaRDD<GenericRecord> generateInserts(Config operation) {
|
||||
long recordsPerPartition = operation.getNumRecordsInsert();
|
||||
int minPayloadSize = operation.getRecordSize();
|
||||
JavaRDD<GenericRecord> inputBatch = jsc.parallelize(Collections.EMPTY_LIST)
|
||||
.repartition(operation.getNumInsertPartitions()).mapPartitions(p -> {
|
||||
return new LazyRecordGeneratorIterator(new FlexibleSchemaRecordGenerationIterator(recordsPerPartition,
|
||||
minPayloadSize, schemaStr, partitionPathFieldNames));
|
||||
});
|
||||
return inputBatch;
|
||||
}
|
||||
|
||||
public JavaRDD<GenericRecord> generateUpdates(Config config) throws IOException {
|
||||
if (deltaOutputConfig.getDeltaOutputMode() == DeltaOutputMode.DFS) {
|
||||
JavaRDD<GenericRecord> inserts = null;
|
||||
if (config.getNumRecordsInsert() > 0) {
|
||||
inserts = generateInserts(config);
|
||||
}
|
||||
DeltaInputReader deltaInputReader = null;
|
||||
JavaRDD<GenericRecord> adjustedRDD = null;
|
||||
if (config.getNumUpsertPartitions() < 1) {
|
||||
// randomly generate updates for a given number of records without regard to partitions and files
|
||||
deltaInputReader = new DFSAvroDeltaInputReader(sparkSession, schemaStr,
|
||||
((DFSDeltaConfig) deltaOutputConfig).getDeltaBasePath(), Option.empty(), Option.empty());
|
||||
adjustedRDD = deltaInputReader.read(config.getNumRecordsUpsert());
|
||||
adjustedRDD = adjustRDDToGenerateExactNumUpdates(adjustedRDD, jsc, config.getNumRecordsUpsert());
|
||||
} else {
|
||||
deltaInputReader =
|
||||
new DFSHoodieDatasetInputReader(jsc, ((DFSDeltaConfig) deltaOutputConfig).getDatasetOutputPath(),
|
||||
schemaStr);
|
||||
if (config.getFractionUpsertPerFile() > 0) {
|
||||
adjustedRDD = deltaInputReader.read(config.getNumUpsertPartitions(), config.getNumUpsertFiles(),
|
||||
config.getFractionUpsertPerFile());
|
||||
} else {
|
||||
adjustedRDD = deltaInputReader.read(config.getNumUpsertPartitions(), config.getNumUpsertFiles(), config
|
||||
.getNumRecordsUpsert());
|
||||
}
|
||||
}
|
||||
log.info("Repartitioning records");
|
||||
// persist this since we will make multiple passes over this
|
||||
adjustedRDD = adjustedRDD.repartition(jsc.defaultParallelism());
|
||||
log.info("Repartitioning records done");
|
||||
Converter converter = new UpdateConverter(schemaStr, config.getRecordSize(),
|
||||
partitionPathFieldNames, recordRowKeyFieldNames);
|
||||
JavaRDD<GenericRecord> updates = converter.convert(adjustedRDD);
|
||||
log.info("Records converted");
|
||||
updates.persist(StorageLevel.DISK_ONLY());
|
||||
return inserts != null ? inserts.union(updates) : updates;
|
||||
// TODO : Generate updates for only N partitions.
|
||||
} else {
|
||||
throw new IllegalArgumentException("Other formats are not supported at the moment");
|
||||
}
|
||||
}
|
||||
|
||||
public Map<Integer, Long> getPartitionToCountMap(JavaRDD<GenericRecord> records) {
|
||||
// Requires us to keep the partitioner the same
|
||||
return records.mapPartitionsWithIndex((index, itr) -> {
|
||||
Iterable<GenericRecord> newIterable = () -> itr;
|
||||
// parallelize counting for speed
|
||||
long count = StreamSupport.stream(newIterable.spliterator(), true).count();
|
||||
return Arrays.asList(new Tuple2<>(index, count)).iterator();
|
||||
}, true).mapToPair(i -> i).collectAsMap();
|
||||
}
|
||||
|
||||
public Map<Integer, Long> getAdjustedPartitionsCount(Map<Integer, Long> partitionCountMap, long
|
||||
recordsToRemove) {
|
||||
long remainingRecordsToRemove = recordsToRemove;
|
||||
Iterator<Map.Entry<Integer, Long>> iterator = partitionCountMap.entrySet().iterator();
|
||||
Map<Integer, Long> adjustedPartitionCountMap = new HashMap<>();
|
||||
while (iterator.hasNext()) {
|
||||
Map.Entry<Integer, Long> entry = iterator.next();
|
||||
if (entry.getValue() < remainingRecordsToRemove) {
|
||||
remainingRecordsToRemove -= entry.getValue();
|
||||
adjustedPartitionCountMap.put(entry.getKey(), 0L);
|
||||
} else {
|
||||
long newValue = entry.getValue() - remainingRecordsToRemove;
|
||||
remainingRecordsToRemove = 0;
|
||||
adjustedPartitionCountMap.put(entry.getKey(), newValue);
|
||||
}
|
||||
if (remainingRecordsToRemove == 0) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
return adjustedPartitionCountMap;
|
||||
}
|
||||
|
||||
public JavaRDD<GenericRecord> adjustRDDToGenerateExactNumUpdates(JavaRDD<GenericRecord> updates, JavaSparkContext
|
||||
jsc, long totalRecordsRequired) {
|
||||
Map<Integer, Long> actualPartitionCountMap = getPartitionToCountMap(updates);
|
||||
long totalRecordsGenerated = actualPartitionCountMap.values().stream().mapToLong(Long::longValue).sum();
|
||||
if (isSafeToTake(totalRecordsRequired, totalRecordsGenerated)) {
|
||||
// Generate totalRecordsRequired - totalRecordsGenerated new records and union the RDD's
|
||||
// NOTE : This performs poorly when totalRecordsRequired >> totalRecordsGenerated. Hence, always
|
||||
// ensure that enough inserts are created before hand (this needs to be noted during the WorkflowDag creation)
|
||||
long sizeOfUpdateRDD = totalRecordsGenerated;
|
||||
while (totalRecordsRequired != sizeOfUpdateRDD) {
|
||||
long recordsToTake = (totalRecordsRequired - sizeOfUpdateRDD) > sizeOfUpdateRDD
|
||||
? sizeOfUpdateRDD : (totalRecordsRequired - sizeOfUpdateRDD);
|
||||
if ((totalRecordsRequired - sizeOfUpdateRDD) > recordsToTake && recordsToTake <= sizeOfUpdateRDD) {
|
||||
updates = updates.union(updates);
|
||||
sizeOfUpdateRDD *= 2;
|
||||
} else {
|
||||
List<GenericRecord> remainingUpdates = updates.take((int) (recordsToTake));
|
||||
updates = updates.union(jsc.parallelize(remainingUpdates));
|
||||
sizeOfUpdateRDD = sizeOfUpdateRDD + recordsToTake;
|
||||
}
|
||||
}
|
||||
return updates;
|
||||
} else if (totalRecordsRequired < totalRecordsGenerated) {
|
||||
final Map<Integer, Long> adjustedPartitionCountMap = getAdjustedPartitionsCount(actualPartitionCountMap,
|
||||
totalRecordsGenerated - totalRecordsRequired);
|
||||
// limit counts across partitions to meet the exact number of updates required
|
||||
JavaRDD<GenericRecord> trimmedRecords = updates.mapPartitionsWithIndex((index, itr) -> {
|
||||
int counter = 1;
|
||||
List<GenericRecord> entriesToKeep = new ArrayList<>();
|
||||
if (!adjustedPartitionCountMap.containsKey(index)) {
|
||||
return itr;
|
||||
} else {
|
||||
long recordsToKeepForThisPartition = adjustedPartitionCountMap.get(index);
|
||||
while (counter <= recordsToKeepForThisPartition && itr.hasNext()) {
|
||||
entriesToKeep.add(itr.next());
|
||||
counter++;
|
||||
}
|
||||
return entriesToKeep.iterator();
|
||||
}
|
||||
}, true);
|
||||
return trimmedRecords;
|
||||
}
|
||||
return updates;
|
||||
}
|
||||
|
||||
private boolean isSafeToTake(long totalRecords, long totalRecordsGenerated) {
|
||||
// TODO : Ensure that the difference between totalRecords and totalRecordsGenerated is not too big, if yes,
|
||||
// then there are fewer number of records on disk, hence we need to find another way to generate updates when
|
||||
// requiredUpdates >> insertedRecords
|
||||
return totalRecords > totalRecordsGenerated;
|
||||
}
|
||||
|
||||
}
|
||||
@@ -0,0 +1,70 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.integ.testsuite.generator;
|
||||
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
import org.apache.avro.Schema;
|
||||
import org.apache.avro.generic.GenericRecord;
|
||||
|
||||
/**
|
||||
* A GenericRecordGeneratorIterator for the custom schema of the workload. Implements {@link Iterator} to allow for
|
||||
* iteration semantics.
|
||||
*/
|
||||
public class FlexibleSchemaRecordGenerationIterator implements Iterator<GenericRecord> {
|
||||
|
||||
// Stores how many records to generate as part of this iterator. Ideally, one iterator is started per spark
|
||||
// partition.
|
||||
private long counter;
|
||||
// Use the full payload generator as default
|
||||
private GenericRecordFullPayloadGenerator generator;
|
||||
// Store last record for the partition path of the first payload to be used for all subsequent generated payloads
|
||||
private GenericRecord lastRecord;
|
||||
// Partition path field name
|
||||
private List<String> partitionPathFieldNames;
|
||||
|
||||
public FlexibleSchemaRecordGenerationIterator(long maxEntriesToProduce, String schema) {
|
||||
this(maxEntriesToProduce, GenericRecordFullPayloadGenerator.DEFAULT_PAYLOAD_SIZE, schema, null);
|
||||
}
|
||||
|
||||
public FlexibleSchemaRecordGenerationIterator(long maxEntriesToProduce, int minPayloadSize, String schemaStr,
|
||||
List<String> partitionPathFieldNames) {
|
||||
this.counter = maxEntriesToProduce;
|
||||
this.partitionPathFieldNames = partitionPathFieldNames;
|
||||
Schema schema = new Schema.Parser().parse(schemaStr);
|
||||
this.generator = new GenericRecordFullPayloadGenerator(schema, minPayloadSize);
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean hasNext() {
|
||||
return this.counter > 0;
|
||||
}
|
||||
|
||||
@Override
|
||||
public GenericRecord next() {
|
||||
this.counter--;
|
||||
if (lastRecord == null) {
|
||||
GenericRecord record = this.generator.getNewPayload();
|
||||
lastRecord = record;
|
||||
return record;
|
||||
} else {
|
||||
return this.generator.randomize(lastRecord, this.partitionPathFieldNames);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,285 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.integ.testsuite.generator;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.nio.ByteBuffer;
|
||||
import java.nio.charset.Charset;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Date;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Random;
|
||||
import java.util.UUID;
|
||||
import org.apache.avro.LogicalType;
|
||||
import org.apache.avro.LogicalTypes;
|
||||
import org.apache.avro.Schema;
|
||||
import org.apache.avro.Schema.Type;
|
||||
import org.apache.avro.generic.GenericData;
|
||||
import org.apache.avro.generic.GenericData.Fixed;
|
||||
import org.apache.avro.generic.GenericFixed;
|
||||
import org.apache.avro.generic.GenericRecord;
|
||||
import org.apache.hudi.common.util.collection.Pair;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
/**
|
||||
* This is a GenericRecord payload generator that generates full generic records {@link GenericRecord}.
|
||||
* Every field of a generic record created using this generator contains a random value.
|
||||
*/
|
||||
public class GenericRecordFullPayloadGenerator implements Serializable {
|
||||
|
||||
public static final int DEFAULT_PAYLOAD_SIZE = 1024 * 10; // 10 KB
|
||||
private static Logger log = LoggerFactory.getLogger(GenericRecordFullPayloadGenerator.class);
|
||||
protected final Random random = new Random();
|
||||
// The source schema used to generate a payload
|
||||
private final transient Schema baseSchema;
|
||||
// Used to validate a generic record
|
||||
private final transient GenericData genericData = new GenericData();
|
||||
// Number of more bytes to add based on the estimated full record payload size and min payload size
|
||||
private int numberOfBytesToAdd;
|
||||
// If more elements should be packed to meet the minPayloadSize
|
||||
private boolean shouldAddMore;
|
||||
// How many complex fields have we visited that can help us pack more entries and increase the size of the record
|
||||
private int numberOfComplexFields;
|
||||
// The size of a full record where every field of a generic record created contains 1 random value
|
||||
private int estimatedFullPayloadSize;
|
||||
// LogicalTypes in Avro 1.8.2
|
||||
private static final String DECIMAL = "decimal";
|
||||
private static final String UUID_NAME = "uuid";
|
||||
private static final String DATE = "date";
|
||||
private static final String TIME_MILLIS = "time-millis";
|
||||
private static final String TIME_MICROS = "time-micros";
|
||||
private static final String TIMESTAMP_MILLIS = "timestamp-millis";
|
||||
private static final String TIMESTAMP_MICROS = "timestamp-micros";
|
||||
|
||||
public GenericRecordFullPayloadGenerator(Schema schema) {
|
||||
this(schema, DEFAULT_PAYLOAD_SIZE);
|
||||
}
|
||||
|
||||
public GenericRecordFullPayloadGenerator(Schema schema, int minPayloadSize) {
|
||||
Pair<Integer, Integer> sizeInfo = new GenericRecordFullPayloadSizeEstimator(schema)
|
||||
.typeEstimateAndNumComplexFields();
|
||||
this.estimatedFullPayloadSize = sizeInfo.getLeft();
|
||||
this.numberOfComplexFields = sizeInfo.getRight();
|
||||
this.baseSchema = schema;
|
||||
this.shouldAddMore = estimatedFullPayloadSize < minPayloadSize;
|
||||
if (this.shouldAddMore) {
|
||||
this.numberOfBytesToAdd = minPayloadSize - estimatedFullPayloadSize;
|
||||
if (numberOfComplexFields < 1) {
|
||||
log.warn("The schema does not have any collections/complex fields. Cannot achieve minPayloadSize : {}",
|
||||
minPayloadSize);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
protected static boolean isPrimitive(Schema localSchema) {
|
||||
if (localSchema.getType() != Type.ARRAY
|
||||
&& localSchema.getType() != Type.MAP
|
||||
&& localSchema.getType() != Type.RECORD
|
||||
&& localSchema.getType() != Type.UNION) {
|
||||
return true;
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
public GenericRecord getNewPayload() {
|
||||
return convert(baseSchema);
|
||||
}
|
||||
|
||||
public GenericRecord getUpdatePayload(GenericRecord record, List<String> blacklistFields) {
|
||||
return randomize(record, blacklistFields);
|
||||
}
|
||||
|
||||
protected GenericRecord convert(Schema schema) {
|
||||
GenericRecord result = new GenericData.Record(schema);
|
||||
for (Schema.Field f : schema.getFields()) {
|
||||
result.put(f.name(), typeConvert(f.schema()));
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
protected GenericRecord convertPartial(Schema schema) {
|
||||
GenericRecord result = new GenericData.Record(schema);
|
||||
for (Schema.Field f : schema.getFields()) {
|
||||
boolean setNull = random.nextBoolean();
|
||||
if (!setNull) {
|
||||
result.put(f.name(), typeConvert(f.schema()));
|
||||
} else {
|
||||
result.put(f.name(), null);
|
||||
}
|
||||
}
|
||||
// TODO : pack remaining bytes into a complex field
|
||||
return result;
|
||||
}
|
||||
|
||||
protected GenericRecord randomize(GenericRecord record, List<String> blacklistFields) {
|
||||
for (Schema.Field f : record.getSchema().getFields()) {
|
||||
if (blacklistFields == null || !blacklistFields.contains(f.name())) {
|
||||
record.put(f.name(), typeConvert(f.schema()));
|
||||
}
|
||||
}
|
||||
return record;
|
||||
}
|
||||
|
||||
private Object typeConvert(Schema schema) {
|
||||
Schema localSchema = schema;
|
||||
if (isOption(schema)) {
|
||||
localSchema = getNonNull(schema);
|
||||
}
|
||||
switch (localSchema.getType()) {
|
||||
case BOOLEAN:
|
||||
return random.nextBoolean();
|
||||
case DOUBLE:
|
||||
return random.nextDouble();
|
||||
case FLOAT:
|
||||
return random.nextFloat();
|
||||
case INT:
|
||||
return random.nextInt();
|
||||
case LONG:
|
||||
return random.nextLong();
|
||||
case STRING:
|
||||
return UUID.randomUUID().toString();
|
||||
case ENUM:
|
||||
List<String> enumSymbols = localSchema.getEnumSymbols();
|
||||
return new GenericData.EnumSymbol(localSchema, enumSymbols.get(random.nextInt(enumSymbols.size() - 1)));
|
||||
case RECORD:
|
||||
return convert(localSchema);
|
||||
case ARRAY:
|
||||
Schema elementSchema = localSchema.getElementType();
|
||||
List listRes = new ArrayList();
|
||||
if (isPrimitive(elementSchema) && this.shouldAddMore) {
|
||||
int numEntriesToAdd = numEntriesToAdd(elementSchema);
|
||||
while (numEntriesToAdd > 0) {
|
||||
listRes.add(typeConvert(elementSchema));
|
||||
numEntriesToAdd--;
|
||||
}
|
||||
} else {
|
||||
listRes.add(typeConvert(elementSchema));
|
||||
}
|
||||
return listRes;
|
||||
case MAP:
|
||||
Schema valueSchema = localSchema.getValueType();
|
||||
Map<String, Object> mapRes = new HashMap<String, Object>();
|
||||
if (isPrimitive(valueSchema) && this.shouldAddMore) {
|
||||
int numEntriesToAdd = numEntriesToAdd(valueSchema);
|
||||
while (numEntriesToAdd > 0) {
|
||||
mapRes.put(UUID.randomUUID().toString(), typeConvert(valueSchema));
|
||||
numEntriesToAdd--;
|
||||
}
|
||||
} else {
|
||||
mapRes.put(UUID.randomUUID().toString(), typeConvert(valueSchema));
|
||||
}
|
||||
return mapRes;
|
||||
case BYTES:
|
||||
return ByteBuffer.wrap(UUID.randomUUID().toString().getBytes(Charset.defaultCharset()));
|
||||
case FIXED:
|
||||
return generateFixedType(localSchema);
|
||||
default:
|
||||
throw new IllegalArgumentException(
|
||||
"Cannot handle type: " + localSchema.getType());
|
||||
}
|
||||
}
|
||||
|
||||
private Object generateFixedType(Schema localSchema) {
|
||||
// TODO: Need to implement valid data generation for fixed type
|
||||
GenericFixed genericFixed = new GenericData.Fixed(localSchema);
|
||||
switch (localSchema.getLogicalType().getName()) {
|
||||
case UUID_NAME:
|
||||
((Fixed) genericFixed).bytes(UUID.randomUUID().toString().getBytes());
|
||||
return genericFixed;
|
||||
case DECIMAL:
|
||||
return genericFixed;
|
||||
case DATE:
|
||||
return genericFixed;
|
||||
case TIME_MILLIS:
|
||||
return genericFixed;
|
||||
default:
|
||||
throw new IllegalArgumentException(
|
||||
"Cannot handle type: " + localSchema.getLogicalType());
|
||||
}
|
||||
}
|
||||
|
||||
public boolean validate(GenericRecord record) {
|
||||
return genericData.validate(baseSchema, record);
|
||||
}
|
||||
|
||||
protected boolean isOption(Schema schema) {
|
||||
return schema.getType().equals(Schema.Type.UNION)
|
||||
&& schema.getTypes().size() == 2
|
||||
&& (schema.getTypes().get(0).getType().equals(Schema.Type.NULL)
|
||||
|| schema.getTypes().get(1).getType().equals(Schema.Type.NULL));
|
||||
}
|
||||
|
||||
protected Schema getNonNull(Schema schema) {
|
||||
List<Schema> types = schema.getTypes();
|
||||
return types.get(0).getType().equals(Schema.Type.NULL) ? types.get(1) : types.get(0);
|
||||
}
|
||||
|
||||
public int getEstimatedFullPayloadSize() {
|
||||
return estimatedFullPayloadSize;
|
||||
}
|
||||
|
||||
private int getSize(Schema elementSchema) {
|
||||
switch (elementSchema.getType()) {
|
||||
case BOOLEAN:
|
||||
return 1;
|
||||
case DOUBLE:
|
||||
return Double.BYTES;
|
||||
case FLOAT:
|
||||
return Float.BYTES;
|
||||
case INT:
|
||||
return Integer.BYTES;
|
||||
case LONG:
|
||||
return Long.BYTES;
|
||||
case STRING:
|
||||
return UUID.randomUUID().toString().length();
|
||||
case ENUM:
|
||||
return 1;
|
||||
case BYTES:
|
||||
return UUID.randomUUID().toString().length();
|
||||
case FIXED:
|
||||
return elementSchema.getFixedSize();
|
||||
default:
|
||||
throw new RuntimeException("Unknown type " + elementSchema.getType());
|
||||
}
|
||||
}
|
||||
|
||||
private int numEntriesToAdd(Schema elementSchema) {
|
||||
// Find the size of the primitive data type in bytes
|
||||
int primitiveDataTypeSize = getSize(elementSchema);
|
||||
int numEntriesToAdd = numberOfBytesToAdd / primitiveDataTypeSize;
|
||||
// If more than 10 entries are being added for this same complex field and there are still more complex fields to
|
||||
// be visited in the schema, reduce the number of entries to add by a factor of 10 to allow for other complex
|
||||
// fields to pack some entries
|
||||
if (numEntriesToAdd % 10 > 0 && this.numberOfComplexFields > 1) {
|
||||
numEntriesToAdd = numEntriesToAdd / 10;
|
||||
numberOfBytesToAdd -= numEntriesToAdd * primitiveDataTypeSize;
|
||||
this.shouldAddMore = true;
|
||||
} else {
|
||||
this.numberOfBytesToAdd = 0;
|
||||
this.shouldAddMore = false;
|
||||
}
|
||||
this.numberOfComplexFields -= 1;
|
||||
return numEntriesToAdd;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -0,0 +1,121 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.integ.testsuite.generator;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.List;
|
||||
import java.util.UUID;
|
||||
import java.util.concurrent.atomic.AtomicInteger;
|
||||
import org.apache.avro.Schema;
|
||||
import org.apache.avro.generic.GenericRecord;
|
||||
import org.apache.hudi.common.util.collection.Pair;
|
||||
|
||||
/**
|
||||
* This is a GenericRecord payload estimator estimates the size of a full generic record {@link GenericRecord}.
|
||||
* A full record is defined as "Every field of a generic record created contains 1 random value"
|
||||
*/
|
||||
public class GenericRecordFullPayloadSizeEstimator implements Serializable {
|
||||
|
||||
private final transient Schema baseSchema;
|
||||
|
||||
// This variable is used to track the number of complex/collection fields with primitive data types at their leaf.
|
||||
// This is used to figure out how many entries can be packed in such a collection field to meet the min payload
|
||||
// size requested
|
||||
private final transient AtomicInteger counter = new AtomicInteger(0);
|
||||
|
||||
public GenericRecordFullPayloadSizeEstimator(Schema schema) {
|
||||
this.baseSchema = schema;
|
||||
}
|
||||
|
||||
public Pair<Integer, Integer> typeEstimateAndNumComplexFields() {
|
||||
int size = estimate(baseSchema);
|
||||
return Pair.of(size, counter.get());
|
||||
}
|
||||
|
||||
/**
|
||||
* This method estimates the size of the payload if all entries of this payload were populated with one value.
|
||||
* For eg. A primitive data type such as String will be populated with {@link UUID} so the length if 36 bytes
|
||||
* whereas a complex data type such as an Array of type Int, will be populated with exactly 1 Integer value.
|
||||
*/
|
||||
protected int estimate(Schema schema) {
|
||||
long size = 0;
|
||||
for (Schema.Field f : schema.getFields()) {
|
||||
size += typeEstimate(f.schema());
|
||||
}
|
||||
return (int) size;
|
||||
}
|
||||
|
||||
private long typeEstimate(Schema schema) {
|
||||
Schema localSchema = schema;
|
||||
if (isOption(schema)) {
|
||||
localSchema = getNonNull(schema);
|
||||
}
|
||||
switch (localSchema.getType()) {
|
||||
case BOOLEAN:
|
||||
return 1;
|
||||
case DOUBLE:
|
||||
return 8;
|
||||
case FLOAT:
|
||||
return 4;
|
||||
case INT:
|
||||
return 4;
|
||||
case LONG:
|
||||
return 8;
|
||||
case STRING:
|
||||
return UUID.randomUUID().toString().length();
|
||||
case ENUM:
|
||||
return 1;
|
||||
case RECORD:
|
||||
return estimate(localSchema);
|
||||
case ARRAY:
|
||||
if (GenericRecordFullPayloadGenerator.isPrimitive(localSchema.getElementType())) {
|
||||
counter.addAndGet(1);
|
||||
}
|
||||
Schema elementSchema = localSchema.getElementType();
|
||||
return typeEstimate(elementSchema);
|
||||
case MAP:
|
||||
if (GenericRecordFullPayloadGenerator.isPrimitive(localSchema.getValueType())) {
|
||||
counter.addAndGet(1);
|
||||
}
|
||||
Schema valueSchema = localSchema.getValueType();
|
||||
return UUID.randomUUID().toString().length() + typeEstimate(valueSchema);
|
||||
case BYTES:
|
||||
return UUID.randomUUID().toString().length();
|
||||
case FIXED:
|
||||
return localSchema.getFixedSize();
|
||||
default:
|
||||
throw new IllegalArgumentException(
|
||||
"Cannot handle type: " + localSchema.getType());
|
||||
}
|
||||
}
|
||||
|
||||
protected boolean isOption(Schema schema) {
|
||||
return schema.getType().equals(Schema.Type.UNION)
|
||||
&& schema.getTypes().size() == 2
|
||||
&& (schema.getTypes().get(0).getType().equals(Schema.Type.NULL)
|
||||
|| schema.getTypes().get(1).getType().equals(Schema.Type.NULL));
|
||||
}
|
||||
|
||||
protected Schema getNonNull(Schema schema) {
|
||||
List<Schema> types = schema.getTypes();
|
||||
return types.get(0).getType().equals(Schema.Type.NULL) ? types.get(1) : types.get(0);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@@ -0,0 +1,82 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.integ.testsuite.generator;
|
||||
|
||||
import org.apache.avro.Schema;
|
||||
import org.apache.avro.generic.GenericData;
|
||||
import org.apache.avro.generic.GenericRecord;
|
||||
|
||||
/**
|
||||
* This is a GenericRecord payload generator that generates partial generic records {@link GenericRecord}. A partial
|
||||
* records is one that has some fields of the schema NULL or NOT PRESENT. This payload enables us to simulate
|
||||
* creation of partial records which are possible in many cases, especially for database change logs.
|
||||
*/
|
||||
public class GenericRecordPartialPayloadGenerator extends GenericRecordFullPayloadGenerator {
|
||||
|
||||
public GenericRecordPartialPayloadGenerator(Schema schema) {
|
||||
super(schema);
|
||||
}
|
||||
|
||||
public GenericRecordPartialPayloadGenerator(Schema schema, int minPayloadSize) {
|
||||
super(schema, minPayloadSize);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected GenericRecord convert(Schema schema) {
|
||||
GenericRecord record = super.convertPartial(schema);
|
||||
return record;
|
||||
}
|
||||
|
||||
private void setNull(GenericRecord record) {
|
||||
for (Schema.Field field : record.getSchema().getFields()) {
|
||||
// A random boolean decides whether this field of the generic record should be present or absent.
|
||||
// Using this we can set only a handful of fields in the record and generate partial records
|
||||
boolean setNull = random.nextBoolean();
|
||||
if (setNull) { // TODO : DO NOT SET THE RECORD KEY FIELDS TO NULL
|
||||
record.put(field.name(), null);
|
||||
} else {
|
||||
if (record.get(field.name()) instanceof GenericData.Record) {
|
||||
setNull((GenericData.Record) record.get(field.name()));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean validate(GenericRecord record) {
|
||||
return validate((Object) record);
|
||||
}
|
||||
|
||||
// Atleast 1 entry should be null
|
||||
private boolean validate(Object object) {
|
||||
if (object == null) {
|
||||
return true;
|
||||
} else if (object instanceof GenericRecord) {
|
||||
for (Schema.Field field : ((GenericRecord) object).getSchema().getFields()) {
|
||||
boolean ret = validate(((GenericRecord) object).get(field.name()));
|
||||
if (ret) {
|
||||
return ret;
|
||||
}
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@@ -0,0 +1,48 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.integ.testsuite.generator;
|
||||
|
||||
import java.util.Iterator;
|
||||
import org.apache.avro.generic.GenericRecord;
|
||||
import org.apache.hudi.client.utils.LazyIterableIterator;
|
||||
|
||||
/**
|
||||
* A lazy record generator to generate {@link GenericRecord}s lazily and not hold a list of records in memory.
|
||||
*/
|
||||
public class LazyRecordGeneratorIterator extends
|
||||
LazyIterableIterator<GenericRecord, GenericRecord> {
|
||||
|
||||
public LazyRecordGeneratorIterator(Iterator<GenericRecord> inputItr) {
|
||||
super(inputItr);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected void start() {
|
||||
}
|
||||
|
||||
@Override
|
||||
protected GenericRecord computeNext() {
|
||||
return inputItr.next();
|
||||
}
|
||||
|
||||
@Override
|
||||
protected void end() {
|
||||
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,56 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.integ.testsuite.generator;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
import org.apache.avro.Schema;
|
||||
import org.apache.avro.generic.GenericRecord;
|
||||
|
||||
public class UpdateGeneratorIterator implements Iterator<GenericRecord> {
|
||||
|
||||
// Use the full payload generator as default
|
||||
private GenericRecordFullPayloadGenerator generator;
|
||||
private List<String> blackListedFields;
|
||||
// iterator
|
||||
private Iterator<GenericRecord> itr;
|
||||
|
||||
public UpdateGeneratorIterator(Iterator<GenericRecord> itr, String schemaStr, List<String> partitionPathFieldNames,
|
||||
List<String> recordKeyFieldNames, int minPayloadSize) {
|
||||
this.itr = itr;
|
||||
this.blackListedFields = new ArrayList<>();
|
||||
this.blackListedFields.addAll(partitionPathFieldNames);
|
||||
this.blackListedFields.addAll(recordKeyFieldNames);
|
||||
Schema schema = new Schema.Parser().parse(schemaStr);
|
||||
this.generator = new GenericRecordFullPayloadGenerator(schema, minPayloadSize);
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean hasNext() {
|
||||
return itr.hasNext();
|
||||
}
|
||||
|
||||
@Override
|
||||
public GenericRecord next() {
|
||||
GenericRecord newRecord = itr.next();
|
||||
return this.generator.randomize(newRecord, this.blackListedFields);
|
||||
}
|
||||
|
||||
}
|
||||
@@ -0,0 +1,92 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.integ.testsuite.helpers;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.stream.Collectors;
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.fs.FileStatus;
|
||||
import org.apache.hadoop.fs.LocatedFileStatus;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.apache.hadoop.fs.RemoteIterator;
|
||||
import org.apache.hudi.common.config.TypedProperties;
|
||||
import org.apache.hudi.common.util.Option;
|
||||
import org.apache.hudi.common.util.collection.ImmutablePair;
|
||||
import org.apache.hudi.common.util.collection.Pair;
|
||||
import org.apache.hudi.exception.HoodieIOException;
|
||||
import org.apache.hudi.utilities.sources.helpers.DFSPathSelector;
|
||||
|
||||
/**
|
||||
* A custom dfs path selector used only for the hudi test suite. To be used only if workload is not run inline.
|
||||
*/
|
||||
public class DFSTestSuitePathSelector extends DFSPathSelector {
|
||||
|
||||
public DFSTestSuitePathSelector(TypedProperties props, Configuration hadoopConf) {
|
||||
super(props, hadoopConf);
|
||||
}
|
||||
|
||||
@Override
|
||||
public Pair<Option<String>, String> getNextFilePathsAndMaxModificationTime(
|
||||
Option<String> lastCheckpointStr, long sourceLimit) {
|
||||
|
||||
Integer lastBatchId;
|
||||
Integer nextBatchId;
|
||||
try {
|
||||
if (lastCheckpointStr.isPresent()) {
|
||||
lastBatchId = Integer.parseInt(lastCheckpointStr.get());
|
||||
nextBatchId = lastBatchId + 1;
|
||||
} else {
|
||||
lastBatchId = -1;
|
||||
nextBatchId = 0;
|
||||
}
|
||||
// obtain all eligible files for the batch
|
||||
List<FileStatus> eligibleFiles = new ArrayList<>();
|
||||
FileStatus[] fileStatuses = fs.globStatus(
|
||||
new Path(props.getString(Config.ROOT_INPUT_PATH_PROP), "*"));
|
||||
for (FileStatus fileStatus : fileStatuses) {
|
||||
if (!fileStatus.isDirectory() || IGNORE_FILEPREFIX_LIST.stream()
|
||||
.anyMatch(pfx -> fileStatus.getPath().getName().startsWith(pfx))) {
|
||||
continue;
|
||||
} else if (fileStatus.getPath().getName().compareTo(lastBatchId.toString()) > 0 && fileStatus.getPath()
|
||||
.getName().compareTo(nextBatchId.toString()) <= 0) {
|
||||
RemoteIterator<LocatedFileStatus> files = fs.listFiles(fileStatus.getPath(), true);
|
||||
while (files.hasNext()) {
|
||||
eligibleFiles.add(files.next());
|
||||
}
|
||||
}
|
||||
}
|
||||
// no data to readAvro
|
||||
if (eligibleFiles.size() == 0) {
|
||||
return new ImmutablePair<>(Option.empty(),
|
||||
lastCheckpointStr.orElseGet(() -> String.valueOf(Long.MIN_VALUE)));
|
||||
}
|
||||
// readAvro the files out.
|
||||
String pathStr = eligibleFiles.stream().map(f -> f.getPath().toString())
|
||||
.collect(Collectors.joining(","));
|
||||
|
||||
return new ImmutablePair<>(Option.ofNullable(pathStr), String.valueOf(nextBatchId));
|
||||
} catch (IOException ioe) {
|
||||
throw new HoodieIOException(
|
||||
"Unable to readAvro from source from checkpoint: " + lastCheckpointStr, ioe);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
@@ -0,0 +1,74 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.integ.testsuite.helpers;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.net.BindException;
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hive.service.server.HiveServer2;
|
||||
import org.apache.hudi.hive.testutils.HiveTestService;
|
||||
import org.apache.hudi.integ.testsuite.HoodieTestSuiteWriter;
|
||||
import org.apache.hudi.integ.testsuite.configuration.DeltaConfig.Config;
|
||||
import org.apache.log4j.LogManager;
|
||||
import org.apache.log4j.Logger;
|
||||
import org.apache.thrift.transport.TTransportException;
|
||||
|
||||
/**
|
||||
* Hive Service provider.
|
||||
*/
|
||||
public class HiveServiceProvider {
|
||||
|
||||
private HiveTestService hiveService;
|
||||
private HiveServer2 hiveServer;
|
||||
private Config config;
|
||||
|
||||
private static final Logger LOG = LogManager.getLogger(HiveServiceProvider.class);
|
||||
|
||||
public HiveServiceProvider(Config config) {
|
||||
this.config = config;
|
||||
}
|
||||
|
||||
public void startLocalHiveServiceIfNeeded(Configuration configuration) throws IOException {
|
||||
if (config.isHiveLocal()) {
|
||||
hiveService = new HiveTestService(configuration);
|
||||
hiveServer = hiveService.start();
|
||||
}
|
||||
}
|
||||
|
||||
public void syncToLocalHiveIfNeeded(HoodieTestSuiteWriter writer) {
|
||||
if (this.config.isHiveLocal()) {
|
||||
writer.getDeltaStreamerWrapper().getDeltaSyncService().getDeltaSync()
|
||||
.syncHive(getLocalHiveServer().getHiveConf());
|
||||
} else {
|
||||
writer.getDeltaStreamerWrapper().getDeltaSyncService().getDeltaSync().syncHive();
|
||||
}
|
||||
}
|
||||
|
||||
public void stopLocalHiveServiceIfNeeded() throws IOException {
|
||||
if (config.isHiveLocal()) {
|
||||
if (hiveService != null) {
|
||||
hiveService.stop();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public HiveServer2 getLocalHiveServer() {
|
||||
return hiveServer;
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,89 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.integ.testsuite.reader;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Arrays;
|
||||
import org.apache.avro.generic.GenericRecord;
|
||||
import org.apache.hadoop.fs.PathFilter;
|
||||
import org.apache.hudi.common.util.Option;
|
||||
import org.apache.hudi.integ.testsuite.writer.AvroFileDeltaInputWriter;
|
||||
import org.apache.hudi.integ.testsuite.writer.DeltaOutputMode;
|
||||
import org.apache.spark.api.java.JavaRDD;
|
||||
import org.apache.spark.sql.SparkSession;
|
||||
|
||||
/**
|
||||
* A reader of {@link DeltaOutputMode#DFS} and {@link DeltaInputType#AVRO}.
|
||||
*/
|
||||
public class DFSAvroDeltaInputReader extends DFSDeltaInputReader {
|
||||
|
||||
private final SparkSession sparkSession;
|
||||
private final String schemaStr;
|
||||
private final String basePath;
|
||||
private final Option<String> structName;
|
||||
private final Option<String> nameSpace;
|
||||
protected PathFilter filter = (path) -> {
|
||||
if (path.toUri().toString().contains(AvroFileDeltaInputWriter.AVRO_EXTENSION)) {
|
||||
return true;
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
};
|
||||
|
||||
public DFSAvroDeltaInputReader(SparkSession sparkSession, String schemaStr, String basePath,
|
||||
Option<String> structName,
|
||||
Option<String> nameSpace) {
|
||||
this.sparkSession = sparkSession;
|
||||
this.schemaStr = schemaStr;
|
||||
this.basePath = basePath;
|
||||
this.structName = structName;
|
||||
this.nameSpace = nameSpace;
|
||||
}
|
||||
|
||||
@Override
|
||||
public JavaRDD<GenericRecord> read(long totalRecordsToRead) throws IOException {
|
||||
return SparkBasedReader.readAvro(sparkSession, schemaStr, getFilePathsToRead(basePath, filter, totalRecordsToRead),
|
||||
structName, nameSpace);
|
||||
}
|
||||
|
||||
@Override
|
||||
public JavaRDD<GenericRecord> read(int numPartitions, long approxNumRecords) throws IOException {
|
||||
throw new UnsupportedOperationException("cannot generate updates");
|
||||
}
|
||||
|
||||
@Override
|
||||
public JavaRDD<GenericRecord> read(int numPartitions, int numFiles, long approxNumRecords) throws IOException {
|
||||
throw new UnsupportedOperationException("cannot generate updates");
|
||||
}
|
||||
|
||||
@Override
|
||||
public JavaRDD<GenericRecord> read(int numPartitions, int numFiles, double percentageRecordsPerFile)
|
||||
throws IOException {
|
||||
throw new UnsupportedOperationException("cannot generate updates");
|
||||
}
|
||||
|
||||
@Override
|
||||
protected long analyzeSingleFile(String filePath) {
|
||||
JavaRDD<GenericRecord> recordsFromOneFile = SparkBasedReader
|
||||
.readAvro(sparkSession, schemaStr, Arrays.asList(filePath),
|
||||
structName, nameSpace);
|
||||
return recordsFromOneFile.count();
|
||||
}
|
||||
|
||||
}
|
||||
@@ -0,0 +1,98 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.integ.testsuite.reader;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
import org.apache.avro.generic.GenericRecord;
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.fs.FileStatus;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.apache.hadoop.fs.PathFilter;
|
||||
import org.apache.hudi.common.fs.FSUtils;
|
||||
import org.apache.hudi.common.util.collection.Pair;
|
||||
|
||||
/**
|
||||
* This class helps to estimate the number of files to read a given number of total records.
|
||||
* Use this class for all DFS based implementations of {@link DeltaInputReader}
|
||||
*/
|
||||
public abstract class DFSDeltaInputReader implements DeltaInputReader<GenericRecord> {
|
||||
|
||||
protected List<String> getFilePathsToRead(String basePath, PathFilter filter, long totalRecordsToRead) throws
|
||||
IOException {
|
||||
FileSystem fs = FSUtils.getFs(basePath, new Configuration());
|
||||
// TODO : Sort list by file size and take the median file status to ensure fair calculation and change to remote
|
||||
// iterator
|
||||
List<FileStatus> fileStatuses = Arrays.asList(fs.globStatus(new Path(basePath, "*/*"), filter));
|
||||
if (fileStatuses.size() > 0) {
|
||||
FileStatus status = fileStatuses.get(0);
|
||||
long avgNumRecordsPerFile = analyzeSingleFile(status.getPath().toString());
|
||||
long numFilesToMatchExpectedRecords = (long) Math.ceil((double) totalRecordsToRead / (double)
|
||||
avgNumRecordsPerFile);
|
||||
long avgSizeOfEachFile = status.getLen();
|
||||
long totalSizeToRead = avgSizeOfEachFile * numFilesToMatchExpectedRecords;
|
||||
// choose N files with that length
|
||||
Pair<Integer, Integer> fileStatusIndexRange = getFileStatusIndexRange(fileStatuses, avgSizeOfEachFile,
|
||||
totalSizeToRead);
|
||||
int startIndex = fileStatusIndexRange.getLeft();
|
||||
List<String> filePaths = new ArrayList<>();
|
||||
while (startIndex == 0 || startIndex < fileStatusIndexRange.getRight()) {
|
||||
filePaths.add(fileStatuses.get(startIndex).getPath().toString());
|
||||
startIndex++;
|
||||
}
|
||||
return filePaths;
|
||||
}
|
||||
return Collections.emptyList();
|
||||
}
|
||||
|
||||
protected Pair<Integer, Integer> getFileStatusIndexRange(List<FileStatus> fileStatuses, long averageFileSize, long
|
||||
totalSizeToRead) {
|
||||
long totalSizeOfFilesPresent = 0;
|
||||
int startOffset = 0;
|
||||
int endOffset = 0;
|
||||
for (FileStatus fileStatus : fileStatuses) {
|
||||
// If current file length is greater than averageFileSize, increment by averageFileSize since our
|
||||
// totalSizeToRead calculation is based on the averageRecordSize * numRecordsToRead.
|
||||
if (fileStatus.getLen() > averageFileSize) {
|
||||
totalSizeOfFilesPresent += averageFileSize;
|
||||
} else {
|
||||
totalSizeOfFilesPresent += fileStatus.getLen();
|
||||
}
|
||||
if (totalSizeOfFilesPresent <= totalSizeToRead) {
|
||||
endOffset++;
|
||||
continue;
|
||||
} else {
|
||||
return Pair.of(startOffset, endOffset);
|
||||
}
|
||||
}
|
||||
return Pair.of(startOffset, endOffset);
|
||||
}
|
||||
|
||||
/**
|
||||
* Implementation of {@link DeltaInputReader}s to provide a way to read a single file on DFS and provide an
|
||||
* average number of records across N files.
|
||||
*/
|
||||
protected long analyzeSingleFile(String filePath) {
|
||||
throw new UnsupportedOperationException("No implementation found");
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,338 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.integ.testsuite.reader;
|
||||
|
||||
import static java.util.Map.Entry.comparingByValue;
|
||||
import static java.util.stream.Collectors.toMap;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.UncheckedIOException;
|
||||
import java.util.Arrays;
|
||||
import java.util.Collections;
|
||||
import java.util.HashMap;
|
||||
import java.util.Iterator;
|
||||
import java.util.LinkedHashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.NoSuchElementException;
|
||||
import java.util.stream.Collectors;
|
||||
import java.util.stream.StreamSupport;
|
||||
import org.apache.avro.Schema;
|
||||
import org.apache.avro.generic.GenericRecord;
|
||||
import org.apache.avro.generic.IndexedRecord;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.apache.hudi.avro.HoodieAvroUtils;
|
||||
import org.apache.hudi.common.fs.FSUtils;
|
||||
import org.apache.hudi.common.model.FileSlice;
|
||||
import org.apache.hudi.common.model.HoodieRecord;
|
||||
import org.apache.hudi.common.model.HoodieRecordPayload;
|
||||
import org.apache.hudi.common.table.HoodieTableMetaClient;
|
||||
import org.apache.hudi.common.table.log.HoodieMergedLogRecordScanner;
|
||||
import org.apache.hudi.common.table.view.HoodieTableFileSystemView;
|
||||
import org.apache.hudi.common.table.view.TableFileSystemView;
|
||||
import org.apache.hudi.common.util.Option;
|
||||
import org.apache.hudi.common.util.ParquetReaderIterator;
|
||||
import org.apache.hudi.common.util.ValidationUtils;
|
||||
import org.apache.hudi.config.HoodieMemoryConfig;
|
||||
import org.apache.parquet.avro.AvroParquetReader;
|
||||
import org.apache.spark.api.java.JavaPairRDD;
|
||||
import org.apache.spark.api.java.JavaRDD;
|
||||
import org.apache.spark.api.java.JavaSparkContext;
|
||||
import org.apache.spark.sql.SparkSession;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
import scala.Tuple2;
|
||||
|
||||
/**
|
||||
* This class helps to generate updates from an already existing hoodie dataset. It supports generating updates in
|
||||
* across partitions, files and records.
|
||||
*/
|
||||
public class DFSHoodieDatasetInputReader extends DFSDeltaInputReader {
|
||||
|
||||
private static Logger log = LoggerFactory.getLogger(DFSHoodieDatasetInputReader.class);
|
||||
|
||||
private transient JavaSparkContext jsc;
|
||||
private String schemaStr;
|
||||
private HoodieTableMetaClient metaClient;
|
||||
|
||||
public DFSHoodieDatasetInputReader(JavaSparkContext jsc, String basePath, String schemaStr) {
|
||||
this.jsc = jsc;
|
||||
this.schemaStr = schemaStr;
|
||||
this.metaClient = new HoodieTableMetaClient(jsc.hadoopConfiguration(), basePath);
|
||||
}
|
||||
|
||||
protected List<String> getPartitions(Option<Integer> partitionsLimit) throws IOException {
|
||||
List<String> partitionPaths = FSUtils
|
||||
.getAllPartitionPaths(metaClient.getFs(), metaClient.getBasePath(), false);
|
||||
// Sort partition so we can pick last N partitions by default
|
||||
Collections.sort(partitionPaths);
|
||||
if (!partitionPaths.isEmpty()) {
|
||||
ValidationUtils.checkArgument(partitionPaths.size() >= partitionsLimit.get(),
|
||||
"Cannot generate updates for more partitions " + "than present in the dataset, partitions "
|
||||
+ "requested " + partitionsLimit.get() + ", partitions present " + partitionPaths.size());
|
||||
return partitionPaths.subList(0, partitionsLimit.get());
|
||||
}
|
||||
return partitionPaths;
|
||||
|
||||
}
|
||||
|
||||
private JavaPairRDD<String, Iterator<FileSlice>> getPartitionToFileSlice(HoodieTableMetaClient metaClient,
|
||||
List<String> partitionPaths) {
|
||||
TableFileSystemView.SliceView fileSystemView = new HoodieTableFileSystemView(metaClient,
|
||||
metaClient.getCommitsAndCompactionTimeline().filterCompletedInstants());
|
||||
// pass num partitions to another method
|
||||
JavaPairRDD<String, Iterator<FileSlice>> partitionToFileSliceList = jsc.parallelize(partitionPaths).mapToPair(p -> {
|
||||
return new Tuple2<>(p, fileSystemView.getLatestFileSlices(p).iterator());
|
||||
});
|
||||
return partitionToFileSliceList;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected long analyzeSingleFile(String filePath) {
|
||||
return SparkBasedReader.readParquet(new SparkSession(jsc.sc()), Arrays.asList(filePath),
|
||||
Option.empty(), Option.empty()).count();
|
||||
}
|
||||
|
||||
private JavaRDD<GenericRecord> fetchAnyRecordsFromDataset(Option<Long> numRecordsToUpdate) throws IOException {
|
||||
return fetchRecordsFromDataset(Option.empty(), Option.empty(), numRecordsToUpdate, Option.empty());
|
||||
}
|
||||
|
||||
private JavaRDD<GenericRecord> fetchAnyRecordsFromDataset(Option<Long> numRecordsToUpdate, Option<Integer>
|
||||
numPartitions) throws IOException {
|
||||
return fetchRecordsFromDataset(numPartitions, Option.empty(), numRecordsToUpdate, Option.empty());
|
||||
}
|
||||
|
||||
private JavaRDD<GenericRecord> fetchPercentageRecordsFromDataset(Option<Integer> numPartitions, Option<Integer>
|
||||
numFiles, Option<Double> percentageRecordsPerFile) throws IOException {
|
||||
return fetchRecordsFromDataset(numPartitions, numFiles, Option.empty(), percentageRecordsPerFile);
|
||||
}
|
||||
|
||||
private JavaRDD<GenericRecord> fetchRecordsFromDataset(Option<Integer> numPartitions, Option<Integer>
|
||||
numFiles, Option<Long> numRecordsToUpdate) throws IOException {
|
||||
return fetchRecordsFromDataset(numPartitions, numFiles, numRecordsToUpdate, Option.empty());
|
||||
}
|
||||
|
||||
private JavaRDD<GenericRecord> fetchRecordsFromDataset(Option<Integer> numPartitions, Option<Integer> numFiles,
|
||||
Option<Long> numRecordsToUpdate, Option<Double> percentageRecordsPerFile) throws IOException {
|
||||
log.info("NumPartitions : {}, NumFiles : {}, numRecordsToUpdate : {}, percentageRecordsPerFile : {}",
|
||||
numPartitions, numFiles, numRecordsToUpdate, percentageRecordsPerFile);
|
||||
List<String> partitionPaths = getPartitions(numPartitions);
|
||||
// Read all file slices in the partition
|
||||
JavaPairRDD<String, Iterator<FileSlice>> partitionToFileSlice = getPartitionToFileSlice(metaClient,
|
||||
partitionPaths);
|
||||
// TODO : read record count from metadata
|
||||
// Read the records in a single file
|
||||
long recordsInSingleFile = iteratorSize(readParquetOrLogFiles(getSingleSliceFromRDD(partitionToFileSlice)));
|
||||
int numFilesToUpdate;
|
||||
long numRecordsToUpdatePerFile;
|
||||
if (!numFiles.isPresent() || numFiles.get() == 0) {
|
||||
// If num files are not passed, find the number of files to update based on total records to update and records
|
||||
// per file
|
||||
numFilesToUpdate = (int) (numRecordsToUpdate.get() / recordsInSingleFile);
|
||||
log.info("Files to update {}", numFilesToUpdate);
|
||||
numRecordsToUpdatePerFile = recordsInSingleFile;
|
||||
} else {
|
||||
// If num files is passed, find the number of records per file based on either percentage or total records to
|
||||
// update and num files passed
|
||||
numFilesToUpdate = numFiles.get();
|
||||
numRecordsToUpdatePerFile = percentageRecordsPerFile.isPresent() ? (long) (recordsInSingleFile
|
||||
* percentageRecordsPerFile.get()) : numRecordsToUpdate.get() / numFilesToUpdate;
|
||||
}
|
||||
// Adjust the number of files to read per partition based on the requested partition & file counts
|
||||
Map<String, Integer> adjustedPartitionToFileIdCountMap = getFilesToReadPerPartition(partitionToFileSlice,
|
||||
getPartitions(numPartitions).size(), numFilesToUpdate);
|
||||
JavaRDD<GenericRecord> updates = projectSchema(generateUpdates(adjustedPartitionToFileIdCountMap,
|
||||
partitionToFileSlice, numFilesToUpdate, (int) numRecordsToUpdatePerFile));
|
||||
if (numRecordsToUpdate.isPresent() && numFiles.isPresent() && numFiles.get() != 0 && numRecordsToUpdate.get()
|
||||
!= numRecordsToUpdatePerFile * numFiles.get()) {
|
||||
long remainingRecordsToAdd = (numRecordsToUpdate.get() - (numRecordsToUpdatePerFile * numFiles.get()));
|
||||
updates = updates.union(projectSchema(jsc.parallelize(generateUpdates(adjustedPartitionToFileIdCountMap,
|
||||
partitionToFileSlice, numFilesToUpdate, (int) remainingRecordsToAdd).take((int) remainingRecordsToAdd))));
|
||||
}
|
||||
log.info("Finished generating updates");
|
||||
return updates;
|
||||
}
|
||||
|
||||
private JavaRDD<GenericRecord> projectSchema(JavaRDD<GenericRecord> updates) {
|
||||
// The records read from the hoodie dataset have the hoodie record fields, rewrite the record to eliminate them
|
||||
return updates
|
||||
.map(r -> HoodieAvroUtils.rewriteRecordWithOnlyNewSchemaFields(r, new Schema.Parser().parse(schemaStr)));
|
||||
}
|
||||
|
||||
private JavaRDD<GenericRecord> generateUpdates(Map<String, Integer> adjustedPartitionToFileIdCountMap,
|
||||
JavaPairRDD<String, Iterator<FileSlice>> partitionToFileSlice, int numFiles, int numRecordsToReadPerFile) {
|
||||
return partitionToFileSlice.map(p -> {
|
||||
int maxFilesToRead = adjustedPartitionToFileIdCountMap.get(p._1);
|
||||
return iteratorLimit(p._2, maxFilesToRead);
|
||||
}).flatMap(p -> p).repartition(numFiles).map(fileSlice -> {
|
||||
if (numRecordsToReadPerFile > 0) {
|
||||
return iteratorLimit(readParquetOrLogFiles(fileSlice), numRecordsToReadPerFile);
|
||||
} else {
|
||||
return readParquetOrLogFiles(fileSlice);
|
||||
}
|
||||
}).flatMap(p -> p).map(i -> (GenericRecord) i);
|
||||
}
|
||||
|
||||
private Map<String, Integer> getFilesToReadPerPartition(JavaPairRDD<String, Iterator<FileSlice>>
|
||||
partitionToFileSlice, Integer numPartitions, Integer numFiles) {
|
||||
int numFilesPerPartition = (int) Math.ceil(numFiles / numPartitions);
|
||||
Map<String, Integer> partitionToFileIdCountMap = partitionToFileSlice
|
||||
.mapToPair(p -> new Tuple2<>(p._1, iteratorSize(p._2))).collectAsMap();
|
||||
long totalExistingFilesCount = partitionToFileIdCountMap.values().stream().reduce((a, b) -> a + b).get();
|
||||
ValidationUtils.checkArgument(totalExistingFilesCount >= numFiles, "Cannot generate updates "
|
||||
+ "for more files than present in the dataset, file requested " + numFiles + ", files present "
|
||||
+ totalExistingFilesCount);
|
||||
Map<String, Integer> partitionToFileIdCountSortedMap = partitionToFileIdCountMap
|
||||
.entrySet()
|
||||
.stream()
|
||||
.sorted(comparingByValue())
|
||||
.collect(toMap(Map.Entry::getKey, Map.Entry::getValue, (e1, e2) -> e2,
|
||||
LinkedHashMap::new));
|
||||
// Limit files to be read per partition
|
||||
Map<String, Integer> adjustedPartitionToFileIdCountMap = new HashMap<>();
|
||||
partitionToFileIdCountSortedMap.entrySet().stream().forEach(e -> {
|
||||
if (e.getValue() <= numFilesPerPartition) {
|
||||
adjustedPartitionToFileIdCountMap.put(e.getKey(), e.getValue());
|
||||
} else {
|
||||
adjustedPartitionToFileIdCountMap.put(e.getKey(), numFilesPerPartition);
|
||||
}
|
||||
});
|
||||
return adjustedPartitionToFileIdCountMap;
|
||||
}
|
||||
|
||||
private FileSlice getSingleSliceFromRDD(JavaPairRDD<String, Iterator<FileSlice>> partitionToFileSlice) {
|
||||
return partitionToFileSlice.map(f -> {
|
||||
FileSlice slice = f._2.next();
|
||||
FileSlice newSlice = new FileSlice(slice.getFileGroupId(), slice.getBaseInstantTime());
|
||||
if (slice.getBaseFile().isPresent()) {
|
||||
newSlice.setBaseFile(slice.getBaseFile().get());
|
||||
} else {
|
||||
slice.getLogFiles().forEach(l -> {
|
||||
newSlice.addLogFile(l);
|
||||
});
|
||||
}
|
||||
return newSlice;
|
||||
}).take(1).get(0);
|
||||
}
|
||||
|
||||
private Iterator<IndexedRecord> readParquetOrLogFiles(FileSlice fileSlice) throws IOException {
|
||||
if (fileSlice.getBaseFile().isPresent()) {
|
||||
Iterator<IndexedRecord> itr =
|
||||
new ParquetReaderIterator<IndexedRecord>(AvroParquetReader.<IndexedRecord>builder(new
|
||||
Path(fileSlice.getBaseFile().get().getPath())).withConf(metaClient.getHadoopConf()).build());
|
||||
return itr;
|
||||
} else {
|
||||
// If there is no data file, fall back to reading log files
|
||||
HoodieMergedLogRecordScanner scanner = new HoodieMergedLogRecordScanner(metaClient.getFs(),
|
||||
metaClient.getBasePath(),
|
||||
fileSlice.getLogFiles().map(l -> l.getPath().getName()).collect(Collectors.toList()),
|
||||
new Schema.Parser().parse(schemaStr), metaClient.getActiveTimeline().getCommitsTimeline()
|
||||
.filterCompletedInstants().lastInstant().get().getTimestamp(),
|
||||
HoodieMemoryConfig.DEFAULT_MAX_MEMORY_FOR_SPILLABLE_MAP_IN_BYTES, true, false,
|
||||
HoodieMemoryConfig.DEFAULT_MAX_DFS_STREAM_BUFFER_SIZE,
|
||||
HoodieMemoryConfig.DEFAULT_SPILLABLE_MAP_BASE_PATH);
|
||||
// readAvro log files
|
||||
Iterable<HoodieRecord<? extends HoodieRecordPayload>> iterable = () -> scanner.iterator();
|
||||
Schema schema = new Schema.Parser().parse(schemaStr);
|
||||
return StreamSupport.stream(iterable.spliterator(), false)
|
||||
.map(e -> {
|
||||
try {
|
||||
return (IndexedRecord) e.getData().getInsertValue(schema).get();
|
||||
} catch (IOException io) {
|
||||
throw new UncheckedIOException(io);
|
||||
}
|
||||
}).iterator();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the number of elements remaining in {@code iterator}. The iterator
|
||||
* will be left exhausted: its {@code hasNext()} method will return
|
||||
* {@code false}.
|
||||
*/
|
||||
private static int iteratorSize(Iterator<?> iterator) {
|
||||
int count = 0;
|
||||
while (iterator.hasNext()) {
|
||||
iterator.next();
|
||||
count++;
|
||||
}
|
||||
return count;
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates an iterator returning the first {@code limitSize} elements of the
|
||||
* given iterator. If the original iterator does not contain that many
|
||||
* elements, the returned iterator will have the same behavior as the original
|
||||
* iterator. The returned iterator supports {@code remove()} if the original
|
||||
* iterator does.
|
||||
*
|
||||
* @param iterator the iterator to limit
|
||||
* @param limitSize the maximum number of elements in the returned iterator
|
||||
* @throws IllegalArgumentException if {@code limitSize} is negative
|
||||
*/
|
||||
private static <T> Iterator<T> iteratorLimit(
|
||||
final Iterator<T> iterator, final int limitSize) {
|
||||
ValidationUtils.checkArgument(iterator != null, "iterator is null");
|
||||
ValidationUtils.checkArgument(limitSize >= 0, "limit is negative");
|
||||
return new Iterator<T>() {
|
||||
private int count;
|
||||
|
||||
@Override
|
||||
public boolean hasNext() {
|
||||
return count < limitSize && iterator.hasNext();
|
||||
}
|
||||
|
||||
@Override
|
||||
public T next() {
|
||||
if (!hasNext()) {
|
||||
throw new NoSuchElementException();
|
||||
}
|
||||
count++;
|
||||
return iterator.next();
|
||||
}
|
||||
|
||||
@Override
|
||||
public void remove() {
|
||||
iterator.remove();
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
@Override
|
||||
public JavaRDD<GenericRecord> read(long numRecords) throws IOException {
|
||||
return fetchAnyRecordsFromDataset(Option.of(numRecords));
|
||||
}
|
||||
|
||||
@Override
|
||||
public JavaRDD<GenericRecord> read(int numPartitions, long approxNumRecords) throws IOException {
|
||||
return fetchAnyRecordsFromDataset(Option.of(approxNumRecords), Option.of(numPartitions));
|
||||
}
|
||||
|
||||
@Override
|
||||
public JavaRDD<GenericRecord> read(int numPartitions, int numFiles, long numRecords) throws IOException {
|
||||
return fetchRecordsFromDataset(Option.of(numPartitions), Option.of(numFiles), Option.of(numRecords));
|
||||
}
|
||||
|
||||
@Override
|
||||
public JavaRDD<GenericRecord> read(int numPartitions, int numFiles, double percentageRecordsPerFile)
|
||||
throws IOException {
|
||||
return fetchPercentageRecordsFromDataset(Option.of(numPartitions), Option.of(numFiles),
|
||||
Option.of(percentageRecordsPerFile));
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,62 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.integ.testsuite.reader;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Serializable;
|
||||
import org.apache.spark.api.java.JavaRDD;
|
||||
|
||||
/**
|
||||
* Implementations of {@link DeltaInputReader} will read the configured input type and provide an RDD of records to the
|
||||
* client.
|
||||
*
|
||||
* @param <O> Read result data type
|
||||
*/
|
||||
public interface DeltaInputReader<O> extends Serializable {
|
||||
|
||||
/**
|
||||
* Attempts to reads an approximate number of records close to approxNumRecords.
|
||||
* This highly depends on the number of records already present in the input.
|
||||
*/
|
||||
JavaRDD<O> read(long approxNumRecords) throws IOException;
|
||||
|
||||
/**
|
||||
* @throws IOException Attempts to read approx number of records (exact if equal or more records available)
|
||||
* across requested number of
|
||||
* partitions.
|
||||
*/
|
||||
JavaRDD<O> read(int numPartitions, long approxNumRecords) throws IOException;
|
||||
|
||||
/**
|
||||
* @throws IOException Attempts to read approx number of records (exact if equal or more records available)
|
||||
* across requested number of
|
||||
* partitions and number of files.
|
||||
* 1. Find numFiles across numPartitions
|
||||
* 2. numRecordsToReadPerFile = approxNumRecords / numFiles
|
||||
*/
|
||||
JavaRDD<O> read(int numPartitions, int numFiles, long approxNumRecords) throws IOException;
|
||||
|
||||
/**
|
||||
* @throws IOException Attempts to a % of records per file across requested number of partitions and number of files.
|
||||
* 1. Find numFiles across numPartitions
|
||||
* 2. numRecordsToReadPerFile = approxNumRecordsPerFile * percentageRecordsPerFile
|
||||
*/
|
||||
JavaRDD<O> read(int numPartitions, int numFiles, double percentageRecordsPerFile) throws IOException;
|
||||
|
||||
}
|
||||
@@ -0,0 +1,26 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.integ.testsuite.reader;
|
||||
|
||||
/**
|
||||
* Supported delta input data types.
|
||||
*/
|
||||
public enum DeltaInputType {
|
||||
AVRO, PARQUET
|
||||
}
|
||||
@@ -0,0 +1,70 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.integ.testsuite.reader;
|
||||
|
||||
import java.util.List;
|
||||
import org.apache.avro.generic.GenericRecord;
|
||||
import org.apache.hudi.AvroConversionUtils;
|
||||
import org.apache.hudi.common.util.Option;
|
||||
import org.apache.hudi.utilities.schema.RowBasedSchemaProvider;
|
||||
import org.apache.spark.api.java.JavaRDD;
|
||||
import org.apache.spark.sql.Dataset;
|
||||
import org.apache.spark.sql.Row;
|
||||
import org.apache.spark.sql.SparkSession;
|
||||
import scala.collection.JavaConverters;
|
||||
|
||||
|
||||
/**
|
||||
* Helper class to read avro and/or parquet files and generate a RDD of {@link GenericRecord}.
|
||||
*/
|
||||
public class SparkBasedReader {
|
||||
|
||||
public static final String SPARK_AVRO_FORMAT = "avro";
|
||||
public static final String SPARK_PARQUET_FORMAT = "com.databricks.spark.parquet";
|
||||
private static final String AVRO_SCHEMA_OPTION_KEY = "avroSchema";
|
||||
|
||||
// Spark anyways globs the path and gets all the paths in memory so take the List<filePaths> as an argument.
|
||||
// https://github.com/apache/spark/.../org/apache/spark/sql/execution/datasources/DataSource.scala#L251
|
||||
public static JavaRDD<GenericRecord> readAvro(SparkSession sparkSession, String schemaStr, List<String> listOfPaths,
|
||||
Option<String> structName, Option<String> nameSpace) {
|
||||
|
||||
Dataset<Row> dataSet = sparkSession.read()
|
||||
.format(SPARK_AVRO_FORMAT)
|
||||
.option(AVRO_SCHEMA_OPTION_KEY, schemaStr)
|
||||
.load(JavaConverters.asScalaIteratorConverter(listOfPaths.iterator()).asScala().toSeq());
|
||||
|
||||
return AvroConversionUtils
|
||||
.createRdd(dataSet.toDF(), structName.orElse(RowBasedSchemaProvider.HOODIE_RECORD_STRUCT_NAME),
|
||||
nameSpace.orElse(RowBasedSchemaProvider.HOODIE_RECORD_NAMESPACE))
|
||||
.toJavaRDD();
|
||||
}
|
||||
|
||||
public static JavaRDD<GenericRecord> readParquet(SparkSession sparkSession, List<String>
|
||||
listOfPaths, Option<String> structName, Option<String> nameSpace) {
|
||||
|
||||
Dataset<Row> dataSet = sparkSession.read()
|
||||
.parquet((JavaConverters.asScalaIteratorConverter(listOfPaths.iterator()).asScala().toSeq()));
|
||||
|
||||
return AvroConversionUtils
|
||||
.createRdd(dataSet.toDF(), structName.orElse(RowBasedSchemaProvider.HOODIE_RECORD_STRUCT_NAME),
|
||||
RowBasedSchemaProvider.HOODIE_RECORD_NAMESPACE)
|
||||
.toJavaRDD();
|
||||
}
|
||||
|
||||
}
|
||||
@@ -0,0 +1,117 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.integ.testsuite.writer;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.OutputStream;
|
||||
import java.util.UUID;
|
||||
import org.apache.avro.Schema;
|
||||
import org.apache.avro.file.DataFileWriter;
|
||||
import org.apache.avro.generic.GenericDatumWriter;
|
||||
import org.apache.avro.generic.GenericRecord;
|
||||
import org.apache.avro.generic.IndexedRecord;
|
||||
import org.apache.avro.io.DatumWriter;
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.apache.hudi.common.fs.FSUtils;
|
||||
import org.apache.hudi.common.fs.HoodieWrapperFileSystem;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
/**
|
||||
* Implementation of {@link DeltaInputWriter} that writes avro records to the result file.
|
||||
*/
|
||||
public class AvroFileDeltaInputWriter implements DeltaInputWriter<GenericRecord> {
|
||||
|
||||
public static final String AVRO_EXTENSION = ".avro";
|
||||
private static Logger log = LoggerFactory.getLogger(AvroFileDeltaInputWriter.class);
|
||||
// The maximum file size for an avro file before being rolled over to a new one
|
||||
private final Long maxFileSize;
|
||||
private final Configuration configuration;
|
||||
private HoodieWrapperFileSystem fs;
|
||||
// Path of the actual avro file
|
||||
private Path file;
|
||||
// Base input path to write avro files under
|
||||
// TODO : Make this bucketed so don't have a large number of files in a single directory
|
||||
private String basePath;
|
||||
private DatumWriter<IndexedRecord> writer;
|
||||
private DataFileWriter<IndexedRecord> dataFileWriter;
|
||||
private OutputStream output;
|
||||
private Schema schema;
|
||||
private DeltaWriteStats deltaWriteStats;
|
||||
private long recordsWritten = 0;
|
||||
|
||||
// TODO : Handle failure case which may leave behind tons of small corrupt files
|
||||
public AvroFileDeltaInputWriter(Configuration configuration, String basePath, String schemaStr, Long maxFileSize)
|
||||
throws IOException {
|
||||
this.schema = Schema.parse(schemaStr);
|
||||
this.maxFileSize = maxFileSize;
|
||||
this.configuration = configuration;
|
||||
this.basePath = basePath;
|
||||
Path path = new Path(basePath, new Path(UUID.randomUUID().toString() + AVRO_EXTENSION));
|
||||
this.file = HoodieWrapperFileSystem.convertToHoodiePath(path, configuration);
|
||||
this.fs = (HoodieWrapperFileSystem) this.file
|
||||
.getFileSystem(FSUtils.registerFileSystem(path, configuration));
|
||||
this.output = this.fs.create(this.file);
|
||||
this.writer = new GenericDatumWriter(schema);
|
||||
this.dataFileWriter = new DataFileWriter<>(writer).create(schema, output);
|
||||
this.deltaWriteStats = new DeltaWriteStats();
|
||||
}
|
||||
|
||||
@Override
|
||||
public void writeData(GenericRecord iData) throws IOException {
|
||||
this.dataFileWriter.append(iData);
|
||||
recordsWritten++;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean canWrite() {
|
||||
return fs.getBytesWritten(file) < maxFileSize;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void close() throws IOException {
|
||||
this.deltaWriteStats.setBytesWritten(this.fs.getBytesWritten(this.file));
|
||||
this.deltaWriteStats.setRecordsWritten(this.recordsWritten);
|
||||
this.deltaWriteStats.setFilePath(this.file.toUri().getPath());
|
||||
this.dataFileWriter.close();
|
||||
log.info("New Avro File : {}", getPath());
|
||||
}
|
||||
|
||||
@Override
|
||||
public DeltaInputWriter getNewWriter() throws IOException {
|
||||
AvroFileDeltaInputWriter avroFileDeltaInputWriter = new AvroFileDeltaInputWriter(this.configuration, this.basePath, this
|
||||
.schema.toString(), this.maxFileSize);
|
||||
return avroFileDeltaInputWriter;
|
||||
}
|
||||
|
||||
public FileSystem getFs() {
|
||||
return fs;
|
||||
}
|
||||
|
||||
public Path getPath() {
|
||||
return this.file;
|
||||
}
|
||||
|
||||
@Override
|
||||
public DeltaWriteStats getDeltaWriteStats() {
|
||||
return this.deltaWriteStats;
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,62 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.integ.testsuite.writer;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
import org.apache.avro.generic.GenericRecord;
|
||||
|
||||
/**
|
||||
* {@link org.apache.hadoop.hdfs.DistributedFileSystem} (or {@link org.apache.hadoop.fs.LocalFileSystem}) based delta
|
||||
* generator.
|
||||
*/
|
||||
public class DFSDeltaWriterAdapter implements DeltaWriterAdapter<GenericRecord> {
|
||||
|
||||
private DeltaInputWriter deltaInputGenerator;
|
||||
private List<DeltaWriteStats> metrics = new ArrayList<>();
|
||||
|
||||
public DFSDeltaWriterAdapter(DeltaInputWriter<GenericRecord> deltaInputGenerator) {
|
||||
this.deltaInputGenerator = deltaInputGenerator;
|
||||
}
|
||||
|
||||
@Override
|
||||
public List<DeltaWriteStats> write(Iterator<GenericRecord> input) throws IOException {
|
||||
while (input.hasNext()) {
|
||||
if (this.deltaInputGenerator.canWrite()) {
|
||||
this.deltaInputGenerator.writeData(input.next());
|
||||
} else if (input.hasNext()) {
|
||||
rollOver();
|
||||
}
|
||||
}
|
||||
close();
|
||||
return this.metrics;
|
||||
}
|
||||
|
||||
public void rollOver() throws IOException {
|
||||
close();
|
||||
this.deltaInputGenerator = this.deltaInputGenerator.getNewWriter();
|
||||
}
|
||||
|
||||
private void close() throws IOException {
|
||||
this.deltaInputGenerator.close();
|
||||
this.metrics.add(this.deltaInputGenerator.getDeltaWriteStats());
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,42 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.integ.testsuite.writer;
|
||||
|
||||
import java.io.IOException;
|
||||
import org.apache.avro.generic.GenericRecord;
|
||||
import org.apache.hudi.integ.testsuite.writer.DeltaWriterAdapter.SparkBasedDeltaWriter;
|
||||
import org.apache.spark.api.java.JavaRDD;
|
||||
|
||||
/**
|
||||
* NEED TO IMPLEMENT A CUSTOM SPARK PARTITIONER TO ENSURE WE WRITE LARGE ENOUGH AVRO FILES.
|
||||
*/
|
||||
public class DFSSparkAvroDeltaWriter implements SparkBasedDeltaWriter<JavaRDD<GenericRecord>> {
|
||||
|
||||
private DeltaInputWriter<JavaRDD<GenericRecord>> deltaInputWriter;
|
||||
|
||||
public DFSSparkAvroDeltaWriter(DeltaInputWriter<JavaRDD<GenericRecord>> deltaInputWriter) {
|
||||
this.deltaInputWriter = deltaInputWriter;
|
||||
}
|
||||
|
||||
@Override
|
||||
public JavaRDD<DeltaWriteStats> write(JavaRDD<GenericRecord> input) throws IOException {
|
||||
this.deltaInputWriter.writeData(input);
|
||||
return null;
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,53 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.integ.testsuite.writer;
|
||||
|
||||
import java.io.Closeable;
|
||||
import java.io.IOException;
|
||||
|
||||
/**
|
||||
* Implementations of {@link DeltaInputWriter} will be able to generate data.
|
||||
*
|
||||
* @param <I> Data type to be generated.
|
||||
*/
|
||||
public interface DeltaInputWriter<I> extends Closeable {
|
||||
|
||||
/**
|
||||
* Generate any type of data.
|
||||
*/
|
||||
void writeData(I iData) throws IOException;
|
||||
|
||||
/**
|
||||
* Check whether more data can/should be written.
|
||||
*/
|
||||
boolean canWrite();
|
||||
|
||||
/**
|
||||
* Return the statistics of data written.
|
||||
*/
|
||||
DeltaWriteStats getDeltaWriteStats();
|
||||
|
||||
/**
|
||||
* Return a new instance of this writer.
|
||||
* @return
|
||||
* @throws IOException
|
||||
*/
|
||||
DeltaInputWriter getNewWriter() throws IOException;
|
||||
|
||||
}
|
||||
@@ -0,0 +1,26 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.integ.testsuite.writer;
|
||||
|
||||
/**
|
||||
* Supported output destination types for the generated delta workload.
|
||||
*/
|
||||
public enum DeltaOutputMode {
|
||||
KAFKA, DFS
|
||||
}
|
||||
@@ -0,0 +1,72 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.integ.testsuite.writer;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import org.apache.hudi.common.util.collection.Pair;
|
||||
|
||||
/**
|
||||
* This class holds the write statistics for {@link DeltaInputWriter}.
|
||||
*/
|
||||
public class DeltaWriteStats implements Serializable {
|
||||
|
||||
// The file path (if any) for the data written
|
||||
private String filePath;
|
||||
// Number of bytes written before being closed
|
||||
private long bytesWritten;
|
||||
// Number of records written before being closed
|
||||
private long recordsWritten;
|
||||
|
||||
private List<Pair<String, String>> partitionPathRecordKey = new ArrayList<>();
|
||||
|
||||
public String getFilePath() {
|
||||
return filePath;
|
||||
}
|
||||
|
||||
public void setFilePath(String filePath) {
|
||||
this.filePath = filePath;
|
||||
}
|
||||
|
||||
public long getBytesWritten() {
|
||||
return bytesWritten;
|
||||
}
|
||||
|
||||
public void setBytesWritten(long bytesWritten) {
|
||||
this.bytesWritten = bytesWritten;
|
||||
}
|
||||
|
||||
public List<Pair<String, String>> getPartitionPathRecordKey() {
|
||||
return partitionPathRecordKey;
|
||||
}
|
||||
|
||||
public void setPartitionPathRecordKey(List<Pair<String, String>> partitionPathRecordKey) {
|
||||
this.partitionPathRecordKey = partitionPathRecordKey;
|
||||
}
|
||||
|
||||
public long getRecordsWritten() {
|
||||
return recordsWritten;
|
||||
}
|
||||
|
||||
public void setRecordsWritten(long recordsWritten) {
|
||||
this.recordsWritten = recordsWritten;
|
||||
}
|
||||
|
||||
}
|
||||
@@ -0,0 +1,34 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.integ.testsuite.writer;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
import org.apache.spark.api.java.JavaRDD;
|
||||
|
||||
public interface DeltaWriterAdapter<I> {
|
||||
|
||||
List<DeltaWriteStats> write(Iterator<I> input) throws IOException;
|
||||
|
||||
interface SparkBasedDeltaWriter<J> {
|
||||
|
||||
JavaRDD<DeltaWriteStats> write(J input) throws IOException;
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,57 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.integ.testsuite.writer;
|
||||
|
||||
import java.io.IOException;
|
||||
import org.apache.avro.generic.GenericRecord;
|
||||
import org.apache.hudi.common.util.StringUtils;
|
||||
import org.apache.hudi.integ.testsuite.configuration.DFSDeltaConfig;
|
||||
import org.apache.hudi.integ.testsuite.configuration.DeltaConfig;
|
||||
import org.apache.hudi.integ.testsuite.reader.DeltaInputType;
|
||||
|
||||
/**
|
||||
* A factory to help instantiate different {@link DeltaWriterAdapter}s depending on the {@link DeltaOutputMode} and
|
||||
* {@link DeltaInputType}.
|
||||
*/
|
||||
public class DeltaWriterFactory {
|
||||
|
||||
private DeltaWriterFactory() {
|
||||
}
|
||||
|
||||
public static DeltaWriterAdapter getDeltaWriterAdapter(DeltaConfig config, Integer batchId) throws IOException {
|
||||
switch (config.getDeltaOutputMode()) {
|
||||
case DFS:
|
||||
switch (config.getDeltaInputType()) {
|
||||
case AVRO:
|
||||
DFSDeltaConfig dfsDeltaConfig = (DFSDeltaConfig) config;
|
||||
dfsDeltaConfig.setBatchId(batchId);
|
||||
DeltaInputWriter<GenericRecord> fileDeltaInputGenerator = new AvroFileDeltaInputWriter(
|
||||
dfsDeltaConfig.getConfiguration(),
|
||||
StringUtils
|
||||
.join(new String[]{dfsDeltaConfig.getDeltaBasePath(), dfsDeltaConfig.getBatchId().toString()},
|
||||
"/"), dfsDeltaConfig.getSchemaStr(), dfsDeltaConfig.getMaxFileSize());
|
||||
return new DFSDeltaWriterAdapter(fileDeltaInputGenerator);
|
||||
default:
|
||||
throw new IllegalArgumentException("Invalid delta input format " + config.getDeltaInputType());
|
||||
}
|
||||
default:
|
||||
throw new IllegalArgumentException("Invalid delta input type " + config.getDeltaOutputMode());
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,69 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.integ.testsuite.writer;
|
||||
|
||||
import java.io.IOException;
|
||||
import org.apache.avro.generic.GenericRecord;
|
||||
import org.apache.hudi.AvroConversionUtils;
|
||||
import org.apache.spark.api.java.JavaRDD;
|
||||
import org.apache.spark.sql.SparkSession;
|
||||
|
||||
/**
|
||||
* Spark based avro delta input writer. We don't use this yet since we cannot control result file size.
|
||||
*/
|
||||
public class SparkAvroDeltaInputWriter implements DeltaInputWriter<JavaRDD<GenericRecord>> {
|
||||
|
||||
private static final String AVRO_FORMAT_PACKAGE = "avro";
|
||||
public SparkSession sparkSession;
|
||||
private String schemaStr;
|
||||
// TODO : the base path has to be a new path every time for spark avro
|
||||
private String basePath;
|
||||
|
||||
public SparkAvroDeltaInputWriter(SparkSession sparkSession, String schemaStr, String basePath) {
|
||||
this.sparkSession = sparkSession;
|
||||
this.schemaStr = schemaStr;
|
||||
this.basePath = basePath;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void writeData(JavaRDD<GenericRecord> iData) throws IOException {
|
||||
AvroConversionUtils.createDataFrame(iData.rdd(), schemaStr, sparkSession).write()
|
||||
.format(AVRO_FORMAT_PACKAGE).save(basePath);
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean canWrite() {
|
||||
throw new UnsupportedOperationException("not applicable for spark based writer");
|
||||
}
|
||||
|
||||
@Override
|
||||
public void close() throws IOException {
|
||||
}
|
||||
|
||||
@Override
|
||||
public DeltaWriteStats getDeltaWriteStats() {
|
||||
throw new UnsupportedOperationException("not applicable for spark based writer");
|
||||
}
|
||||
|
||||
@Override
|
||||
public DeltaInputWriter getNewWriter() throws IOException {
|
||||
throw new UnsupportedOperationException("not applicable for spark based writer");
|
||||
}
|
||||
|
||||
}
|
||||
@@ -0,0 +1,138 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.integ.testsuite;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.assertTrue;
|
||||
import static org.mockito.Mockito.times;
|
||||
import static org.mockito.Mockito.when;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Iterator;
|
||||
import org.apache.avro.generic.GenericRecord;
|
||||
import org.apache.hadoop.fs.FileStatus;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.apache.hudi.common.config.SerializableConfiguration;
|
||||
import org.apache.hudi.common.fs.FSUtils;
|
||||
import org.apache.hudi.integ.testsuite.configuration.DFSDeltaConfig;
|
||||
import org.apache.hudi.integ.testsuite.configuration.DeltaConfig;
|
||||
import org.apache.hudi.integ.testsuite.generator.FlexibleSchemaRecordGenerationIterator;
|
||||
import org.apache.hudi.integ.testsuite.reader.DeltaInputType;
|
||||
import org.apache.hudi.integ.testsuite.utils.TestUtils;
|
||||
import org.apache.hudi.integ.testsuite.writer.AvroFileDeltaInputWriter;
|
||||
import org.apache.hudi.integ.testsuite.writer.DFSDeltaWriterAdapter;
|
||||
import org.apache.hudi.integ.testsuite.writer.DeltaInputWriter;
|
||||
import org.apache.hudi.integ.testsuite.writer.DeltaOutputMode;
|
||||
import org.apache.hudi.integ.testsuite.writer.DeltaWriteStats;
|
||||
import org.apache.hudi.integ.testsuite.writer.DeltaWriterAdapter;
|
||||
import org.apache.hudi.integ.testsuite.writer.DeltaWriterFactory;
|
||||
import org.apache.hudi.utilities.schema.FilebasedSchemaProvider;
|
||||
import org.apache.hudi.utilities.testutils.UtilitiesTestBase;
|
||||
import org.apache.spark.api.java.JavaRDD;
|
||||
import org.junit.jupiter.api.AfterAll;
|
||||
import org.junit.jupiter.api.AfterEach;
|
||||
import org.junit.jupiter.api.BeforeAll;
|
||||
import org.junit.jupiter.api.BeforeEach;
|
||||
import org.junit.jupiter.api.Test;
|
||||
import org.mockito.Mockito;
|
||||
|
||||
public class TestDFSHoodieTestSuiteWriterAdapter extends UtilitiesTestBase {
|
||||
|
||||
private FilebasedSchemaProvider schemaProvider;
|
||||
private static final String COMPLEX_SOURCE_SCHEMA_DOCKER_DEMO_RELATIVE_PATH = "/docker/demo/config/test-suite/";
|
||||
|
||||
@BeforeAll
|
||||
public static void initClass() throws Exception {
|
||||
UtilitiesTestBase.initClass();
|
||||
}
|
||||
|
||||
@AfterAll
|
||||
public static void cleanupClass() {
|
||||
UtilitiesTestBase.cleanupClass();
|
||||
}
|
||||
|
||||
@BeforeEach
|
||||
public void setup() throws Exception {
|
||||
super.setup();
|
||||
schemaProvider = new FilebasedSchemaProvider(Helpers.setupSchemaOnDFSWithAbsoluteScope(
|
||||
System.getProperty("user.dir") + "/.." + COMPLEX_SOURCE_SCHEMA_DOCKER_DEMO_RELATIVE_PATH,
|
||||
"complex-source.avsc"), jsc);
|
||||
}
|
||||
|
||||
@AfterEach
|
||||
public void teardown() throws Exception {
|
||||
super.teardown();
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testDFSOneFileWrite() throws IOException {
|
||||
|
||||
DeltaInputWriter<GenericRecord> mockFileSinkWriter = Mockito.mock(AvroFileDeltaInputWriter.class);
|
||||
DeltaWriteStats mockDeltaWriteStats = Mockito.mock(DeltaWriteStats.class);
|
||||
when(mockFileSinkWriter.getNewWriter()).thenReturn(mockFileSinkWriter);
|
||||
when(mockFileSinkWriter.canWrite()).thenReturn(true);
|
||||
when(mockFileSinkWriter.getDeltaWriteStats()).thenReturn(mockDeltaWriteStats);
|
||||
|
||||
DeltaWriterAdapter<GenericRecord> dfsDeltaWriterAdapter = new DFSDeltaWriterAdapter(mockFileSinkWriter);
|
||||
|
||||
JavaRDD<GenericRecord> records = TestUtils.makeRDD(jsc, 10);
|
||||
|
||||
dfsDeltaWriterAdapter.write(records.collect().iterator());
|
||||
Mockito.verify(mockFileSinkWriter, times(10)).canWrite();
|
||||
Mockito.verify(mockFileSinkWriter, times(1)).close();
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testDFSTwoFilesWriteWithRollover() throws IOException {
|
||||
|
||||
DeltaInputWriter<GenericRecord> mockFileSinkWriter = Mockito.mock(AvroFileDeltaInputWriter.class);
|
||||
DeltaWriteStats mockDeltaWriteStats = Mockito.mock(DeltaWriteStats.class);
|
||||
when(mockFileSinkWriter.getNewWriter()).thenReturn(mockFileSinkWriter);
|
||||
when(mockFileSinkWriter.canWrite()).thenReturn(false, true);
|
||||
when(mockFileSinkWriter.getDeltaWriteStats()).thenReturn(mockDeltaWriteStats);
|
||||
|
||||
DeltaWriterAdapter<GenericRecord> dfsDeltaWriterAdapter = new DFSDeltaWriterAdapter(mockFileSinkWriter);
|
||||
|
||||
Iterator<GenericRecord> mockIterator = Mockito.mock(Iterator.class);
|
||||
when(mockIterator.hasNext()).thenReturn(true, true, true, false);
|
||||
|
||||
dfsDeltaWriterAdapter.write(mockIterator);
|
||||
Mockito.verify(mockFileSinkWriter, times(2)).canWrite();
|
||||
Mockito.verify(mockFileSinkWriter, times(1)).getNewWriter();
|
||||
Mockito.verify(mockFileSinkWriter, times(2)).close();
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testDFSWorkloadSinkWithMultipleFilesFunctional() throws IOException {
|
||||
DeltaConfig dfsSinkConfig = new DFSDeltaConfig(DeltaOutputMode.DFS, DeltaInputType.AVRO,
|
||||
new SerializableConfiguration(jsc.hadoopConfiguration()), dfsBasePath, dfsBasePath,
|
||||
schemaProvider.getSourceSchema().toString(), 10240L);
|
||||
DeltaWriterAdapter<GenericRecord> dfsDeltaWriterAdapter = DeltaWriterFactory
|
||||
.getDeltaWriterAdapter(dfsSinkConfig, 1);
|
||||
FlexibleSchemaRecordGenerationIterator itr = new FlexibleSchemaRecordGenerationIterator(1000,
|
||||
schemaProvider.getSourceSchema().toString());
|
||||
dfsDeltaWriterAdapter.write(itr);
|
||||
FileSystem fs = FSUtils.getFs(dfsBasePath, jsc.hadoopConfiguration());
|
||||
FileStatus[] fileStatuses = fs.listStatus(new Path(dfsBasePath));
|
||||
// Since maxFileSize was 10240L and we produced 1K records each close to 1K size, we should produce more than
|
||||
// 1 file
|
||||
assertTrue(fileStatuses.length > 0);
|
||||
}
|
||||
|
||||
}
|
||||
@@ -0,0 +1,147 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.integ.testsuite;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||
import static org.junit.jupiter.api.Assertions.assertFalse;
|
||||
import static org.junit.jupiter.api.Assertions.assertTrue;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.UncheckedIOException;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
import java.util.stream.Collectors;
|
||||
import java.util.stream.IntStream;
|
||||
import org.apache.avro.generic.GenericRecord;
|
||||
import org.apache.hadoop.fs.FileStatus;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.apache.hudi.common.fs.FSUtils;
|
||||
import org.apache.hudi.common.util.Option;
|
||||
import org.apache.hudi.integ.testsuite.reader.SparkBasedReader;
|
||||
import org.apache.hudi.integ.testsuite.writer.AvroFileDeltaInputWriter;
|
||||
import org.apache.hudi.integ.testsuite.writer.DeltaInputWriter;
|
||||
import org.apache.hudi.integ.testsuite.writer.DeltaWriteStats;
|
||||
import org.apache.hudi.integ.testsuite.generator.GenericRecordFullPayloadGenerator;
|
||||
import org.apache.hudi.integ.testsuite.reader.SparkBasedReader;
|
||||
import org.apache.hudi.integ.testsuite.writer.AvroFileDeltaInputWriter;
|
||||
import org.apache.hudi.integ.testsuite.writer.DeltaInputWriter;
|
||||
import org.apache.hudi.integ.testsuite.writer.DeltaWriteStats;
|
||||
import org.apache.hudi.utilities.schema.FilebasedSchemaProvider;
|
||||
import org.apache.hudi.utilities.testutils.UtilitiesTestBase;
|
||||
import org.apache.spark.api.java.JavaRDD;
|
||||
import org.junit.jupiter.api.AfterAll;
|
||||
import org.junit.jupiter.api.AfterEach;
|
||||
import org.junit.jupiter.api.BeforeAll;
|
||||
import org.junit.jupiter.api.BeforeEach;
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
public class TestFileDeltaInputWriter extends UtilitiesTestBase {
|
||||
|
||||
private FilebasedSchemaProvider schemaProvider;
|
||||
private static final String COMPLEX_SOURCE_SCHEMA_DOCKER_DEMO_RELATIVE_PATH = "/docker/demo/config/test-suite/";
|
||||
|
||||
@BeforeAll
|
||||
public static void initClass() throws Exception {
|
||||
UtilitiesTestBase.initClass();
|
||||
}
|
||||
|
||||
@AfterAll
|
||||
public static void cleanupClass() {
|
||||
UtilitiesTestBase.cleanupClass();
|
||||
}
|
||||
|
||||
@BeforeEach
|
||||
public void setup() throws Exception {
|
||||
super.setup();
|
||||
schemaProvider = new FilebasedSchemaProvider(Helpers.setupSchemaOnDFSWithAbsoluteScope(System.getProperty("user.dir") + "/.."
|
||||
+ COMPLEX_SOURCE_SCHEMA_DOCKER_DEMO_RELATIVE_PATH, "complex-source.avsc"), jsc);
|
||||
}
|
||||
|
||||
@AfterEach
|
||||
public void teardown() throws Exception {
|
||||
super.teardown();
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testAvroFileSinkWriter() throws IOException {
|
||||
// 1. Create a Avro File Sink Writer
|
||||
DeltaInputWriter<GenericRecord> fileSinkWriter =
|
||||
new AvroFileDeltaInputWriter(jsc.hadoopConfiguration(), dfsBasePath + "/input", schemaProvider.getSourceSchema()
|
||||
.toString(), 1024 * 1024L);
|
||||
GenericRecordFullPayloadGenerator payloadGenerator =
|
||||
new GenericRecordFullPayloadGenerator(schemaProvider.getSourceSchema());
|
||||
// 2. Generate 100 avro payloads and write them to an avro file
|
||||
IntStream.range(0, 100).forEach(a -> {
|
||||
try {
|
||||
fileSinkWriter.writeData(payloadGenerator.getNewPayload());
|
||||
} catch (IOException io) {
|
||||
throw new UncheckedIOException(io);
|
||||
}
|
||||
});
|
||||
fileSinkWriter.close();
|
||||
DeltaWriteStats deltaWriteStats = fileSinkWriter.getDeltaWriteStats();
|
||||
FileSystem fs = FSUtils.getFs(dfsBasePath, jsc.hadoopConfiguration());
|
||||
FileStatus[] fileStatuses = fs.listStatus(new Path(deltaWriteStats.getFilePath()));
|
||||
// Atleast 1 file was written
|
||||
assertEquals(1, fileStatuses.length);
|
||||
// File length should be greater than 0
|
||||
assertTrue(fileStatuses[0].getLen() > 0);
|
||||
// File length should be the same as the number of bytes written
|
||||
assertTrue(deltaWriteStats.getBytesWritten() > 0);
|
||||
List<String> paths = Arrays.asList(fs.globStatus(new Path(dfsBasePath + "/*/*.avro")))
|
||||
.stream().map(f -> f.getPath().toString()).collect(Collectors.toList());
|
||||
JavaRDD<GenericRecord> writtenRecords =
|
||||
SparkBasedReader.readAvro(sparkSession, schemaProvider.getSourceSchema().toString(), paths, Option.empty(),
|
||||
Option.empty());
|
||||
// Number of records written should be 100
|
||||
assertEquals(writtenRecords.count(), 100);
|
||||
// Number of records in file should match with the stats
|
||||
assertEquals(writtenRecords.count(), deltaWriteStats.getRecordsWritten());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testAvroFileSinkCreateNewWriter() throws IOException {
|
||||
// 1. Create a Avro File Sink Writer
|
||||
DeltaInputWriter<GenericRecord> fileSinkWriter =
|
||||
new AvroFileDeltaInputWriter(jsc.hadoopConfiguration(), dfsBasePath,
|
||||
schemaProvider.getSourceSchema().toString(),
|
||||
1024 * 1024L);
|
||||
GenericRecordFullPayloadGenerator payloadGenerator =
|
||||
new GenericRecordFullPayloadGenerator(schemaProvider.getSourceSchema());
|
||||
// 2. Generate 100 avro payloads and write them to an avro file
|
||||
IntStream.range(0, 100).forEach(a -> {
|
||||
try {
|
||||
fileSinkWriter.writeData(payloadGenerator.getNewPayload());
|
||||
} catch (IOException io) {
|
||||
throw new UncheckedIOException(io);
|
||||
}
|
||||
});
|
||||
fileSinkWriter.close();
|
||||
String oldFilePath = fileSinkWriter.getDeltaWriteStats().getFilePath();
|
||||
assertFalse(oldFilePath == null);
|
||||
DeltaInputWriter<GenericRecord> newFileSinkWriter = fileSinkWriter.getNewWriter();
|
||||
newFileSinkWriter.close();
|
||||
DeltaWriteStats newStats = newFileSinkWriter.getDeltaWriteStats();
|
||||
assertEquals(newStats.getBytesWritten(), 3674);
|
||||
assertEquals(newStats.getRecordsWritten(), 0);
|
||||
assertTrue(newStats.getFilePath() != null);
|
||||
}
|
||||
|
||||
}
|
||||
@@ -0,0 +1,80 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.integ.testsuite.configuration;
|
||||
|
||||
import static junit.framework.Assert.assertTrue;
|
||||
import static junit.framework.TestCase.assertEquals;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.hudi.integ.testsuite.dag.nodes.DagNode;
|
||||
import org.apache.hudi.integ.testsuite.dag.nodes.InsertNode;
|
||||
import org.apache.hudi.integ.testsuite.dag.nodes.UpsertNode;
|
||||
import org.apache.hudi.integ.testsuite.configuration.DeltaConfig.Config;
|
||||
import org.apache.hudi.integ.testsuite.dag.WorkflowDag;
|
||||
import org.apache.hudi.integ.testsuite.dag.nodes.DagNode;
|
||||
import org.apache.hudi.integ.testsuite.dag.nodes.InsertNode;
|
||||
import org.apache.hudi.integ.testsuite.dag.nodes.UpsertNode;
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
public class TestWorkflowBuilder {
|
||||
|
||||
@Test
|
||||
public void testWorkloadOperationSequenceBuilder() {
|
||||
|
||||
DagNode root = new InsertNode(DeltaConfig.Config.newBuilder()
|
||||
.withNumRecordsToInsert(10000)
|
||||
.withNumInsertPartitions(1)
|
||||
.withNumTimesToRepeat(2)
|
||||
.withRecordSize(1000).build());
|
||||
|
||||
DagNode child1 = new UpsertNode(DeltaConfig.Config.newBuilder()
|
||||
.withNumRecordsToUpdate(10000)
|
||||
.withNumInsertPartitions(1)
|
||||
.withNumTimesToRepeat(2)
|
||||
.withRecordSize(1000).build());
|
||||
|
||||
root.addChildNode(child1);
|
||||
child1.addParentNode(root);
|
||||
List<DagNode> rootNodes = new ArrayList<>();
|
||||
rootNodes.add(root);
|
||||
WorkflowDag workflowDag = new WorkflowDag(rootNodes);
|
||||
|
||||
assertEquals(workflowDag.getNodeList().size(), 1);
|
||||
assertEquals(((DagNode) workflowDag.getNodeList().get(0)).getChildNodes().size(), 1);
|
||||
DagNode dagNode = (DagNode) workflowDag.getNodeList().get(0);
|
||||
assertTrue(dagNode instanceof InsertNode);
|
||||
DeltaConfig.Config config = dagNode.getConfig();
|
||||
assertEquals(config.getNumInsertPartitions(), 1);
|
||||
assertEquals(config.getRecordSize(), 1000);
|
||||
assertEquals(config.getRepeatCount(), 2);
|
||||
assertEquals(config.getNumRecordsInsert(), 10000);
|
||||
assertEquals(config.getNumRecordsUpsert(), 0);
|
||||
dagNode = (DagNode) ((DagNode) workflowDag.getNodeList().get(0)).getChildNodes().get(0);
|
||||
assertTrue(dagNode instanceof UpsertNode);
|
||||
config = dagNode.getConfig();
|
||||
assertEquals(config.getNumInsertPartitions(), 1);
|
||||
assertEquals(config.getRecordSize(), 1000);
|
||||
assertEquals(config.getRepeatCount(), 2);
|
||||
assertEquals(config.getNumRecordsInsert(), 0);
|
||||
assertEquals(config.getNumRecordsUpsert(), 10000);
|
||||
}
|
||||
|
||||
}
|
||||
@@ -0,0 +1,93 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.integ.testsuite.converter;
|
||||
|
||||
import static junit.framework.TestCase.assertTrue;
|
||||
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import org.apache.avro.Schema.Field;
|
||||
import org.apache.avro.generic.GenericRecord;
|
||||
import org.apache.hudi.integ.testsuite.utils.TestUtils;
|
||||
import org.apache.hudi.integ.testsuite.utils.TestUtils;
|
||||
import org.apache.hudi.utilities.UtilHelpers;
|
||||
import org.apache.spark.api.java.JavaRDD;
|
||||
import org.apache.spark.api.java.JavaSparkContext;
|
||||
import org.junit.jupiter.api.AfterEach;
|
||||
import org.junit.jupiter.api.BeforeEach;
|
||||
import org.junit.jupiter.api.Test;
|
||||
import scala.Tuple2;
|
||||
|
||||
public class TestUpdateConverter {
|
||||
|
||||
private JavaSparkContext jsc;
|
||||
|
||||
@BeforeEach
|
||||
public void setup() throws Exception {
|
||||
jsc = UtilHelpers.buildSparkContext(this.getClass().getName() + "-hoodie", "local[1]");
|
||||
|
||||
}
|
||||
|
||||
@AfterEach
|
||||
public void teardown() {
|
||||
jsc.stop();
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testGenerateUpdateRecordsFromInputRecords() throws Exception {
|
||||
JavaRDD<GenericRecord> inputRDD = TestUtils.makeRDD(jsc, 10);
|
||||
String schemaStr = inputRDD.take(1).get(0).getSchema().toString();
|
||||
int minPayloadSize = 1000;
|
||||
// 2. DFS converter reads existing records and generates random updates for the same row keys
|
||||
UpdateConverter updateConverter = new UpdateConverter(schemaStr, minPayloadSize,
|
||||
Arrays.asList("timestamp"), Arrays.asList("_row_key"));
|
||||
List<String> insertRowKeys = inputRDD.map(r -> r.get("_row_key").toString()).collect();
|
||||
assertTrue(inputRDD.count() == 10);
|
||||
JavaRDD<GenericRecord> outputRDD = updateConverter.convert(inputRDD);
|
||||
List<String> updateRowKeys = outputRDD.map(row -> row.get("_row_key").toString()).collect();
|
||||
// The insert row keys should be the same as update row keys
|
||||
assertTrue(insertRowKeys.containsAll(updateRowKeys));
|
||||
Map<String, GenericRecord> inputRecords = inputRDD.mapToPair(r -> new Tuple2<>(r.get("_row_key").toString(), r))
|
||||
.collectAsMap();
|
||||
List<GenericRecord> updateRecords = outputRDD.collect();
|
||||
updateRecords.stream().forEach(updateRecord -> {
|
||||
GenericRecord inputRecord = inputRecords.get(updateRecord.get("_row_key").toString());
|
||||
assertTrue(areRecordsDifferent(inputRecord, updateRecord));
|
||||
});
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* Checks if even a single field in the 2 records is different (except the row key which is the same for an update).
|
||||
*/
|
||||
private boolean areRecordsDifferent(GenericRecord in, GenericRecord up) {
|
||||
for (Field field : in.getSchema().getFields()) {
|
||||
if (field.name() == "_row_key") {
|
||||
continue;
|
||||
} else {
|
||||
// Just convert all types to string for now since all are primitive
|
||||
if (in.get(field.name()).toString() != up.get(field.name()).toString()) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,81 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.integ.testsuite.dag;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.function.Function;
|
||||
import org.apache.hudi.client.WriteStatus;
|
||||
import org.apache.hudi.integ.testsuite.dag.nodes.DagNode;
|
||||
import org.apache.hudi.integ.testsuite.dag.nodes.InsertNode;
|
||||
import org.apache.hudi.integ.testsuite.dag.nodes.UpsertNode;
|
||||
import org.apache.hudi.integ.testsuite.dag.nodes.ValidateNode;
|
||||
import org.apache.hudi.integ.testsuite.configuration.DeltaConfig.Config;
|
||||
import org.apache.hudi.integ.testsuite.dag.nodes.DagNode;
|
||||
import org.apache.hudi.integ.testsuite.dag.nodes.InsertNode;
|
||||
import org.apache.hudi.integ.testsuite.dag.nodes.UpsertNode;
|
||||
import org.apache.hudi.integ.testsuite.dag.nodes.ValidateNode;
|
||||
import org.apache.spark.api.java.JavaRDD;
|
||||
|
||||
public class ComplexDagGenerator implements WorkflowDagGenerator {
|
||||
|
||||
@Override
|
||||
public WorkflowDag build() {
|
||||
DagNode root = new InsertNode(Config.newBuilder()
|
||||
.withNumRecordsToInsert(1000)
|
||||
.withNumInsertPartitions(3)
|
||||
.withRecordSize(1000).build());
|
||||
|
||||
DagNode child1 = new UpsertNode(Config.newBuilder()
|
||||
.withNumRecordsToUpdate(999)
|
||||
.withNumRecordsToInsert(1000)
|
||||
.withNumUpsertFiles(1)
|
||||
.withNumUpsertPartitions(1)
|
||||
.withNumInsertPartitions(1)
|
||||
.withRecordSize(10000).build());
|
||||
|
||||
Function<List<DagNode<JavaRDD<WriteStatus>>>, Boolean> function = (dagNodes) -> {
|
||||
DagNode<JavaRDD<WriteStatus>> parent1 = dagNodes.get(0);
|
||||
List<WriteStatus> statuses = parent1.getResult().collect();
|
||||
long totalRecordsTouched = statuses.stream().map(st -> st.getStat().getNumUpdateWrites() + st.getStat()
|
||||
.getNumInserts()).reduce((a, b) -> a + b).get();
|
||||
boolean b1 = totalRecordsTouched == parent1.getConfig().getNumRecordsInsert()
|
||||
+ parent1.getConfig().getNumRecordsUpsert();
|
||||
boolean b2 = statuses.size() > parent1.getConfig().getNumUpsertFiles();
|
||||
|
||||
DagNode<JavaRDD<WriteStatus>> parent2 = parent1.getParentNodes().get(0);
|
||||
statuses = parent2.getResult().collect();
|
||||
totalRecordsTouched = statuses.stream().map(st -> st.getStat().getNumUpdateWrites() + st.getStat()
|
||||
.getNumInserts()).reduce((a, b) -> a + b).get();
|
||||
boolean b3 = totalRecordsTouched == parent2.getConfig().getNumRecordsInsert()
|
||||
* parent2.getConfig().getNumInsertPartitions() + parent2.getConfig().getNumRecordsUpsert();
|
||||
return b1 & b2 & b3;
|
||||
};
|
||||
DagNode child2 = new ValidateNode(Config.newBuilder().build(), function);
|
||||
|
||||
root.addChildNode(child1);
|
||||
// child1.addParentNode(root);
|
||||
child1.addChildNode(child2);
|
||||
// child2.addParentNode(child1);
|
||||
List<DagNode> rootNodes = new ArrayList<>();
|
||||
rootNodes.add(root);
|
||||
return new WorkflowDag(rootNodes);
|
||||
}
|
||||
|
||||
}
|
||||
@@ -0,0 +1,58 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.integ.testsuite.dag;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
import org.apache.hudi.common.util.collection.Pair;
|
||||
import org.apache.hudi.integ.testsuite.dag.nodes.DagNode;
|
||||
import org.apache.hudi.integ.testsuite.dag.nodes.HiveSyncNode;
|
||||
import org.apache.hudi.integ.testsuite.dag.nodes.InsertNode;
|
||||
import org.apache.hudi.integ.testsuite.configuration.DeltaConfig.Config;
|
||||
import org.apache.hudi.integ.testsuite.dag.nodes.DagNode;
|
||||
import org.apache.hudi.integ.testsuite.dag.nodes.HiveQueryNode;
|
||||
import org.apache.hudi.integ.testsuite.dag.nodes.HiveSyncNode;
|
||||
import org.apache.hudi.integ.testsuite.dag.nodes.InsertNode;
|
||||
|
||||
public class HiveSyncDagGenerator implements WorkflowDagGenerator {
|
||||
|
||||
@Override
|
||||
public WorkflowDag build() {
|
||||
DagNode root = new InsertNode(Config.newBuilder()
|
||||
.withNumRecordsToInsert(100)
|
||||
.withNumInsertPartitions(1)
|
||||
.withNumTimesToRepeat(1)
|
||||
.withRecordSize(1000).build());
|
||||
|
||||
DagNode child1 = new HiveSyncNode(Config.newBuilder().withHiveLocal(true).build());
|
||||
|
||||
root.addChildNode(child1);
|
||||
|
||||
DagNode child2 = new HiveQueryNode(Config.newBuilder().withHiveLocal(true).withHiveQueryAndResults(Arrays
|
||||
.asList(Pair.of("select " + "count(*) from testdb1.table1 group " + "by rider having count(*) < 1", 0)))
|
||||
.build());
|
||||
child1.addChildNode(child2);
|
||||
|
||||
List<DagNode> rootNodes = new ArrayList<>();
|
||||
rootNodes.add(root);
|
||||
return new WorkflowDag(rootNodes);
|
||||
}
|
||||
|
||||
}
|
||||
@@ -0,0 +1,58 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.integ.testsuite.dag;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
import org.apache.hudi.common.util.collection.Pair;
|
||||
import org.apache.hudi.integ.testsuite.dag.nodes.DagNode;
|
||||
import org.apache.hudi.integ.testsuite.dag.nodes.HiveSyncNode;
|
||||
import org.apache.hudi.integ.testsuite.dag.nodes.InsertNode;
|
||||
import org.apache.hudi.integ.testsuite.configuration.DeltaConfig.Config;
|
||||
import org.apache.hudi.integ.testsuite.dag.nodes.DagNode;
|
||||
import org.apache.hudi.integ.testsuite.dag.nodes.HiveQueryNode;
|
||||
import org.apache.hudi.integ.testsuite.dag.nodes.HiveSyncNode;
|
||||
import org.apache.hudi.integ.testsuite.dag.nodes.InsertNode;
|
||||
|
||||
public class HiveSyncDagGeneratorMOR implements WorkflowDagGenerator {
|
||||
|
||||
@Override
|
||||
public WorkflowDag build() {
|
||||
DagNode root = new InsertNode(Config.newBuilder()
|
||||
.withNumRecordsToInsert(100)
|
||||
.withNumInsertPartitions(1)
|
||||
.withNumTimesToRepeat(1)
|
||||
.withRecordSize(1000).build());
|
||||
|
||||
DagNode child1 = new HiveSyncNode(Config.newBuilder().withHiveLocal(true).build());
|
||||
|
||||
root.addChildNode(child1);
|
||||
|
||||
DagNode child2 = new HiveQueryNode(Config.newBuilder().withHiveLocal(true).withHiveQueryAndResults(Arrays
|
||||
.asList(Pair.of("select " + "count(*) from testdb1.table1_rt group " + "by rider having count(*) < 1", 0)))
|
||||
.build());
|
||||
child1.addChildNode(child2);
|
||||
|
||||
List<DagNode> rootNodes = new ArrayList<>();
|
||||
rootNodes.add(root);
|
||||
return new WorkflowDag(rootNodes);
|
||||
}
|
||||
|
||||
}
|
||||
@@ -0,0 +1,95 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.integ.testsuite.dag;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import org.apache.hudi.integ.testsuite.configuration.DeltaConfig.Config;
|
||||
import org.apache.hudi.integ.testsuite.dag.nodes.DagNode;
|
||||
import org.apache.hudi.integ.testsuite.dag.nodes.InsertNode;
|
||||
import org.apache.hudi.utilities.testutils.UtilitiesTestBase;
|
||||
import org.junit.jupiter.api.Assertions;
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
public class TestDagUtils {
|
||||
|
||||
private static final String COW_DAG_DOCKER_DEMO_RELATIVE_PATH = "/docker/demo/config/test-suite/complex-dag-cow.yaml";
|
||||
|
||||
@Test
|
||||
public void testConvertDagToYaml() throws Exception {
|
||||
ComplexDagGenerator dag = new ComplexDagGenerator();
|
||||
String yaml = DagUtils.convertDagToYaml(dag.build());
|
||||
System.out.println(yaml);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testConvertYamlToDag() throws Exception {
|
||||
WorkflowDag dag = DagUtils.convertYamlToDag(UtilitiesTestBase.Helpers
|
||||
.readFileFromAbsolutePath((System.getProperty("user.dir") + "/.." + COW_DAG_DOCKER_DEMO_RELATIVE_PATH)));
|
||||
assertEquals(dag.getNodeList().size(), 1);
|
||||
Assertions.assertEquals(((DagNode) dag.getNodeList().get(0)).getParentNodes().size(), 0);
|
||||
assertEquals(((DagNode) dag.getNodeList().get(0)).getChildNodes().size(), 1);
|
||||
DagNode firstChild = (DagNode) ((DagNode) dag.getNodeList().get(0)).getChildNodes().get(0);
|
||||
assertEquals(firstChild.getParentNodes().size(), 1);
|
||||
assertEquals(firstChild.getChildNodes().size(), 1);
|
||||
assertEquals(((DagNode) firstChild.getChildNodes().get(0)).getChildNodes().size(), 1);
|
||||
}
|
||||
|
||||
public static class ComplexDagGenerator implements WorkflowDagGenerator {
|
||||
|
||||
@Override
|
||||
public WorkflowDag build() {
|
||||
DagNode root = new InsertNode(Config.newBuilder()
|
||||
.withNumRecordsToInsert(1000000)
|
||||
.withNumInsertPartitions(1)
|
||||
.withNumTimesToRepeat(2)
|
||||
.withRecordSize(1000).build());
|
||||
|
||||
DagNode child1 = new InsertNode(Config.newBuilder()
|
||||
.withNumRecordsToInsert(1000000)
|
||||
.withNumInsertPartitions(1)
|
||||
.withNumTimesToRepeat(2)
|
||||
.withRecordSize(1000).build());
|
||||
|
||||
DagNode child2 = new InsertNode(Config.newBuilder()
|
||||
.withNumRecordsToInsert(1000000)
|
||||
.withNumInsertPartitions(1)
|
||||
.withNumTimesToRepeat(2)
|
||||
.withRecordSize(1000).build());
|
||||
|
||||
root.addChildNode(child1);
|
||||
root.addChildNode(child2);
|
||||
|
||||
DagNode child3 = new InsertNode(Config.newBuilder()
|
||||
.withNumRecordsToInsert(1000000)
|
||||
.withNumInsertPartitions(1)
|
||||
.withNumTimesToRepeat(2)
|
||||
.withRecordSize(1000).build());
|
||||
|
||||
child2.addChildNode(child3);
|
||||
List<DagNode> rootNodes = new ArrayList<>();
|
||||
rootNodes.add(root);
|
||||
|
||||
return new WorkflowDag(rootNodes);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
@@ -0,0 +1,56 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.integ.testsuite.generator;
|
||||
|
||||
import static junit.framework.TestCase.assertEquals;
|
||||
|
||||
import org.apache.avro.Schema;
|
||||
import org.apache.hudi.common.util.collection.Pair;
|
||||
import org.apache.hudi.utilities.testutils.UtilitiesTestBase;
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
public class TestGenericRecordPayloadEstimator {
|
||||
|
||||
private static final String SOURCE_SCHEMA_DOCKER_DEMO_RELATIVE_PATH = "/docker/demo/config/test-suite/source.avsc";
|
||||
private static final String COMPLEX_SOURCE_SCHEMA_DOCKER_DEMO_RELATIVE_PATH =
|
||||
"/docker/demo/config/test-suite/complex-source.avsc";
|
||||
|
||||
@Test
|
||||
public void testSimpleSchemaSize() throws Exception {
|
||||
Schema schema = new Schema.Parser().parse(UtilitiesTestBase.Helpers
|
||||
.readFileFromAbsolutePath(System.getProperty("user.dir") + "/.." + SOURCE_SCHEMA_DOCKER_DEMO_RELATIVE_PATH));
|
||||
GenericRecordFullPayloadSizeEstimator estimator =
|
||||
new GenericRecordFullPayloadSizeEstimator(schema);
|
||||
Pair<Integer, Integer> estimateAndNumComplexFields = estimator.typeEstimateAndNumComplexFields();
|
||||
assertEquals(estimateAndNumComplexFields.getRight().intValue(), 0);
|
||||
assertEquals(estimateAndNumComplexFields.getLeft().intValue(), 156);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testComplexSchemaSize() throws Exception {
|
||||
Schema schema = new Schema.Parser().parse(UtilitiesTestBase.Helpers.readFileFromAbsolutePath(
|
||||
System.getProperty("user.dir") + "/.." + COMPLEX_SOURCE_SCHEMA_DOCKER_DEMO_RELATIVE_PATH));
|
||||
GenericRecordFullPayloadSizeEstimator estimator =
|
||||
new GenericRecordFullPayloadSizeEstimator(schema);
|
||||
Pair<Integer, Integer> estimateAndNumComplexFields = estimator.typeEstimateAndNumComplexFields();
|
||||
assertEquals(estimateAndNumComplexFields.getRight().intValue(), 1);
|
||||
assertEquals(estimateAndNumComplexFields.getLeft().intValue(), 1278);
|
||||
}
|
||||
|
||||
}
|
||||
@@ -0,0 +1,130 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.integ.testsuite.generator;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.assertFalse;
|
||||
import static org.junit.jupiter.api.Assertions.assertTrue;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
import java.util.stream.IntStream;
|
||||
import org.apache.avro.Schema;
|
||||
import org.apache.avro.generic.GenericRecord;
|
||||
import org.apache.hudi.avro.HoodieAvroUtils;
|
||||
import org.apache.hudi.utilities.testutils.UtilitiesTestBase;
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
public class TestGenericRecordPayloadGenerator {
|
||||
|
||||
private static final String SOURCE_SCHEMA_DOCKER_DEMO_RELATIVE_PATH = "/docker/demo/config/test-suite/source.avsc";
|
||||
private static final String COMPLEX_SOURCE_SCHEMA_DOCKER_DEMO_RELATIVE_PATH =
|
||||
"/docker/demo/config/test-suite/complex-source.avsc";
|
||||
|
||||
@Test
|
||||
public void testSimplePayload() throws Exception {
|
||||
Schema schema = new Schema.Parser().parse(UtilitiesTestBase.Helpers
|
||||
.readFileFromAbsolutePath(System.getProperty("user.dir") + "/.." + SOURCE_SCHEMA_DOCKER_DEMO_RELATIVE_PATH));
|
||||
GenericRecordFullPayloadGenerator payloadGenerator = new GenericRecordFullPayloadGenerator(schema);
|
||||
GenericRecord record = payloadGenerator.getNewPayload();
|
||||
// The generated payload should validate with the provided schema
|
||||
payloadGenerator.validate(record);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testComplexPayload() throws IOException {
|
||||
Schema schema = new Schema.Parser().parse(UtilitiesTestBase.Helpers
|
||||
.readFileFromAbsolutePath(System.getProperty("user.dir") + "/.." +
|
||||
COMPLEX_SOURCE_SCHEMA_DOCKER_DEMO_RELATIVE_PATH));
|
||||
GenericRecordFullPayloadGenerator payloadGenerator = new GenericRecordFullPayloadGenerator(schema);
|
||||
GenericRecord record = payloadGenerator.getNewPayload();
|
||||
// The generated payload should validate with the provided schema
|
||||
assertTrue(payloadGenerator.validate(record));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testComplexPartialPayload() throws IOException {
|
||||
Schema schema = new Schema.Parser().parse(UtilitiesTestBase.Helpers
|
||||
.readFileFromAbsolutePath(System.getProperty("user.dir") + "/.." +
|
||||
COMPLEX_SOURCE_SCHEMA_DOCKER_DEMO_RELATIVE_PATH));
|
||||
GenericRecordPartialPayloadGenerator payloadGenerator = new GenericRecordPartialPayloadGenerator(schema);
|
||||
IntStream.range(0, 10).forEach(a -> {
|
||||
GenericRecord record = payloadGenerator.getNewPayload();
|
||||
// The generated payload should validate with the provided schema
|
||||
assertTrue(payloadGenerator.validate(record));
|
||||
});
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testUpdatePayloadGenerator() throws IOException {
|
||||
Schema schema = new Schema.Parser().parse(UtilitiesTestBase.Helpers
|
||||
.readFileFromAbsolutePath(System.getProperty("user.dir") + "/.." + SOURCE_SCHEMA_DOCKER_DEMO_RELATIVE_PATH));
|
||||
GenericRecordFullPayloadGenerator payloadGenerator = new GenericRecordFullPayloadGenerator(schema);
|
||||
List<String> insertRowKeys = new ArrayList<>();
|
||||
List<String> updateRowKeys = new ArrayList<>();
|
||||
List<Long> insertTimeStamps = new ArrayList<>();
|
||||
List<Long> updateTimeStamps = new ArrayList<>();
|
||||
List<GenericRecord> records = new ArrayList<>();
|
||||
// Generate 10 new records
|
||||
IntStream.range(0, 10).forEach(a -> {
|
||||
GenericRecord record = payloadGenerator.getNewPayload();
|
||||
records.add(record);
|
||||
insertRowKeys.add(record.get("_row_key").toString());
|
||||
insertTimeStamps.add((Long) record.get("timestamp"));
|
||||
});
|
||||
List<String> blacklistFields = Arrays.asList("_row_key");
|
||||
records.stream().forEach(a -> {
|
||||
// Generate 10 updated records
|
||||
GenericRecord record = payloadGenerator.getUpdatePayload(a, blacklistFields);
|
||||
updateRowKeys.add(record.get("_row_key").toString());
|
||||
updateTimeStamps.add((Long) record.get("timestamp"));
|
||||
});
|
||||
// The row keys from insert payloads should match all the row keys from the update payloads
|
||||
assertTrue(insertRowKeys.containsAll(updateRowKeys));
|
||||
// The timestamp field for the insert payloads should not all match with the update payloads
|
||||
assertFalse(insertTimeStamps.containsAll(updateTimeStamps));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testSimplePayloadWithLargeMinSize() throws Exception {
|
||||
Schema schema = new Schema.Parser().parse(UtilitiesTestBase.Helpers
|
||||
.readFileFromAbsolutePath(System.getProperty("user.dir") + "/.." + SOURCE_SCHEMA_DOCKER_DEMO_RELATIVE_PATH));
|
||||
int minPayloadSize = 1000;
|
||||
GenericRecordFullPayloadGenerator payloadGenerator = new GenericRecordFullPayloadGenerator(schema,
|
||||
minPayloadSize);
|
||||
GenericRecord record = payloadGenerator.getNewPayload();
|
||||
// The payload generated is less than minPayloadSize due to no collections present
|
||||
assertTrue(HoodieAvroUtils.avroToBytes(record).length < minPayloadSize);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testComplexPayloadWithLargeMinSize() throws Exception {
|
||||
Schema schema = new Schema.Parser().parse(UtilitiesTestBase.Helpers
|
||||
.readFileFromAbsolutePath(System.getProperty("user.dir") + "/.." +
|
||||
COMPLEX_SOURCE_SCHEMA_DOCKER_DEMO_RELATIVE_PATH));
|
||||
int minPayloadSize = 10000;
|
||||
GenericRecordFullPayloadGenerator payloadGenerator = new GenericRecordFullPayloadGenerator(
|
||||
schema, minPayloadSize);
|
||||
GenericRecord record = payloadGenerator.getNewPayload();
|
||||
// The payload generated should be within 10% extra of the minPayloadSize
|
||||
assertTrue(HoodieAvroUtils.avroToBytes(record).length < minPayloadSize + 0.1 * minPayloadSize);
|
||||
}
|
||||
|
||||
}
|
||||
@@ -0,0 +1,224 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.integ.testsuite.job;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||
|
||||
import java.util.UUID;
|
||||
import java.util.stream.Stream;
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.apache.hudi.DataSourceWriteOptions;
|
||||
import org.apache.hudi.common.config.TypedProperties;
|
||||
import org.apache.hudi.common.model.HoodieTableType;
|
||||
import org.apache.hudi.common.table.HoodieTableMetaClient;
|
||||
import org.apache.hudi.integ.testsuite.HoodieTestSuiteJob;
|
||||
import org.apache.hudi.integ.testsuite.HoodieTestSuiteJob.HoodieTestSuiteConfig;
|
||||
import org.apache.hudi.integ.testsuite.dag.ComplexDagGenerator;
|
||||
import org.apache.hudi.integ.testsuite.dag.HiveSyncDagGenerator;
|
||||
import org.apache.hudi.integ.testsuite.dag.HiveSyncDagGeneratorMOR;
|
||||
import org.apache.hudi.integ.testsuite.dag.WorkflowDagGenerator;
|
||||
import org.apache.hudi.integ.testsuite.reader.DeltaInputType;
|
||||
import org.apache.hudi.integ.testsuite.writer.DeltaOutputMode;
|
||||
import org.apache.hudi.keygen.TimestampBasedKeyGenerator;
|
||||
import org.apache.hudi.utilities.schema.FilebasedSchemaProvider;
|
||||
import org.apache.hudi.utilities.sources.AvroDFSSource;
|
||||
import org.apache.hudi.utilities.testutils.UtilitiesTestBase;
|
||||
import org.apache.log4j.Level;
|
||||
import org.apache.log4j.Logger;
|
||||
import org.junit.jupiter.api.AfterAll;
|
||||
import org.junit.jupiter.api.AfterEach;
|
||||
import org.junit.jupiter.api.BeforeAll;
|
||||
import org.junit.jupiter.api.BeforeEach;
|
||||
import org.junit.jupiter.params.ParameterizedTest;
|
||||
import org.junit.jupiter.params.provider.Arguments;
|
||||
import org.junit.jupiter.params.provider.MethodSource;
|
||||
|
||||
public class TestHoodieTestSuiteJob extends UtilitiesTestBase {
|
||||
|
||||
private static final String TEST_NAME_WITH_PARAMS = "[{index}] Test with useDeltaStreamer={0}, tableType={1}";
|
||||
private static final String BASE_PROPERTIES_DOCKER_DEMO_RELATIVE_PATH = "/docker/demo/config/test-suite/base"
|
||||
+ ".properties";
|
||||
private static final String SOURCE_SCHEMA_DOCKER_DEMO_RELATIVE_PATH = "/docker/demo/config/test-suite/source.avsc";
|
||||
private static final String TARGET_SCHEMA_DOCKER_DEMO_RELATIVE_PATH = "/docker/demo/config/test-suite/target.avsc";
|
||||
private static final String COW_DAG_DOCKER_DEMO_RELATIVE_PATH = "/docker/demo/config/test-suite/complex-dag-cow.yaml";
|
||||
private static final String MOR_DAG_DOCKER_DEMO_RELATIVE_PATH = "/docker/demo/config/test-suite/complex-dag-mor.yaml";
|
||||
|
||||
public static Stream<Arguments> configParams() {
|
||||
Object[][] data =
|
||||
new Object[][] {{false, "COPY_ON_WRITE"}};
|
||||
return Stream.of(data).map(Arguments::of);
|
||||
}
|
||||
|
||||
@BeforeAll
|
||||
public static void initClass() throws Exception {
|
||||
UtilitiesTestBase.initClass();
|
||||
// prepare the configs.
|
||||
UtilitiesTestBase.Helpers.copyToDFSFromAbsolutePath(System.getProperty("user.dir") + "/.."
|
||||
+ BASE_PROPERTIES_DOCKER_DEMO_RELATIVE_PATH, dfs, dfsBasePath + "/base.properties");
|
||||
UtilitiesTestBase.Helpers.copyToDFSFromAbsolutePath(System.getProperty("user.dir") + "/.."
|
||||
+ SOURCE_SCHEMA_DOCKER_DEMO_RELATIVE_PATH, dfs, dfsBasePath + "/source.avsc");
|
||||
UtilitiesTestBase.Helpers.copyToDFSFromAbsolutePath(System.getProperty("user.dir") + "/.."
|
||||
+ TARGET_SCHEMA_DOCKER_DEMO_RELATIVE_PATH, dfs, dfsBasePath + "/target.avsc");
|
||||
|
||||
UtilitiesTestBase.Helpers.copyToDFSFromAbsolutePath(System.getProperty("user.dir") + "/.."
|
||||
+ COW_DAG_DOCKER_DEMO_RELATIVE_PATH, dfs, dfsBasePath + "/complex-dag-cow.yaml");
|
||||
UtilitiesTestBase.Helpers.copyToDFSFromAbsolutePath(System.getProperty("user.dir") + "/.."
|
||||
+ MOR_DAG_DOCKER_DEMO_RELATIVE_PATH, dfs, dfsBasePath + "/complex-dag-mor.yaml");
|
||||
|
||||
TypedProperties props = new TypedProperties();
|
||||
props.setProperty("hoodie.datasource.write.recordkey.field", "_row_key");
|
||||
props.setProperty("hoodie.datasource.write.partitionpath.field", "timestamp");
|
||||
props.setProperty("hoodie.deltastreamer.keygen.timebased.timestamp.type", "UNIX_TIMESTAMP");
|
||||
props.setProperty("hoodie.deltastreamer.keygen.timebased.output.dateformat", "yyyy/MM/dd");
|
||||
props.setProperty("hoodie.deltastreamer.schemaprovider.source.schema.file", dfsBasePath + "/source.avsc");
|
||||
props.setProperty("hoodie.deltastreamer.schemaprovider.target.schema.file", dfsBasePath + "/source.avsc");
|
||||
props.setProperty("hoodie.deltastreamer.source.dfs.root", dfsBasePath + "/input");
|
||||
props.setProperty("hoodie.datasource.hive_sync.assume_date_partitioning", "true");
|
||||
props.setProperty("hoodie.datasource.write.keytranslator.class", "org.apache.hudi"
|
||||
+ ".DayBasedPartitionPathKeyTranslator");
|
||||
props.setProperty("hoodie.compact.inline.max.delta.commits", "3");
|
||||
props.setProperty("hoodie.parquet.max.file.size", "1024000");
|
||||
props.setProperty("hoodie.compact.inline.max.delta.commits", "0");
|
||||
// Hive Configs
|
||||
props.setProperty(DataSourceWriteOptions.HIVE_URL_OPT_KEY(), "jdbc:hive2://127.0.0.1:9999/");
|
||||
props.setProperty(DataSourceWriteOptions.HIVE_DATABASE_OPT_KEY(), "testdb1");
|
||||
props.setProperty(DataSourceWriteOptions.HIVE_TABLE_OPT_KEY(), "table1");
|
||||
props.setProperty(DataSourceWriteOptions.HIVE_PARTITION_FIELDS_OPT_KEY(), "datestr");
|
||||
props.setProperty(DataSourceWriteOptions.KEYGENERATOR_CLASS_OPT_KEY(), TimestampBasedKeyGenerator.class.getName());
|
||||
UtilitiesTestBase.Helpers.savePropsToDFS(props, dfs, dfsBasePath + "/test-source"
|
||||
+ ".properties");
|
||||
|
||||
// Properties used for the delta-streamer which incrementally pulls from upstream DFS Avro source and
|
||||
// writes to downstream hudi table
|
||||
TypedProperties downstreamProps = new TypedProperties();
|
||||
downstreamProps.setProperty("include", "base.properties");
|
||||
downstreamProps.setProperty("hoodie.datasource.write.recordkey.field", "_row_key");
|
||||
downstreamProps.setProperty("hoodie.datasource.write.partitionpath.field", "timestamp");
|
||||
|
||||
// Source schema is the target schema of upstream table
|
||||
downstreamProps.setProperty("hoodie.deltastreamer.schemaprovider.source.schema.file", dfsBasePath + "/source.avsc");
|
||||
downstreamProps.setProperty("hoodie.deltastreamer.schemaprovider.target.schema.file", dfsBasePath + "/source.avsc");
|
||||
UtilitiesTestBase.Helpers.savePropsToDFS(downstreamProps, dfs,
|
||||
dfsBasePath + "/test-downstream-source.properties");
|
||||
// these tests cause a lot of log verbosity from spark, turning it down
|
||||
Logger.getLogger("org.apache.spark").setLevel(Level.WARN);
|
||||
}
|
||||
|
||||
@AfterAll
|
||||
public static void cleanupClass() {
|
||||
UtilitiesTestBase.cleanupClass();
|
||||
}
|
||||
|
||||
@BeforeEach
|
||||
public void setup() throws Exception {
|
||||
super.setup();
|
||||
}
|
||||
|
||||
@AfterEach
|
||||
public void teardown() throws Exception {
|
||||
super.teardown();
|
||||
}
|
||||
|
||||
// Tests in this class add to the test build time significantly. Since this is a Integration Test (end to end), we
|
||||
// would like to run this as a nightly build which is a TODO.
|
||||
// TODO : Clean up input / result paths after each test
|
||||
@MethodSource("configParams")
|
||||
public void testDagWithInsertUpsertAndValidate(boolean useDeltaStreamer, String tableType) throws Exception {
|
||||
dfs.delete(new Path(dfsBasePath + "/input"), true);
|
||||
dfs.delete(new Path(dfsBasePath + "/result"), true);
|
||||
String inputBasePath = dfsBasePath + "/input/" + UUID.randomUUID().toString();
|
||||
String outputBasePath = dfsBasePath + "/result/" + UUID.randomUUID().toString();
|
||||
HoodieTestSuiteConfig cfg = makeConfig(inputBasePath, outputBasePath, useDeltaStreamer, tableType);
|
||||
cfg.workloadDagGenerator = ComplexDagGenerator.class.getName();
|
||||
HoodieTestSuiteJob hoodieTestSuiteJob = new HoodieTestSuiteJob(cfg, jsc);
|
||||
hoodieTestSuiteJob.runTestSuite();
|
||||
HoodieTableMetaClient metaClient = new HoodieTableMetaClient(new Configuration(), cfg.targetBasePath);
|
||||
assertEquals(metaClient.getActiveTimeline().getCommitsTimeline().getInstants().count(), 2);
|
||||
}
|
||||
|
||||
@MethodSource("configParams")
|
||||
public void testHiveSync(boolean useDeltaStreamer, String tableType) throws Exception {
|
||||
dfs.delete(new Path(dfsBasePath + "/input"), true);
|
||||
dfs.delete(new Path(dfsBasePath + "/result"), true);
|
||||
String inputBasePath = dfsBasePath + "/input";
|
||||
String outputBasePath = dfsBasePath + "/result";
|
||||
HoodieTestSuiteConfig cfg = makeConfig(inputBasePath, outputBasePath, useDeltaStreamer, tableType);
|
||||
if (tableType == HoodieTableType.COPY_ON_WRITE.name()) {
|
||||
cfg.workloadDagGenerator = HiveSyncDagGenerator.class.getName();
|
||||
} else {
|
||||
cfg.workloadDagGenerator = HiveSyncDagGeneratorMOR.class.getName();
|
||||
}
|
||||
HoodieTestSuiteJob hoodieTestSuiteJob = new HoodieTestSuiteJob(cfg, jsc);
|
||||
hoodieTestSuiteJob.runTestSuite();
|
||||
HoodieTableMetaClient metaClient = new HoodieTableMetaClient(new Configuration(), cfg.targetBasePath);
|
||||
assertEquals(metaClient.getActiveTimeline().getCommitsTimeline().getInstants().count(), 1);
|
||||
}
|
||||
|
||||
@MethodSource("configParams")
|
||||
public void testCOWFullDagFromYaml(boolean useDeltaStreamer, String tableType) throws Exception {
|
||||
dfs.delete(new Path(dfsBasePath + "/input"), true);
|
||||
dfs.delete(new Path(dfsBasePath + "/result"), true);
|
||||
String inputBasePath = dfsBasePath + "/input";
|
||||
String outputBasePath = dfsBasePath + "/result";
|
||||
HoodieTestSuiteConfig cfg = makeConfig(inputBasePath, outputBasePath, useDeltaStreamer, HoodieTableType
|
||||
.COPY_ON_WRITE.name());
|
||||
cfg.workloadYamlPath = dfsBasePath + "/complex-dag-cow.yaml";
|
||||
HoodieTestSuiteJob hoodieTestSuiteJob = new HoodieTestSuiteJob(cfg, jsc);
|
||||
hoodieTestSuiteJob.runTestSuite();
|
||||
HoodieTableMetaClient metaClient = new HoodieTableMetaClient(new Configuration(), cfg.targetBasePath);
|
||||
assertEquals(metaClient.getActiveTimeline().getCommitsTimeline().getInstants().count(), 5);
|
||||
}
|
||||
|
||||
@MethodSource("configParams")
|
||||
public void testMORFullDagFromYaml(boolean useDeltaStreamer, String tableType) throws Exception {
|
||||
dfs.delete(new Path(dfsBasePath + "/input"), true);
|
||||
dfs.delete(new Path(dfsBasePath + "/result"), true);
|
||||
String inputBasePath = dfsBasePath + "/input";
|
||||
String outputBasePath = dfsBasePath + "/result";
|
||||
HoodieTestSuiteConfig cfg = makeConfig(inputBasePath, outputBasePath, useDeltaStreamer, HoodieTableType
|
||||
.MERGE_ON_READ.name());
|
||||
cfg.workloadYamlPath = dfsBasePath + "/complex-dag-mor.yaml";
|
||||
HoodieTestSuiteJob hoodieTestSuiteJob = new HoodieTestSuiteJob(cfg, jsc);
|
||||
hoodieTestSuiteJob.runTestSuite();
|
||||
HoodieTableMetaClient metaClient = new HoodieTableMetaClient(new Configuration(), cfg.targetBasePath);
|
||||
assertEquals(metaClient.getActiveTimeline().getCommitsTimeline().getInstants().count(), 7);
|
||||
}
|
||||
|
||||
protected HoodieTestSuiteConfig makeConfig(String inputBasePath, String outputBasePath, boolean useDeltaStream,
|
||||
String tableType) {
|
||||
HoodieTestSuiteConfig cfg = new HoodieTestSuiteConfig();
|
||||
cfg.targetBasePath = outputBasePath;
|
||||
cfg.inputBasePath = inputBasePath;
|
||||
cfg.targetTableName = "table1";
|
||||
cfg.tableType = tableType;
|
||||
cfg.sourceClassName = AvroDFSSource.class.getName();
|
||||
cfg.sourceOrderingField = "timestamp";
|
||||
cfg.propsFilePath = dfsBasePath + "/test-source.properties";
|
||||
cfg.outputTypeName = DeltaOutputMode.DFS.name();
|
||||
cfg.inputFormatName = DeltaInputType.AVRO.name();
|
||||
cfg.limitFileSize = 1024 * 1024L;
|
||||
cfg.sourceLimit = 20000000;
|
||||
cfg.workloadDagGenerator = WorkflowDagGenerator.class.getName();
|
||||
cfg.schemaProviderClassName = FilebasedSchemaProvider.class.getName();
|
||||
cfg.useDeltaStreamer = useDeltaStream;
|
||||
return cfg;
|
||||
}
|
||||
|
||||
}
|
||||
@@ -0,0 +1,74 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.integ.testsuite.reader;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||
import static org.junit.jupiter.api.Assertions.assertTrue;
|
||||
|
||||
import java.io.IOException;
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.fs.FileStatus;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.apache.hudi.common.fs.FSUtils;
|
||||
import org.apache.hudi.common.util.Option;
|
||||
import org.apache.hudi.integ.testsuite.utils.TestUtils;
|
||||
import org.apache.hudi.integ.testsuite.utils.TestUtils;
|
||||
import org.apache.hudi.utilities.testutils.UtilitiesTestBase;
|
||||
import org.junit.jupiter.api.AfterAll;
|
||||
import org.junit.jupiter.api.BeforeAll;
|
||||
import org.junit.jupiter.api.BeforeEach;
|
||||
import org.junit.jupiter.api.Disabled;
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
public class TestDFSAvroDeltaInputReader extends UtilitiesTestBase {
|
||||
|
||||
@BeforeAll
|
||||
public static void initClass() throws Exception {
|
||||
UtilitiesTestBase.initClass();
|
||||
}
|
||||
|
||||
@AfterAll
|
||||
public static void cleanupClass() {
|
||||
UtilitiesTestBase.cleanupClass();
|
||||
}
|
||||
|
||||
@BeforeEach
|
||||
public void setup() throws Exception {
|
||||
super.setup();
|
||||
}
|
||||
|
||||
@Test
|
||||
@Disabled
|
||||
public void testDFSSinkReader() throws IOException {
|
||||
FileSystem fs = FSUtils.getFs(dfsBasePath, new Configuration());
|
||||
// Create 10 avro files with 10 records each
|
||||
TestUtils.createAvroFiles(jsc, sparkSession, dfsBasePath, 10, 10);
|
||||
FileStatus[] statuses = fs.globStatus(new Path(dfsBasePath + "/*/*.avro"));
|
||||
DFSAvroDeltaInputReader reader =
|
||||
new DFSAvroDeltaInputReader(sparkSession, TestUtils.getSchema().toString(), dfsBasePath, Option.empty(),
|
||||
Option.empty());
|
||||
assertEquals(reader.analyzeSingleFile(statuses[0].getPath().toString()), 5);
|
||||
assertEquals(reader.read(100).count(), 100);
|
||||
assertEquals(reader.read(1000).count(), 100);
|
||||
assertEquals(reader.read(10).count(), 10);
|
||||
assertTrue(reader.read(11).count() > 11);
|
||||
}
|
||||
|
||||
}
|
||||
@@ -0,0 +1,119 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.integ.testsuite.reader;
|
||||
|
||||
import static junit.framework.TestCase.assertEquals;
|
||||
import static junit.framework.TestCase.assertTrue;
|
||||
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import org.apache.avro.Schema;
|
||||
import org.apache.avro.generic.GenericRecord;
|
||||
import org.apache.hudi.avro.HoodieAvroUtils;
|
||||
import org.apache.hudi.client.HoodieWriteClient;
|
||||
import org.apache.hudi.client.WriteStatus;
|
||||
import org.apache.hudi.common.config.TypedProperties;
|
||||
import org.apache.hudi.common.model.HoodieRecord;
|
||||
import org.apache.hudi.common.testutils.HoodieTestDataGenerator;
|
||||
import org.apache.hudi.common.testutils.HoodieTestUtils;
|
||||
import org.apache.hudi.config.HoodieWriteConfig;
|
||||
import org.apache.hudi.utilities.schema.FilebasedSchemaProvider;
|
||||
import org.apache.hudi.utilities.testutils.UtilitiesTestBase;
|
||||
import org.apache.spark.api.java.JavaRDD;
|
||||
import org.junit.jupiter.api.AfterAll;
|
||||
import org.junit.jupiter.api.AfterEach;
|
||||
import org.junit.jupiter.api.BeforeAll;
|
||||
import org.junit.jupiter.api.BeforeEach;
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
public class TestDFSHoodieDatasetInputReader extends UtilitiesTestBase {
|
||||
|
||||
@BeforeAll
|
||||
public static void initClass() throws Exception {
|
||||
UtilitiesTestBase.initClass();
|
||||
}
|
||||
|
||||
@AfterAll
|
||||
public static void cleanupClass() {
|
||||
UtilitiesTestBase.cleanupClass();
|
||||
}
|
||||
|
||||
@BeforeEach
|
||||
public void setup() throws Exception {
|
||||
super.setup();
|
||||
HoodieTestUtils.init(jsc.hadoopConfiguration(), dfsBasePath);
|
||||
}
|
||||
|
||||
@AfterEach
|
||||
public void teardown() throws Exception {
|
||||
super.teardown();
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testSimpleHoodieDatasetReader() throws Exception {
|
||||
|
||||
HoodieWriteConfig config = makeHoodieClientConfig();
|
||||
HoodieWriteClient client = new HoodieWriteClient(jsc, config);
|
||||
String commitTime = client.startCommit();
|
||||
HoodieTestDataGenerator generator = new HoodieTestDataGenerator();
|
||||
// Insert 100 records across 3 partitions
|
||||
List<HoodieRecord> inserts = generator.generateInserts(commitTime, 100);
|
||||
JavaRDD<WriteStatus> writeStatuses = client.upsert(jsc.parallelize(inserts), commitTime);
|
||||
writeStatuses.count();
|
||||
|
||||
DFSHoodieDatasetInputReader reader = new DFSHoodieDatasetInputReader(jsc, config.getBasePath(),
|
||||
HoodieAvroUtils.addMetadataFields(new Schema.Parser().parse(config.getSchema())).toString());
|
||||
// Try to read 100 records for the same partition path and same file ID
|
||||
JavaRDD<GenericRecord> records = reader.read(1, 1, 100L);
|
||||
assertTrue(records.count() <= 100);
|
||||
assertEquals(new HashSet<>(records.map(p -> p.get(HoodieRecord.PARTITION_PATH_METADATA_FIELD)).collect()).size(),
|
||||
1);
|
||||
assertEquals(new HashSet<>(records.map(p -> p.get(HoodieRecord.FILENAME_METADATA_FIELD)).collect()).size(),
|
||||
1);
|
||||
|
||||
// Try to read 100 records for 3 partition paths and 3 different file ids
|
||||
records = reader.read(3, 3, 100L);
|
||||
assertTrue(records.count() <= 100);
|
||||
assertEquals(new HashSet<>(records.map(p -> p.get(HoodieRecord.PARTITION_PATH_METADATA_FIELD)).collect()).size(),
|
||||
3);
|
||||
assertEquals(new HashSet<>(records.map(p -> p.get(HoodieRecord.FILENAME_METADATA_FIELD)).collect()).size(),
|
||||
3);
|
||||
|
||||
// Try to read 100 records for 3 partition paths and 50% records from each file
|
||||
records = reader.read(3, 3, 0.5);
|
||||
assertTrue(records.count() <= 100);
|
||||
assertEquals(new HashSet<>(records.map(p -> p.get(HoodieRecord.PARTITION_PATH_METADATA_FIELD)).collect()).size(),
|
||||
3);
|
||||
assertEquals(new HashSet<>(records.map(p -> p.get(HoodieRecord.FILENAME_METADATA_FIELD)).collect()).size(),
|
||||
3);
|
||||
}
|
||||
|
||||
private HoodieWriteConfig makeHoodieClientConfig() throws Exception {
|
||||
return makeHoodieClientConfigBuilder().build();
|
||||
}
|
||||
|
||||
private HoodieWriteConfig.Builder makeHoodieClientConfigBuilder() throws Exception {
|
||||
// Prepare the AvroParquetIO
|
||||
return HoodieWriteConfig.newBuilder().withPath(dfsBasePath)
|
||||
.withParallelism(2, 2)
|
||||
.withSchema(HoodieTestDataGenerator
|
||||
.TRIP_EXAMPLE_SCHEMA);
|
||||
}
|
||||
|
||||
}
|
||||
@@ -0,0 +1,63 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.integ.testsuite.utils;
|
||||
|
||||
import java.util.List;
|
||||
import org.apache.avro.Schema;
|
||||
import org.apache.avro.generic.GenericRecord;
|
||||
import org.apache.hudi.AvroConversionUtils;
|
||||
import org.apache.hudi.common.testutils.HoodieTestDataGenerator;
|
||||
import org.apache.hudi.utilities.schema.RowBasedSchemaProvider;
|
||||
import org.apache.spark.api.java.JavaRDD;
|
||||
import org.apache.spark.api.java.JavaSparkContext;
|
||||
import org.apache.spark.sql.SparkSession;
|
||||
|
||||
public class TestUtils {
|
||||
|
||||
/**
|
||||
* Create a RDD of generic records for testing purposes.
|
||||
*/
|
||||
public static JavaRDD<GenericRecord> makeRDD(JavaSparkContext jsc, int numRecords) {
|
||||
return jsc.parallelize(generateGenericRecords(numRecords));
|
||||
}
|
||||
|
||||
/**
|
||||
* Generate generic records.
|
||||
*/
|
||||
public static List<GenericRecord> generateGenericRecords(int numRecords) {
|
||||
HoodieTestDataGenerator dataGenerator = new HoodieTestDataGenerator();
|
||||
return dataGenerator.generateGenericRecords(numRecords);
|
||||
}
|
||||
|
||||
public static void createAvroFiles(JavaSparkContext jsc, SparkSession sparkSession, String basePath, int numFiles,
|
||||
int numRecordsPerFile) {
|
||||
Schema schema = HoodieTestDataGenerator.AVRO_SCHEMA;
|
||||
for (int i = 0; i < numFiles; i++) {
|
||||
JavaRDD<GenericRecord> rdd = makeRDD(jsc, numRecordsPerFile);
|
||||
AvroConversionUtils.createDataFrame(rdd.rdd(), schema.toString(), sparkSession).write()
|
||||
.format("avro").option("recordName", RowBasedSchemaProvider.HOODIE_RECORD_STRUCT_NAME)
|
||||
.option("recordNamespace", RowBasedSchemaProvider.HOODIE_RECORD_NAMESPACE).save(basePath + "/" + i);
|
||||
}
|
||||
}
|
||||
|
||||
public static Schema getSchema() {
|
||||
return HoodieTestDataGenerator.AVRO_SCHEMA;
|
||||
}
|
||||
|
||||
}
|
||||
@@ -15,8 +15,9 @@
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
###
|
||||
log4j.rootLogger=WARN, CONSOLE
|
||||
log4j.logger.org.apache.hudi=DEBUG
|
||||
log4j.rootLogger=ERROR, CONSOLE
|
||||
log4j.logger.org.apache.hudi=ERROR
|
||||
log4j.category.org.apache.spark=ERROR
|
||||
|
||||
# CONSOLE is set to be a ConsoleAppender.
|
||||
log4j.appender.CONSOLE=org.apache.log4j.ConsoleAppender
|
||||
@@ -25,5 +26,5 @@ log4j.appender.CONSOLE.layout=org.apache.log4j.PatternLayout
|
||||
log4j.appender.CONSOLE.layout.ConversionPattern=[%-5p] %d %c %x - %m%n
|
||||
log4j.appender.CONSOLE.filter.a=org.apache.log4j.varia.LevelRangeFilter
|
||||
log4j.appender.CONSOLE.filter.a.AcceptOnMatch=true
|
||||
log4j.appender.CONSOLE.filter.a.LevelMin=WARN
|
||||
log4j.appender.CONSOLE.filter.a.LevelMin=ERROR
|
||||
log4j.appender.CONSOLE.filter.a.LevelMax=FATAL
|
||||
|
||||
Reference in New Issue
Block a user