1
0

[HUDI-394] Provide a basic implementation of test suite

This commit is contained in:
Nishith Agarwal
2019-11-01 12:40:09 -07:00
committed by n3nash
parent d5b593b7d9
commit 2fc2b01d86
102 changed files with 8633 additions and 64 deletions

View File

@@ -0,0 +1,77 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.integ.testsuite;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.hive.conf.HiveConf;
import org.apache.hudi.client.WriteStatus;
import org.apache.hudi.common.model.HoodieRecord;
import org.apache.hudi.common.util.collection.Pair;
import org.apache.hudi.utilities.deltastreamer.HoodieDeltaStreamer;
import org.apache.hudi.utilities.schema.SchemaProvider;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
/**
* Extends the {@link HoodieDeltaStreamer} to expose certain operations helpful in running the Test Suite.
* This is done to achieve 2 things 1) Leverage some components of {@link HoodieDeltaStreamer} 2)
* Piggyback on the suite to test {@link HoodieDeltaStreamer}
*/
public class HoodieDeltaStreamerWrapper extends HoodieDeltaStreamer {
public HoodieDeltaStreamerWrapper(Config cfg, JavaSparkContext jssc) throws Exception {
super(cfg, jssc);
}
public HoodieDeltaStreamerWrapper(Config cfg, JavaSparkContext jssc, FileSystem fs, HiveConf conf) throws Exception {
super(cfg, jssc, fs, conf);
}
public JavaRDD<WriteStatus> upsert(Operation operation) throws
Exception {
cfg.operation = operation;
return deltaSyncService.getDeltaSync().syncOnce().getRight();
}
public JavaRDD<WriteStatus> insert() throws Exception {
return upsert(Operation.INSERT);
}
public JavaRDD<WriteStatus> bulkInsert() throws
Exception {
return upsert(Operation.BULK_INSERT);
}
public void scheduleCompact() throws Exception {
// Since we don't support scheduleCompact() operation in delta-streamer, assume upsert without any data that will
// trigger scheduling compaction
upsert(Operation.UPSERT);
}
public JavaRDD<WriteStatus> compact() throws Exception {
// Since we don't support compact() operation in delta-streamer, assume upsert without any data that will trigger
// inline compaction
return upsert(Operation.UPSERT);
}
public Pair<SchemaProvider, Pair<String, JavaRDD<HoodieRecord>>> fetchSource() throws Exception {
return deltaSyncService.getDeltaSync().readFromSource(deltaSyncService.getDeltaSync().getCommitTimelineOpt());
}
}

View File

@@ -0,0 +1,186 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.integ.testsuite;
import com.beust.jcommander.JCommander;
import com.beust.jcommander.Parameter;
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hive.conf.HiveConf;
import org.apache.hudi.DataSourceUtils;
import org.apache.hudi.common.config.SerializableConfiguration;
import org.apache.hudi.common.config.TypedProperties;
import org.apache.hudi.common.fs.FSUtils;
import org.apache.hudi.common.model.HoodieTableType;
import org.apache.hudi.common.table.HoodieTableMetaClient;
import org.apache.hudi.common.util.ReflectionUtils;
import org.apache.hudi.exception.HoodieException;
import org.apache.hudi.integ.testsuite.configuration.DFSDeltaConfig;
import org.apache.hudi.integ.testsuite.dag.DagUtils;
import org.apache.hudi.integ.testsuite.dag.WorkflowDag;
import org.apache.hudi.integ.testsuite.dag.WorkflowDagGenerator;
import org.apache.hudi.integ.testsuite.dag.scheduler.DagScheduler;
import org.apache.hudi.integ.testsuite.generator.DeltaGenerator;
import org.apache.hudi.integ.testsuite.reader.DeltaInputType;
import org.apache.hudi.integ.testsuite.writer.DeltaOutputMode;
import org.apache.hudi.keygen.KeyGenerator;
import org.apache.hudi.utilities.UtilHelpers;
import org.apache.hudi.utilities.deltastreamer.HoodieDeltaStreamer;
import org.apache.hudi.utilities.schema.SchemaProvider;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.sql.SparkSession;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* This is the entry point for running a Hudi Test Suite. Although this class has similarities with
* {@link HoodieDeltaStreamer} this class does not extend it since do not want to create a dependency on the changes in
* DeltaStreamer.
*/
public class HoodieTestSuiteJob {
private static volatile Logger log = LoggerFactory.getLogger(HoodieTestSuiteJob.class);
private final HoodieTestSuiteConfig cfg;
/**
* Bag of properties with source, hoodie client, key generator etc.
*/
TypedProperties props;
/**
* Schema provider that supplies the command for writing out the generated payloads.
*/
private transient SchemaProvider schemaProvider;
/**
* Filesystem used.
*/
private transient FileSystem fs;
/**
* Spark context.
*/
private transient JavaSparkContext jsc;
/**
* Spark Session.
*/
private transient SparkSession sparkSession;
/**
* Hive Config.
*/
private transient HiveConf hiveConf;
private KeyGenerator keyGenerator;
public HoodieTestSuiteJob(HoodieTestSuiteConfig cfg, JavaSparkContext jsc) throws IOException {
this.cfg = cfg;
this.jsc = jsc;
this.sparkSession = SparkSession.builder().config(jsc.getConf()).getOrCreate();
this.fs = FSUtils.getFs(cfg.inputBasePath, jsc.hadoopConfiguration());
this.props = UtilHelpers.readConfig(fs, new Path(cfg.propsFilePath), cfg.configs).getConfig();
log.info("Creating workload generator with configs : {}", props.toString());
this.schemaProvider = UtilHelpers.createSchemaProvider(cfg.schemaProviderClassName, props, jsc);
this.hiveConf = getDefaultHiveConf(jsc.hadoopConfiguration());
this.keyGenerator = DataSourceUtils.createKeyGenerator(props);
if (!fs.exists(new Path(cfg.targetBasePath))) {
HoodieTableMetaClient.initTableType(jsc.hadoopConfiguration(), cfg.targetBasePath,
HoodieTableType.valueOf(cfg.tableType), cfg.targetTableName, "archived");
}
}
private static HiveConf getDefaultHiveConf(Configuration cfg) {
HiveConf hiveConf = new HiveConf();
hiveConf.addResource(cfg);
return hiveConf;
}
public static void main(String[] args) throws Exception {
final HoodieTestSuiteConfig cfg = new HoodieTestSuiteConfig();
JCommander cmd = new JCommander(cfg, args);
if (cfg.help || args.length == 0) {
cmd.usage();
System.exit(1);
}
JavaSparkContext jssc = UtilHelpers.buildSparkContext("workload-generator-" + cfg.outputTypeName
+ "-" + cfg.inputFormatName, cfg.sparkMaster);
new HoodieTestSuiteJob(cfg, jssc).runTestSuite();
}
public void runTestSuite() {
try {
WorkflowDag workflowDag = this.cfg.workloadYamlPath == null ? ((WorkflowDagGenerator) ReflectionUtils
.loadClass((this.cfg).workloadDagGenerator)).build()
: DagUtils.convertYamlPathToDag(this.fs, this.cfg.workloadYamlPath);
log.info("Workflow Dag => " + DagUtils.convertDagToYaml(workflowDag));
long startTime = System.currentTimeMillis();
String schemaStr = schemaProvider.getSourceSchema().toString();
final HoodieTestSuiteWriter writer = new HoodieTestSuiteWriter(jsc, props, cfg, schemaStr);
final DeltaGenerator deltaGenerator = new DeltaGenerator(
new DFSDeltaConfig(DeltaOutputMode.valueOf(cfg.outputTypeName), DeltaInputType.valueOf(cfg.inputFormatName),
new SerializableConfiguration(jsc.hadoopConfiguration()), cfg.inputBasePath, cfg.targetBasePath,
schemaStr, cfg.limitFileSize), jsc, sparkSession, schemaStr, keyGenerator);
DagScheduler dagScheduler = new DagScheduler(workflowDag, writer, deltaGenerator);
dagScheduler.schedule();
log.info("Finished scheduling all tasks, Time taken {}", System.currentTimeMillis() - startTime);
} catch (Exception e) {
log.error("Failed to run Test Suite ", e);
throw new HoodieException("Failed to run Test Suite ", e);
} finally {
jsc.stop();
}
}
/**
* The Hudi test suite uses {@link HoodieDeltaStreamer} to run some operations hence extend delta streamer config.
*/
public static class HoodieTestSuiteConfig extends HoodieDeltaStreamer.Config {
@Parameter(names = {"--input-base-path"}, description = "base path for input data"
+ "(Will be created if did not exist first time around. If exists, more data will be added to that path)",
required = true)
public String inputBasePath;
@Parameter(names = {
"--workload-generator-classname"}, description = "WorkflowDag of operations to generate the workload",
required = true)
public String workloadDagGenerator = WorkflowDagGenerator.class.getName();
@Parameter(names = {
"--workload-yaml-path"}, description = "Workflow Dag yaml path to generate the workload")
public String workloadYamlPath;
@Parameter(names = {"--delta-output-type"}, description = "Subclass of "
+ "org.apache.hudi.testsuite.workload.DeltaOutputMode to readAvro data.")
public String outputTypeName = DeltaOutputMode.DFS.name();
@Parameter(names = {"--delta-input-format"}, description = "Subclass of "
+ "org.apache.hudi.testsuite.workload.DeltaOutputMode to read avro data.")
public String inputFormatName = DeltaInputType.AVRO.name();
@Parameter(names = {"--input-file-size"}, description = "The min/max size of the input files to generate",
required = true)
public Long limitFileSize = 1024 * 1024 * 120L;
@Parameter(names = {"--use-deltastreamer"}, description = "Choose whether to use HoodieDeltaStreamer to "
+ "perform"
+ " ingestion. If set to false, HoodieWriteClient will be used")
public Boolean useDeltaStreamer = false;
}
}

View File

@@ -0,0 +1,219 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.integ.testsuite;
import java.util.Arrays;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;
import java.util.Properties;
import java.util.Set;
import org.apache.hadoop.conf.Configuration;
import org.apache.hudi.avro.model.HoodieCompactionPlan;
import org.apache.hudi.client.HoodieReadClient;
import org.apache.hudi.client.HoodieWriteClient;
import org.apache.hudi.client.WriteStatus;
import org.apache.hudi.common.model.HoodieRecord;
import org.apache.hudi.common.table.timeline.HoodieActiveTimeline;
import org.apache.hudi.common.util.Option;
import org.apache.hudi.common.util.collection.Pair;
import org.apache.hudi.config.HoodieCompactionConfig;
import org.apache.hudi.config.HoodieIndexConfig;
import org.apache.hudi.config.HoodieWriteConfig;
import org.apache.hudi.index.HoodieIndex;
import org.apache.hudi.integ.testsuite.dag.nodes.CleanNode;
import org.apache.hudi.integ.testsuite.dag.nodes.DagNode;
import org.apache.hudi.integ.testsuite.dag.nodes.RollbackNode;
import org.apache.hudi.integ.testsuite.dag.nodes.ScheduleCompactNode;
import org.apache.hudi.integ.testsuite.HoodieTestSuiteJob.HoodieTestSuiteConfig;
import org.apache.hudi.utilities.deltastreamer.HoodieDeltaStreamer.Operation;
import org.apache.hudi.utilities.schema.SchemaProvider;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
/**
* A writer abstraction for the Hudi test suite. This class wraps different implementations of writers used to perform
* write operations into the target hudi dataset. Current supported writers are {@link HoodieDeltaStreamerWrapper}
* and {@link HoodieWriteClient}.
*/
public class HoodieTestSuiteWriter {
private HoodieDeltaStreamerWrapper deltaStreamerWrapper;
private HoodieWriteClient writeClient;
protected HoodieTestSuiteConfig cfg;
private Option<String> lastCheckpoint;
private HoodieReadClient hoodieReadClient;
private Properties props;
private String schema;
private transient Configuration configuration;
private transient JavaSparkContext sparkContext;
private static Set<String> VALID_DAG_NODES_TO_ALLOW_WRITE_CLIENT_IN_DELTASTREAMER_MODE = new HashSet<>(
Arrays.asList(RollbackNode.class.getName(), CleanNode.class.getName(), ScheduleCompactNode.class.getName()));
public HoodieTestSuiteWriter(JavaSparkContext jsc, Properties props, HoodieTestSuiteConfig cfg, String schema) throws
Exception {
this(jsc, props, cfg, schema, true);
}
public HoodieTestSuiteWriter(JavaSparkContext jsc, Properties props, HoodieTestSuiteConfig cfg, String schema,
boolean rollbackInflight) throws Exception {
// We ensure that only 1 instance of HoodieWriteClient is instantiated for a HoodieTestSuiteWriter
// This does not instantiate a HoodieWriteClient until a
// {@link HoodieDeltaStreamer#commit(HoodieWriteClient, JavaRDD, Option)} is invoked.
this.deltaStreamerWrapper = new HoodieDeltaStreamerWrapper(cfg, jsc);
this.hoodieReadClient = new HoodieReadClient(jsc, cfg.targetBasePath);
if (!cfg.useDeltaStreamer) {
this.writeClient = new HoodieWriteClient(jsc, getHoodieClientConfig(cfg, props, schema), rollbackInflight);
}
this.cfg = cfg;
this.configuration = jsc.hadoopConfiguration();
this.sparkContext = jsc;
this.props = props;
this.schema = schema;
}
private HoodieWriteConfig getHoodieClientConfig(HoodieTestSuiteConfig cfg, Properties props, String schema) {
HoodieWriteConfig.Builder builder =
HoodieWriteConfig.newBuilder().combineInput(true, true).withPath(cfg.targetBasePath)
.withAutoCommit(false)
.withCompactionConfig(HoodieCompactionConfig.newBuilder().withPayloadClass(cfg.payloadClassName).build())
.forTable(cfg.targetTableName)
.withIndexConfig(HoodieIndexConfig.newBuilder().withIndexType(HoodieIndex.IndexType.BLOOM).build())
.withProps(props);
builder = builder.withSchema(schema);
return builder.build();
}
private boolean allowWriteClientAccess(DagNode dagNode) {
if (VALID_DAG_NODES_TO_ALLOW_WRITE_CLIENT_IN_DELTASTREAMER_MODE.contains(dagNode.getClass().getName())) {
return true;
}
return false;
}
public Pair<SchemaProvider, Pair<String, JavaRDD<HoodieRecord>>> fetchSource() throws Exception {
return this.deltaStreamerWrapper.fetchSource();
}
public Option<String> startCommit() {
if (cfg.useDeltaStreamer) {
return Option.of(HoodieActiveTimeline.createNewInstantTime());
} else {
return Option.of(writeClient.startCommit());
}
}
public JavaRDD<WriteStatus> upsert(Option<String> instantTime) throws Exception {
if (cfg.useDeltaStreamer) {
return deltaStreamerWrapper.upsert(Operation.UPSERT);
} else {
Pair<SchemaProvider, Pair<String, JavaRDD<HoodieRecord>>> nextBatch = fetchSource();
lastCheckpoint = Option.of(nextBatch.getValue().getLeft());
return writeClient.upsert(nextBatch.getRight().getRight(), instantTime.get());
}
}
public JavaRDD<WriteStatus> insert(Option<String> instantTime) throws Exception {
if (cfg.useDeltaStreamer) {
return deltaStreamerWrapper.insert();
} else {
Pair<SchemaProvider, Pair<String, JavaRDD<HoodieRecord>>> nextBatch = fetchSource();
lastCheckpoint = Option.of(nextBatch.getValue().getLeft());
return writeClient.insert(nextBatch.getRight().getRight(), instantTime.get());
}
}
public JavaRDD<WriteStatus> bulkInsert(Option<String> instantTime) throws Exception {
if (cfg.useDeltaStreamer) {
return deltaStreamerWrapper.bulkInsert();
} else {
Pair<SchemaProvider, Pair<String, JavaRDD<HoodieRecord>>> nextBatch = fetchSource();
lastCheckpoint = Option.of(nextBatch.getValue().getLeft());
return writeClient.bulkInsert(nextBatch.getRight().getRight(), instantTime.get());
}
}
public JavaRDD<WriteStatus> compact(Option<String> instantTime) throws Exception {
if (cfg.useDeltaStreamer) {
return deltaStreamerWrapper.compact();
} else {
if (!instantTime.isPresent()) {
Option<Pair<String, HoodieCompactionPlan>> compactionPlanPair = Option
.fromJavaOptional(hoodieReadClient.getPendingCompactions()
.stream().findFirst());
if (compactionPlanPair.isPresent()) {
instantTime = Option.of(compactionPlanPair.get().getLeft());
}
}
if (instantTime.isPresent()) {
return writeClient.compact(instantTime.get());
} else {
return null;
}
}
}
public Option<String> scheduleCompaction(Option<Map<String, String>> previousCommitExtraMetadata) throws
Exception {
if (!cfg.useDeltaStreamer) {
deltaStreamerWrapper.scheduleCompact();
return Option.empty();
} else {
return writeClient.scheduleCompaction(previousCommitExtraMetadata);
}
}
public void commit(JavaRDD<WriteStatus> records, Option<String> instantTime) {
if (!cfg.useDeltaStreamer) {
Map<String, String> extraMetadata = new HashMap<>();
/** Store the checkpoint in the commit metadata just like
* {@link HoodieDeltaStreamer#commit(HoodieWriteClient, JavaRDD, Option)} **/
extraMetadata.put(HoodieDeltaStreamerWrapper.CHECKPOINT_KEY, lastCheckpoint.get());
writeClient.commit(instantTime.get(), records, Option.of(extraMetadata));
}
}
public HoodieWriteClient getWriteClient(DagNode dagNode) throws IllegalAccessException {
if (cfg.useDeltaStreamer & !allowWriteClientAccess(dagNode)) {
throw new IllegalAccessException("cannot access write client when testing in deltastreamer mode");
}
synchronized (this) {
if (writeClient == null) {
this.writeClient = new HoodieWriteClient(this.sparkContext, getHoodieClientConfig(cfg, props, schema), false);
}
}
return writeClient;
}
public HoodieDeltaStreamerWrapper getDeltaStreamerWrapper() {
return deltaStreamerWrapper;
}
public HoodieTestSuiteConfig getCfg() {
return cfg;
}
public Configuration getConfiguration() {
return configuration;
}
public JavaSparkContext getSparkContext() {
return sparkContext;
}
}

View File

@@ -0,0 +1,73 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.integ.testsuite.configuration;
import org.apache.hudi.common.config.SerializableConfiguration;
import org.apache.hudi.integ.testsuite.reader.DeltaInputType;
import org.apache.hudi.integ.testsuite.writer.DeltaOutputMode;
/**
* Configuration to hold details about a DFS based output type, implements {@link DeltaConfig}.
*/
public class DFSDeltaConfig extends DeltaConfig {
// The base path where the generated data should be written to. This data will in turn be used to write into a hudi
// dataset
private final String deltaBasePath;
private final String datasetOutputPath;
private final String schemaStr;
// Maximum file size for the files generated
private final Long maxFileSize;
// The current batch id
private Integer batchId;
public DFSDeltaConfig(DeltaOutputMode deltaOutputMode, DeltaInputType deltaInputType,
SerializableConfiguration configuration,
String deltaBasePath, String targetBasePath, String schemaStr, Long maxFileSize) {
super(deltaOutputMode, deltaInputType, configuration);
this.deltaBasePath = deltaBasePath;
this.schemaStr = schemaStr;
this.maxFileSize = maxFileSize;
this.datasetOutputPath = targetBasePath;
}
public String getDeltaBasePath() {
return deltaBasePath;
}
public String getDatasetOutputPath() {
return datasetOutputPath;
}
public String getSchemaStr() {
return schemaStr;
}
public Long getMaxFileSize() {
return maxFileSize;
}
public Integer getBatchId() {
return batchId;
}
public void setBatchId(Integer batchId) {
this.batchId = batchId;
}
}

View File

@@ -0,0 +1,256 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.integ.testsuite.configuration;
import com.fasterxml.jackson.databind.ObjectMapper;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.apache.hadoop.conf.Configuration;
import org.apache.hudi.common.config.SerializableConfiguration;
import org.apache.hudi.common.util.collection.Pair;
import org.apache.hudi.integ.testsuite.reader.DeltaInputType;
import org.apache.hudi.integ.testsuite.writer.DeltaOutputMode;
/**
* Configuration to hold the delta output type and delta input format.
*/
public class DeltaConfig implements Serializable {
private final DeltaOutputMode deltaOutputMode;
private final DeltaInputType deltaInputType;
private final SerializableConfiguration configuration;
public DeltaConfig(DeltaOutputMode deltaOutputMode, DeltaInputType deltaInputType,
SerializableConfiguration configuration) {
this.deltaOutputMode = deltaOutputMode;
this.deltaInputType = deltaInputType;
this.configuration = configuration;
}
public DeltaOutputMode getDeltaOutputMode() {
return deltaOutputMode;
}
public DeltaInputType getDeltaInputType() {
return deltaInputType;
}
public Configuration getConfiguration() {
return configuration.get();
}
/**
* Represents any kind of workload operation for new data. Each workload also contains a set of Option sequence of
* actions that can be executed in parallel.
*/
public static class Config {
public static final String CONFIG_NAME = "config";
public static final String TYPE = "type";
public static final String NODE_NAME = "name";
public static final String DEPENDENCIES = "deps";
public static final String CHILDREN = "children";
public static final String HIVE_QUERIES = "hive_queries";
public static final String HIVE_PROPERTIES = "hive_props";
private static String NUM_RECORDS_INSERT = "num_records_insert";
private static String NUM_RECORDS_UPSERT = "num_records_upsert";
private static String REPEAT_COUNT = "repeat_count";
private static String RECORD_SIZE = "record_size";
private static String NUM_PARTITIONS_INSERT = "num_partitions_insert";
private static String NUM_PARTITIONS_UPSERT = "num_partitions_upsert";
private static String NUM_FILES_UPSERT = "num_files_upsert";
private static String FRACTION_UPSERT_PER_FILE = "fraction_upsert_per_file";
private static String DISABLE_GENERATE = "disable_generate";
private static String DISABLE_INGEST = "disable_ingest";
private static String HIVE_LOCAL = "hive_local";
private Map<String, Object> configsMap;
public Config(Map<String, Object> configsMap) {
this.configsMap = configsMap;
}
public static Builder newBuilder() {
return new Builder();
}
public long getNumRecordsInsert() {
return Long.valueOf(configsMap.getOrDefault(NUM_RECORDS_INSERT, 0).toString());
}
public long getNumRecordsUpsert() {
return Long.valueOf(configsMap.getOrDefault(NUM_RECORDS_UPSERT, 0).toString());
}
public int getRecordSize() {
return Integer.valueOf(configsMap.getOrDefault(RECORD_SIZE, 1024).toString());
}
public int getNumInsertPartitions() {
return Integer.valueOf(configsMap.getOrDefault(NUM_PARTITIONS_INSERT, 1).toString());
}
public int getRepeatCount() {
return Integer.valueOf(configsMap.getOrDefault(REPEAT_COUNT, 1).toString());
}
public int getNumUpsertPartitions() {
return Integer.valueOf(configsMap.getOrDefault(NUM_PARTITIONS_UPSERT, 0).toString());
}
public int getNumUpsertFiles() {
return Integer.valueOf(configsMap.getOrDefault(NUM_FILES_UPSERT, 1).toString());
}
public double getFractionUpsertPerFile() {
return Double.valueOf(configsMap.getOrDefault(FRACTION_UPSERT_PER_FILE, 0.0).toString());
}
public boolean isDisableGenerate() {
return Boolean.valueOf(configsMap.getOrDefault(DISABLE_GENERATE, false).toString());
}
public boolean isDisableIngest() {
return Boolean.valueOf(configsMap.getOrDefault(DISABLE_INGEST, false).toString());
}
public Map<String, Object> getOtherConfigs() {
if (configsMap == null) {
return new HashMap<>();
}
return configsMap;
}
public List<Pair<String, Integer>> getHiveQueries() {
try {
return (List<Pair<String, Integer>>) this.configsMap.getOrDefault("hive_queries", new ArrayList<>());
} catch (Exception e) {
throw new RuntimeException("unable to get hive queries from configs");
}
}
public boolean isHiveLocal() {
return Boolean.valueOf(configsMap.getOrDefault(HIVE_LOCAL, false).toString());
}
public List<String> getHiveProperties() {
return (List<String>) this.configsMap.getOrDefault(HIVE_PROPERTIES, new ArrayList<>());
}
@Override
public String toString() {
try {
return new ObjectMapper().writeValueAsString(this.configsMap);
} catch (Exception e) {
throw new RuntimeException("unable to generate string representation of config");
}
}
public static class Builder {
private Map<String, Object> configsMap = new HashMap<>();
public Builder() {
}
public Builder withNumRecordsToInsert(long numRecordsInsert) {
this.configsMap.put(NUM_RECORDS_INSERT, numRecordsInsert);
return this;
}
public Builder withNumRecordsToUpdate(long numRecordsUpsert) {
this.configsMap.put(NUM_RECORDS_UPSERT, numRecordsUpsert);
return this;
}
public Builder withNumInsertPartitions(int numInsertPartitions) {
this.configsMap.put(NUM_PARTITIONS_INSERT, numInsertPartitions);
return this;
}
public Builder withNumUpsertPartitions(int numUpsertPartitions) {
this.configsMap.put(NUM_PARTITIONS_UPSERT, numUpsertPartitions);
return this;
}
public Builder withNumUpsertFiles(int numUpsertFiles) {
this.configsMap.put(NUM_FILES_UPSERT, numUpsertFiles);
return this;
}
public Builder withFractionUpsertPerFile(double fractionUpsertPerFile) {
this.configsMap.put(FRACTION_UPSERT_PER_FILE, fractionUpsertPerFile);
return this;
}
public Builder withNumTimesToRepeat(int repeatCount) {
this.configsMap.put(REPEAT_COUNT, repeatCount);
return this;
}
public Builder withRecordSize(int recordSize) {
this.configsMap.put(RECORD_SIZE, recordSize);
return this;
}
public Builder disableGenerate(boolean generate) {
this.configsMap.put(DISABLE_GENERATE, generate);
return this;
}
public Builder disableIngest(boolean ingest) {
this.configsMap.put(DISABLE_INGEST, ingest);
return this;
}
public Builder withConfig(String name, Object value) {
this.configsMap.put(name, value);
return this;
}
public Builder withHiveQueryAndResults(List<Pair<String, Integer>> hiveQueries) {
this.configsMap.put(HIVE_QUERIES, hiveQueries);
return this;
}
public Builder withHiveLocal(boolean startHiveLocal) {
this.configsMap.put(HIVE_LOCAL, startHiveLocal);
return this;
}
public Builder withHiveProperties(List<String> hiveProperties) {
this.configsMap.put(HIVE_PROPERTIES, hiveProperties);
return this;
}
public Builder withConfigsMap(Map<String, Object> configsMap) {
this.configsMap = configsMap;
return this;
}
public Config build() {
return new Config(configsMap);
}
}
}
}

View File

@@ -0,0 +1,33 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.integ.testsuite.converter;
import java.io.Serializable;
import org.apache.spark.api.java.JavaRDD;
/**
* Implementations of {@link Converter} will convert data from one format to another.
*
* @param <I> Input Data Type
* @param <O> Output Data Type
*/
public interface Converter<I, O> extends Serializable {
JavaRDD<O> convert(JavaRDD<I> inputRDD);
}

View File

@@ -0,0 +1,53 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.integ.testsuite.converter;
import java.util.List;
import org.apache.avro.generic.GenericRecord;
import org.apache.hudi.integ.testsuite.generator.LazyRecordGeneratorIterator;
import org.apache.hudi.integ.testsuite.generator.UpdateGeneratorIterator;
import org.apache.spark.api.java.JavaRDD;
/**
* This converter creates an update {@link GenericRecord} from an existing {@link GenericRecord}.
*/
public class UpdateConverter implements Converter<GenericRecord, GenericRecord> {
private final String schemaStr;
// The fields that should not be mutated when converting the insert record to an update record, generally the
// record_key
private final List<String> partitionPathFields;
private final List<String> recordKeyFields;
private final int minPayloadSize;
public UpdateConverter(String schemaStr, int minPayloadSize, List<String> partitionPathFields,
List<String> recordKeyFields) {
this.schemaStr = schemaStr;
this.partitionPathFields = partitionPathFields;
this.recordKeyFields = recordKeyFields;
this.minPayloadSize = minPayloadSize;
}
@Override
public JavaRDD<GenericRecord> convert(JavaRDD<GenericRecord> inputRDD) {
return inputRDD.mapPartitions(recordItr -> new LazyRecordGeneratorIterator(new UpdateGeneratorIterator(recordItr,
schemaStr, partitionPathFields, recordKeyFields, minPayloadSize)));
}
}

View File

@@ -0,0 +1,217 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.integ.testsuite.dag;
import com.fasterxml.jackson.core.JsonParser;
import com.fasterxml.jackson.databind.JsonNode;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.fasterxml.jackson.databind.node.ObjectNode;
import com.fasterxml.jackson.dataformat.yaml.YAMLFactory;
import com.fasterxml.jackson.dataformat.yaml.YAMLGenerator.Feature;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.stream.Collectors;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hudi.common.util.ReflectionUtils;
import org.apache.hudi.common.util.StringUtils;
import org.apache.hudi.common.util.collection.Pair;
import org.apache.hudi.integ.testsuite.configuration.DeltaConfig;
import org.apache.hudi.integ.testsuite.dag.nodes.DagNode;
/**
* Utility class to SerDe workflow dag.
*/
public class DagUtils {
static final ObjectMapper MAPPER = new ObjectMapper();
/**
* Converts a YAML path to {@link WorkflowDag}.
*/
public static WorkflowDag convertYamlPathToDag(FileSystem fs, String path) throws IOException {
InputStream is = fs.open(new Path(path));
return convertYamlToDag(toString(is));
}
/**
* Converts a YAML representation to {@link WorkflowDag}.
*/
public static WorkflowDag convertYamlToDag(String yaml) throws IOException {
Map<String, DagNode> allNodes = new HashMap<>();
final ObjectMapper yamlReader = new ObjectMapper(new YAMLFactory());
final JsonNode jsonNode = yamlReader.readTree(yaml);
Iterator<Entry<String, JsonNode>> itr = jsonNode.fields();
while (itr.hasNext()) {
Entry<String, JsonNode> dagNode = itr.next();
allNodes.put(dagNode.getKey(), convertJsonToDagNode(allNodes, dagNode.getValue()));
}
return new WorkflowDag(findRootNodes(allNodes));
}
/**
* Converts {@link WorkflowDag} to a YAML representation.
*/
public static String convertDagToYaml(WorkflowDag dag) throws IOException {
final ObjectMapper yamlWriter = new ObjectMapper(new YAMLFactory().disable(Feature.WRITE_DOC_START_MARKER)
.enable(Feature.MINIMIZE_QUOTES).enable(JsonParser.Feature.ALLOW_UNQUOTED_FIELD_NAMES));
JsonNode yamlNode = MAPPER.createObjectNode();
convertDagToYaml(yamlNode, dag.getNodeList());
return yamlWriter.writerWithDefaultPrettyPrinter().writeValueAsString(yamlNode);
}
private static void convertDagToYaml(JsonNode yamlNode, List<DagNode> dagNodes) throws IOException {
for (DagNode dagNode : dagNodes) {
String name = dagNode.getConfig().getOtherConfigs().getOrDefault(DeltaConfig.Config.NODE_NAME, dagNode.getName()).toString();
((ObjectNode) yamlNode).put(name, convertDagNodeToJsonNode(dagNode));
if (dagNode.getChildNodes().size() > 0) {
convertDagToYaml(yamlNode, dagNode.getChildNodes());
}
}
}
private static DagNode convertJsonToDagNode(Map<String, DagNode> allNodes, JsonNode node) throws IOException {
String type = node.get(DeltaConfig.Config.TYPE).asText();
final DagNode retNode = convertJsonToDagNode(node, type);
Arrays.asList(node.get(DeltaConfig.Config.DEPENDENCIES).textValue().split(",")).stream().forEach(dep -> {
DagNode parentNode = allNodes.get(dep);
if (parentNode != null) {
parentNode.addChildNode(retNode);
}
});
return retNode;
}
private static List<DagNode> findRootNodes(Map<String, DagNode> allNodes) {
final List<DagNode> rootNodes = new ArrayList<>();
allNodes.entrySet().stream().forEach(entry -> {
if (entry.getValue().getParentNodes().size() < 1) {
rootNodes.add(entry.getValue());
}
});
return rootNodes;
}
private static DagNode convertJsonToDagNode(JsonNode node, String type) {
try {
DeltaConfig.Config config = DeltaConfig.Config.newBuilder().withConfigsMap(convertJsonNodeToMap(node)).build();
return (DagNode) ReflectionUtils.loadClass(generateFQN(type), config);
} catch (ClassNotFoundException e) {
throw new RuntimeException(e);
}
}
private static String generateFQN(String name) throws ClassNotFoundException {
return Class.forName(StringUtils.joinUsingDelim(".",
DagNode.class.getName().substring(0, DagNode.class.getName().lastIndexOf(".")), name)).getName();
}
private static JsonNode convertDagNodeToJsonNode(DagNode node) throws IOException {
return createJsonNode(node, node.getClass().getSimpleName());
}
private static Map<String, Object> convertJsonNodeToMap(JsonNode node) {
Map<String, Object> configsMap = new HashMap<>();
Iterator<Entry<String, JsonNode>> itr = node.get(DeltaConfig.Config.CONFIG_NAME).fields();
while (itr.hasNext()) {
Entry<String, JsonNode> entry = itr.next();
switch (entry.getKey()) {
case DeltaConfig.Config.HIVE_QUERIES:
configsMap.put(DeltaConfig.Config.HIVE_QUERIES, getHiveQueries(entry));
break;
case DeltaConfig.Config.HIVE_PROPERTIES:
configsMap.put(DeltaConfig.Config.HIVE_PROPERTIES, getProperties(entry));
break;
default:
configsMap.put(entry.getKey(), getValue(entry.getValue()));
break;
}
}
return configsMap;
}
private static List<Pair<String, Integer>> getHiveQueries(Entry<String, JsonNode> entry) {
List<Pair<String, Integer>> queries = new ArrayList<>();
Iterator<Entry<String, JsonNode>> queriesItr = entry.getValue().fields();
while (queriesItr.hasNext()) {
queries.add(Pair.of(queriesItr.next().getValue().textValue(), queriesItr.next().getValue().asInt()));
}
return queries;
}
private static List<String> getProperties(Entry<String, JsonNode> entry) {
List<String> properties = new ArrayList<>();
Iterator<Entry<String, JsonNode>> queriesItr = entry.getValue().fields();
while (queriesItr.hasNext()) {
properties.add(queriesItr.next().getValue().textValue());
}
return properties;
}
private static Object getValue(JsonNode node) {
if (node.isInt()) {
return node.asInt();
} else if (node.isLong()) {
return node.asLong();
} else if (node.isShort()) {
return node.asInt();
} else if (node.isBoolean()) {
return node.asBoolean();
} else if (node.isDouble()) {
return node.asDouble();
} else if (node.isFloat()) {
return node.asDouble();
}
return node.textValue();
}
private static JsonNode createJsonNode(DagNode node, String type) throws IOException {
JsonNode configNode = MAPPER.readTree(node.getConfig().toString());
JsonNode jsonNode = MAPPER.createObjectNode();
((ObjectNode) jsonNode).put(DeltaConfig.Config.CONFIG_NAME, configNode);
((ObjectNode) jsonNode).put(DeltaConfig.Config.TYPE, type);
((ObjectNode) jsonNode).put(DeltaConfig.Config.DEPENDENCIES, getDependencyNames(node));
return jsonNode;
}
private static String getDependencyNames(DagNode node) {
return node.getParentNodes().stream()
.map(e -> ((DagNode) e).getConfig().getOtherConfigs().getOrDefault(DeltaConfig.Config.NODE_NAME, node.getName()).toString())
.collect(Collectors.joining(",")).toString();
}
public static String toString(InputStream inputStream) throws IOException {
ByteArrayOutputStream result = new ByteArrayOutputStream();
byte[] buffer = new byte[1024];
int length;
while ((length = inputStream.read(buffer)) != -1) {
result.write(buffer, 0, length);
}
return result.toString("utf-8");
}
}

View File

@@ -0,0 +1,54 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.integ.testsuite.dag;
import java.io.Serializable;
import org.apache.hudi.integ.testsuite.HoodieTestSuiteWriter;
import org.apache.hudi.integ.testsuite.dag.nodes.DagNode;
import org.apache.hudi.integ.testsuite.generator.DeltaGenerator;
import org.apache.spark.api.java.JavaSparkContext;
/**
* This wraps the context needed for an execution of
* a {@link DagNode#execute(ExecutionContext)}.
*/
public class ExecutionContext implements Serializable {
private HoodieTestSuiteWriter hoodieTestSuiteWriter;
private DeltaGenerator deltaGenerator;
private transient JavaSparkContext jsc;
public ExecutionContext(JavaSparkContext jsc, HoodieTestSuiteWriter hoodieTestSuiteWriter, DeltaGenerator deltaGenerator) {
this.hoodieTestSuiteWriter = hoodieTestSuiteWriter;
this.deltaGenerator = deltaGenerator;
this.jsc = jsc;
}
public HoodieTestSuiteWriter getHoodieTestSuiteWriter() {
return hoodieTestSuiteWriter;
}
public DeltaGenerator getDeltaGenerator() {
return deltaGenerator;
}
public JavaSparkContext getJsc() {
return jsc;
}
}

View File

@@ -0,0 +1,75 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.integ.testsuite.dag;
import java.util.ArrayList;
import java.util.List;
import org.apache.hudi.common.util.collection.Pair;
import org.apache.hudi.integ.testsuite.configuration.DeltaConfig;
import org.apache.hudi.integ.testsuite.dag.nodes.DagNode;
import org.apache.hudi.integ.testsuite.dag.nodes.HiveQueryNode;
import org.apache.hudi.integ.testsuite.dag.nodes.InsertNode;
import org.apache.hudi.integ.testsuite.dag.nodes.UpsertNode;
/**
* An example of how to generate a workflow dag programmatically. This is also used as the default workflow dag if
* none is provided.
*/
public class SimpleWorkflowDagGenerator implements WorkflowDagGenerator {
@Override
public WorkflowDag build() {
DagNode root = new InsertNode(DeltaConfig.Config.newBuilder()
.withNumRecordsToInsert(100)
.withNumInsertPartitions(1)
.withNumTimesToRepeat(2)
.withRecordSize(1000).build());
DagNode child1 = new InsertNode(DeltaConfig.Config.newBuilder()
.withNumRecordsToInsert(100)
.withNumInsertPartitions(1)
.withNumTimesToRepeat(2)
.withRecordSize(1000).build());
root.addChildNode(child1);
DagNode child1OfChild1 = new UpsertNode(DeltaConfig.Config.newBuilder()
.withNumRecordsToUpdate(100)
.withNumUpsertPartitions(2)
.withNumTimesToRepeat(1)
.withRecordSize(1000).build());
// Tests running 2 nodes in parallel
child1.addChildNode(child1OfChild1);
List<Pair<String, Integer>> queryAndResult = new ArrayList<>();
queryAndResult.add(Pair.of("select " + "count(*) from testdb1.table1 group "
+ "by rider having count(*) < 1", 0));
DagNode child2OfChild1 = new HiveQueryNode(DeltaConfig.Config.newBuilder()
.withHiveQueryAndResults(queryAndResult).withHiveLocal(true).build());
child1.addChildNode(child2OfChild1);
List<DagNode> rootNodes = new ArrayList<>();
rootNodes.add(root);
return new WorkflowDag(rootNodes);
}
}

View File

@@ -0,0 +1,39 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.integ.testsuite.dag;
import java.util.List;
import org.apache.hudi.integ.testsuite.dag.nodes.DagNode;
/**
* Workflow dag that encapsulates all execute nodes.
*/
public class WorkflowDag<O> {
private List<DagNode<O>> nodeList;
public WorkflowDag(List<DagNode<O>> nodeList) {
this.nodeList = nodeList;
}
public List<DagNode<O>> getNodeList() {
return nodeList;
}
}

View File

@@ -0,0 +1,31 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.integ.testsuite.dag;
/**
* A interface which represents a workflow dag generator.
*/
public interface WorkflowDagGenerator {
/**
* Builds a {@link WorkflowDag}.
*/
WorkflowDag build();
}

View File

@@ -0,0 +1,40 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.integ.testsuite.dag.nodes;
import org.apache.hudi.client.WriteStatus;
import org.apache.hudi.common.util.Option;
import org.apache.hudi.integ.testsuite.HoodieTestSuiteWriter;
import org.apache.hudi.integ.testsuite.configuration.DeltaConfig.Config;
import org.apache.spark.api.java.JavaRDD;
public class BulkInsertNode extends InsertNode {
public BulkInsertNode(Config config) {
super(config);
}
@Override
protected JavaRDD<WriteStatus> ingest(HoodieTestSuiteWriter hoodieTestSuiteWriter, Option<String> commitTime)
throws Exception {
log.info("Execute bulk ingest node {}", this.getName());
return hoodieTestSuiteWriter.bulkInsert(commitTime);
}
}

View File

@@ -0,0 +1,34 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.integ.testsuite.dag.nodes;
import org.apache.hudi.integ.testsuite.dag.ExecutionContext;
public class CleanNode extends DagNode<Boolean> {
public CleanNode() {
}
@Override
public void execute(ExecutionContext executionContext) throws Exception {
log.info("Executing clean node {}", this.getName());
executionContext.getHoodieTestSuiteWriter().getWriteClient(this).clean();
}
}

View File

@@ -0,0 +1,47 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.integ.testsuite.dag.nodes;
import org.apache.hudi.client.WriteStatus;
import org.apache.hudi.common.table.HoodieTableMetaClient;
import org.apache.hudi.common.table.timeline.HoodieInstant;
import org.apache.hudi.common.util.Option;
import org.apache.hudi.integ.testsuite.configuration.DeltaConfig.Config;
import org.apache.hudi.integ.testsuite.dag.ExecutionContext;
import org.apache.spark.api.java.JavaRDD;
public class CompactNode extends DagNode<JavaRDD<WriteStatus>> {
public CompactNode(Config config) {
this.config = config;
}
@Override
public void execute(ExecutionContext executionContext) throws Exception {
HoodieTableMetaClient metaClient = new HoodieTableMetaClient(executionContext.getHoodieTestSuiteWriter().getConfiguration(),
executionContext.getHoodieTestSuiteWriter().getCfg().targetBasePath);
Option<HoodieInstant> lastInstant = metaClient.getActiveTimeline()
.getCommitsAndCompactionTimeline().filterPendingCompactionTimeline().lastInstant();
if (lastInstant.isPresent()) {
log.info("Compacting instant {}", lastInstant.get());
this.result = executionContext.getHoodieTestSuiteWriter().compact(Option.of(lastInstant.get().getTimestamp()));
}
}
}

View File

@@ -0,0 +1,125 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.integ.testsuite.dag.nodes;
import java.util.ArrayList;
import java.util.LinkedList;
import java.util.List;
import java.util.Objects;
import java.util.UUID;
import org.apache.hudi.integ.testsuite.configuration.DeltaConfig.Config;
import org.apache.hudi.integ.testsuite.dag.ExecutionContext;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* Represents a Node in the DAG of operations for a workflow.
*/
public abstract class DagNode<O> implements Comparable<DagNode<O>> {
protected static Logger log = LoggerFactory.getLogger(DagNode.class);
protected List<DagNode<O>> childNodes;
protected List<DagNode<O>> parentNodes;
protected O result;
protected Config config;
private boolean isCompleted;
public DagNode<O> addChildNode(DagNode childNode) {
childNode.getParentNodes().add(this);
getChildNodes().add(childNode);
return this;
}
public DagNode<O> addParentNode(DagNode parentNode) {
if (!this.getParentNodes().contains(parentNode)) {
this.getParentNodes().add(parentNode);
}
return this;
}
public O getResult() {
return result;
}
public List<DagNode<O>> getChildNodes() {
if (childNodes == null) {
childNodes = new LinkedList<>();
}
return childNodes;
}
public List<DagNode<O>> getParentNodes() {
if (parentNodes == null) {
this.parentNodes = new ArrayList<>();
}
return this.parentNodes;
}
public void setParentNodes(List<DagNode<O>> parentNodes) {
this.parentNodes = parentNodes;
}
public abstract void execute(ExecutionContext context) throws Exception;
public boolean isCompleted() {
return isCompleted;
}
public void setCompleted(boolean completed) {
isCompleted = completed;
}
public Config getConfig() {
return config;
}
public String getName() {
Object name = this.config.getOtherConfigs().get(Config.NODE_NAME);
if (name == null) {
String randomName = UUID.randomUUID().toString();
this.config.getOtherConfigs().put(Config.NODE_NAME, randomName);
return randomName;
}
return name.toString();
}
@Override
public boolean equals(Object o) {
if (this == o) {
return true;
}
if (o == null || getClass() != o.getClass()) {
return false;
}
DagNode<?> dagNode = (DagNode<?>) o;
return getName() == dagNode.getName();
}
@Override
public int hashCode() {
return Objects.hash(getName());
}
@Override
public int compareTo(DagNode<O> thatNode) {
return this.hashCode() - thatNode.hashCode();
}
}

View File

@@ -0,0 +1,87 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.integ.testsuite.dag.nodes;
import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.ResultSet;
import java.sql.SQLException;
import java.sql.Statement;
import org.apache.hudi.DataSourceUtils;
import org.apache.hudi.common.util.collection.Pair;
import org.apache.hudi.hive.HiveSyncConfig;
import org.apache.hudi.integ.testsuite.configuration.DeltaConfig;
import org.apache.hudi.integ.testsuite.dag.ExecutionContext;
import org.apache.hudi.integ.testsuite.helpers.HiveServiceProvider;
public class HiveQueryNode extends DagNode<Boolean> {
private HiveServiceProvider hiveServiceProvider;
public HiveQueryNode(DeltaConfig.Config config) {
this.config = config;
this.hiveServiceProvider = new HiveServiceProvider(config);
}
@Override
public void execute(ExecutionContext executionContext) throws Exception {
log.info("Executing hive query node {}", this.getName());
this.hiveServiceProvider.startLocalHiveServiceIfNeeded(executionContext.getHoodieTestSuiteWriter().getConfiguration());
HiveSyncConfig hiveSyncConfig = DataSourceUtils
.buildHiveSyncConfig(executionContext.getHoodieTestSuiteWriter().getDeltaStreamerWrapper()
.getDeltaSyncService().getDeltaSync().getProps(),
executionContext.getHoodieTestSuiteWriter().getDeltaStreamerWrapper()
.getDeltaSyncService().getDeltaSync().getCfg().targetBasePath,
executionContext.getHoodieTestSuiteWriter().getDeltaStreamerWrapper()
.getDeltaSyncService().getDeltaSync().getCfg().baseFileFormat);
this.hiveServiceProvider.syncToLocalHiveIfNeeded(executionContext.getHoodieTestSuiteWriter());
Connection con = DriverManager.getConnection(hiveSyncConfig.jdbcUrl, hiveSyncConfig.hiveUser,
hiveSyncConfig.hivePass);
Statement stmt = con.createStatement();
stmt.execute("set hive.input.format=org.apache.hadoop.hive.ql.io.HiveInputFormat");
for (String hiveProperty : this.config.getHiveProperties()) {
executeStatement(hiveProperty, stmt);
}
for (Pair<String, Integer> queryAndResult : this.config.getHiveQueries()) {
log.info("Running {}", queryAndResult.getLeft());
ResultSet res = stmt.executeQuery(queryAndResult.getLeft());
if (!res.next()) {
log.info("res.next() was False - typically this means the query returned no rows.");
assert 0 == queryAndResult.getRight();
} else {
Integer result = res.getInt(1);
if (!queryAndResult.getRight().equals(result)) {
throw new AssertionError(
"QUERY: " + queryAndResult.getLeft()
+ " | EXPECTED RESULT = " + queryAndResult.getRight()
+ " | ACTUAL RESULT = " + result
);
}
}
log.info("Successfully validated query!");
}
this.hiveServiceProvider.stopLocalHiveServiceIfNeeded();
}
private void executeStatement(String query, Statement stmt) throws SQLException {
log.info("Executing statement {}", stmt.toString());
stmt.execute(query);
}
}

View File

@@ -0,0 +1,46 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.integ.testsuite.dag.nodes;
import org.apache.hudi.integ.testsuite.helpers.HiveServiceProvider;
import org.apache.hudi.integ.testsuite.configuration.DeltaConfig.Config;
import org.apache.hudi.integ.testsuite.dag.ExecutionContext;
public class HiveSyncNode extends DagNode<Boolean> {
private HiveServiceProvider hiveServiceProvider;
public HiveSyncNode(Config config) {
this.config = config;
this.hiveServiceProvider = new HiveServiceProvider(config);
}
@Override
public void execute(ExecutionContext executionContext) throws Exception {
log.info("Executing hive sync node");
this.hiveServiceProvider.startLocalHiveServiceIfNeeded(executionContext.getHoodieTestSuiteWriter().getConfiguration());
this.hiveServiceProvider.syncToLocalHiveIfNeeded(executionContext.getHoodieTestSuiteWriter());
executionContext.getHoodieTestSuiteWriter().getDeltaStreamerWrapper().getDeltaSyncService().getDeltaSync().syncHive();
this.hiveServiceProvider.stopLocalHiveServiceIfNeeded();
}
public HiveServiceProvider getHiveServiceProvider() {
return hiveServiceProvider;
}
}

View File

@@ -0,0 +1,60 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.integ.testsuite.dag.nodes;
import org.apache.hudi.client.WriteStatus;
import org.apache.hudi.common.util.Option;
import org.apache.hudi.integ.testsuite.HoodieTestSuiteWriter;
import org.apache.hudi.integ.testsuite.configuration.DeltaConfig.Config;
import org.apache.hudi.integ.testsuite.dag.ExecutionContext;
import org.apache.hudi.integ.testsuite.generator.DeltaGenerator;
import org.apache.spark.api.java.JavaRDD;
public class InsertNode extends DagNode<JavaRDD<WriteStatus>> {
public InsertNode(Config config) {
this.config = config;
}
@Override
public void execute(ExecutionContext executionContext) throws Exception {
generate(executionContext.getDeltaGenerator());
log.info("Configs : {}", this.config);
if (!config.isDisableIngest()) {
log.info("Inserting input data {}", this.getName());
Option<String> commitTime = executionContext.getHoodieTestSuiteWriter().startCommit();
JavaRDD<WriteStatus> writeStatus = ingest(executionContext.getHoodieTestSuiteWriter(), commitTime);
executionContext.getHoodieTestSuiteWriter().commit(writeStatus, commitTime);
this.result = writeStatus;
}
}
protected void generate(DeltaGenerator deltaGenerator) throws Exception {
if (!config.isDisableGenerate()) {
log.info("Generating input data for node {}", this.getName());
deltaGenerator.writeRecords(deltaGenerator.generateInserts(config)).count();
}
}
protected JavaRDD<WriteStatus> ingest(HoodieTestSuiteWriter hoodieTestSuiteWriter,
Option<String> commitTime) throws Exception {
return hoodieTestSuiteWriter.insert(commitTime);
}
}

View File

@@ -0,0 +1,48 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.integ.testsuite.dag.nodes;
import org.apache.hudi.common.table.HoodieTableMetaClient;
import org.apache.hudi.common.table.timeline.HoodieInstant;
import org.apache.hudi.common.util.Option;
import org.apache.hudi.integ.testsuite.configuration.DeltaConfig.Config;
import org.apache.hudi.integ.testsuite.dag.ExecutionContext;
public class RollbackNode extends DagNode<Option<HoodieInstant>> {
public RollbackNode(Config config) {
this.config = config;
}
@Override
public void execute(ExecutionContext executionContext) throws Exception {
log.info("Executing rollback node {}", this.getName());
// Can only be done with an instantiation of a new WriteClient hence cannot be done during DeltaStreamer
// testing for now
HoodieTableMetaClient metaClient = new HoodieTableMetaClient(executionContext.getHoodieTestSuiteWriter().getConfiguration(),
executionContext.getHoodieTestSuiteWriter().getCfg().targetBasePath);
Option<HoodieInstant> lastInstant = metaClient.getActiveTimeline().getCommitsTimeline().lastInstant();
if (lastInstant.isPresent()) {
log.info("Rolling back last instant {}", lastInstant.get());
executionContext.getHoodieTestSuiteWriter().getWriteClient(this).rollback(lastInstant.get().getTimestamp());
this.result = lastInstant;
}
}
}

View File

@@ -0,0 +1,56 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.integ.testsuite.dag.nodes;
import org.apache.hudi.common.model.HoodieCommitMetadata;
import org.apache.hudi.common.table.HoodieTableMetaClient;
import org.apache.hudi.common.table.timeline.HoodieInstant;
import org.apache.hudi.common.util.Option;
import org.apache.hudi.integ.testsuite.configuration.DeltaConfig.Config;
import org.apache.hudi.integ.testsuite.dag.ExecutionContext;
public class ScheduleCompactNode extends DagNode<Option<String>> {
public ScheduleCompactNode(Config config) {
this.config = config;
}
@Override
public void execute(ExecutionContext executionContext) throws Exception {
log.info("Executing schedule compact node {}", this.getName());
// Can only be done with an instantiation of a new WriteClient hence cannot be done during DeltaStreamer
// testing for now
// Find the last commit and extra the extra metadata to be passed to the schedule compaction. This is
// done to ensure the CHECKPOINT is correctly passed from commit to commit
HoodieTableMetaClient metaClient = new HoodieTableMetaClient(executionContext.getHoodieTestSuiteWriter().getConfiguration(),
executionContext.getHoodieTestSuiteWriter().getCfg().targetBasePath);
Option<HoodieInstant> lastInstant = metaClient.getActiveTimeline().getCommitsTimeline().lastInstant();
if (lastInstant.isPresent()) {
HoodieCommitMetadata metadata = org.apache.hudi.common.model.HoodieCommitMetadata.fromBytes(metaClient
.getActiveTimeline().getInstantDetails(lastInstant.get()).get(), HoodieCommitMetadata.class);
Option<String> scheduledInstant = executionContext.getHoodieTestSuiteWriter().scheduleCompaction(Option.of(metadata
.getExtraMetadata()));
if (scheduledInstant.isPresent()) {
log.info("Scheduling compaction instant {}", scheduledInstant.get());
}
this.result = scheduledInstant;
}
}
}

View File

@@ -0,0 +1,61 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.integ.testsuite.dag.nodes;
import org.apache.hudi.common.util.collection.Pair;
import org.apache.hudi.integ.testsuite.helpers.HiveServiceProvider;
import org.apache.hudi.integ.testsuite.configuration.DeltaConfig.Config;
import org.apache.hudi.integ.testsuite.dag.ExecutionContext;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.SparkSession;
public class SparkSQLQueryNode extends DagNode<Boolean> {
HiveServiceProvider hiveServiceProvider;
public SparkSQLQueryNode(Config config) {
this.config = config;
this.hiveServiceProvider = new HiveServiceProvider(config);
}
@Override
public void execute(ExecutionContext executionContext) throws Exception {
log.info("Executing spark sql query node");
this.hiveServiceProvider.startLocalHiveServiceIfNeeded(executionContext.getHoodieTestSuiteWriter().getConfiguration());
this.hiveServiceProvider.syncToLocalHiveIfNeeded(executionContext.getHoodieTestSuiteWriter());
SparkSession session = SparkSession.builder().sparkContext(executionContext.getJsc().sc()).getOrCreate();
for (String hiveProperty : this.config.getHiveProperties()) {
session.sql(hiveProperty).count();
}
for (Pair<String, Integer> queryAndResult : this.config.getHiveQueries()) {
log.info("Running {}", queryAndResult.getLeft());
Dataset<Row> res = session.sql(queryAndResult.getLeft());
if (res.count() == 0) {
assert 0 == queryAndResult.getRight();
} else {
assert ((Row[]) res.collect())[0].getInt(0) == queryAndResult.getRight();
}
log.info("Successfully validated query!");
}
this.hiveServiceProvider.stopLocalHiveServiceIfNeeded();
this.result = true;
}
}

View File

@@ -0,0 +1,52 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.integ.testsuite.dag.nodes;
import org.apache.hudi.client.WriteStatus;
import org.apache.hudi.common.util.Option;
import org.apache.hudi.integ.testsuite.HoodieTestSuiteWriter;
import org.apache.hudi.integ.testsuite.configuration.DeltaConfig.Config;
import org.apache.hudi.integ.testsuite.generator.DeltaGenerator;
import org.apache.spark.api.java.JavaRDD;
public class UpsertNode extends InsertNode {
public UpsertNode(Config config) {
super(config);
}
@Override
protected void generate(DeltaGenerator deltaGenerator) throws Exception {
if (!config.isDisableGenerate()) {
log.info("Generating input data {}", this.getName());
deltaGenerator.writeRecords(deltaGenerator.generateUpdates(config)).count();
}
}
@Override
protected JavaRDD<WriteStatus> ingest(HoodieTestSuiteWriter hoodieTestSuiteWriter, Option<String> commitTime)
throws Exception {
if (!config.isDisableIngest()) {
log.info("Upserting input data {}", this.getName());
this.result = hoodieTestSuiteWriter.upsert(commitTime);
}
return this.result;
}
}

View File

@@ -0,0 +1,48 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.integ.testsuite.dag.nodes;
import java.util.List;
import java.util.function.Function;
import org.apache.hudi.integ.testsuite.configuration.DeltaConfig.Config;
import org.apache.hudi.integ.testsuite.dag.ExecutionContext;
public class ValidateNode<R> extends DagNode {
protected Function<List<DagNode>, R> function;
public ValidateNode(Config config, Function<List<DagNode>, R> function) {
this.function = function;
this.config = config;
}
@Override
public void execute(ExecutionContext executionContext) {
if (this.getParentNodes().size() > 0 && (Boolean) this.config.getOtherConfigs().getOrDefault("WAIT_FOR_PARENTS",
true)) {
for (DagNode node : (List<DagNode>) this.getParentNodes()) {
if (!node.isCompleted()) {
throw new RuntimeException("cannot validate before parent nodes are complete");
}
}
}
this.result = this.function.apply((List<DagNode>) this.getParentNodes());
}
}

View File

@@ -0,0 +1,100 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.integ.testsuite.dag.scheduler;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.PriorityQueue;
import java.util.Queue;
import java.util.Set;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.Future;
import java.util.concurrent.TimeUnit;
import org.apache.hudi.exception.HoodieException;
import org.apache.hudi.integ.testsuite.dag.nodes.DagNode;
import org.apache.hudi.integ.testsuite.HoodieTestSuiteWriter;
import org.apache.hudi.integ.testsuite.dag.ExecutionContext;
import org.apache.hudi.integ.testsuite.dag.WorkflowDag;
import org.apache.hudi.integ.testsuite.generator.DeltaGenerator;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
public class DagScheduler {
private static Logger log = LoggerFactory.getLogger(DagScheduler.class);
private WorkflowDag workflowDag;
private ExecutionContext executionContext;
public DagScheduler(WorkflowDag workflowDag, HoodieTestSuiteWriter hoodieTestSuiteWriter, DeltaGenerator deltaGenerator) {
this.workflowDag = workflowDag;
this.executionContext = new ExecutionContext(null, hoodieTestSuiteWriter, deltaGenerator);
}
public void schedule() throws Exception {
ExecutorService service = Executors.newFixedThreadPool(2);
try {
execute(service, workflowDag.getNodeList());
service.shutdown();
} finally {
if (!service.isShutdown()) {
log.info("Forcing shutdown of executor service, this might kill running tasks");
service.shutdownNow();
}
}
}
private void execute(ExecutorService service, List<DagNode> nodes) throws Exception {
// Nodes at the same level are executed in parallel
Queue<DagNode> queue = new PriorityQueue<>(nodes);
log.info("Running workloads");
do {
List<Future> futures = new ArrayList<>();
Set<DagNode> childNodes = new HashSet<>();
while (queue.size() > 0) {
DagNode nodeToExecute = queue.poll();
futures.add(service.submit(() -> executeNode(nodeToExecute)));
if (nodeToExecute.getChildNodes().size() > 0) {
childNodes.addAll(nodeToExecute.getChildNodes());
}
}
queue.addAll(childNodes);
childNodes.clear();
for (Future future : futures) {
future.get(1, TimeUnit.HOURS);
}
} while (queue.size() > 0);
log.info("Finished workloads");
}
private void executeNode(DagNode node) {
if (node.isCompleted()) {
throw new RuntimeException("DagNode already completed! Cannot re-execute");
}
try {
node.execute(executionContext);
node.setCompleted(true);
log.info("Finished executing {}", node.getName());
} catch (Exception e) {
log.error("Exception executing node");
throw new HoodieException(e);
}
}
}

View File

@@ -0,0 +1,237 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.integ.testsuite.generator;
import java.io.IOException;
import java.io.Serializable;
import java.io.UncheckedIOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.stream.StreamSupport;
import org.apache.avro.generic.GenericRecord;
import org.apache.hudi.common.util.Option;
import org.apache.hudi.integ.testsuite.converter.Converter;
import org.apache.hudi.integ.testsuite.converter.UpdateConverter;
import org.apache.hudi.integ.testsuite.reader.DFSAvroDeltaInputReader;
import org.apache.hudi.integ.testsuite.reader.DFSHoodieDatasetInputReader;
import org.apache.hudi.integ.testsuite.reader.DeltaInputReader;
import org.apache.hudi.integ.testsuite.writer.DeltaOutputMode;
import org.apache.hudi.integ.testsuite.writer.DeltaWriteStats;
import org.apache.hudi.integ.testsuite.writer.DeltaWriterAdapter;
import org.apache.hudi.integ.testsuite.writer.DeltaWriterFactory;
import org.apache.hudi.keygen.ComplexKeyGenerator;
import org.apache.hudi.keygen.KeyGenerator;
import org.apache.hudi.keygen.SimpleKeyGenerator;
import org.apache.hudi.integ.testsuite.configuration.DFSDeltaConfig;
import org.apache.hudi.integ.testsuite.configuration.DeltaConfig;
import org.apache.hudi.integ.testsuite.configuration.DeltaConfig.Config;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.sql.SparkSession;
import org.apache.spark.storage.StorageLevel;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import scala.Tuple2;
/**
* The delta generator generates all types of workloads (insert, update) for the given configs.
*/
public class DeltaGenerator implements Serializable {
private static Logger log = LoggerFactory.getLogger(DFSHoodieDatasetInputReader.class);
private DeltaConfig deltaOutputConfig;
private transient JavaSparkContext jsc;
private transient SparkSession sparkSession;
private String schemaStr;
private List<String> recordRowKeyFieldNames;
private List<String> partitionPathFieldNames;
private int batchId;
public DeltaGenerator(DeltaConfig deltaOutputConfig, JavaSparkContext jsc, SparkSession sparkSession,
String schemaStr,
KeyGenerator keyGenerator) {
this.deltaOutputConfig = deltaOutputConfig;
this.jsc = jsc;
this.sparkSession = sparkSession;
this.schemaStr = schemaStr;
this.recordRowKeyFieldNames = keyGenerator instanceof ComplexKeyGenerator ? ((ComplexKeyGenerator) keyGenerator)
.getRecordKeyFields() : Arrays.asList(((SimpleKeyGenerator) keyGenerator).getRecordKeyField());
this.partitionPathFieldNames = keyGenerator instanceof ComplexKeyGenerator ? ((ComplexKeyGenerator) keyGenerator)
.getPartitionPathFields() : Arrays.asList(((SimpleKeyGenerator) keyGenerator).getPartitionPathField());
}
public JavaRDD<DeltaWriteStats> writeRecords(JavaRDD<GenericRecord> records) {
// The following creates a new anonymous function for iterator and hence results in serialization issues
JavaRDD<DeltaWriteStats> ws = records.mapPartitions(itr -> {
try {
DeltaWriterAdapter<GenericRecord> deltaWriterAdapter = DeltaWriterFactory
.getDeltaWriterAdapter(deltaOutputConfig, batchId);
return Collections.singletonList(deltaWriterAdapter.write(itr)).iterator();
} catch (IOException io) {
throw new UncheckedIOException(io);
}
}).flatMap(List::iterator);
batchId++;
return ws;
}
public JavaRDD<GenericRecord> generateInserts(Config operation) {
long recordsPerPartition = operation.getNumRecordsInsert();
int minPayloadSize = operation.getRecordSize();
JavaRDD<GenericRecord> inputBatch = jsc.parallelize(Collections.EMPTY_LIST)
.repartition(operation.getNumInsertPartitions()).mapPartitions(p -> {
return new LazyRecordGeneratorIterator(new FlexibleSchemaRecordGenerationIterator(recordsPerPartition,
minPayloadSize, schemaStr, partitionPathFieldNames));
});
return inputBatch;
}
public JavaRDD<GenericRecord> generateUpdates(Config config) throws IOException {
if (deltaOutputConfig.getDeltaOutputMode() == DeltaOutputMode.DFS) {
JavaRDD<GenericRecord> inserts = null;
if (config.getNumRecordsInsert() > 0) {
inserts = generateInserts(config);
}
DeltaInputReader deltaInputReader = null;
JavaRDD<GenericRecord> adjustedRDD = null;
if (config.getNumUpsertPartitions() < 1) {
// randomly generate updates for a given number of records without regard to partitions and files
deltaInputReader = new DFSAvroDeltaInputReader(sparkSession, schemaStr,
((DFSDeltaConfig) deltaOutputConfig).getDeltaBasePath(), Option.empty(), Option.empty());
adjustedRDD = deltaInputReader.read(config.getNumRecordsUpsert());
adjustedRDD = adjustRDDToGenerateExactNumUpdates(adjustedRDD, jsc, config.getNumRecordsUpsert());
} else {
deltaInputReader =
new DFSHoodieDatasetInputReader(jsc, ((DFSDeltaConfig) deltaOutputConfig).getDatasetOutputPath(),
schemaStr);
if (config.getFractionUpsertPerFile() > 0) {
adjustedRDD = deltaInputReader.read(config.getNumUpsertPartitions(), config.getNumUpsertFiles(),
config.getFractionUpsertPerFile());
} else {
adjustedRDD = deltaInputReader.read(config.getNumUpsertPartitions(), config.getNumUpsertFiles(), config
.getNumRecordsUpsert());
}
}
log.info("Repartitioning records");
// persist this since we will make multiple passes over this
adjustedRDD = adjustedRDD.repartition(jsc.defaultParallelism());
log.info("Repartitioning records done");
Converter converter = new UpdateConverter(schemaStr, config.getRecordSize(),
partitionPathFieldNames, recordRowKeyFieldNames);
JavaRDD<GenericRecord> updates = converter.convert(adjustedRDD);
log.info("Records converted");
updates.persist(StorageLevel.DISK_ONLY());
return inserts != null ? inserts.union(updates) : updates;
// TODO : Generate updates for only N partitions.
} else {
throw new IllegalArgumentException("Other formats are not supported at the moment");
}
}
public Map<Integer, Long> getPartitionToCountMap(JavaRDD<GenericRecord> records) {
// Requires us to keep the partitioner the same
return records.mapPartitionsWithIndex((index, itr) -> {
Iterable<GenericRecord> newIterable = () -> itr;
// parallelize counting for speed
long count = StreamSupport.stream(newIterable.spliterator(), true).count();
return Arrays.asList(new Tuple2<>(index, count)).iterator();
}, true).mapToPair(i -> i).collectAsMap();
}
public Map<Integer, Long> getAdjustedPartitionsCount(Map<Integer, Long> partitionCountMap, long
recordsToRemove) {
long remainingRecordsToRemove = recordsToRemove;
Iterator<Map.Entry<Integer, Long>> iterator = partitionCountMap.entrySet().iterator();
Map<Integer, Long> adjustedPartitionCountMap = new HashMap<>();
while (iterator.hasNext()) {
Map.Entry<Integer, Long> entry = iterator.next();
if (entry.getValue() < remainingRecordsToRemove) {
remainingRecordsToRemove -= entry.getValue();
adjustedPartitionCountMap.put(entry.getKey(), 0L);
} else {
long newValue = entry.getValue() - remainingRecordsToRemove;
remainingRecordsToRemove = 0;
adjustedPartitionCountMap.put(entry.getKey(), newValue);
}
if (remainingRecordsToRemove == 0) {
break;
}
}
return adjustedPartitionCountMap;
}
public JavaRDD<GenericRecord> adjustRDDToGenerateExactNumUpdates(JavaRDD<GenericRecord> updates, JavaSparkContext
jsc, long totalRecordsRequired) {
Map<Integer, Long> actualPartitionCountMap = getPartitionToCountMap(updates);
long totalRecordsGenerated = actualPartitionCountMap.values().stream().mapToLong(Long::longValue).sum();
if (isSafeToTake(totalRecordsRequired, totalRecordsGenerated)) {
// Generate totalRecordsRequired - totalRecordsGenerated new records and union the RDD's
// NOTE : This performs poorly when totalRecordsRequired >> totalRecordsGenerated. Hence, always
// ensure that enough inserts are created before hand (this needs to be noted during the WorkflowDag creation)
long sizeOfUpdateRDD = totalRecordsGenerated;
while (totalRecordsRequired != sizeOfUpdateRDD) {
long recordsToTake = (totalRecordsRequired - sizeOfUpdateRDD) > sizeOfUpdateRDD
? sizeOfUpdateRDD : (totalRecordsRequired - sizeOfUpdateRDD);
if ((totalRecordsRequired - sizeOfUpdateRDD) > recordsToTake && recordsToTake <= sizeOfUpdateRDD) {
updates = updates.union(updates);
sizeOfUpdateRDD *= 2;
} else {
List<GenericRecord> remainingUpdates = updates.take((int) (recordsToTake));
updates = updates.union(jsc.parallelize(remainingUpdates));
sizeOfUpdateRDD = sizeOfUpdateRDD + recordsToTake;
}
}
return updates;
} else if (totalRecordsRequired < totalRecordsGenerated) {
final Map<Integer, Long> adjustedPartitionCountMap = getAdjustedPartitionsCount(actualPartitionCountMap,
totalRecordsGenerated - totalRecordsRequired);
// limit counts across partitions to meet the exact number of updates required
JavaRDD<GenericRecord> trimmedRecords = updates.mapPartitionsWithIndex((index, itr) -> {
int counter = 1;
List<GenericRecord> entriesToKeep = new ArrayList<>();
if (!adjustedPartitionCountMap.containsKey(index)) {
return itr;
} else {
long recordsToKeepForThisPartition = adjustedPartitionCountMap.get(index);
while (counter <= recordsToKeepForThisPartition && itr.hasNext()) {
entriesToKeep.add(itr.next());
counter++;
}
return entriesToKeep.iterator();
}
}, true);
return trimmedRecords;
}
return updates;
}
private boolean isSafeToTake(long totalRecords, long totalRecordsGenerated) {
// TODO : Ensure that the difference between totalRecords and totalRecordsGenerated is not too big, if yes,
// then there are fewer number of records on disk, hence we need to find another way to generate updates when
// requiredUpdates >> insertedRecords
return totalRecords > totalRecordsGenerated;
}
}

View File

@@ -0,0 +1,70 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.integ.testsuite.generator;
import java.util.Iterator;
import java.util.List;
import org.apache.avro.Schema;
import org.apache.avro.generic.GenericRecord;
/**
* A GenericRecordGeneratorIterator for the custom schema of the workload. Implements {@link Iterator} to allow for
* iteration semantics.
*/
public class FlexibleSchemaRecordGenerationIterator implements Iterator<GenericRecord> {
// Stores how many records to generate as part of this iterator. Ideally, one iterator is started per spark
// partition.
private long counter;
// Use the full payload generator as default
private GenericRecordFullPayloadGenerator generator;
// Store last record for the partition path of the first payload to be used for all subsequent generated payloads
private GenericRecord lastRecord;
// Partition path field name
private List<String> partitionPathFieldNames;
public FlexibleSchemaRecordGenerationIterator(long maxEntriesToProduce, String schema) {
this(maxEntriesToProduce, GenericRecordFullPayloadGenerator.DEFAULT_PAYLOAD_SIZE, schema, null);
}
public FlexibleSchemaRecordGenerationIterator(long maxEntriesToProduce, int minPayloadSize, String schemaStr,
List<String> partitionPathFieldNames) {
this.counter = maxEntriesToProduce;
this.partitionPathFieldNames = partitionPathFieldNames;
Schema schema = new Schema.Parser().parse(schemaStr);
this.generator = new GenericRecordFullPayloadGenerator(schema, minPayloadSize);
}
@Override
public boolean hasNext() {
return this.counter > 0;
}
@Override
public GenericRecord next() {
this.counter--;
if (lastRecord == null) {
GenericRecord record = this.generator.getNewPayload();
lastRecord = record;
return record;
} else {
return this.generator.randomize(lastRecord, this.partitionPathFieldNames);
}
}
}

View File

@@ -0,0 +1,285 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.integ.testsuite.generator;
import java.io.Serializable;
import java.nio.ByteBuffer;
import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.Date;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Random;
import java.util.UUID;
import org.apache.avro.LogicalType;
import org.apache.avro.LogicalTypes;
import org.apache.avro.Schema;
import org.apache.avro.Schema.Type;
import org.apache.avro.generic.GenericData;
import org.apache.avro.generic.GenericData.Fixed;
import org.apache.avro.generic.GenericFixed;
import org.apache.avro.generic.GenericRecord;
import org.apache.hudi.common.util.collection.Pair;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* This is a GenericRecord payload generator that generates full generic records {@link GenericRecord}.
* Every field of a generic record created using this generator contains a random value.
*/
public class GenericRecordFullPayloadGenerator implements Serializable {
public static final int DEFAULT_PAYLOAD_SIZE = 1024 * 10; // 10 KB
private static Logger log = LoggerFactory.getLogger(GenericRecordFullPayloadGenerator.class);
protected final Random random = new Random();
// The source schema used to generate a payload
private final transient Schema baseSchema;
// Used to validate a generic record
private final transient GenericData genericData = new GenericData();
// Number of more bytes to add based on the estimated full record payload size and min payload size
private int numberOfBytesToAdd;
// If more elements should be packed to meet the minPayloadSize
private boolean shouldAddMore;
// How many complex fields have we visited that can help us pack more entries and increase the size of the record
private int numberOfComplexFields;
// The size of a full record where every field of a generic record created contains 1 random value
private int estimatedFullPayloadSize;
// LogicalTypes in Avro 1.8.2
private static final String DECIMAL = "decimal";
private static final String UUID_NAME = "uuid";
private static final String DATE = "date";
private static final String TIME_MILLIS = "time-millis";
private static final String TIME_MICROS = "time-micros";
private static final String TIMESTAMP_MILLIS = "timestamp-millis";
private static final String TIMESTAMP_MICROS = "timestamp-micros";
public GenericRecordFullPayloadGenerator(Schema schema) {
this(schema, DEFAULT_PAYLOAD_SIZE);
}
public GenericRecordFullPayloadGenerator(Schema schema, int minPayloadSize) {
Pair<Integer, Integer> sizeInfo = new GenericRecordFullPayloadSizeEstimator(schema)
.typeEstimateAndNumComplexFields();
this.estimatedFullPayloadSize = sizeInfo.getLeft();
this.numberOfComplexFields = sizeInfo.getRight();
this.baseSchema = schema;
this.shouldAddMore = estimatedFullPayloadSize < minPayloadSize;
if (this.shouldAddMore) {
this.numberOfBytesToAdd = minPayloadSize - estimatedFullPayloadSize;
if (numberOfComplexFields < 1) {
log.warn("The schema does not have any collections/complex fields. Cannot achieve minPayloadSize : {}",
minPayloadSize);
}
}
}
protected static boolean isPrimitive(Schema localSchema) {
if (localSchema.getType() != Type.ARRAY
&& localSchema.getType() != Type.MAP
&& localSchema.getType() != Type.RECORD
&& localSchema.getType() != Type.UNION) {
return true;
} else {
return false;
}
}
public GenericRecord getNewPayload() {
return convert(baseSchema);
}
public GenericRecord getUpdatePayload(GenericRecord record, List<String> blacklistFields) {
return randomize(record, blacklistFields);
}
protected GenericRecord convert(Schema schema) {
GenericRecord result = new GenericData.Record(schema);
for (Schema.Field f : schema.getFields()) {
result.put(f.name(), typeConvert(f.schema()));
}
return result;
}
protected GenericRecord convertPartial(Schema schema) {
GenericRecord result = new GenericData.Record(schema);
for (Schema.Field f : schema.getFields()) {
boolean setNull = random.nextBoolean();
if (!setNull) {
result.put(f.name(), typeConvert(f.schema()));
} else {
result.put(f.name(), null);
}
}
// TODO : pack remaining bytes into a complex field
return result;
}
protected GenericRecord randomize(GenericRecord record, List<String> blacklistFields) {
for (Schema.Field f : record.getSchema().getFields()) {
if (blacklistFields == null || !blacklistFields.contains(f.name())) {
record.put(f.name(), typeConvert(f.schema()));
}
}
return record;
}
private Object typeConvert(Schema schema) {
Schema localSchema = schema;
if (isOption(schema)) {
localSchema = getNonNull(schema);
}
switch (localSchema.getType()) {
case BOOLEAN:
return random.nextBoolean();
case DOUBLE:
return random.nextDouble();
case FLOAT:
return random.nextFloat();
case INT:
return random.nextInt();
case LONG:
return random.nextLong();
case STRING:
return UUID.randomUUID().toString();
case ENUM:
List<String> enumSymbols = localSchema.getEnumSymbols();
return new GenericData.EnumSymbol(localSchema, enumSymbols.get(random.nextInt(enumSymbols.size() - 1)));
case RECORD:
return convert(localSchema);
case ARRAY:
Schema elementSchema = localSchema.getElementType();
List listRes = new ArrayList();
if (isPrimitive(elementSchema) && this.shouldAddMore) {
int numEntriesToAdd = numEntriesToAdd(elementSchema);
while (numEntriesToAdd > 0) {
listRes.add(typeConvert(elementSchema));
numEntriesToAdd--;
}
} else {
listRes.add(typeConvert(elementSchema));
}
return listRes;
case MAP:
Schema valueSchema = localSchema.getValueType();
Map<String, Object> mapRes = new HashMap<String, Object>();
if (isPrimitive(valueSchema) && this.shouldAddMore) {
int numEntriesToAdd = numEntriesToAdd(valueSchema);
while (numEntriesToAdd > 0) {
mapRes.put(UUID.randomUUID().toString(), typeConvert(valueSchema));
numEntriesToAdd--;
}
} else {
mapRes.put(UUID.randomUUID().toString(), typeConvert(valueSchema));
}
return mapRes;
case BYTES:
return ByteBuffer.wrap(UUID.randomUUID().toString().getBytes(Charset.defaultCharset()));
case FIXED:
return generateFixedType(localSchema);
default:
throw new IllegalArgumentException(
"Cannot handle type: " + localSchema.getType());
}
}
private Object generateFixedType(Schema localSchema) {
// TODO: Need to implement valid data generation for fixed type
GenericFixed genericFixed = new GenericData.Fixed(localSchema);
switch (localSchema.getLogicalType().getName()) {
case UUID_NAME:
((Fixed) genericFixed).bytes(UUID.randomUUID().toString().getBytes());
return genericFixed;
case DECIMAL:
return genericFixed;
case DATE:
return genericFixed;
case TIME_MILLIS:
return genericFixed;
default:
throw new IllegalArgumentException(
"Cannot handle type: " + localSchema.getLogicalType());
}
}
public boolean validate(GenericRecord record) {
return genericData.validate(baseSchema, record);
}
protected boolean isOption(Schema schema) {
return schema.getType().equals(Schema.Type.UNION)
&& schema.getTypes().size() == 2
&& (schema.getTypes().get(0).getType().equals(Schema.Type.NULL)
|| schema.getTypes().get(1).getType().equals(Schema.Type.NULL));
}
protected Schema getNonNull(Schema schema) {
List<Schema> types = schema.getTypes();
return types.get(0).getType().equals(Schema.Type.NULL) ? types.get(1) : types.get(0);
}
public int getEstimatedFullPayloadSize() {
return estimatedFullPayloadSize;
}
private int getSize(Schema elementSchema) {
switch (elementSchema.getType()) {
case BOOLEAN:
return 1;
case DOUBLE:
return Double.BYTES;
case FLOAT:
return Float.BYTES;
case INT:
return Integer.BYTES;
case LONG:
return Long.BYTES;
case STRING:
return UUID.randomUUID().toString().length();
case ENUM:
return 1;
case BYTES:
return UUID.randomUUID().toString().length();
case FIXED:
return elementSchema.getFixedSize();
default:
throw new RuntimeException("Unknown type " + elementSchema.getType());
}
}
private int numEntriesToAdd(Schema elementSchema) {
// Find the size of the primitive data type in bytes
int primitiveDataTypeSize = getSize(elementSchema);
int numEntriesToAdd = numberOfBytesToAdd / primitiveDataTypeSize;
// If more than 10 entries are being added for this same complex field and there are still more complex fields to
// be visited in the schema, reduce the number of entries to add by a factor of 10 to allow for other complex
// fields to pack some entries
if (numEntriesToAdd % 10 > 0 && this.numberOfComplexFields > 1) {
numEntriesToAdd = numEntriesToAdd / 10;
numberOfBytesToAdd -= numEntriesToAdd * primitiveDataTypeSize;
this.shouldAddMore = true;
} else {
this.numberOfBytesToAdd = 0;
this.shouldAddMore = false;
}
this.numberOfComplexFields -= 1;
return numEntriesToAdd;
}
}

View File

@@ -0,0 +1,121 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.integ.testsuite.generator;
import java.io.Serializable;
import java.util.List;
import java.util.UUID;
import java.util.concurrent.atomic.AtomicInteger;
import org.apache.avro.Schema;
import org.apache.avro.generic.GenericRecord;
import org.apache.hudi.common.util.collection.Pair;
/**
* This is a GenericRecord payload estimator estimates the size of a full generic record {@link GenericRecord}.
* A full record is defined as "Every field of a generic record created contains 1 random value"
*/
public class GenericRecordFullPayloadSizeEstimator implements Serializable {
private final transient Schema baseSchema;
// This variable is used to track the number of complex/collection fields with primitive data types at their leaf.
// This is used to figure out how many entries can be packed in such a collection field to meet the min payload
// size requested
private final transient AtomicInteger counter = new AtomicInteger(0);
public GenericRecordFullPayloadSizeEstimator(Schema schema) {
this.baseSchema = schema;
}
public Pair<Integer, Integer> typeEstimateAndNumComplexFields() {
int size = estimate(baseSchema);
return Pair.of(size, counter.get());
}
/**
* This method estimates the size of the payload if all entries of this payload were populated with one value.
* For eg. A primitive data type such as String will be populated with {@link UUID} so the length if 36 bytes
* whereas a complex data type such as an Array of type Int, will be populated with exactly 1 Integer value.
*/
protected int estimate(Schema schema) {
long size = 0;
for (Schema.Field f : schema.getFields()) {
size += typeEstimate(f.schema());
}
return (int) size;
}
private long typeEstimate(Schema schema) {
Schema localSchema = schema;
if (isOption(schema)) {
localSchema = getNonNull(schema);
}
switch (localSchema.getType()) {
case BOOLEAN:
return 1;
case DOUBLE:
return 8;
case FLOAT:
return 4;
case INT:
return 4;
case LONG:
return 8;
case STRING:
return UUID.randomUUID().toString().length();
case ENUM:
return 1;
case RECORD:
return estimate(localSchema);
case ARRAY:
if (GenericRecordFullPayloadGenerator.isPrimitive(localSchema.getElementType())) {
counter.addAndGet(1);
}
Schema elementSchema = localSchema.getElementType();
return typeEstimate(elementSchema);
case MAP:
if (GenericRecordFullPayloadGenerator.isPrimitive(localSchema.getValueType())) {
counter.addAndGet(1);
}
Schema valueSchema = localSchema.getValueType();
return UUID.randomUUID().toString().length() + typeEstimate(valueSchema);
case BYTES:
return UUID.randomUUID().toString().length();
case FIXED:
return localSchema.getFixedSize();
default:
throw new IllegalArgumentException(
"Cannot handle type: " + localSchema.getType());
}
}
protected boolean isOption(Schema schema) {
return schema.getType().equals(Schema.Type.UNION)
&& schema.getTypes().size() == 2
&& (schema.getTypes().get(0).getType().equals(Schema.Type.NULL)
|| schema.getTypes().get(1).getType().equals(Schema.Type.NULL));
}
protected Schema getNonNull(Schema schema) {
List<Schema> types = schema.getTypes();
return types.get(0).getType().equals(Schema.Type.NULL) ? types.get(1) : types.get(0);
}
}

View File

@@ -0,0 +1,82 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.integ.testsuite.generator;
import org.apache.avro.Schema;
import org.apache.avro.generic.GenericData;
import org.apache.avro.generic.GenericRecord;
/**
* This is a GenericRecord payload generator that generates partial generic records {@link GenericRecord}. A partial
* records is one that has some fields of the schema NULL or NOT PRESENT. This payload enables us to simulate
* creation of partial records which are possible in many cases, especially for database change logs.
*/
public class GenericRecordPartialPayloadGenerator extends GenericRecordFullPayloadGenerator {
public GenericRecordPartialPayloadGenerator(Schema schema) {
super(schema);
}
public GenericRecordPartialPayloadGenerator(Schema schema, int minPayloadSize) {
super(schema, minPayloadSize);
}
@Override
protected GenericRecord convert(Schema schema) {
GenericRecord record = super.convertPartial(schema);
return record;
}
private void setNull(GenericRecord record) {
for (Schema.Field field : record.getSchema().getFields()) {
// A random boolean decides whether this field of the generic record should be present or absent.
// Using this we can set only a handful of fields in the record and generate partial records
boolean setNull = random.nextBoolean();
if (setNull) { // TODO : DO NOT SET THE RECORD KEY FIELDS TO NULL
record.put(field.name(), null);
} else {
if (record.get(field.name()) instanceof GenericData.Record) {
setNull((GenericData.Record) record.get(field.name()));
}
}
}
}
@Override
public boolean validate(GenericRecord record) {
return validate((Object) record);
}
// Atleast 1 entry should be null
private boolean validate(Object object) {
if (object == null) {
return true;
} else if (object instanceof GenericRecord) {
for (Schema.Field field : ((GenericRecord) object).getSchema().getFields()) {
boolean ret = validate(((GenericRecord) object).get(field.name()));
if (ret) {
return ret;
}
}
}
return false;
}
}

View File

@@ -0,0 +1,48 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.integ.testsuite.generator;
import java.util.Iterator;
import org.apache.avro.generic.GenericRecord;
import org.apache.hudi.client.utils.LazyIterableIterator;
/**
* A lazy record generator to generate {@link GenericRecord}s lazily and not hold a list of records in memory.
*/
public class LazyRecordGeneratorIterator extends
LazyIterableIterator<GenericRecord, GenericRecord> {
public LazyRecordGeneratorIterator(Iterator<GenericRecord> inputItr) {
super(inputItr);
}
@Override
protected void start() {
}
@Override
protected GenericRecord computeNext() {
return inputItr.next();
}
@Override
protected void end() {
}
}

View File

@@ -0,0 +1,56 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.integ.testsuite.generator;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import org.apache.avro.Schema;
import org.apache.avro.generic.GenericRecord;
public class UpdateGeneratorIterator implements Iterator<GenericRecord> {
// Use the full payload generator as default
private GenericRecordFullPayloadGenerator generator;
private List<String> blackListedFields;
// iterator
private Iterator<GenericRecord> itr;
public UpdateGeneratorIterator(Iterator<GenericRecord> itr, String schemaStr, List<String> partitionPathFieldNames,
List<String> recordKeyFieldNames, int minPayloadSize) {
this.itr = itr;
this.blackListedFields = new ArrayList<>();
this.blackListedFields.addAll(partitionPathFieldNames);
this.blackListedFields.addAll(recordKeyFieldNames);
Schema schema = new Schema.Parser().parse(schemaStr);
this.generator = new GenericRecordFullPayloadGenerator(schema, minPayloadSize);
}
@Override
public boolean hasNext() {
return itr.hasNext();
}
@Override
public GenericRecord next() {
GenericRecord newRecord = itr.next();
return this.generator.randomize(newRecord, this.blackListedFields);
}
}

View File

@@ -0,0 +1,92 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.integ.testsuite.helpers;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.stream.Collectors;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.LocatedFileStatus;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.RemoteIterator;
import org.apache.hudi.common.config.TypedProperties;
import org.apache.hudi.common.util.Option;
import org.apache.hudi.common.util.collection.ImmutablePair;
import org.apache.hudi.common.util.collection.Pair;
import org.apache.hudi.exception.HoodieIOException;
import org.apache.hudi.utilities.sources.helpers.DFSPathSelector;
/**
* A custom dfs path selector used only for the hudi test suite. To be used only if workload is not run inline.
*/
public class DFSTestSuitePathSelector extends DFSPathSelector {
public DFSTestSuitePathSelector(TypedProperties props, Configuration hadoopConf) {
super(props, hadoopConf);
}
@Override
public Pair<Option<String>, String> getNextFilePathsAndMaxModificationTime(
Option<String> lastCheckpointStr, long sourceLimit) {
Integer lastBatchId;
Integer nextBatchId;
try {
if (lastCheckpointStr.isPresent()) {
lastBatchId = Integer.parseInt(lastCheckpointStr.get());
nextBatchId = lastBatchId + 1;
} else {
lastBatchId = -1;
nextBatchId = 0;
}
// obtain all eligible files for the batch
List<FileStatus> eligibleFiles = new ArrayList<>();
FileStatus[] fileStatuses = fs.globStatus(
new Path(props.getString(Config.ROOT_INPUT_PATH_PROP), "*"));
for (FileStatus fileStatus : fileStatuses) {
if (!fileStatus.isDirectory() || IGNORE_FILEPREFIX_LIST.stream()
.anyMatch(pfx -> fileStatus.getPath().getName().startsWith(pfx))) {
continue;
} else if (fileStatus.getPath().getName().compareTo(lastBatchId.toString()) > 0 && fileStatus.getPath()
.getName().compareTo(nextBatchId.toString()) <= 0) {
RemoteIterator<LocatedFileStatus> files = fs.listFiles(fileStatus.getPath(), true);
while (files.hasNext()) {
eligibleFiles.add(files.next());
}
}
}
// no data to readAvro
if (eligibleFiles.size() == 0) {
return new ImmutablePair<>(Option.empty(),
lastCheckpointStr.orElseGet(() -> String.valueOf(Long.MIN_VALUE)));
}
// readAvro the files out.
String pathStr = eligibleFiles.stream().map(f -> f.getPath().toString())
.collect(Collectors.joining(","));
return new ImmutablePair<>(Option.ofNullable(pathStr), String.valueOf(nextBatchId));
} catch (IOException ioe) {
throw new HoodieIOException(
"Unable to readAvro from source from checkpoint: " + lastCheckpointStr, ioe);
}
}
}

View File

@@ -0,0 +1,74 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.integ.testsuite.helpers;
import java.io.IOException;
import java.net.BindException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hive.service.server.HiveServer2;
import org.apache.hudi.hive.testutils.HiveTestService;
import org.apache.hudi.integ.testsuite.HoodieTestSuiteWriter;
import org.apache.hudi.integ.testsuite.configuration.DeltaConfig.Config;
import org.apache.log4j.LogManager;
import org.apache.log4j.Logger;
import org.apache.thrift.transport.TTransportException;
/**
* Hive Service provider.
*/
public class HiveServiceProvider {
private HiveTestService hiveService;
private HiveServer2 hiveServer;
private Config config;
private static final Logger LOG = LogManager.getLogger(HiveServiceProvider.class);
public HiveServiceProvider(Config config) {
this.config = config;
}
public void startLocalHiveServiceIfNeeded(Configuration configuration) throws IOException {
if (config.isHiveLocal()) {
hiveService = new HiveTestService(configuration);
hiveServer = hiveService.start();
}
}
public void syncToLocalHiveIfNeeded(HoodieTestSuiteWriter writer) {
if (this.config.isHiveLocal()) {
writer.getDeltaStreamerWrapper().getDeltaSyncService().getDeltaSync()
.syncHive(getLocalHiveServer().getHiveConf());
} else {
writer.getDeltaStreamerWrapper().getDeltaSyncService().getDeltaSync().syncHive();
}
}
public void stopLocalHiveServiceIfNeeded() throws IOException {
if (config.isHiveLocal()) {
if (hiveService != null) {
hiveService.stop();
}
}
}
public HiveServer2 getLocalHiveServer() {
return hiveServer;
}
}

View File

@@ -0,0 +1,89 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.integ.testsuite.reader;
import java.io.IOException;
import java.util.Arrays;
import org.apache.avro.generic.GenericRecord;
import org.apache.hadoop.fs.PathFilter;
import org.apache.hudi.common.util.Option;
import org.apache.hudi.integ.testsuite.writer.AvroFileDeltaInputWriter;
import org.apache.hudi.integ.testsuite.writer.DeltaOutputMode;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.sql.SparkSession;
/**
* A reader of {@link DeltaOutputMode#DFS} and {@link DeltaInputType#AVRO}.
*/
public class DFSAvroDeltaInputReader extends DFSDeltaInputReader {
private final SparkSession sparkSession;
private final String schemaStr;
private final String basePath;
private final Option<String> structName;
private final Option<String> nameSpace;
protected PathFilter filter = (path) -> {
if (path.toUri().toString().contains(AvroFileDeltaInputWriter.AVRO_EXTENSION)) {
return true;
} else {
return false;
}
};
public DFSAvroDeltaInputReader(SparkSession sparkSession, String schemaStr, String basePath,
Option<String> structName,
Option<String> nameSpace) {
this.sparkSession = sparkSession;
this.schemaStr = schemaStr;
this.basePath = basePath;
this.structName = structName;
this.nameSpace = nameSpace;
}
@Override
public JavaRDD<GenericRecord> read(long totalRecordsToRead) throws IOException {
return SparkBasedReader.readAvro(sparkSession, schemaStr, getFilePathsToRead(basePath, filter, totalRecordsToRead),
structName, nameSpace);
}
@Override
public JavaRDD<GenericRecord> read(int numPartitions, long approxNumRecords) throws IOException {
throw new UnsupportedOperationException("cannot generate updates");
}
@Override
public JavaRDD<GenericRecord> read(int numPartitions, int numFiles, long approxNumRecords) throws IOException {
throw new UnsupportedOperationException("cannot generate updates");
}
@Override
public JavaRDD<GenericRecord> read(int numPartitions, int numFiles, double percentageRecordsPerFile)
throws IOException {
throw new UnsupportedOperationException("cannot generate updates");
}
@Override
protected long analyzeSingleFile(String filePath) {
JavaRDD<GenericRecord> recordsFromOneFile = SparkBasedReader
.readAvro(sparkSession, schemaStr, Arrays.asList(filePath),
structName, nameSpace);
return recordsFromOneFile.count();
}
}

View File

@@ -0,0 +1,98 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.integ.testsuite.reader;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.List;
import org.apache.avro.generic.GenericRecord;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.PathFilter;
import org.apache.hudi.common.fs.FSUtils;
import org.apache.hudi.common.util.collection.Pair;
/**
* This class helps to estimate the number of files to read a given number of total records.
* Use this class for all DFS based implementations of {@link DeltaInputReader}
*/
public abstract class DFSDeltaInputReader implements DeltaInputReader<GenericRecord> {
protected List<String> getFilePathsToRead(String basePath, PathFilter filter, long totalRecordsToRead) throws
IOException {
FileSystem fs = FSUtils.getFs(basePath, new Configuration());
// TODO : Sort list by file size and take the median file status to ensure fair calculation and change to remote
// iterator
List<FileStatus> fileStatuses = Arrays.asList(fs.globStatus(new Path(basePath, "*/*"), filter));
if (fileStatuses.size() > 0) {
FileStatus status = fileStatuses.get(0);
long avgNumRecordsPerFile = analyzeSingleFile(status.getPath().toString());
long numFilesToMatchExpectedRecords = (long) Math.ceil((double) totalRecordsToRead / (double)
avgNumRecordsPerFile);
long avgSizeOfEachFile = status.getLen();
long totalSizeToRead = avgSizeOfEachFile * numFilesToMatchExpectedRecords;
// choose N files with that length
Pair<Integer, Integer> fileStatusIndexRange = getFileStatusIndexRange(fileStatuses, avgSizeOfEachFile,
totalSizeToRead);
int startIndex = fileStatusIndexRange.getLeft();
List<String> filePaths = new ArrayList<>();
while (startIndex == 0 || startIndex < fileStatusIndexRange.getRight()) {
filePaths.add(fileStatuses.get(startIndex).getPath().toString());
startIndex++;
}
return filePaths;
}
return Collections.emptyList();
}
protected Pair<Integer, Integer> getFileStatusIndexRange(List<FileStatus> fileStatuses, long averageFileSize, long
totalSizeToRead) {
long totalSizeOfFilesPresent = 0;
int startOffset = 0;
int endOffset = 0;
for (FileStatus fileStatus : fileStatuses) {
// If current file length is greater than averageFileSize, increment by averageFileSize since our
// totalSizeToRead calculation is based on the averageRecordSize * numRecordsToRead.
if (fileStatus.getLen() > averageFileSize) {
totalSizeOfFilesPresent += averageFileSize;
} else {
totalSizeOfFilesPresent += fileStatus.getLen();
}
if (totalSizeOfFilesPresent <= totalSizeToRead) {
endOffset++;
continue;
} else {
return Pair.of(startOffset, endOffset);
}
}
return Pair.of(startOffset, endOffset);
}
/**
* Implementation of {@link DeltaInputReader}s to provide a way to read a single file on DFS and provide an
* average number of records across N files.
*/
protected long analyzeSingleFile(String filePath) {
throw new UnsupportedOperationException("No implementation found");
}
}

View File

@@ -0,0 +1,338 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.integ.testsuite.reader;
import static java.util.Map.Entry.comparingByValue;
import static java.util.stream.Collectors.toMap;
import java.io.IOException;
import java.io.UncheckedIOException;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashMap;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.NoSuchElementException;
import java.util.stream.Collectors;
import java.util.stream.StreamSupport;
import org.apache.avro.Schema;
import org.apache.avro.generic.GenericRecord;
import org.apache.avro.generic.IndexedRecord;
import org.apache.hadoop.fs.Path;
import org.apache.hudi.avro.HoodieAvroUtils;
import org.apache.hudi.common.fs.FSUtils;
import org.apache.hudi.common.model.FileSlice;
import org.apache.hudi.common.model.HoodieRecord;
import org.apache.hudi.common.model.HoodieRecordPayload;
import org.apache.hudi.common.table.HoodieTableMetaClient;
import org.apache.hudi.common.table.log.HoodieMergedLogRecordScanner;
import org.apache.hudi.common.table.view.HoodieTableFileSystemView;
import org.apache.hudi.common.table.view.TableFileSystemView;
import org.apache.hudi.common.util.Option;
import org.apache.hudi.common.util.ParquetReaderIterator;
import org.apache.hudi.common.util.ValidationUtils;
import org.apache.hudi.config.HoodieMemoryConfig;
import org.apache.parquet.avro.AvroParquetReader;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.sql.SparkSession;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import scala.Tuple2;
/**
* This class helps to generate updates from an already existing hoodie dataset. It supports generating updates in
* across partitions, files and records.
*/
public class DFSHoodieDatasetInputReader extends DFSDeltaInputReader {
private static Logger log = LoggerFactory.getLogger(DFSHoodieDatasetInputReader.class);
private transient JavaSparkContext jsc;
private String schemaStr;
private HoodieTableMetaClient metaClient;
public DFSHoodieDatasetInputReader(JavaSparkContext jsc, String basePath, String schemaStr) {
this.jsc = jsc;
this.schemaStr = schemaStr;
this.metaClient = new HoodieTableMetaClient(jsc.hadoopConfiguration(), basePath);
}
protected List<String> getPartitions(Option<Integer> partitionsLimit) throws IOException {
List<String> partitionPaths = FSUtils
.getAllPartitionPaths(metaClient.getFs(), metaClient.getBasePath(), false);
// Sort partition so we can pick last N partitions by default
Collections.sort(partitionPaths);
if (!partitionPaths.isEmpty()) {
ValidationUtils.checkArgument(partitionPaths.size() >= partitionsLimit.get(),
"Cannot generate updates for more partitions " + "than present in the dataset, partitions "
+ "requested " + partitionsLimit.get() + ", partitions present " + partitionPaths.size());
return partitionPaths.subList(0, partitionsLimit.get());
}
return partitionPaths;
}
private JavaPairRDD<String, Iterator<FileSlice>> getPartitionToFileSlice(HoodieTableMetaClient metaClient,
List<String> partitionPaths) {
TableFileSystemView.SliceView fileSystemView = new HoodieTableFileSystemView(metaClient,
metaClient.getCommitsAndCompactionTimeline().filterCompletedInstants());
// pass num partitions to another method
JavaPairRDD<String, Iterator<FileSlice>> partitionToFileSliceList = jsc.parallelize(partitionPaths).mapToPair(p -> {
return new Tuple2<>(p, fileSystemView.getLatestFileSlices(p).iterator());
});
return partitionToFileSliceList;
}
@Override
protected long analyzeSingleFile(String filePath) {
return SparkBasedReader.readParquet(new SparkSession(jsc.sc()), Arrays.asList(filePath),
Option.empty(), Option.empty()).count();
}
private JavaRDD<GenericRecord> fetchAnyRecordsFromDataset(Option<Long> numRecordsToUpdate) throws IOException {
return fetchRecordsFromDataset(Option.empty(), Option.empty(), numRecordsToUpdate, Option.empty());
}
private JavaRDD<GenericRecord> fetchAnyRecordsFromDataset(Option<Long> numRecordsToUpdate, Option<Integer>
numPartitions) throws IOException {
return fetchRecordsFromDataset(numPartitions, Option.empty(), numRecordsToUpdate, Option.empty());
}
private JavaRDD<GenericRecord> fetchPercentageRecordsFromDataset(Option<Integer> numPartitions, Option<Integer>
numFiles, Option<Double> percentageRecordsPerFile) throws IOException {
return fetchRecordsFromDataset(numPartitions, numFiles, Option.empty(), percentageRecordsPerFile);
}
private JavaRDD<GenericRecord> fetchRecordsFromDataset(Option<Integer> numPartitions, Option<Integer>
numFiles, Option<Long> numRecordsToUpdate) throws IOException {
return fetchRecordsFromDataset(numPartitions, numFiles, numRecordsToUpdate, Option.empty());
}
private JavaRDD<GenericRecord> fetchRecordsFromDataset(Option<Integer> numPartitions, Option<Integer> numFiles,
Option<Long> numRecordsToUpdate, Option<Double> percentageRecordsPerFile) throws IOException {
log.info("NumPartitions : {}, NumFiles : {}, numRecordsToUpdate : {}, percentageRecordsPerFile : {}",
numPartitions, numFiles, numRecordsToUpdate, percentageRecordsPerFile);
List<String> partitionPaths = getPartitions(numPartitions);
// Read all file slices in the partition
JavaPairRDD<String, Iterator<FileSlice>> partitionToFileSlice = getPartitionToFileSlice(metaClient,
partitionPaths);
// TODO : read record count from metadata
// Read the records in a single file
long recordsInSingleFile = iteratorSize(readParquetOrLogFiles(getSingleSliceFromRDD(partitionToFileSlice)));
int numFilesToUpdate;
long numRecordsToUpdatePerFile;
if (!numFiles.isPresent() || numFiles.get() == 0) {
// If num files are not passed, find the number of files to update based on total records to update and records
// per file
numFilesToUpdate = (int) (numRecordsToUpdate.get() / recordsInSingleFile);
log.info("Files to update {}", numFilesToUpdate);
numRecordsToUpdatePerFile = recordsInSingleFile;
} else {
// If num files is passed, find the number of records per file based on either percentage or total records to
// update and num files passed
numFilesToUpdate = numFiles.get();
numRecordsToUpdatePerFile = percentageRecordsPerFile.isPresent() ? (long) (recordsInSingleFile
* percentageRecordsPerFile.get()) : numRecordsToUpdate.get() / numFilesToUpdate;
}
// Adjust the number of files to read per partition based on the requested partition & file counts
Map<String, Integer> adjustedPartitionToFileIdCountMap = getFilesToReadPerPartition(partitionToFileSlice,
getPartitions(numPartitions).size(), numFilesToUpdate);
JavaRDD<GenericRecord> updates = projectSchema(generateUpdates(adjustedPartitionToFileIdCountMap,
partitionToFileSlice, numFilesToUpdate, (int) numRecordsToUpdatePerFile));
if (numRecordsToUpdate.isPresent() && numFiles.isPresent() && numFiles.get() != 0 && numRecordsToUpdate.get()
!= numRecordsToUpdatePerFile * numFiles.get()) {
long remainingRecordsToAdd = (numRecordsToUpdate.get() - (numRecordsToUpdatePerFile * numFiles.get()));
updates = updates.union(projectSchema(jsc.parallelize(generateUpdates(adjustedPartitionToFileIdCountMap,
partitionToFileSlice, numFilesToUpdate, (int) remainingRecordsToAdd).take((int) remainingRecordsToAdd))));
}
log.info("Finished generating updates");
return updates;
}
private JavaRDD<GenericRecord> projectSchema(JavaRDD<GenericRecord> updates) {
// The records read from the hoodie dataset have the hoodie record fields, rewrite the record to eliminate them
return updates
.map(r -> HoodieAvroUtils.rewriteRecordWithOnlyNewSchemaFields(r, new Schema.Parser().parse(schemaStr)));
}
private JavaRDD<GenericRecord> generateUpdates(Map<String, Integer> adjustedPartitionToFileIdCountMap,
JavaPairRDD<String, Iterator<FileSlice>> partitionToFileSlice, int numFiles, int numRecordsToReadPerFile) {
return partitionToFileSlice.map(p -> {
int maxFilesToRead = adjustedPartitionToFileIdCountMap.get(p._1);
return iteratorLimit(p._2, maxFilesToRead);
}).flatMap(p -> p).repartition(numFiles).map(fileSlice -> {
if (numRecordsToReadPerFile > 0) {
return iteratorLimit(readParquetOrLogFiles(fileSlice), numRecordsToReadPerFile);
} else {
return readParquetOrLogFiles(fileSlice);
}
}).flatMap(p -> p).map(i -> (GenericRecord) i);
}
private Map<String, Integer> getFilesToReadPerPartition(JavaPairRDD<String, Iterator<FileSlice>>
partitionToFileSlice, Integer numPartitions, Integer numFiles) {
int numFilesPerPartition = (int) Math.ceil(numFiles / numPartitions);
Map<String, Integer> partitionToFileIdCountMap = partitionToFileSlice
.mapToPair(p -> new Tuple2<>(p._1, iteratorSize(p._2))).collectAsMap();
long totalExistingFilesCount = partitionToFileIdCountMap.values().stream().reduce((a, b) -> a + b).get();
ValidationUtils.checkArgument(totalExistingFilesCount >= numFiles, "Cannot generate updates "
+ "for more files than present in the dataset, file requested " + numFiles + ", files present "
+ totalExistingFilesCount);
Map<String, Integer> partitionToFileIdCountSortedMap = partitionToFileIdCountMap
.entrySet()
.stream()
.sorted(comparingByValue())
.collect(toMap(Map.Entry::getKey, Map.Entry::getValue, (e1, e2) -> e2,
LinkedHashMap::new));
// Limit files to be read per partition
Map<String, Integer> adjustedPartitionToFileIdCountMap = new HashMap<>();
partitionToFileIdCountSortedMap.entrySet().stream().forEach(e -> {
if (e.getValue() <= numFilesPerPartition) {
adjustedPartitionToFileIdCountMap.put(e.getKey(), e.getValue());
} else {
adjustedPartitionToFileIdCountMap.put(e.getKey(), numFilesPerPartition);
}
});
return adjustedPartitionToFileIdCountMap;
}
private FileSlice getSingleSliceFromRDD(JavaPairRDD<String, Iterator<FileSlice>> partitionToFileSlice) {
return partitionToFileSlice.map(f -> {
FileSlice slice = f._2.next();
FileSlice newSlice = new FileSlice(slice.getFileGroupId(), slice.getBaseInstantTime());
if (slice.getBaseFile().isPresent()) {
newSlice.setBaseFile(slice.getBaseFile().get());
} else {
slice.getLogFiles().forEach(l -> {
newSlice.addLogFile(l);
});
}
return newSlice;
}).take(1).get(0);
}
private Iterator<IndexedRecord> readParquetOrLogFiles(FileSlice fileSlice) throws IOException {
if (fileSlice.getBaseFile().isPresent()) {
Iterator<IndexedRecord> itr =
new ParquetReaderIterator<IndexedRecord>(AvroParquetReader.<IndexedRecord>builder(new
Path(fileSlice.getBaseFile().get().getPath())).withConf(metaClient.getHadoopConf()).build());
return itr;
} else {
// If there is no data file, fall back to reading log files
HoodieMergedLogRecordScanner scanner = new HoodieMergedLogRecordScanner(metaClient.getFs(),
metaClient.getBasePath(),
fileSlice.getLogFiles().map(l -> l.getPath().getName()).collect(Collectors.toList()),
new Schema.Parser().parse(schemaStr), metaClient.getActiveTimeline().getCommitsTimeline()
.filterCompletedInstants().lastInstant().get().getTimestamp(),
HoodieMemoryConfig.DEFAULT_MAX_MEMORY_FOR_SPILLABLE_MAP_IN_BYTES, true, false,
HoodieMemoryConfig.DEFAULT_MAX_DFS_STREAM_BUFFER_SIZE,
HoodieMemoryConfig.DEFAULT_SPILLABLE_MAP_BASE_PATH);
// readAvro log files
Iterable<HoodieRecord<? extends HoodieRecordPayload>> iterable = () -> scanner.iterator();
Schema schema = new Schema.Parser().parse(schemaStr);
return StreamSupport.stream(iterable.spliterator(), false)
.map(e -> {
try {
return (IndexedRecord) e.getData().getInsertValue(schema).get();
} catch (IOException io) {
throw new UncheckedIOException(io);
}
}).iterator();
}
}
/**
* Returns the number of elements remaining in {@code iterator}. The iterator
* will be left exhausted: its {@code hasNext()} method will return
* {@code false}.
*/
private static int iteratorSize(Iterator<?> iterator) {
int count = 0;
while (iterator.hasNext()) {
iterator.next();
count++;
}
return count;
}
/**
* Creates an iterator returning the first {@code limitSize} elements of the
* given iterator. If the original iterator does not contain that many
* elements, the returned iterator will have the same behavior as the original
* iterator. The returned iterator supports {@code remove()} if the original
* iterator does.
*
* @param iterator the iterator to limit
* @param limitSize the maximum number of elements in the returned iterator
* @throws IllegalArgumentException if {@code limitSize} is negative
*/
private static <T> Iterator<T> iteratorLimit(
final Iterator<T> iterator, final int limitSize) {
ValidationUtils.checkArgument(iterator != null, "iterator is null");
ValidationUtils.checkArgument(limitSize >= 0, "limit is negative");
return new Iterator<T>() {
private int count;
@Override
public boolean hasNext() {
return count < limitSize && iterator.hasNext();
}
@Override
public T next() {
if (!hasNext()) {
throw new NoSuchElementException();
}
count++;
return iterator.next();
}
@Override
public void remove() {
iterator.remove();
}
};
}
@Override
public JavaRDD<GenericRecord> read(long numRecords) throws IOException {
return fetchAnyRecordsFromDataset(Option.of(numRecords));
}
@Override
public JavaRDD<GenericRecord> read(int numPartitions, long approxNumRecords) throws IOException {
return fetchAnyRecordsFromDataset(Option.of(approxNumRecords), Option.of(numPartitions));
}
@Override
public JavaRDD<GenericRecord> read(int numPartitions, int numFiles, long numRecords) throws IOException {
return fetchRecordsFromDataset(Option.of(numPartitions), Option.of(numFiles), Option.of(numRecords));
}
@Override
public JavaRDD<GenericRecord> read(int numPartitions, int numFiles, double percentageRecordsPerFile)
throws IOException {
return fetchPercentageRecordsFromDataset(Option.of(numPartitions), Option.of(numFiles),
Option.of(percentageRecordsPerFile));
}
}

View File

@@ -0,0 +1,62 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.integ.testsuite.reader;
import java.io.IOException;
import java.io.Serializable;
import org.apache.spark.api.java.JavaRDD;
/**
* Implementations of {@link DeltaInputReader} will read the configured input type and provide an RDD of records to the
* client.
*
* @param <O> Read result data type
*/
public interface DeltaInputReader<O> extends Serializable {
/**
* Attempts to reads an approximate number of records close to approxNumRecords.
* This highly depends on the number of records already present in the input.
*/
JavaRDD<O> read(long approxNumRecords) throws IOException;
/**
* @throws IOException Attempts to read approx number of records (exact if equal or more records available)
* across requested number of
* partitions.
*/
JavaRDD<O> read(int numPartitions, long approxNumRecords) throws IOException;
/**
* @throws IOException Attempts to read approx number of records (exact if equal or more records available)
* across requested number of
* partitions and number of files.
* 1. Find numFiles across numPartitions
* 2. numRecordsToReadPerFile = approxNumRecords / numFiles
*/
JavaRDD<O> read(int numPartitions, int numFiles, long approxNumRecords) throws IOException;
/**
* @throws IOException Attempts to a % of records per file across requested number of partitions and number of files.
* 1. Find numFiles across numPartitions
* 2. numRecordsToReadPerFile = approxNumRecordsPerFile * percentageRecordsPerFile
*/
JavaRDD<O> read(int numPartitions, int numFiles, double percentageRecordsPerFile) throws IOException;
}

View File

@@ -0,0 +1,26 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.integ.testsuite.reader;
/**
* Supported delta input data types.
*/
public enum DeltaInputType {
AVRO, PARQUET
}

View File

@@ -0,0 +1,70 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.integ.testsuite.reader;
import java.util.List;
import org.apache.avro.generic.GenericRecord;
import org.apache.hudi.AvroConversionUtils;
import org.apache.hudi.common.util.Option;
import org.apache.hudi.utilities.schema.RowBasedSchemaProvider;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.SparkSession;
import scala.collection.JavaConverters;
/**
* Helper class to read avro and/or parquet files and generate a RDD of {@link GenericRecord}.
*/
public class SparkBasedReader {
public static final String SPARK_AVRO_FORMAT = "avro";
public static final String SPARK_PARQUET_FORMAT = "com.databricks.spark.parquet";
private static final String AVRO_SCHEMA_OPTION_KEY = "avroSchema";
// Spark anyways globs the path and gets all the paths in memory so take the List<filePaths> as an argument.
// https://github.com/apache/spark/.../org/apache/spark/sql/execution/datasources/DataSource.scala#L251
public static JavaRDD<GenericRecord> readAvro(SparkSession sparkSession, String schemaStr, List<String> listOfPaths,
Option<String> structName, Option<String> nameSpace) {
Dataset<Row> dataSet = sparkSession.read()
.format(SPARK_AVRO_FORMAT)
.option(AVRO_SCHEMA_OPTION_KEY, schemaStr)
.load(JavaConverters.asScalaIteratorConverter(listOfPaths.iterator()).asScala().toSeq());
return AvroConversionUtils
.createRdd(dataSet.toDF(), structName.orElse(RowBasedSchemaProvider.HOODIE_RECORD_STRUCT_NAME),
nameSpace.orElse(RowBasedSchemaProvider.HOODIE_RECORD_NAMESPACE))
.toJavaRDD();
}
public static JavaRDD<GenericRecord> readParquet(SparkSession sparkSession, List<String>
listOfPaths, Option<String> structName, Option<String> nameSpace) {
Dataset<Row> dataSet = sparkSession.read()
.parquet((JavaConverters.asScalaIteratorConverter(listOfPaths.iterator()).asScala().toSeq()));
return AvroConversionUtils
.createRdd(dataSet.toDF(), structName.orElse(RowBasedSchemaProvider.HOODIE_RECORD_STRUCT_NAME),
RowBasedSchemaProvider.HOODIE_RECORD_NAMESPACE)
.toJavaRDD();
}
}

View File

@@ -0,0 +1,117 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.integ.testsuite.writer;
import java.io.IOException;
import java.io.OutputStream;
import java.util.UUID;
import org.apache.avro.Schema;
import org.apache.avro.file.DataFileWriter;
import org.apache.avro.generic.GenericDatumWriter;
import org.apache.avro.generic.GenericRecord;
import org.apache.avro.generic.IndexedRecord;
import org.apache.avro.io.DatumWriter;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hudi.common.fs.FSUtils;
import org.apache.hudi.common.fs.HoodieWrapperFileSystem;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* Implementation of {@link DeltaInputWriter} that writes avro records to the result file.
*/
public class AvroFileDeltaInputWriter implements DeltaInputWriter<GenericRecord> {
public static final String AVRO_EXTENSION = ".avro";
private static Logger log = LoggerFactory.getLogger(AvroFileDeltaInputWriter.class);
// The maximum file size for an avro file before being rolled over to a new one
private final Long maxFileSize;
private final Configuration configuration;
private HoodieWrapperFileSystem fs;
// Path of the actual avro file
private Path file;
// Base input path to write avro files under
// TODO : Make this bucketed so don't have a large number of files in a single directory
private String basePath;
private DatumWriter<IndexedRecord> writer;
private DataFileWriter<IndexedRecord> dataFileWriter;
private OutputStream output;
private Schema schema;
private DeltaWriteStats deltaWriteStats;
private long recordsWritten = 0;
// TODO : Handle failure case which may leave behind tons of small corrupt files
public AvroFileDeltaInputWriter(Configuration configuration, String basePath, String schemaStr, Long maxFileSize)
throws IOException {
this.schema = Schema.parse(schemaStr);
this.maxFileSize = maxFileSize;
this.configuration = configuration;
this.basePath = basePath;
Path path = new Path(basePath, new Path(UUID.randomUUID().toString() + AVRO_EXTENSION));
this.file = HoodieWrapperFileSystem.convertToHoodiePath(path, configuration);
this.fs = (HoodieWrapperFileSystem) this.file
.getFileSystem(FSUtils.registerFileSystem(path, configuration));
this.output = this.fs.create(this.file);
this.writer = new GenericDatumWriter(schema);
this.dataFileWriter = new DataFileWriter<>(writer).create(schema, output);
this.deltaWriteStats = new DeltaWriteStats();
}
@Override
public void writeData(GenericRecord iData) throws IOException {
this.dataFileWriter.append(iData);
recordsWritten++;
}
@Override
public boolean canWrite() {
return fs.getBytesWritten(file) < maxFileSize;
}
@Override
public void close() throws IOException {
this.deltaWriteStats.setBytesWritten(this.fs.getBytesWritten(this.file));
this.deltaWriteStats.setRecordsWritten(this.recordsWritten);
this.deltaWriteStats.setFilePath(this.file.toUri().getPath());
this.dataFileWriter.close();
log.info("New Avro File : {}", getPath());
}
@Override
public DeltaInputWriter getNewWriter() throws IOException {
AvroFileDeltaInputWriter avroFileDeltaInputWriter = new AvroFileDeltaInputWriter(this.configuration, this.basePath, this
.schema.toString(), this.maxFileSize);
return avroFileDeltaInputWriter;
}
public FileSystem getFs() {
return fs;
}
public Path getPath() {
return this.file;
}
@Override
public DeltaWriteStats getDeltaWriteStats() {
return this.deltaWriteStats;
}
}

View File

@@ -0,0 +1,62 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.integ.testsuite.writer;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import org.apache.avro.generic.GenericRecord;
/**
* {@link org.apache.hadoop.hdfs.DistributedFileSystem} (or {@link org.apache.hadoop.fs.LocalFileSystem}) based delta
* generator.
*/
public class DFSDeltaWriterAdapter implements DeltaWriterAdapter<GenericRecord> {
private DeltaInputWriter deltaInputGenerator;
private List<DeltaWriteStats> metrics = new ArrayList<>();
public DFSDeltaWriterAdapter(DeltaInputWriter<GenericRecord> deltaInputGenerator) {
this.deltaInputGenerator = deltaInputGenerator;
}
@Override
public List<DeltaWriteStats> write(Iterator<GenericRecord> input) throws IOException {
while (input.hasNext()) {
if (this.deltaInputGenerator.canWrite()) {
this.deltaInputGenerator.writeData(input.next());
} else if (input.hasNext()) {
rollOver();
}
}
close();
return this.metrics;
}
public void rollOver() throws IOException {
close();
this.deltaInputGenerator = this.deltaInputGenerator.getNewWriter();
}
private void close() throws IOException {
this.deltaInputGenerator.close();
this.metrics.add(this.deltaInputGenerator.getDeltaWriteStats());
}
}

View File

@@ -0,0 +1,42 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.integ.testsuite.writer;
import java.io.IOException;
import org.apache.avro.generic.GenericRecord;
import org.apache.hudi.integ.testsuite.writer.DeltaWriterAdapter.SparkBasedDeltaWriter;
import org.apache.spark.api.java.JavaRDD;
/**
* NEED TO IMPLEMENT A CUSTOM SPARK PARTITIONER TO ENSURE WE WRITE LARGE ENOUGH AVRO FILES.
*/
public class DFSSparkAvroDeltaWriter implements SparkBasedDeltaWriter<JavaRDD<GenericRecord>> {
private DeltaInputWriter<JavaRDD<GenericRecord>> deltaInputWriter;
public DFSSparkAvroDeltaWriter(DeltaInputWriter<JavaRDD<GenericRecord>> deltaInputWriter) {
this.deltaInputWriter = deltaInputWriter;
}
@Override
public JavaRDD<DeltaWriteStats> write(JavaRDD<GenericRecord> input) throws IOException {
this.deltaInputWriter.writeData(input);
return null;
}
}

View File

@@ -0,0 +1,53 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.integ.testsuite.writer;
import java.io.Closeable;
import java.io.IOException;
/**
* Implementations of {@link DeltaInputWriter} will be able to generate data.
*
* @param <I> Data type to be generated.
*/
public interface DeltaInputWriter<I> extends Closeable {
/**
* Generate any type of data.
*/
void writeData(I iData) throws IOException;
/**
* Check whether more data can/should be written.
*/
boolean canWrite();
/**
* Return the statistics of data written.
*/
DeltaWriteStats getDeltaWriteStats();
/**
* Return a new instance of this writer.
* @return
* @throws IOException
*/
DeltaInputWriter getNewWriter() throws IOException;
}

View File

@@ -0,0 +1,26 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.integ.testsuite.writer;
/**
* Supported output destination types for the generated delta workload.
*/
public enum DeltaOutputMode {
KAFKA, DFS
}

View File

@@ -0,0 +1,72 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.integ.testsuite.writer;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.List;
import org.apache.hudi.common.util.collection.Pair;
/**
* This class holds the write statistics for {@link DeltaInputWriter}.
*/
public class DeltaWriteStats implements Serializable {
// The file path (if any) for the data written
private String filePath;
// Number of bytes written before being closed
private long bytesWritten;
// Number of records written before being closed
private long recordsWritten;
private List<Pair<String, String>> partitionPathRecordKey = new ArrayList<>();
public String getFilePath() {
return filePath;
}
public void setFilePath(String filePath) {
this.filePath = filePath;
}
public long getBytesWritten() {
return bytesWritten;
}
public void setBytesWritten(long bytesWritten) {
this.bytesWritten = bytesWritten;
}
public List<Pair<String, String>> getPartitionPathRecordKey() {
return partitionPathRecordKey;
}
public void setPartitionPathRecordKey(List<Pair<String, String>> partitionPathRecordKey) {
this.partitionPathRecordKey = partitionPathRecordKey;
}
public long getRecordsWritten() {
return recordsWritten;
}
public void setRecordsWritten(long recordsWritten) {
this.recordsWritten = recordsWritten;
}
}

View File

@@ -0,0 +1,34 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.integ.testsuite.writer;
import java.io.IOException;
import java.util.Iterator;
import java.util.List;
import org.apache.spark.api.java.JavaRDD;
public interface DeltaWriterAdapter<I> {
List<DeltaWriteStats> write(Iterator<I> input) throws IOException;
interface SparkBasedDeltaWriter<J> {
JavaRDD<DeltaWriteStats> write(J input) throws IOException;
}
}

View File

@@ -0,0 +1,57 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.integ.testsuite.writer;
import java.io.IOException;
import org.apache.avro.generic.GenericRecord;
import org.apache.hudi.common.util.StringUtils;
import org.apache.hudi.integ.testsuite.configuration.DFSDeltaConfig;
import org.apache.hudi.integ.testsuite.configuration.DeltaConfig;
import org.apache.hudi.integ.testsuite.reader.DeltaInputType;
/**
* A factory to help instantiate different {@link DeltaWriterAdapter}s depending on the {@link DeltaOutputMode} and
* {@link DeltaInputType}.
*/
public class DeltaWriterFactory {
private DeltaWriterFactory() {
}
public static DeltaWriterAdapter getDeltaWriterAdapter(DeltaConfig config, Integer batchId) throws IOException {
switch (config.getDeltaOutputMode()) {
case DFS:
switch (config.getDeltaInputType()) {
case AVRO:
DFSDeltaConfig dfsDeltaConfig = (DFSDeltaConfig) config;
dfsDeltaConfig.setBatchId(batchId);
DeltaInputWriter<GenericRecord> fileDeltaInputGenerator = new AvroFileDeltaInputWriter(
dfsDeltaConfig.getConfiguration(),
StringUtils
.join(new String[]{dfsDeltaConfig.getDeltaBasePath(), dfsDeltaConfig.getBatchId().toString()},
"/"), dfsDeltaConfig.getSchemaStr(), dfsDeltaConfig.getMaxFileSize());
return new DFSDeltaWriterAdapter(fileDeltaInputGenerator);
default:
throw new IllegalArgumentException("Invalid delta input format " + config.getDeltaInputType());
}
default:
throw new IllegalArgumentException("Invalid delta input type " + config.getDeltaOutputMode());
}
}
}

View File

@@ -0,0 +1,69 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.integ.testsuite.writer;
import java.io.IOException;
import org.apache.avro.generic.GenericRecord;
import org.apache.hudi.AvroConversionUtils;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.sql.SparkSession;
/**
* Spark based avro delta input writer. We don't use this yet since we cannot control result file size.
*/
public class SparkAvroDeltaInputWriter implements DeltaInputWriter<JavaRDD<GenericRecord>> {
private static final String AVRO_FORMAT_PACKAGE = "avro";
public SparkSession sparkSession;
private String schemaStr;
// TODO : the base path has to be a new path every time for spark avro
private String basePath;
public SparkAvroDeltaInputWriter(SparkSession sparkSession, String schemaStr, String basePath) {
this.sparkSession = sparkSession;
this.schemaStr = schemaStr;
this.basePath = basePath;
}
@Override
public void writeData(JavaRDD<GenericRecord> iData) throws IOException {
AvroConversionUtils.createDataFrame(iData.rdd(), schemaStr, sparkSession).write()
.format(AVRO_FORMAT_PACKAGE).save(basePath);
}
@Override
public boolean canWrite() {
throw new UnsupportedOperationException("not applicable for spark based writer");
}
@Override
public void close() throws IOException {
}
@Override
public DeltaWriteStats getDeltaWriteStats() {
throw new UnsupportedOperationException("not applicable for spark based writer");
}
@Override
public DeltaInputWriter getNewWriter() throws IOException {
throw new UnsupportedOperationException("not applicable for spark based writer");
}
}

View File

@@ -0,0 +1,138 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.integ.testsuite;
import static org.junit.jupiter.api.Assertions.assertTrue;
import static org.mockito.Mockito.times;
import static org.mockito.Mockito.when;
import java.io.IOException;
import java.util.Iterator;
import org.apache.avro.generic.GenericRecord;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hudi.common.config.SerializableConfiguration;
import org.apache.hudi.common.fs.FSUtils;
import org.apache.hudi.integ.testsuite.configuration.DFSDeltaConfig;
import org.apache.hudi.integ.testsuite.configuration.DeltaConfig;
import org.apache.hudi.integ.testsuite.generator.FlexibleSchemaRecordGenerationIterator;
import org.apache.hudi.integ.testsuite.reader.DeltaInputType;
import org.apache.hudi.integ.testsuite.utils.TestUtils;
import org.apache.hudi.integ.testsuite.writer.AvroFileDeltaInputWriter;
import org.apache.hudi.integ.testsuite.writer.DFSDeltaWriterAdapter;
import org.apache.hudi.integ.testsuite.writer.DeltaInputWriter;
import org.apache.hudi.integ.testsuite.writer.DeltaOutputMode;
import org.apache.hudi.integ.testsuite.writer.DeltaWriteStats;
import org.apache.hudi.integ.testsuite.writer.DeltaWriterAdapter;
import org.apache.hudi.integ.testsuite.writer.DeltaWriterFactory;
import org.apache.hudi.utilities.schema.FilebasedSchemaProvider;
import org.apache.hudi.utilities.testutils.UtilitiesTestBase;
import org.apache.spark.api.java.JavaRDD;
import org.junit.jupiter.api.AfterAll;
import org.junit.jupiter.api.AfterEach;
import org.junit.jupiter.api.BeforeAll;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
import org.mockito.Mockito;
public class TestDFSHoodieTestSuiteWriterAdapter extends UtilitiesTestBase {
private FilebasedSchemaProvider schemaProvider;
private static final String COMPLEX_SOURCE_SCHEMA_DOCKER_DEMO_RELATIVE_PATH = "/docker/demo/config/test-suite/";
@BeforeAll
public static void initClass() throws Exception {
UtilitiesTestBase.initClass();
}
@AfterAll
public static void cleanupClass() {
UtilitiesTestBase.cleanupClass();
}
@BeforeEach
public void setup() throws Exception {
super.setup();
schemaProvider = new FilebasedSchemaProvider(Helpers.setupSchemaOnDFSWithAbsoluteScope(
System.getProperty("user.dir") + "/.." + COMPLEX_SOURCE_SCHEMA_DOCKER_DEMO_RELATIVE_PATH,
"complex-source.avsc"), jsc);
}
@AfterEach
public void teardown() throws Exception {
super.teardown();
}
@Test
public void testDFSOneFileWrite() throws IOException {
DeltaInputWriter<GenericRecord> mockFileSinkWriter = Mockito.mock(AvroFileDeltaInputWriter.class);
DeltaWriteStats mockDeltaWriteStats = Mockito.mock(DeltaWriteStats.class);
when(mockFileSinkWriter.getNewWriter()).thenReturn(mockFileSinkWriter);
when(mockFileSinkWriter.canWrite()).thenReturn(true);
when(mockFileSinkWriter.getDeltaWriteStats()).thenReturn(mockDeltaWriteStats);
DeltaWriterAdapter<GenericRecord> dfsDeltaWriterAdapter = new DFSDeltaWriterAdapter(mockFileSinkWriter);
JavaRDD<GenericRecord> records = TestUtils.makeRDD(jsc, 10);
dfsDeltaWriterAdapter.write(records.collect().iterator());
Mockito.verify(mockFileSinkWriter, times(10)).canWrite();
Mockito.verify(mockFileSinkWriter, times(1)).close();
}
@Test
public void testDFSTwoFilesWriteWithRollover() throws IOException {
DeltaInputWriter<GenericRecord> mockFileSinkWriter = Mockito.mock(AvroFileDeltaInputWriter.class);
DeltaWriteStats mockDeltaWriteStats = Mockito.mock(DeltaWriteStats.class);
when(mockFileSinkWriter.getNewWriter()).thenReturn(mockFileSinkWriter);
when(mockFileSinkWriter.canWrite()).thenReturn(false, true);
when(mockFileSinkWriter.getDeltaWriteStats()).thenReturn(mockDeltaWriteStats);
DeltaWriterAdapter<GenericRecord> dfsDeltaWriterAdapter = new DFSDeltaWriterAdapter(mockFileSinkWriter);
Iterator<GenericRecord> mockIterator = Mockito.mock(Iterator.class);
when(mockIterator.hasNext()).thenReturn(true, true, true, false);
dfsDeltaWriterAdapter.write(mockIterator);
Mockito.verify(mockFileSinkWriter, times(2)).canWrite();
Mockito.verify(mockFileSinkWriter, times(1)).getNewWriter();
Mockito.verify(mockFileSinkWriter, times(2)).close();
}
@Test
public void testDFSWorkloadSinkWithMultipleFilesFunctional() throws IOException {
DeltaConfig dfsSinkConfig = new DFSDeltaConfig(DeltaOutputMode.DFS, DeltaInputType.AVRO,
new SerializableConfiguration(jsc.hadoopConfiguration()), dfsBasePath, dfsBasePath,
schemaProvider.getSourceSchema().toString(), 10240L);
DeltaWriterAdapter<GenericRecord> dfsDeltaWriterAdapter = DeltaWriterFactory
.getDeltaWriterAdapter(dfsSinkConfig, 1);
FlexibleSchemaRecordGenerationIterator itr = new FlexibleSchemaRecordGenerationIterator(1000,
schemaProvider.getSourceSchema().toString());
dfsDeltaWriterAdapter.write(itr);
FileSystem fs = FSUtils.getFs(dfsBasePath, jsc.hadoopConfiguration());
FileStatus[] fileStatuses = fs.listStatus(new Path(dfsBasePath));
// Since maxFileSize was 10240L and we produced 1K records each close to 1K size, we should produce more than
// 1 file
assertTrue(fileStatuses.length > 0);
}
}

View File

@@ -0,0 +1,147 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.integ.testsuite;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertFalse;
import static org.junit.jupiter.api.Assertions.assertTrue;
import java.io.IOException;
import java.io.UncheckedIOException;
import java.util.Arrays;
import java.util.List;
import java.util.stream.Collectors;
import java.util.stream.IntStream;
import org.apache.avro.generic.GenericRecord;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hudi.common.fs.FSUtils;
import org.apache.hudi.common.util.Option;
import org.apache.hudi.integ.testsuite.reader.SparkBasedReader;
import org.apache.hudi.integ.testsuite.writer.AvroFileDeltaInputWriter;
import org.apache.hudi.integ.testsuite.writer.DeltaInputWriter;
import org.apache.hudi.integ.testsuite.writer.DeltaWriteStats;
import org.apache.hudi.integ.testsuite.generator.GenericRecordFullPayloadGenerator;
import org.apache.hudi.integ.testsuite.reader.SparkBasedReader;
import org.apache.hudi.integ.testsuite.writer.AvroFileDeltaInputWriter;
import org.apache.hudi.integ.testsuite.writer.DeltaInputWriter;
import org.apache.hudi.integ.testsuite.writer.DeltaWriteStats;
import org.apache.hudi.utilities.schema.FilebasedSchemaProvider;
import org.apache.hudi.utilities.testutils.UtilitiesTestBase;
import org.apache.spark.api.java.JavaRDD;
import org.junit.jupiter.api.AfterAll;
import org.junit.jupiter.api.AfterEach;
import org.junit.jupiter.api.BeforeAll;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
public class TestFileDeltaInputWriter extends UtilitiesTestBase {
private FilebasedSchemaProvider schemaProvider;
private static final String COMPLEX_SOURCE_SCHEMA_DOCKER_DEMO_RELATIVE_PATH = "/docker/demo/config/test-suite/";
@BeforeAll
public static void initClass() throws Exception {
UtilitiesTestBase.initClass();
}
@AfterAll
public static void cleanupClass() {
UtilitiesTestBase.cleanupClass();
}
@BeforeEach
public void setup() throws Exception {
super.setup();
schemaProvider = new FilebasedSchemaProvider(Helpers.setupSchemaOnDFSWithAbsoluteScope(System.getProperty("user.dir") + "/.."
+ COMPLEX_SOURCE_SCHEMA_DOCKER_DEMO_RELATIVE_PATH, "complex-source.avsc"), jsc);
}
@AfterEach
public void teardown() throws Exception {
super.teardown();
}
@Test
public void testAvroFileSinkWriter() throws IOException {
// 1. Create a Avro File Sink Writer
DeltaInputWriter<GenericRecord> fileSinkWriter =
new AvroFileDeltaInputWriter(jsc.hadoopConfiguration(), dfsBasePath + "/input", schemaProvider.getSourceSchema()
.toString(), 1024 * 1024L);
GenericRecordFullPayloadGenerator payloadGenerator =
new GenericRecordFullPayloadGenerator(schemaProvider.getSourceSchema());
// 2. Generate 100 avro payloads and write them to an avro file
IntStream.range(0, 100).forEach(a -> {
try {
fileSinkWriter.writeData(payloadGenerator.getNewPayload());
} catch (IOException io) {
throw new UncheckedIOException(io);
}
});
fileSinkWriter.close();
DeltaWriteStats deltaWriteStats = fileSinkWriter.getDeltaWriteStats();
FileSystem fs = FSUtils.getFs(dfsBasePath, jsc.hadoopConfiguration());
FileStatus[] fileStatuses = fs.listStatus(new Path(deltaWriteStats.getFilePath()));
// Atleast 1 file was written
assertEquals(1, fileStatuses.length);
// File length should be greater than 0
assertTrue(fileStatuses[0].getLen() > 0);
// File length should be the same as the number of bytes written
assertTrue(deltaWriteStats.getBytesWritten() > 0);
List<String> paths = Arrays.asList(fs.globStatus(new Path(dfsBasePath + "/*/*.avro")))
.stream().map(f -> f.getPath().toString()).collect(Collectors.toList());
JavaRDD<GenericRecord> writtenRecords =
SparkBasedReader.readAvro(sparkSession, schemaProvider.getSourceSchema().toString(), paths, Option.empty(),
Option.empty());
// Number of records written should be 100
assertEquals(writtenRecords.count(), 100);
// Number of records in file should match with the stats
assertEquals(writtenRecords.count(), deltaWriteStats.getRecordsWritten());
}
@Test
public void testAvroFileSinkCreateNewWriter() throws IOException {
// 1. Create a Avro File Sink Writer
DeltaInputWriter<GenericRecord> fileSinkWriter =
new AvroFileDeltaInputWriter(jsc.hadoopConfiguration(), dfsBasePath,
schemaProvider.getSourceSchema().toString(),
1024 * 1024L);
GenericRecordFullPayloadGenerator payloadGenerator =
new GenericRecordFullPayloadGenerator(schemaProvider.getSourceSchema());
// 2. Generate 100 avro payloads and write them to an avro file
IntStream.range(0, 100).forEach(a -> {
try {
fileSinkWriter.writeData(payloadGenerator.getNewPayload());
} catch (IOException io) {
throw new UncheckedIOException(io);
}
});
fileSinkWriter.close();
String oldFilePath = fileSinkWriter.getDeltaWriteStats().getFilePath();
assertFalse(oldFilePath == null);
DeltaInputWriter<GenericRecord> newFileSinkWriter = fileSinkWriter.getNewWriter();
newFileSinkWriter.close();
DeltaWriteStats newStats = newFileSinkWriter.getDeltaWriteStats();
assertEquals(newStats.getBytesWritten(), 3674);
assertEquals(newStats.getRecordsWritten(), 0);
assertTrue(newStats.getFilePath() != null);
}
}

View File

@@ -0,0 +1,80 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.integ.testsuite.configuration;
import static junit.framework.Assert.assertTrue;
import static junit.framework.TestCase.assertEquals;
import java.util.ArrayList;
import java.util.List;
import org.apache.hudi.integ.testsuite.dag.nodes.DagNode;
import org.apache.hudi.integ.testsuite.dag.nodes.InsertNode;
import org.apache.hudi.integ.testsuite.dag.nodes.UpsertNode;
import org.apache.hudi.integ.testsuite.configuration.DeltaConfig.Config;
import org.apache.hudi.integ.testsuite.dag.WorkflowDag;
import org.apache.hudi.integ.testsuite.dag.nodes.DagNode;
import org.apache.hudi.integ.testsuite.dag.nodes.InsertNode;
import org.apache.hudi.integ.testsuite.dag.nodes.UpsertNode;
import org.junit.jupiter.api.Test;
public class TestWorkflowBuilder {
@Test
public void testWorkloadOperationSequenceBuilder() {
DagNode root = new InsertNode(DeltaConfig.Config.newBuilder()
.withNumRecordsToInsert(10000)
.withNumInsertPartitions(1)
.withNumTimesToRepeat(2)
.withRecordSize(1000).build());
DagNode child1 = new UpsertNode(DeltaConfig.Config.newBuilder()
.withNumRecordsToUpdate(10000)
.withNumInsertPartitions(1)
.withNumTimesToRepeat(2)
.withRecordSize(1000).build());
root.addChildNode(child1);
child1.addParentNode(root);
List<DagNode> rootNodes = new ArrayList<>();
rootNodes.add(root);
WorkflowDag workflowDag = new WorkflowDag(rootNodes);
assertEquals(workflowDag.getNodeList().size(), 1);
assertEquals(((DagNode) workflowDag.getNodeList().get(0)).getChildNodes().size(), 1);
DagNode dagNode = (DagNode) workflowDag.getNodeList().get(0);
assertTrue(dagNode instanceof InsertNode);
DeltaConfig.Config config = dagNode.getConfig();
assertEquals(config.getNumInsertPartitions(), 1);
assertEquals(config.getRecordSize(), 1000);
assertEquals(config.getRepeatCount(), 2);
assertEquals(config.getNumRecordsInsert(), 10000);
assertEquals(config.getNumRecordsUpsert(), 0);
dagNode = (DagNode) ((DagNode) workflowDag.getNodeList().get(0)).getChildNodes().get(0);
assertTrue(dagNode instanceof UpsertNode);
config = dagNode.getConfig();
assertEquals(config.getNumInsertPartitions(), 1);
assertEquals(config.getRecordSize(), 1000);
assertEquals(config.getRepeatCount(), 2);
assertEquals(config.getNumRecordsInsert(), 0);
assertEquals(config.getNumRecordsUpsert(), 10000);
}
}

View File

@@ -0,0 +1,93 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.integ.testsuite.converter;
import static junit.framework.TestCase.assertTrue;
import java.util.Arrays;
import java.util.List;
import java.util.Map;
import org.apache.avro.Schema.Field;
import org.apache.avro.generic.GenericRecord;
import org.apache.hudi.integ.testsuite.utils.TestUtils;
import org.apache.hudi.integ.testsuite.utils.TestUtils;
import org.apache.hudi.utilities.UtilHelpers;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.junit.jupiter.api.AfterEach;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
import scala.Tuple2;
public class TestUpdateConverter {
private JavaSparkContext jsc;
@BeforeEach
public void setup() throws Exception {
jsc = UtilHelpers.buildSparkContext(this.getClass().getName() + "-hoodie", "local[1]");
}
@AfterEach
public void teardown() {
jsc.stop();
}
@Test
public void testGenerateUpdateRecordsFromInputRecords() throws Exception {
JavaRDD<GenericRecord> inputRDD = TestUtils.makeRDD(jsc, 10);
String schemaStr = inputRDD.take(1).get(0).getSchema().toString();
int minPayloadSize = 1000;
// 2. DFS converter reads existing records and generates random updates for the same row keys
UpdateConverter updateConverter = new UpdateConverter(schemaStr, minPayloadSize,
Arrays.asList("timestamp"), Arrays.asList("_row_key"));
List<String> insertRowKeys = inputRDD.map(r -> r.get("_row_key").toString()).collect();
assertTrue(inputRDD.count() == 10);
JavaRDD<GenericRecord> outputRDD = updateConverter.convert(inputRDD);
List<String> updateRowKeys = outputRDD.map(row -> row.get("_row_key").toString()).collect();
// The insert row keys should be the same as update row keys
assertTrue(insertRowKeys.containsAll(updateRowKeys));
Map<String, GenericRecord> inputRecords = inputRDD.mapToPair(r -> new Tuple2<>(r.get("_row_key").toString(), r))
.collectAsMap();
List<GenericRecord> updateRecords = outputRDD.collect();
updateRecords.stream().forEach(updateRecord -> {
GenericRecord inputRecord = inputRecords.get(updateRecord.get("_row_key").toString());
assertTrue(areRecordsDifferent(inputRecord, updateRecord));
});
}
/**
* Checks if even a single field in the 2 records is different (except the row key which is the same for an update).
*/
private boolean areRecordsDifferent(GenericRecord in, GenericRecord up) {
for (Field field : in.getSchema().getFields()) {
if (field.name() == "_row_key") {
continue;
} else {
// Just convert all types to string for now since all are primitive
if (in.get(field.name()).toString() != up.get(field.name()).toString()) {
return true;
}
}
}
return false;
}
}

View File

@@ -0,0 +1,81 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.integ.testsuite.dag;
import java.util.ArrayList;
import java.util.List;
import java.util.function.Function;
import org.apache.hudi.client.WriteStatus;
import org.apache.hudi.integ.testsuite.dag.nodes.DagNode;
import org.apache.hudi.integ.testsuite.dag.nodes.InsertNode;
import org.apache.hudi.integ.testsuite.dag.nodes.UpsertNode;
import org.apache.hudi.integ.testsuite.dag.nodes.ValidateNode;
import org.apache.hudi.integ.testsuite.configuration.DeltaConfig.Config;
import org.apache.hudi.integ.testsuite.dag.nodes.DagNode;
import org.apache.hudi.integ.testsuite.dag.nodes.InsertNode;
import org.apache.hudi.integ.testsuite.dag.nodes.UpsertNode;
import org.apache.hudi.integ.testsuite.dag.nodes.ValidateNode;
import org.apache.spark.api.java.JavaRDD;
public class ComplexDagGenerator implements WorkflowDagGenerator {
@Override
public WorkflowDag build() {
DagNode root = new InsertNode(Config.newBuilder()
.withNumRecordsToInsert(1000)
.withNumInsertPartitions(3)
.withRecordSize(1000).build());
DagNode child1 = new UpsertNode(Config.newBuilder()
.withNumRecordsToUpdate(999)
.withNumRecordsToInsert(1000)
.withNumUpsertFiles(1)
.withNumUpsertPartitions(1)
.withNumInsertPartitions(1)
.withRecordSize(10000).build());
Function<List<DagNode<JavaRDD<WriteStatus>>>, Boolean> function = (dagNodes) -> {
DagNode<JavaRDD<WriteStatus>> parent1 = dagNodes.get(0);
List<WriteStatus> statuses = parent1.getResult().collect();
long totalRecordsTouched = statuses.stream().map(st -> st.getStat().getNumUpdateWrites() + st.getStat()
.getNumInserts()).reduce((a, b) -> a + b).get();
boolean b1 = totalRecordsTouched == parent1.getConfig().getNumRecordsInsert()
+ parent1.getConfig().getNumRecordsUpsert();
boolean b2 = statuses.size() > parent1.getConfig().getNumUpsertFiles();
DagNode<JavaRDD<WriteStatus>> parent2 = parent1.getParentNodes().get(0);
statuses = parent2.getResult().collect();
totalRecordsTouched = statuses.stream().map(st -> st.getStat().getNumUpdateWrites() + st.getStat()
.getNumInserts()).reduce((a, b) -> a + b).get();
boolean b3 = totalRecordsTouched == parent2.getConfig().getNumRecordsInsert()
* parent2.getConfig().getNumInsertPartitions() + parent2.getConfig().getNumRecordsUpsert();
return b1 & b2 & b3;
};
DagNode child2 = new ValidateNode(Config.newBuilder().build(), function);
root.addChildNode(child1);
// child1.addParentNode(root);
child1.addChildNode(child2);
// child2.addParentNode(child1);
List<DagNode> rootNodes = new ArrayList<>();
rootNodes.add(root);
return new WorkflowDag(rootNodes);
}
}

View File

@@ -0,0 +1,58 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.integ.testsuite.dag;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import org.apache.hudi.common.util.collection.Pair;
import org.apache.hudi.integ.testsuite.dag.nodes.DagNode;
import org.apache.hudi.integ.testsuite.dag.nodes.HiveSyncNode;
import org.apache.hudi.integ.testsuite.dag.nodes.InsertNode;
import org.apache.hudi.integ.testsuite.configuration.DeltaConfig.Config;
import org.apache.hudi.integ.testsuite.dag.nodes.DagNode;
import org.apache.hudi.integ.testsuite.dag.nodes.HiveQueryNode;
import org.apache.hudi.integ.testsuite.dag.nodes.HiveSyncNode;
import org.apache.hudi.integ.testsuite.dag.nodes.InsertNode;
public class HiveSyncDagGenerator implements WorkflowDagGenerator {
@Override
public WorkflowDag build() {
DagNode root = new InsertNode(Config.newBuilder()
.withNumRecordsToInsert(100)
.withNumInsertPartitions(1)
.withNumTimesToRepeat(1)
.withRecordSize(1000).build());
DagNode child1 = new HiveSyncNode(Config.newBuilder().withHiveLocal(true).build());
root.addChildNode(child1);
DagNode child2 = new HiveQueryNode(Config.newBuilder().withHiveLocal(true).withHiveQueryAndResults(Arrays
.asList(Pair.of("select " + "count(*) from testdb1.table1 group " + "by rider having count(*) < 1", 0)))
.build());
child1.addChildNode(child2);
List<DagNode> rootNodes = new ArrayList<>();
rootNodes.add(root);
return new WorkflowDag(rootNodes);
}
}

View File

@@ -0,0 +1,58 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.integ.testsuite.dag;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import org.apache.hudi.common.util.collection.Pair;
import org.apache.hudi.integ.testsuite.dag.nodes.DagNode;
import org.apache.hudi.integ.testsuite.dag.nodes.HiveSyncNode;
import org.apache.hudi.integ.testsuite.dag.nodes.InsertNode;
import org.apache.hudi.integ.testsuite.configuration.DeltaConfig.Config;
import org.apache.hudi.integ.testsuite.dag.nodes.DagNode;
import org.apache.hudi.integ.testsuite.dag.nodes.HiveQueryNode;
import org.apache.hudi.integ.testsuite.dag.nodes.HiveSyncNode;
import org.apache.hudi.integ.testsuite.dag.nodes.InsertNode;
public class HiveSyncDagGeneratorMOR implements WorkflowDagGenerator {
@Override
public WorkflowDag build() {
DagNode root = new InsertNode(Config.newBuilder()
.withNumRecordsToInsert(100)
.withNumInsertPartitions(1)
.withNumTimesToRepeat(1)
.withRecordSize(1000).build());
DagNode child1 = new HiveSyncNode(Config.newBuilder().withHiveLocal(true).build());
root.addChildNode(child1);
DagNode child2 = new HiveQueryNode(Config.newBuilder().withHiveLocal(true).withHiveQueryAndResults(Arrays
.asList(Pair.of("select " + "count(*) from testdb1.table1_rt group " + "by rider having count(*) < 1", 0)))
.build());
child1.addChildNode(child2);
List<DagNode> rootNodes = new ArrayList<>();
rootNodes.add(root);
return new WorkflowDag(rootNodes);
}
}

View File

@@ -0,0 +1,95 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.integ.testsuite.dag;
import static org.junit.jupiter.api.Assertions.assertEquals;
import java.util.ArrayList;
import java.util.List;
import org.apache.hudi.integ.testsuite.configuration.DeltaConfig.Config;
import org.apache.hudi.integ.testsuite.dag.nodes.DagNode;
import org.apache.hudi.integ.testsuite.dag.nodes.InsertNode;
import org.apache.hudi.utilities.testutils.UtilitiesTestBase;
import org.junit.jupiter.api.Assertions;
import org.junit.jupiter.api.Test;
public class TestDagUtils {
private static final String COW_DAG_DOCKER_DEMO_RELATIVE_PATH = "/docker/demo/config/test-suite/complex-dag-cow.yaml";
@Test
public void testConvertDagToYaml() throws Exception {
ComplexDagGenerator dag = new ComplexDagGenerator();
String yaml = DagUtils.convertDagToYaml(dag.build());
System.out.println(yaml);
}
@Test
public void testConvertYamlToDag() throws Exception {
WorkflowDag dag = DagUtils.convertYamlToDag(UtilitiesTestBase.Helpers
.readFileFromAbsolutePath((System.getProperty("user.dir") + "/.." + COW_DAG_DOCKER_DEMO_RELATIVE_PATH)));
assertEquals(dag.getNodeList().size(), 1);
Assertions.assertEquals(((DagNode) dag.getNodeList().get(0)).getParentNodes().size(), 0);
assertEquals(((DagNode) dag.getNodeList().get(0)).getChildNodes().size(), 1);
DagNode firstChild = (DagNode) ((DagNode) dag.getNodeList().get(0)).getChildNodes().get(0);
assertEquals(firstChild.getParentNodes().size(), 1);
assertEquals(firstChild.getChildNodes().size(), 1);
assertEquals(((DagNode) firstChild.getChildNodes().get(0)).getChildNodes().size(), 1);
}
public static class ComplexDagGenerator implements WorkflowDagGenerator {
@Override
public WorkflowDag build() {
DagNode root = new InsertNode(Config.newBuilder()
.withNumRecordsToInsert(1000000)
.withNumInsertPartitions(1)
.withNumTimesToRepeat(2)
.withRecordSize(1000).build());
DagNode child1 = new InsertNode(Config.newBuilder()
.withNumRecordsToInsert(1000000)
.withNumInsertPartitions(1)
.withNumTimesToRepeat(2)
.withRecordSize(1000).build());
DagNode child2 = new InsertNode(Config.newBuilder()
.withNumRecordsToInsert(1000000)
.withNumInsertPartitions(1)
.withNumTimesToRepeat(2)
.withRecordSize(1000).build());
root.addChildNode(child1);
root.addChildNode(child2);
DagNode child3 = new InsertNode(Config.newBuilder()
.withNumRecordsToInsert(1000000)
.withNumInsertPartitions(1)
.withNumTimesToRepeat(2)
.withRecordSize(1000).build());
child2.addChildNode(child3);
List<DagNode> rootNodes = new ArrayList<>();
rootNodes.add(root);
return new WorkflowDag(rootNodes);
}
}
}

View File

@@ -0,0 +1,56 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.integ.testsuite.generator;
import static junit.framework.TestCase.assertEquals;
import org.apache.avro.Schema;
import org.apache.hudi.common.util.collection.Pair;
import org.apache.hudi.utilities.testutils.UtilitiesTestBase;
import org.junit.jupiter.api.Test;
public class TestGenericRecordPayloadEstimator {
private static final String SOURCE_SCHEMA_DOCKER_DEMO_RELATIVE_PATH = "/docker/demo/config/test-suite/source.avsc";
private static final String COMPLEX_SOURCE_SCHEMA_DOCKER_DEMO_RELATIVE_PATH =
"/docker/demo/config/test-suite/complex-source.avsc";
@Test
public void testSimpleSchemaSize() throws Exception {
Schema schema = new Schema.Parser().parse(UtilitiesTestBase.Helpers
.readFileFromAbsolutePath(System.getProperty("user.dir") + "/.." + SOURCE_SCHEMA_DOCKER_DEMO_RELATIVE_PATH));
GenericRecordFullPayloadSizeEstimator estimator =
new GenericRecordFullPayloadSizeEstimator(schema);
Pair<Integer, Integer> estimateAndNumComplexFields = estimator.typeEstimateAndNumComplexFields();
assertEquals(estimateAndNumComplexFields.getRight().intValue(), 0);
assertEquals(estimateAndNumComplexFields.getLeft().intValue(), 156);
}
@Test
public void testComplexSchemaSize() throws Exception {
Schema schema = new Schema.Parser().parse(UtilitiesTestBase.Helpers.readFileFromAbsolutePath(
System.getProperty("user.dir") + "/.." + COMPLEX_SOURCE_SCHEMA_DOCKER_DEMO_RELATIVE_PATH));
GenericRecordFullPayloadSizeEstimator estimator =
new GenericRecordFullPayloadSizeEstimator(schema);
Pair<Integer, Integer> estimateAndNumComplexFields = estimator.typeEstimateAndNumComplexFields();
assertEquals(estimateAndNumComplexFields.getRight().intValue(), 1);
assertEquals(estimateAndNumComplexFields.getLeft().intValue(), 1278);
}
}

View File

@@ -0,0 +1,130 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.integ.testsuite.generator;
import static org.junit.jupiter.api.Assertions.assertFalse;
import static org.junit.jupiter.api.Assertions.assertTrue;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.stream.IntStream;
import org.apache.avro.Schema;
import org.apache.avro.generic.GenericRecord;
import org.apache.hudi.avro.HoodieAvroUtils;
import org.apache.hudi.utilities.testutils.UtilitiesTestBase;
import org.junit.jupiter.api.Test;
public class TestGenericRecordPayloadGenerator {
private static final String SOURCE_SCHEMA_DOCKER_DEMO_RELATIVE_PATH = "/docker/demo/config/test-suite/source.avsc";
private static final String COMPLEX_SOURCE_SCHEMA_DOCKER_DEMO_RELATIVE_PATH =
"/docker/demo/config/test-suite/complex-source.avsc";
@Test
public void testSimplePayload() throws Exception {
Schema schema = new Schema.Parser().parse(UtilitiesTestBase.Helpers
.readFileFromAbsolutePath(System.getProperty("user.dir") + "/.." + SOURCE_SCHEMA_DOCKER_DEMO_RELATIVE_PATH));
GenericRecordFullPayloadGenerator payloadGenerator = new GenericRecordFullPayloadGenerator(schema);
GenericRecord record = payloadGenerator.getNewPayload();
// The generated payload should validate with the provided schema
payloadGenerator.validate(record);
}
@Test
public void testComplexPayload() throws IOException {
Schema schema = new Schema.Parser().parse(UtilitiesTestBase.Helpers
.readFileFromAbsolutePath(System.getProperty("user.dir") + "/.." +
COMPLEX_SOURCE_SCHEMA_DOCKER_DEMO_RELATIVE_PATH));
GenericRecordFullPayloadGenerator payloadGenerator = new GenericRecordFullPayloadGenerator(schema);
GenericRecord record = payloadGenerator.getNewPayload();
// The generated payload should validate with the provided schema
assertTrue(payloadGenerator.validate(record));
}
@Test
public void testComplexPartialPayload() throws IOException {
Schema schema = new Schema.Parser().parse(UtilitiesTestBase.Helpers
.readFileFromAbsolutePath(System.getProperty("user.dir") + "/.." +
COMPLEX_SOURCE_SCHEMA_DOCKER_DEMO_RELATIVE_PATH));
GenericRecordPartialPayloadGenerator payloadGenerator = new GenericRecordPartialPayloadGenerator(schema);
IntStream.range(0, 10).forEach(a -> {
GenericRecord record = payloadGenerator.getNewPayload();
// The generated payload should validate with the provided schema
assertTrue(payloadGenerator.validate(record));
});
}
@Test
public void testUpdatePayloadGenerator() throws IOException {
Schema schema = new Schema.Parser().parse(UtilitiesTestBase.Helpers
.readFileFromAbsolutePath(System.getProperty("user.dir") + "/.." + SOURCE_SCHEMA_DOCKER_DEMO_RELATIVE_PATH));
GenericRecordFullPayloadGenerator payloadGenerator = new GenericRecordFullPayloadGenerator(schema);
List<String> insertRowKeys = new ArrayList<>();
List<String> updateRowKeys = new ArrayList<>();
List<Long> insertTimeStamps = new ArrayList<>();
List<Long> updateTimeStamps = new ArrayList<>();
List<GenericRecord> records = new ArrayList<>();
// Generate 10 new records
IntStream.range(0, 10).forEach(a -> {
GenericRecord record = payloadGenerator.getNewPayload();
records.add(record);
insertRowKeys.add(record.get("_row_key").toString());
insertTimeStamps.add((Long) record.get("timestamp"));
});
List<String> blacklistFields = Arrays.asList("_row_key");
records.stream().forEach(a -> {
// Generate 10 updated records
GenericRecord record = payloadGenerator.getUpdatePayload(a, blacklistFields);
updateRowKeys.add(record.get("_row_key").toString());
updateTimeStamps.add((Long) record.get("timestamp"));
});
// The row keys from insert payloads should match all the row keys from the update payloads
assertTrue(insertRowKeys.containsAll(updateRowKeys));
// The timestamp field for the insert payloads should not all match with the update payloads
assertFalse(insertTimeStamps.containsAll(updateTimeStamps));
}
@Test
public void testSimplePayloadWithLargeMinSize() throws Exception {
Schema schema = new Schema.Parser().parse(UtilitiesTestBase.Helpers
.readFileFromAbsolutePath(System.getProperty("user.dir") + "/.." + SOURCE_SCHEMA_DOCKER_DEMO_RELATIVE_PATH));
int minPayloadSize = 1000;
GenericRecordFullPayloadGenerator payloadGenerator = new GenericRecordFullPayloadGenerator(schema,
minPayloadSize);
GenericRecord record = payloadGenerator.getNewPayload();
// The payload generated is less than minPayloadSize due to no collections present
assertTrue(HoodieAvroUtils.avroToBytes(record).length < minPayloadSize);
}
@Test
public void testComplexPayloadWithLargeMinSize() throws Exception {
Schema schema = new Schema.Parser().parse(UtilitiesTestBase.Helpers
.readFileFromAbsolutePath(System.getProperty("user.dir") + "/.." +
COMPLEX_SOURCE_SCHEMA_DOCKER_DEMO_RELATIVE_PATH));
int minPayloadSize = 10000;
GenericRecordFullPayloadGenerator payloadGenerator = new GenericRecordFullPayloadGenerator(
schema, minPayloadSize);
GenericRecord record = payloadGenerator.getNewPayload();
// The payload generated should be within 10% extra of the minPayloadSize
assertTrue(HoodieAvroUtils.avroToBytes(record).length < minPayloadSize + 0.1 * minPayloadSize);
}
}

View File

@@ -0,0 +1,224 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.integ.testsuite.job;
import static org.junit.jupiter.api.Assertions.assertEquals;
import java.util.UUID;
import java.util.stream.Stream;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hudi.DataSourceWriteOptions;
import org.apache.hudi.common.config.TypedProperties;
import org.apache.hudi.common.model.HoodieTableType;
import org.apache.hudi.common.table.HoodieTableMetaClient;
import org.apache.hudi.integ.testsuite.HoodieTestSuiteJob;
import org.apache.hudi.integ.testsuite.HoodieTestSuiteJob.HoodieTestSuiteConfig;
import org.apache.hudi.integ.testsuite.dag.ComplexDagGenerator;
import org.apache.hudi.integ.testsuite.dag.HiveSyncDagGenerator;
import org.apache.hudi.integ.testsuite.dag.HiveSyncDagGeneratorMOR;
import org.apache.hudi.integ.testsuite.dag.WorkflowDagGenerator;
import org.apache.hudi.integ.testsuite.reader.DeltaInputType;
import org.apache.hudi.integ.testsuite.writer.DeltaOutputMode;
import org.apache.hudi.keygen.TimestampBasedKeyGenerator;
import org.apache.hudi.utilities.schema.FilebasedSchemaProvider;
import org.apache.hudi.utilities.sources.AvroDFSSource;
import org.apache.hudi.utilities.testutils.UtilitiesTestBase;
import org.apache.log4j.Level;
import org.apache.log4j.Logger;
import org.junit.jupiter.api.AfterAll;
import org.junit.jupiter.api.AfterEach;
import org.junit.jupiter.api.BeforeAll;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.params.ParameterizedTest;
import org.junit.jupiter.params.provider.Arguments;
import org.junit.jupiter.params.provider.MethodSource;
public class TestHoodieTestSuiteJob extends UtilitiesTestBase {
private static final String TEST_NAME_WITH_PARAMS = "[{index}] Test with useDeltaStreamer={0}, tableType={1}";
private static final String BASE_PROPERTIES_DOCKER_DEMO_RELATIVE_PATH = "/docker/demo/config/test-suite/base"
+ ".properties";
private static final String SOURCE_SCHEMA_DOCKER_DEMO_RELATIVE_PATH = "/docker/demo/config/test-suite/source.avsc";
private static final String TARGET_SCHEMA_DOCKER_DEMO_RELATIVE_PATH = "/docker/demo/config/test-suite/target.avsc";
private static final String COW_DAG_DOCKER_DEMO_RELATIVE_PATH = "/docker/demo/config/test-suite/complex-dag-cow.yaml";
private static final String MOR_DAG_DOCKER_DEMO_RELATIVE_PATH = "/docker/demo/config/test-suite/complex-dag-mor.yaml";
public static Stream<Arguments> configParams() {
Object[][] data =
new Object[][] {{false, "COPY_ON_WRITE"}};
return Stream.of(data).map(Arguments::of);
}
@BeforeAll
public static void initClass() throws Exception {
UtilitiesTestBase.initClass();
// prepare the configs.
UtilitiesTestBase.Helpers.copyToDFSFromAbsolutePath(System.getProperty("user.dir") + "/.."
+ BASE_PROPERTIES_DOCKER_DEMO_RELATIVE_PATH, dfs, dfsBasePath + "/base.properties");
UtilitiesTestBase.Helpers.copyToDFSFromAbsolutePath(System.getProperty("user.dir") + "/.."
+ SOURCE_SCHEMA_DOCKER_DEMO_RELATIVE_PATH, dfs, dfsBasePath + "/source.avsc");
UtilitiesTestBase.Helpers.copyToDFSFromAbsolutePath(System.getProperty("user.dir") + "/.."
+ TARGET_SCHEMA_DOCKER_DEMO_RELATIVE_PATH, dfs, dfsBasePath + "/target.avsc");
UtilitiesTestBase.Helpers.copyToDFSFromAbsolutePath(System.getProperty("user.dir") + "/.."
+ COW_DAG_DOCKER_DEMO_RELATIVE_PATH, dfs, dfsBasePath + "/complex-dag-cow.yaml");
UtilitiesTestBase.Helpers.copyToDFSFromAbsolutePath(System.getProperty("user.dir") + "/.."
+ MOR_DAG_DOCKER_DEMO_RELATIVE_PATH, dfs, dfsBasePath + "/complex-dag-mor.yaml");
TypedProperties props = new TypedProperties();
props.setProperty("hoodie.datasource.write.recordkey.field", "_row_key");
props.setProperty("hoodie.datasource.write.partitionpath.field", "timestamp");
props.setProperty("hoodie.deltastreamer.keygen.timebased.timestamp.type", "UNIX_TIMESTAMP");
props.setProperty("hoodie.deltastreamer.keygen.timebased.output.dateformat", "yyyy/MM/dd");
props.setProperty("hoodie.deltastreamer.schemaprovider.source.schema.file", dfsBasePath + "/source.avsc");
props.setProperty("hoodie.deltastreamer.schemaprovider.target.schema.file", dfsBasePath + "/source.avsc");
props.setProperty("hoodie.deltastreamer.source.dfs.root", dfsBasePath + "/input");
props.setProperty("hoodie.datasource.hive_sync.assume_date_partitioning", "true");
props.setProperty("hoodie.datasource.write.keytranslator.class", "org.apache.hudi"
+ ".DayBasedPartitionPathKeyTranslator");
props.setProperty("hoodie.compact.inline.max.delta.commits", "3");
props.setProperty("hoodie.parquet.max.file.size", "1024000");
props.setProperty("hoodie.compact.inline.max.delta.commits", "0");
// Hive Configs
props.setProperty(DataSourceWriteOptions.HIVE_URL_OPT_KEY(), "jdbc:hive2://127.0.0.1:9999/");
props.setProperty(DataSourceWriteOptions.HIVE_DATABASE_OPT_KEY(), "testdb1");
props.setProperty(DataSourceWriteOptions.HIVE_TABLE_OPT_KEY(), "table1");
props.setProperty(DataSourceWriteOptions.HIVE_PARTITION_FIELDS_OPT_KEY(), "datestr");
props.setProperty(DataSourceWriteOptions.KEYGENERATOR_CLASS_OPT_KEY(), TimestampBasedKeyGenerator.class.getName());
UtilitiesTestBase.Helpers.savePropsToDFS(props, dfs, dfsBasePath + "/test-source"
+ ".properties");
// Properties used for the delta-streamer which incrementally pulls from upstream DFS Avro source and
// writes to downstream hudi table
TypedProperties downstreamProps = new TypedProperties();
downstreamProps.setProperty("include", "base.properties");
downstreamProps.setProperty("hoodie.datasource.write.recordkey.field", "_row_key");
downstreamProps.setProperty("hoodie.datasource.write.partitionpath.field", "timestamp");
// Source schema is the target schema of upstream table
downstreamProps.setProperty("hoodie.deltastreamer.schemaprovider.source.schema.file", dfsBasePath + "/source.avsc");
downstreamProps.setProperty("hoodie.deltastreamer.schemaprovider.target.schema.file", dfsBasePath + "/source.avsc");
UtilitiesTestBase.Helpers.savePropsToDFS(downstreamProps, dfs,
dfsBasePath + "/test-downstream-source.properties");
// these tests cause a lot of log verbosity from spark, turning it down
Logger.getLogger("org.apache.spark").setLevel(Level.WARN);
}
@AfterAll
public static void cleanupClass() {
UtilitiesTestBase.cleanupClass();
}
@BeforeEach
public void setup() throws Exception {
super.setup();
}
@AfterEach
public void teardown() throws Exception {
super.teardown();
}
// Tests in this class add to the test build time significantly. Since this is a Integration Test (end to end), we
// would like to run this as a nightly build which is a TODO.
// TODO : Clean up input / result paths after each test
@MethodSource("configParams")
public void testDagWithInsertUpsertAndValidate(boolean useDeltaStreamer, String tableType) throws Exception {
dfs.delete(new Path(dfsBasePath + "/input"), true);
dfs.delete(new Path(dfsBasePath + "/result"), true);
String inputBasePath = dfsBasePath + "/input/" + UUID.randomUUID().toString();
String outputBasePath = dfsBasePath + "/result/" + UUID.randomUUID().toString();
HoodieTestSuiteConfig cfg = makeConfig(inputBasePath, outputBasePath, useDeltaStreamer, tableType);
cfg.workloadDagGenerator = ComplexDagGenerator.class.getName();
HoodieTestSuiteJob hoodieTestSuiteJob = new HoodieTestSuiteJob(cfg, jsc);
hoodieTestSuiteJob.runTestSuite();
HoodieTableMetaClient metaClient = new HoodieTableMetaClient(new Configuration(), cfg.targetBasePath);
assertEquals(metaClient.getActiveTimeline().getCommitsTimeline().getInstants().count(), 2);
}
@MethodSource("configParams")
public void testHiveSync(boolean useDeltaStreamer, String tableType) throws Exception {
dfs.delete(new Path(dfsBasePath + "/input"), true);
dfs.delete(new Path(dfsBasePath + "/result"), true);
String inputBasePath = dfsBasePath + "/input";
String outputBasePath = dfsBasePath + "/result";
HoodieTestSuiteConfig cfg = makeConfig(inputBasePath, outputBasePath, useDeltaStreamer, tableType);
if (tableType == HoodieTableType.COPY_ON_WRITE.name()) {
cfg.workloadDagGenerator = HiveSyncDagGenerator.class.getName();
} else {
cfg.workloadDagGenerator = HiveSyncDagGeneratorMOR.class.getName();
}
HoodieTestSuiteJob hoodieTestSuiteJob = new HoodieTestSuiteJob(cfg, jsc);
hoodieTestSuiteJob.runTestSuite();
HoodieTableMetaClient metaClient = new HoodieTableMetaClient(new Configuration(), cfg.targetBasePath);
assertEquals(metaClient.getActiveTimeline().getCommitsTimeline().getInstants().count(), 1);
}
@MethodSource("configParams")
public void testCOWFullDagFromYaml(boolean useDeltaStreamer, String tableType) throws Exception {
dfs.delete(new Path(dfsBasePath + "/input"), true);
dfs.delete(new Path(dfsBasePath + "/result"), true);
String inputBasePath = dfsBasePath + "/input";
String outputBasePath = dfsBasePath + "/result";
HoodieTestSuiteConfig cfg = makeConfig(inputBasePath, outputBasePath, useDeltaStreamer, HoodieTableType
.COPY_ON_WRITE.name());
cfg.workloadYamlPath = dfsBasePath + "/complex-dag-cow.yaml";
HoodieTestSuiteJob hoodieTestSuiteJob = new HoodieTestSuiteJob(cfg, jsc);
hoodieTestSuiteJob.runTestSuite();
HoodieTableMetaClient metaClient = new HoodieTableMetaClient(new Configuration(), cfg.targetBasePath);
assertEquals(metaClient.getActiveTimeline().getCommitsTimeline().getInstants().count(), 5);
}
@MethodSource("configParams")
public void testMORFullDagFromYaml(boolean useDeltaStreamer, String tableType) throws Exception {
dfs.delete(new Path(dfsBasePath + "/input"), true);
dfs.delete(new Path(dfsBasePath + "/result"), true);
String inputBasePath = dfsBasePath + "/input";
String outputBasePath = dfsBasePath + "/result";
HoodieTestSuiteConfig cfg = makeConfig(inputBasePath, outputBasePath, useDeltaStreamer, HoodieTableType
.MERGE_ON_READ.name());
cfg.workloadYamlPath = dfsBasePath + "/complex-dag-mor.yaml";
HoodieTestSuiteJob hoodieTestSuiteJob = new HoodieTestSuiteJob(cfg, jsc);
hoodieTestSuiteJob.runTestSuite();
HoodieTableMetaClient metaClient = new HoodieTableMetaClient(new Configuration(), cfg.targetBasePath);
assertEquals(metaClient.getActiveTimeline().getCommitsTimeline().getInstants().count(), 7);
}
protected HoodieTestSuiteConfig makeConfig(String inputBasePath, String outputBasePath, boolean useDeltaStream,
String tableType) {
HoodieTestSuiteConfig cfg = new HoodieTestSuiteConfig();
cfg.targetBasePath = outputBasePath;
cfg.inputBasePath = inputBasePath;
cfg.targetTableName = "table1";
cfg.tableType = tableType;
cfg.sourceClassName = AvroDFSSource.class.getName();
cfg.sourceOrderingField = "timestamp";
cfg.propsFilePath = dfsBasePath + "/test-source.properties";
cfg.outputTypeName = DeltaOutputMode.DFS.name();
cfg.inputFormatName = DeltaInputType.AVRO.name();
cfg.limitFileSize = 1024 * 1024L;
cfg.sourceLimit = 20000000;
cfg.workloadDagGenerator = WorkflowDagGenerator.class.getName();
cfg.schemaProviderClassName = FilebasedSchemaProvider.class.getName();
cfg.useDeltaStreamer = useDeltaStream;
return cfg;
}
}

View File

@@ -0,0 +1,74 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.integ.testsuite.reader;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertTrue;
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hudi.common.fs.FSUtils;
import org.apache.hudi.common.util.Option;
import org.apache.hudi.integ.testsuite.utils.TestUtils;
import org.apache.hudi.integ.testsuite.utils.TestUtils;
import org.apache.hudi.utilities.testutils.UtilitiesTestBase;
import org.junit.jupiter.api.AfterAll;
import org.junit.jupiter.api.BeforeAll;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Disabled;
import org.junit.jupiter.api.Test;
public class TestDFSAvroDeltaInputReader extends UtilitiesTestBase {
@BeforeAll
public static void initClass() throws Exception {
UtilitiesTestBase.initClass();
}
@AfterAll
public static void cleanupClass() {
UtilitiesTestBase.cleanupClass();
}
@BeforeEach
public void setup() throws Exception {
super.setup();
}
@Test
@Disabled
public void testDFSSinkReader() throws IOException {
FileSystem fs = FSUtils.getFs(dfsBasePath, new Configuration());
// Create 10 avro files with 10 records each
TestUtils.createAvroFiles(jsc, sparkSession, dfsBasePath, 10, 10);
FileStatus[] statuses = fs.globStatus(new Path(dfsBasePath + "/*/*.avro"));
DFSAvroDeltaInputReader reader =
new DFSAvroDeltaInputReader(sparkSession, TestUtils.getSchema().toString(), dfsBasePath, Option.empty(),
Option.empty());
assertEquals(reader.analyzeSingleFile(statuses[0].getPath().toString()), 5);
assertEquals(reader.read(100).count(), 100);
assertEquals(reader.read(1000).count(), 100);
assertEquals(reader.read(10).count(), 10);
assertTrue(reader.read(11).count() > 11);
}
}

View File

@@ -0,0 +1,119 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.integ.testsuite.reader;
import static junit.framework.TestCase.assertEquals;
import static junit.framework.TestCase.assertTrue;
import java.util.HashSet;
import java.util.List;
import org.apache.avro.Schema;
import org.apache.avro.generic.GenericRecord;
import org.apache.hudi.avro.HoodieAvroUtils;
import org.apache.hudi.client.HoodieWriteClient;
import org.apache.hudi.client.WriteStatus;
import org.apache.hudi.common.config.TypedProperties;
import org.apache.hudi.common.model.HoodieRecord;
import org.apache.hudi.common.testutils.HoodieTestDataGenerator;
import org.apache.hudi.common.testutils.HoodieTestUtils;
import org.apache.hudi.config.HoodieWriteConfig;
import org.apache.hudi.utilities.schema.FilebasedSchemaProvider;
import org.apache.hudi.utilities.testutils.UtilitiesTestBase;
import org.apache.spark.api.java.JavaRDD;
import org.junit.jupiter.api.AfterAll;
import org.junit.jupiter.api.AfterEach;
import org.junit.jupiter.api.BeforeAll;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
public class TestDFSHoodieDatasetInputReader extends UtilitiesTestBase {
@BeforeAll
public static void initClass() throws Exception {
UtilitiesTestBase.initClass();
}
@AfterAll
public static void cleanupClass() {
UtilitiesTestBase.cleanupClass();
}
@BeforeEach
public void setup() throws Exception {
super.setup();
HoodieTestUtils.init(jsc.hadoopConfiguration(), dfsBasePath);
}
@AfterEach
public void teardown() throws Exception {
super.teardown();
}
@Test
public void testSimpleHoodieDatasetReader() throws Exception {
HoodieWriteConfig config = makeHoodieClientConfig();
HoodieWriteClient client = new HoodieWriteClient(jsc, config);
String commitTime = client.startCommit();
HoodieTestDataGenerator generator = new HoodieTestDataGenerator();
// Insert 100 records across 3 partitions
List<HoodieRecord> inserts = generator.generateInserts(commitTime, 100);
JavaRDD<WriteStatus> writeStatuses = client.upsert(jsc.parallelize(inserts), commitTime);
writeStatuses.count();
DFSHoodieDatasetInputReader reader = new DFSHoodieDatasetInputReader(jsc, config.getBasePath(),
HoodieAvroUtils.addMetadataFields(new Schema.Parser().parse(config.getSchema())).toString());
// Try to read 100 records for the same partition path and same file ID
JavaRDD<GenericRecord> records = reader.read(1, 1, 100L);
assertTrue(records.count() <= 100);
assertEquals(new HashSet<>(records.map(p -> p.get(HoodieRecord.PARTITION_PATH_METADATA_FIELD)).collect()).size(),
1);
assertEquals(new HashSet<>(records.map(p -> p.get(HoodieRecord.FILENAME_METADATA_FIELD)).collect()).size(),
1);
// Try to read 100 records for 3 partition paths and 3 different file ids
records = reader.read(3, 3, 100L);
assertTrue(records.count() <= 100);
assertEquals(new HashSet<>(records.map(p -> p.get(HoodieRecord.PARTITION_PATH_METADATA_FIELD)).collect()).size(),
3);
assertEquals(new HashSet<>(records.map(p -> p.get(HoodieRecord.FILENAME_METADATA_FIELD)).collect()).size(),
3);
// Try to read 100 records for 3 partition paths and 50% records from each file
records = reader.read(3, 3, 0.5);
assertTrue(records.count() <= 100);
assertEquals(new HashSet<>(records.map(p -> p.get(HoodieRecord.PARTITION_PATH_METADATA_FIELD)).collect()).size(),
3);
assertEquals(new HashSet<>(records.map(p -> p.get(HoodieRecord.FILENAME_METADATA_FIELD)).collect()).size(),
3);
}
private HoodieWriteConfig makeHoodieClientConfig() throws Exception {
return makeHoodieClientConfigBuilder().build();
}
private HoodieWriteConfig.Builder makeHoodieClientConfigBuilder() throws Exception {
// Prepare the AvroParquetIO
return HoodieWriteConfig.newBuilder().withPath(dfsBasePath)
.withParallelism(2, 2)
.withSchema(HoodieTestDataGenerator
.TRIP_EXAMPLE_SCHEMA);
}
}

View File

@@ -0,0 +1,63 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hudi.integ.testsuite.utils;
import java.util.List;
import org.apache.avro.Schema;
import org.apache.avro.generic.GenericRecord;
import org.apache.hudi.AvroConversionUtils;
import org.apache.hudi.common.testutils.HoodieTestDataGenerator;
import org.apache.hudi.utilities.schema.RowBasedSchemaProvider;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.sql.SparkSession;
public class TestUtils {
/**
* Create a RDD of generic records for testing purposes.
*/
public static JavaRDD<GenericRecord> makeRDD(JavaSparkContext jsc, int numRecords) {
return jsc.parallelize(generateGenericRecords(numRecords));
}
/**
* Generate generic records.
*/
public static List<GenericRecord> generateGenericRecords(int numRecords) {
HoodieTestDataGenerator dataGenerator = new HoodieTestDataGenerator();
return dataGenerator.generateGenericRecords(numRecords);
}
public static void createAvroFiles(JavaSparkContext jsc, SparkSession sparkSession, String basePath, int numFiles,
int numRecordsPerFile) {
Schema schema = HoodieTestDataGenerator.AVRO_SCHEMA;
for (int i = 0; i < numFiles; i++) {
JavaRDD<GenericRecord> rdd = makeRDD(jsc, numRecordsPerFile);
AvroConversionUtils.createDataFrame(rdd.rdd(), schema.toString(), sparkSession).write()
.format("avro").option("recordName", RowBasedSchemaProvider.HOODIE_RECORD_STRUCT_NAME)
.option("recordNamespace", RowBasedSchemaProvider.HOODIE_RECORD_NAMESPACE).save(basePath + "/" + i);
}
}
public static Schema getSchema() {
return HoodieTestDataGenerator.AVRO_SCHEMA;
}
}

View File

@@ -15,8 +15,9 @@
# See the License for the specific language governing permissions and
# limitations under the License.
###
log4j.rootLogger=WARN, CONSOLE
log4j.logger.org.apache.hudi=DEBUG
log4j.rootLogger=ERROR, CONSOLE
log4j.logger.org.apache.hudi=ERROR
log4j.category.org.apache.spark=ERROR
# CONSOLE is set to be a ConsoleAppender.
log4j.appender.CONSOLE=org.apache.log4j.ConsoleAppender
@@ -25,5 +26,5 @@ log4j.appender.CONSOLE.layout=org.apache.log4j.PatternLayout
log4j.appender.CONSOLE.layout.ConversionPattern=[%-5p] %d %c %x - %m%n
log4j.appender.CONSOLE.filter.a=org.apache.log4j.varia.LevelRangeFilter
log4j.appender.CONSOLE.filter.a.AcceptOnMatch=true
log4j.appender.CONSOLE.filter.a.LevelMin=WARN
log4j.appender.CONSOLE.filter.a.LevelMin=ERROR
log4j.appender.CONSOLE.filter.a.LevelMax=FATAL