[HUDI-1303] Some improvements for the HUDI Test Suite. (#2128)
1. Use the DAG Node's label from the yaml as its name instead of UUID names which are not descriptive when debugging issues from logs. 2. Fix CleanNode constructor which is not correctly implemented 3. When generating upsets, allows more granualar control over the number of inserts and upserts - zero or more inserts and upserts can be specified instead of always requiring both inserts and upserts. 4. Fixed generation of records of specific size - The current code was using a class variable "shouldAddMore" which was reset to false after the first record generation causing subsequent records to be of minimum size. - In this change, we pre-calculate the extra size of the complex fields. When generating records, for complex fields we read the field size from this map. 5. Refresh the timeline of the DeltaSync service before calling readFromSource. This ensures that only the newest generated data is read and data generated in the older Dag Nodes is ignored (as their AVRO files will have an older timestamp). 6. Making --workload-generator-classname an optional parameter as most probably the default will be used
This commit is contained in:
@@ -21,12 +21,13 @@ HIVE_SITE_CONF_javax_jdo_option_ConnectionUserName=hive
|
|||||||
HIVE_SITE_CONF_javax_jdo_option_ConnectionPassword=hive
|
HIVE_SITE_CONF_javax_jdo_option_ConnectionPassword=hive
|
||||||
HIVE_SITE_CONF_datanucleus_autoCreateSchema=false
|
HIVE_SITE_CONF_datanucleus_autoCreateSchema=false
|
||||||
HIVE_SITE_CONF_hive_metastore_uris=thrift://hivemetastore:9083
|
HIVE_SITE_CONF_hive_metastore_uris=thrift://hivemetastore:9083
|
||||||
HDFS_CONF_dfs_namenode_datanode_registration_ip___hostname___check=false
|
|
||||||
|
|
||||||
|
HDFS_CONF_dfs_namenode_datanode_registration_ip___hostname___check=false
|
||||||
HDFS_CONF_dfs_webhdfs_enabled=true
|
HDFS_CONF_dfs_webhdfs_enabled=true
|
||||||
HDFS_CONF_dfs_permissions_enabled=false
|
HDFS_CONF_dfs_permissions_enabled=false
|
||||||
#HDFS_CONF_dfs_client_use_datanode_hostname=true
|
#HDFS_CONF_dfs_client_use_datanode_hostname=true
|
||||||
#HDFS_CONF_dfs_namenode_use_datanode_hostname=true
|
#HDFS_CONF_dfs_namenode_use_datanode_hostname=true
|
||||||
|
HDFS_CONF_dfs_replication=1
|
||||||
|
|
||||||
CORE_CONF_fs_defaultFS=hdfs://namenode:8020
|
CORE_CONF_fs_defaultFS=hdfs://namenode:8020
|
||||||
CORE_CONF_hadoop_http_staticuser_user=root
|
CORE_CONF_hadoop_http_staticuser_user=root
|
||||||
|
|||||||
@@ -66,6 +66,7 @@ public class HoodieDeltaStreamerWrapper extends HoodieDeltaStreamer {
|
|||||||
|
|
||||||
public Pair<SchemaProvider, Pair<String, JavaRDD<HoodieRecord>>> fetchSource() throws Exception {
|
public Pair<SchemaProvider, Pair<String, JavaRDD<HoodieRecord>>> fetchSource() throws Exception {
|
||||||
DeltaSync service = deltaSyncService.get().getDeltaSync();
|
DeltaSync service = deltaSyncService.get().getDeltaSync();
|
||||||
|
service.refreshTimeline();
|
||||||
return service.readFromSource(service.getCommitTimelineOpt());
|
return service.readFromSource(service.getCommitTimelineOpt());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -156,8 +156,7 @@ public class HoodieTestSuiteJob {
|
|||||||
public String inputBasePath;
|
public String inputBasePath;
|
||||||
|
|
||||||
@Parameter(names = {
|
@Parameter(names = {
|
||||||
"--workload-generator-classname"}, description = "WorkflowDag of operations to generate the workload",
|
"--workload-generator-classname"}, description = "WorkflowDag of operations to generate the workload")
|
||||||
required = true)
|
|
||||||
public String workloadDagGenerator = WorkflowDagGenerator.class.getName();
|
public String workloadDagGenerator = WorkflowDagGenerator.class.getName();
|
||||||
|
|
||||||
@Parameter(names = {
|
@Parameter(names = {
|
||||||
@@ -177,8 +176,7 @@ public class HoodieTestSuiteJob {
|
|||||||
public Long limitFileSize = 1024 * 1024 * 120L;
|
public Long limitFileSize = 1024 * 1024 * 120L;
|
||||||
|
|
||||||
@Parameter(names = {"--use-deltastreamer"}, description = "Choose whether to use HoodieDeltaStreamer to "
|
@Parameter(names = {"--use-deltastreamer"}, description = "Choose whether to use HoodieDeltaStreamer to "
|
||||||
+ "perform"
|
+ "perform ingestion. If set to false, HoodieWriteClient will be used")
|
||||||
+ " ingestion. If set to false, HoodieWriteClient will be used")
|
|
||||||
public Boolean useDeltaStreamer = false;
|
public Boolean useDeltaStreamer = false;
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -257,6 +257,11 @@ public class DeltaConfig implements Serializable {
|
|||||||
return this;
|
return this;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public Builder withName(String name) {
|
||||||
|
this.configsMap.put(CONFIG_NAME, name);
|
||||||
|
return this;
|
||||||
|
}
|
||||||
|
|
||||||
public Config build() {
|
public Config build() {
|
||||||
return new Config(configsMap);
|
return new Config(configsMap);
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -68,7 +68,7 @@ public class DagUtils {
|
|||||||
Iterator<Entry<String, JsonNode>> itr = jsonNode.fields();
|
Iterator<Entry<String, JsonNode>> itr = jsonNode.fields();
|
||||||
while (itr.hasNext()) {
|
while (itr.hasNext()) {
|
||||||
Entry<String, JsonNode> dagNode = itr.next();
|
Entry<String, JsonNode> dagNode = itr.next();
|
||||||
allNodes.put(dagNode.getKey(), convertJsonToDagNode(allNodes, dagNode.getValue()));
|
allNodes.put(dagNode.getKey(), convertJsonToDagNode(allNodes, dagNode.getKey(), dagNode.getValue()));
|
||||||
}
|
}
|
||||||
return new WorkflowDag(findRootNodes(allNodes));
|
return new WorkflowDag(findRootNodes(allNodes));
|
||||||
}
|
}
|
||||||
@@ -94,9 +94,10 @@ public class DagUtils {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private static DagNode convertJsonToDagNode(Map<String, DagNode> allNodes, JsonNode node) throws IOException {
|
private static DagNode convertJsonToDagNode(Map<String, DagNode> allNodes, String name, JsonNode node)
|
||||||
|
throws IOException {
|
||||||
String type = node.get(DeltaConfig.Config.TYPE).asText();
|
String type = node.get(DeltaConfig.Config.TYPE).asText();
|
||||||
final DagNode retNode = convertJsonToDagNode(node, type);
|
final DagNode retNode = convertJsonToDagNode(node, type, name);
|
||||||
Arrays.asList(node.get(DeltaConfig.Config.DEPENDENCIES).textValue().split(",")).stream().forEach(dep -> {
|
Arrays.asList(node.get(DeltaConfig.Config.DEPENDENCIES).textValue().split(",")).stream().forEach(dep -> {
|
||||||
DagNode parentNode = allNodes.get(dep);
|
DagNode parentNode = allNodes.get(dep);
|
||||||
if (parentNode != null) {
|
if (parentNode != null) {
|
||||||
@@ -116,9 +117,10 @@ public class DagUtils {
|
|||||||
return rootNodes;
|
return rootNodes;
|
||||||
}
|
}
|
||||||
|
|
||||||
private static DagNode convertJsonToDagNode(JsonNode node, String type) {
|
private static DagNode convertJsonToDagNode(JsonNode node, String type, String name) {
|
||||||
try {
|
try {
|
||||||
DeltaConfig.Config config = DeltaConfig.Config.newBuilder().withConfigsMap(convertJsonNodeToMap(node)).build();
|
DeltaConfig.Config config = DeltaConfig.Config.newBuilder().withConfigsMap(convertJsonNodeToMap(node))
|
||||||
|
.withName(name).build();
|
||||||
return (DagNode) ReflectionUtils.loadClass(generateFQN(type), config);
|
return (DagNode) ReflectionUtils.loadClass(generateFQN(type), config);
|
||||||
} catch (ClassNotFoundException e) {
|
} catch (ClassNotFoundException e) {
|
||||||
throw new RuntimeException(e);
|
throw new RuntimeException(e);
|
||||||
|
|||||||
@@ -18,6 +18,7 @@
|
|||||||
|
|
||||||
package org.apache.hudi.integ.testsuite.dag.nodes;
|
package org.apache.hudi.integ.testsuite.dag.nodes;
|
||||||
|
|
||||||
|
import org.apache.hudi.integ.testsuite.configuration.DeltaConfig.Config;
|
||||||
import org.apache.hudi.integ.testsuite.dag.ExecutionContext;
|
import org.apache.hudi.integ.testsuite.dag.ExecutionContext;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@@ -26,7 +27,8 @@ import org.apache.hudi.integ.testsuite.dag.ExecutionContext;
|
|||||||
*/
|
*/
|
||||||
public class CleanNode extends DagNode<Boolean> {
|
public class CleanNode extends DagNode<Boolean> {
|
||||||
|
|
||||||
public CleanNode() {
|
public CleanNode(Config config) {
|
||||||
|
this.config = config;
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
|||||||
@@ -31,7 +31,6 @@ import java.util.Map;
|
|||||||
import java.util.stream.StreamSupport;
|
import java.util.stream.StreamSupport;
|
||||||
import org.apache.avro.generic.GenericRecord;
|
import org.apache.avro.generic.GenericRecord;
|
||||||
import org.apache.hudi.common.util.Option;
|
import org.apache.hudi.common.util.Option;
|
||||||
import org.apache.hudi.integ.testsuite.converter.Converter;
|
|
||||||
import org.apache.hudi.integ.testsuite.converter.UpdateConverter;
|
import org.apache.hudi.integ.testsuite.converter.UpdateConverter;
|
||||||
import org.apache.hudi.integ.testsuite.reader.DFSAvroDeltaInputReader;
|
import org.apache.hudi.integ.testsuite.reader.DFSAvroDeltaInputReader;
|
||||||
import org.apache.hudi.integ.testsuite.reader.DFSHoodieDatasetInputReader;
|
import org.apache.hudi.integ.testsuite.reader.DFSHoodieDatasetInputReader;
|
||||||
@@ -93,11 +92,11 @@ public class DeltaGenerator implements Serializable {
|
|||||||
}
|
}
|
||||||
|
|
||||||
public JavaRDD<GenericRecord> generateInserts(Config operation) {
|
public JavaRDD<GenericRecord> generateInserts(Config operation) {
|
||||||
long recordsPerPartition = operation.getNumRecordsInsert();
|
|
||||||
int numPartitions = operation.getNumInsertPartitions();
|
int numPartitions = operation.getNumInsertPartitions();
|
||||||
|
long recordsPerPartition = operation.getNumRecordsInsert() / numPartitions;
|
||||||
int minPayloadSize = operation.getRecordSize();
|
int minPayloadSize = operation.getRecordSize();
|
||||||
JavaRDD<GenericRecord> inputBatch = jsc.parallelize(Collections.EMPTY_LIST)
|
JavaRDD<GenericRecord> inputBatch = jsc.parallelize(Collections.EMPTY_LIST)
|
||||||
.repartition(operation.getNumInsertPartitions()).mapPartitions(p -> {
|
.repartition(numPartitions).mapPartitions(p -> {
|
||||||
return new LazyRecordGeneratorIterator(new FlexibleSchemaRecordGenerationIterator(recordsPerPartition,
|
return new LazyRecordGeneratorIterator(new FlexibleSchemaRecordGenerationIterator(recordsPerPartition,
|
||||||
minPayloadSize, schemaStr, partitionPathFieldNames, numPartitions));
|
minPayloadSize, schemaStr, partitionPathFieldNames, numPartitions));
|
||||||
});
|
});
|
||||||
@@ -112,7 +111,8 @@ public class DeltaGenerator implements Serializable {
|
|||||||
}
|
}
|
||||||
DeltaInputReader deltaInputReader = null;
|
DeltaInputReader deltaInputReader = null;
|
||||||
JavaRDD<GenericRecord> adjustedRDD = null;
|
JavaRDD<GenericRecord> adjustedRDD = null;
|
||||||
if (config.getNumUpsertPartitions() < 1) {
|
if (config.getNumUpsertPartitions() != 0) {
|
||||||
|
if (config.getNumUpsertPartitions() < 0) {
|
||||||
// randomly generate updates for a given number of records without regard to partitions and files
|
// randomly generate updates for a given number of records without regard to partitions and files
|
||||||
deltaInputReader = new DFSAvroDeltaInputReader(sparkSession, schemaStr,
|
deltaInputReader = new DFSAvroDeltaInputReader(sparkSession, schemaStr,
|
||||||
((DFSDeltaConfig) deltaOutputConfig).getDeltaBasePath(), Option.empty(), Option.empty());
|
((DFSDeltaConfig) deltaOutputConfig).getDeltaBasePath(), Option.empty(), Option.empty());
|
||||||
@@ -130,16 +130,25 @@ public class DeltaGenerator implements Serializable {
|
|||||||
.getNumRecordsUpsert());
|
.getNumRecordsUpsert());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
log.info("Repartitioning records");
|
log.info("Repartitioning records");
|
||||||
// persist this since we will make multiple passes over this
|
// persist this since we will make multiple passes over this
|
||||||
adjustedRDD = adjustedRDD.repartition(jsc.defaultParallelism());
|
adjustedRDD = adjustedRDD.repartition(jsc.defaultParallelism());
|
||||||
log.info("Repartitioning records done");
|
log.info("Repartitioning records done");
|
||||||
Converter converter = new UpdateConverter(schemaStr, config.getRecordSize(),
|
UpdateConverter converter = new UpdateConverter(schemaStr, config.getRecordSize(),
|
||||||
partitionPathFieldNames, recordRowKeyFieldNames);
|
partitionPathFieldNames, recordRowKeyFieldNames);
|
||||||
JavaRDD<GenericRecord> updates = converter.convert(adjustedRDD);
|
JavaRDD<GenericRecord> updates = converter.convert(adjustedRDD);
|
||||||
|
|
||||||
log.info("Records converted");
|
log.info("Records converted");
|
||||||
updates.persist(StorageLevel.DISK_ONLY());
|
updates.persist(StorageLevel.DISK_ONLY());
|
||||||
return inserts != null ? inserts.union(updates) : updates;
|
|
||||||
|
if (inserts == null) {
|
||||||
|
inserts = updates;
|
||||||
|
} else {
|
||||||
|
inserts = inserts.union(updates);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return inserts;
|
||||||
// TODO : Generate updates for only N partitions.
|
// TODO : Generate updates for only N partitions.
|
||||||
} else {
|
} else {
|
||||||
throw new IllegalArgumentException("Other formats are not supported at the moment");
|
throw new IllegalArgumentException("Other formats are not supported at the moment");
|
||||||
|
|||||||
@@ -44,25 +44,22 @@ import org.slf4j.LoggerFactory;
|
|||||||
* Every field of a generic record created using this generator contains a random value.
|
* Every field of a generic record created using this generator contains a random value.
|
||||||
*/
|
*/
|
||||||
public class GenericRecordFullPayloadGenerator implements Serializable {
|
public class GenericRecordFullPayloadGenerator implements Serializable {
|
||||||
|
private static Logger LOG = LoggerFactory.getLogger(GenericRecordFullPayloadGenerator.class);
|
||||||
|
|
||||||
public static final int DEFAULT_PAYLOAD_SIZE = 1024 * 10; // 10 KB
|
public static final int DEFAULT_PAYLOAD_SIZE = 1024 * 10; // 10 KB
|
||||||
public static final int DEFAULT_NUM_DATE_PARTITIONS = 50;
|
public static final int DEFAULT_NUM_DATE_PARTITIONS = 50;
|
||||||
private static Logger log = LoggerFactory.getLogger(GenericRecordFullPayloadGenerator.class);
|
|
||||||
protected final Random random = new Random();
|
protected final Random random = new Random();
|
||||||
// The source schema used to generate a payload
|
// The source schema used to generate a payload
|
||||||
private final transient Schema baseSchema;
|
private final transient Schema baseSchema;
|
||||||
// Used to validate a generic record
|
// Used to validate a generic record
|
||||||
private final transient GenericData genericData = new GenericData();
|
private final transient GenericData genericData = new GenericData();
|
||||||
// Number of more bytes to add based on the estimated full record payload size and min payload size
|
|
||||||
private int numberOfBytesToAdd;
|
|
||||||
// If more elements should be packed to meet the minPayloadSize
|
|
||||||
private boolean shouldAddMore;
|
|
||||||
// How many complex fields have we visited that can help us pack more entries and increase the size of the record
|
|
||||||
private int numberOfComplexFields;
|
|
||||||
// The size of a full record where every field of a generic record created contains 1 random value
|
|
||||||
private int estimatedFullPayloadSize;
|
|
||||||
// The number of unique dates to create
|
// The number of unique dates to create
|
||||||
private int numDatePartitions = DEFAULT_NUM_DATE_PARTITIONS;
|
private int numDatePartitions = DEFAULT_NUM_DATE_PARTITIONS;
|
||||||
|
// The size of a full record where every field of a generic record created contains 1 random value
|
||||||
|
private final int estimatedFullPayloadSize;
|
||||||
|
// Number of extra entries to add in a complex/collection field to achieve the desired record size
|
||||||
|
Map<String, Integer> extraEntriesMap = new HashMap<>();
|
||||||
|
|
||||||
// LogicalTypes in Avro 1.8.2
|
// LogicalTypes in Avro 1.8.2
|
||||||
private static final String DECIMAL = "decimal";
|
private static final String DECIMAL = "decimal";
|
||||||
private static final String UUID_NAME = "uuid";
|
private static final String UUID_NAME = "uuid";
|
||||||
@@ -80,17 +77,18 @@ public class GenericRecordFullPayloadGenerator implements Serializable {
|
|||||||
Pair<Integer, Integer> sizeInfo = new GenericRecordFullPayloadSizeEstimator(schema)
|
Pair<Integer, Integer> sizeInfo = new GenericRecordFullPayloadSizeEstimator(schema)
|
||||||
.typeEstimateAndNumComplexFields();
|
.typeEstimateAndNumComplexFields();
|
||||||
this.estimatedFullPayloadSize = sizeInfo.getLeft();
|
this.estimatedFullPayloadSize = sizeInfo.getLeft();
|
||||||
this.numberOfComplexFields = sizeInfo.getRight();
|
|
||||||
this.baseSchema = schema;
|
this.baseSchema = schema;
|
||||||
this.shouldAddMore = estimatedFullPayloadSize < minPayloadSize;
|
if (estimatedFullPayloadSize < minPayloadSize) {
|
||||||
if (this.shouldAddMore) {
|
int numberOfComplexFields = sizeInfo.getRight();
|
||||||
this.numberOfBytesToAdd = minPayloadSize - estimatedFullPayloadSize;
|
|
||||||
if (numberOfComplexFields < 1) {
|
if (numberOfComplexFields < 1) {
|
||||||
log.warn("The schema does not have any collections/complex fields. Cannot achieve minPayloadSize : {}",
|
LOG.warn("The schema does not have any collections/complex fields. "
|
||||||
minPayloadSize);
|
+ "Cannot achieve minPayloadSize => " + minPayloadSize);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
determineExtraEntriesRequired(numberOfComplexFields, minPayloadSize - estimatedFullPayloadSize);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
public GenericRecordFullPayloadGenerator(Schema schema, int minPayloadSize, int numDatePartitions) {
|
public GenericRecordFullPayloadGenerator(Schema schema, int minPayloadSize, int numDatePartitions) {
|
||||||
this(schema, minPayloadSize);
|
this(schema, minPayloadSize);
|
||||||
this.numDatePartitions = numDatePartitions;
|
this.numDatePartitions = numDatePartitions;
|
||||||
@@ -113,7 +111,11 @@ public class GenericRecordFullPayloadGenerator implements Serializable {
|
|||||||
* @return {@link GenericRecord} with random value
|
* @return {@link GenericRecord} with random value
|
||||||
*/
|
*/
|
||||||
public GenericRecord getNewPayload() {
|
public GenericRecord getNewPayload() {
|
||||||
return convert(baseSchema);
|
return getNewPayload(baseSchema);
|
||||||
|
}
|
||||||
|
|
||||||
|
protected GenericRecord getNewPayload(Schema schema) {
|
||||||
|
return randomize(new GenericData.Record(schema), null);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@@ -127,20 +129,6 @@ public class GenericRecordFullPayloadGenerator implements Serializable {
|
|||||||
return randomize(record, blacklistFields);
|
return randomize(record, blacklistFields);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
|
||||||
* Create a {@link GenericRecord} with random value according to given schema.
|
|
||||||
*
|
|
||||||
* @param schema Schema to create record with
|
|
||||||
* @return {@link GenericRecord} with random value
|
|
||||||
*/
|
|
||||||
protected GenericRecord convert(Schema schema) {
|
|
||||||
GenericRecord result = new GenericData.Record(schema);
|
|
||||||
for (Schema.Field f : schema.getFields()) {
|
|
||||||
result.put(f.name(), typeConvert(f.schema()));
|
|
||||||
}
|
|
||||||
return result;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Create a new {@link GenericRecord} with random values. Not all the fields have value, it is random, and its value
|
* Create a new {@link GenericRecord} with random values. Not all the fields have value, it is random, and its value
|
||||||
* is random too.
|
* is random too.
|
||||||
@@ -153,7 +141,7 @@ public class GenericRecordFullPayloadGenerator implements Serializable {
|
|||||||
for (Schema.Field f : schema.getFields()) {
|
for (Schema.Field f : schema.getFields()) {
|
||||||
boolean setNull = random.nextBoolean();
|
boolean setNull = random.nextBoolean();
|
||||||
if (!setNull) {
|
if (!setNull) {
|
||||||
result.put(f.name(), typeConvert(f.schema()));
|
result.put(f.name(), typeConvert(f));
|
||||||
} else {
|
} else {
|
||||||
result.put(f.name(), null);
|
result.put(f.name(), null);
|
||||||
}
|
}
|
||||||
@@ -173,7 +161,7 @@ public class GenericRecordFullPayloadGenerator implements Serializable {
|
|||||||
protected GenericRecord randomize(GenericRecord record, List<String> blacklistFields) {
|
protected GenericRecord randomize(GenericRecord record, List<String> blacklistFields) {
|
||||||
for (Schema.Field f : record.getSchema().getFields()) {
|
for (Schema.Field f : record.getSchema().getFields()) {
|
||||||
if (blacklistFields == null || !blacklistFields.contains(f.name())) {
|
if (blacklistFields == null || !blacklistFields.contains(f.name())) {
|
||||||
record.put(f.name(), typeConvert(f.schema()));
|
record.put(f.name(), typeConvert(f));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return record;
|
return record;
|
||||||
@@ -188,12 +176,12 @@ public class GenericRecordFullPayloadGenerator implements Serializable {
|
|||||||
/**
|
/**
|
||||||
* Generate random value according to their type.
|
* Generate random value according to their type.
|
||||||
*/
|
*/
|
||||||
private Object typeConvert(Schema schema) {
|
private Object typeConvert(Schema.Field field) {
|
||||||
Schema localSchema = schema;
|
Schema fieldSchema = field.schema();
|
||||||
if (isOption(schema)) {
|
if (isOption(fieldSchema)) {
|
||||||
localSchema = getNonNull(schema);
|
fieldSchema = getNonNull(fieldSchema);
|
||||||
}
|
}
|
||||||
switch (localSchema.getType()) {
|
switch (fieldSchema.getType()) {
|
||||||
case BOOLEAN:
|
case BOOLEAN:
|
||||||
return random.nextBoolean();
|
return random.nextBoolean();
|
||||||
case DOUBLE:
|
case DOUBLE:
|
||||||
@@ -207,43 +195,33 @@ public class GenericRecordFullPayloadGenerator implements Serializable {
|
|||||||
case STRING:
|
case STRING:
|
||||||
return UUID.randomUUID().toString();
|
return UUID.randomUUID().toString();
|
||||||
case ENUM:
|
case ENUM:
|
||||||
List<String> enumSymbols = localSchema.getEnumSymbols();
|
List<String> enumSymbols = fieldSchema.getEnumSymbols();
|
||||||
return new GenericData.EnumSymbol(localSchema, enumSymbols.get(random.nextInt(enumSymbols.size() - 1)));
|
return new GenericData.EnumSymbol(fieldSchema, enumSymbols.get(random.nextInt(enumSymbols.size() - 1)));
|
||||||
case RECORD:
|
case RECORD:
|
||||||
return convert(localSchema);
|
return getNewPayload(fieldSchema);
|
||||||
case ARRAY:
|
case ARRAY:
|
||||||
Schema elementSchema = localSchema.getElementType();
|
Schema.Field elementField = new Schema.Field(field.name(), fieldSchema.getElementType(), "", null);
|
||||||
List listRes = new ArrayList();
|
List listRes = new ArrayList();
|
||||||
if (isPrimitive(elementSchema) && this.shouldAddMore) {
|
int numEntriesToAdd = extraEntriesMap.getOrDefault(field.name(), 1);
|
||||||
int numEntriesToAdd = numEntriesToAdd(elementSchema);
|
while (numEntriesToAdd-- > 0) {
|
||||||
while (numEntriesToAdd > 0) {
|
listRes.add(typeConvert(elementField));
|
||||||
listRes.add(typeConvert(elementSchema));
|
|
||||||
numEntriesToAdd--;
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
listRes.add(typeConvert(elementSchema));
|
|
||||||
}
|
}
|
||||||
return listRes;
|
return listRes;
|
||||||
case MAP:
|
case MAP:
|
||||||
Schema valueSchema = localSchema.getValueType();
|
Schema.Field valueField = new Schema.Field(field.name(), fieldSchema.getValueType(), "", null);
|
||||||
Map<String, Object> mapRes = new HashMap<String, Object>();
|
Map<String, Object> mapRes = new HashMap<String, Object>();
|
||||||
if (isPrimitive(valueSchema) && this.shouldAddMore) {
|
numEntriesToAdd = extraEntriesMap.getOrDefault(field.name(), 1);
|
||||||
int numEntriesToAdd = numEntriesToAdd(valueSchema);
|
|
||||||
while (numEntriesToAdd > 0) {
|
while (numEntriesToAdd > 0) {
|
||||||
mapRes.put(UUID.randomUUID().toString(), typeConvert(valueSchema));
|
mapRes.put(UUID.randomUUID().toString(), typeConvert(valueField));
|
||||||
numEntriesToAdd--;
|
numEntriesToAdd--;
|
||||||
}
|
}
|
||||||
} else {
|
|
||||||
mapRes.put(UUID.randomUUID().toString(), typeConvert(valueSchema));
|
|
||||||
}
|
|
||||||
return mapRes;
|
return mapRes;
|
||||||
case BYTES:
|
case BYTES:
|
||||||
return ByteBuffer.wrap(UUID.randomUUID().toString().getBytes(Charset.defaultCharset()));
|
return ByteBuffer.wrap(UUID.randomUUID().toString().getBytes(Charset.defaultCharset()));
|
||||||
case FIXED:
|
case FIXED:
|
||||||
return generateFixedType(localSchema);
|
return generateFixedType(fieldSchema);
|
||||||
default:
|
default:
|
||||||
throw new IllegalArgumentException(
|
throw new IllegalArgumentException("Cannot handle type: " + fieldSchema.getType());
|
||||||
"Cannot handle type: " + localSchema.getType());
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -333,23 +311,37 @@ public class GenericRecordFullPayloadGenerator implements Serializable {
|
|||||||
* @param elementSchema
|
* @param elementSchema
|
||||||
* @return Number of entries to add
|
* @return Number of entries to add
|
||||||
*/
|
*/
|
||||||
private int numEntriesToAdd(Schema elementSchema) {
|
private void determineExtraEntriesRequired(int numberOfComplexFields, int numberOfBytesToAdd) {
|
||||||
|
for (Schema.Field f : baseSchema.getFields()) {
|
||||||
|
Schema elementSchema = f.schema();
|
||||||
// Find the size of the primitive data type in bytes
|
// Find the size of the primitive data type in bytes
|
||||||
int primitiveDataTypeSize = getSize(elementSchema);
|
int primitiveDataTypeSize = 0;
|
||||||
|
if (elementSchema.getType() == Type.ARRAY && isPrimitive(elementSchema.getElementType())) {
|
||||||
|
primitiveDataTypeSize = getSize(elementSchema.getElementType());
|
||||||
|
} else if (elementSchema.getType() == Type.MAP && isPrimitive(elementSchema.getValueType())) {
|
||||||
|
primitiveDataTypeSize = getSize(elementSchema.getValueType());
|
||||||
|
} else {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
int numEntriesToAdd = numberOfBytesToAdd / primitiveDataTypeSize;
|
int numEntriesToAdd = numberOfBytesToAdd / primitiveDataTypeSize;
|
||||||
// If more than 10 entries are being added for this same complex field and there are still more complex fields to
|
// If more than 10 entries are being added for this same complex field and there are still more complex fields to
|
||||||
// be visited in the schema, reduce the number of entries to add by a factor of 10 to allow for other complex
|
// be visited in the schema, reduce the number of entries to add by a factor of 10 to allow for other complex
|
||||||
// fields to pack some entries
|
// fields to pack some entries
|
||||||
if (numEntriesToAdd % 10 > 0 && this.numberOfComplexFields > 1) {
|
if (numEntriesToAdd > 10 && numberOfComplexFields > 1) {
|
||||||
numEntriesToAdd = numEntriesToAdd / 10;
|
numEntriesToAdd = 10;
|
||||||
numberOfBytesToAdd -= numEntriesToAdd * primitiveDataTypeSize;
|
numberOfBytesToAdd -= numEntriesToAdd * primitiveDataTypeSize;
|
||||||
this.shouldAddMore = true;
|
|
||||||
} else {
|
} else {
|
||||||
this.numberOfBytesToAdd = 0;
|
numberOfBytesToAdd = 0;
|
||||||
this.shouldAddMore = false;
|
}
|
||||||
|
|
||||||
|
extraEntriesMap.put(f.name(), numEntriesToAdd);
|
||||||
|
|
||||||
|
numberOfComplexFields -= 1;
|
||||||
|
if (numberOfBytesToAdd <= 0) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
this.numberOfComplexFields -= 1;
|
|
||||||
return numEntriesToAdd;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -38,7 +38,7 @@ public class GenericRecordPartialPayloadGenerator extends GenericRecordFullPaylo
|
|||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
protected GenericRecord convert(Schema schema) {
|
protected GenericRecord getNewPayload(Schema schema) {
|
||||||
GenericRecord record = super.convertPartial(schema);
|
GenericRecord record = super.convertPartial(schema);
|
||||||
return record;
|
return record;
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -132,7 +132,7 @@ public class DFSHoodieDatasetInputReader extends DFSDeltaInputReader {
|
|||||||
Option<Long> numRecordsToUpdate, Option<Double> percentageRecordsPerFile) throws IOException {
|
Option<Long> numRecordsToUpdate, Option<Double> percentageRecordsPerFile) throws IOException {
|
||||||
log.info("NumPartitions : {}, NumFiles : {}, numRecordsToUpdate : {}, percentageRecordsPerFile : {}",
|
log.info("NumPartitions : {}, NumFiles : {}, numRecordsToUpdate : {}, percentageRecordsPerFile : {}",
|
||||||
numPartitions, numFiles, numRecordsToUpdate, percentageRecordsPerFile);
|
numPartitions, numFiles, numRecordsToUpdate, percentageRecordsPerFile);
|
||||||
List<String> partitionPaths = getPartitions(numPartitions);
|
final List<String> partitionPaths = getPartitions(numPartitions);
|
||||||
// Read all file slices in the partition
|
// Read all file slices in the partition
|
||||||
JavaPairRDD<String, Iterator<FileSlice>> partitionToFileSlice = getPartitionToFileSlice(metaClient,
|
JavaPairRDD<String, Iterator<FileSlice>> partitionToFileSlice = getPartitionToFileSlice(metaClient,
|
||||||
partitionPaths);
|
partitionPaths);
|
||||||
@@ -156,7 +156,7 @@ public class DFSHoodieDatasetInputReader extends DFSDeltaInputReader {
|
|||||||
}
|
}
|
||||||
// Adjust the number of files to read per partition based on the requested partition & file counts
|
// Adjust the number of files to read per partition based on the requested partition & file counts
|
||||||
Map<String, Integer> adjustedPartitionToFileIdCountMap = getFilesToReadPerPartition(partitionToFileSlice,
|
Map<String, Integer> adjustedPartitionToFileIdCountMap = getFilesToReadPerPartition(partitionToFileSlice,
|
||||||
getPartitions(numPartitions).size(), numFilesToUpdate);
|
partitionPaths.size(), numFilesToUpdate);
|
||||||
JavaRDD<GenericRecord> updates = projectSchema(generateUpdates(adjustedPartitionToFileIdCountMap,
|
JavaRDD<GenericRecord> updates = projectSchema(generateUpdates(adjustedPartitionToFileIdCountMap,
|
||||||
partitionToFileSlice, numFilesToUpdate, (int) numRecordsToUpdatePerFile));
|
partitionToFileSlice, numFilesToUpdate, (int) numRecordsToUpdatePerFile));
|
||||||
if (numRecordsToUpdate.isPresent() && numFiles.isPresent() && numFiles.get() != 0 && numRecordsToUpdate.get()
|
if (numRecordsToUpdate.isPresent() && numFiles.isPresent() && numFiles.get() != 0 && numRecordsToUpdate.get()
|
||||||
|
|||||||
@@ -219,7 +219,7 @@ public class DeltaSync implements Serializable {
|
|||||||
*
|
*
|
||||||
* @throws IOException in case of any IOException
|
* @throws IOException in case of any IOException
|
||||||
*/
|
*/
|
||||||
private void refreshTimeline() throws IOException {
|
public void refreshTimeline() throws IOException {
|
||||||
if (fs.exists(new Path(cfg.targetBasePath))) {
|
if (fs.exists(new Path(cfg.targetBasePath))) {
|
||||||
HoodieTableMetaClient meta = new HoodieTableMetaClient(new Configuration(fs.getConf()), cfg.targetBasePath,
|
HoodieTableMetaClient meta = new HoodieTableMetaClient(new Configuration(fs.getConf()), cfg.targetBasePath,
|
||||||
cfg.payloadClassName);
|
cfg.payloadClassName);
|
||||||
|
|||||||
@@ -78,6 +78,7 @@ public class DFSPathSelector {
|
|||||||
while (fitr.hasNext()) {
|
while (fitr.hasNext()) {
|
||||||
LocatedFileStatus fileStatus = fitr.next();
|
LocatedFileStatus fileStatus = fitr.next();
|
||||||
if (fileStatus.isDirectory()
|
if (fileStatus.isDirectory()
|
||||||
|
|| fileStatus.getLen() == 0
|
||||||
|| IGNORE_FILEPREFIX_LIST.stream().anyMatch(pfx -> fileStatus.getPath().getName().startsWith(pfx))) {
|
|| IGNORE_FILEPREFIX_LIST.stream().anyMatch(pfx -> fileStatus.getPath().getName().startsWith(pfx))) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user