[HUDI-2872][HUDI-2646] Refactoring layout optimization (clustering) flow to support linear ordering (#4606)
Refactoring layout optimization (clustering) flow to - Enable support for linear (lexicographic) ordering as one of the ordering strategies (along w/ Z-order, Hilbert) - Reconcile Layout Optimization and Clustering configuration to be more congruent
This commit is contained in:
@@ -59,7 +59,7 @@ public class HoodieClusteringConfig extends HoodieConfig {
|
|||||||
"hoodie.clustering.plan.partition.filter.mode";
|
"hoodie.clustering.plan.partition.filter.mode";
|
||||||
|
|
||||||
// Any Space-filling curves optimize(z-order/hilbert) params can be saved with this prefix
|
// Any Space-filling curves optimize(z-order/hilbert) params can be saved with this prefix
|
||||||
public static final String LAYOUT_OPTIMIZE_PARAM_PREFIX = "hoodie.layout.optimize.";
|
private static final String LAYOUT_OPTIMIZE_PARAM_PREFIX = "hoodie.layout.optimize.";
|
||||||
|
|
||||||
public static final ConfigProperty<String> DAYBASED_LOOKBACK_PARTITIONS = ConfigProperty
|
public static final ConfigProperty<String> DAYBASED_LOOKBACK_PARTITIONS = ConfigProperty
|
||||||
.key(CLUSTERING_STRATEGY_PARAM_PREFIX + "daybased.lookback.partitions")
|
.key(CLUSTERING_STRATEGY_PARAM_PREFIX + "daybased.lookback.partitions")
|
||||||
@@ -190,63 +190,88 @@ public class HoodieClusteringConfig extends HoodieConfig {
|
|||||||
.withDocumentation("When rewriting data, preserves existing hoodie_commit_time");
|
.withDocumentation("When rewriting data, preserves existing hoodie_commit_time");
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Using space-filling curves to optimize the layout of table to boost query performance.
|
* @deprecated this setting has no effect. Please refer to clustering configuration, as well as
|
||||||
* The table data which sorted by space-filling curve has better aggregation;
|
* {@link #LAYOUT_OPTIMIZE_STRATEGY} config to enable advanced record layout optimization strategies
|
||||||
* combine with min-max filtering, it can achieve good performance improvement.
|
|
||||||
*
|
|
||||||
* Notice:
|
|
||||||
* when we use this feature, we need specify the sort columns.
|
|
||||||
* The more columns involved in sorting, the worse the aggregation, and the smaller the query performance improvement.
|
|
||||||
* Choose the filter columns which commonly used in query sql as sort columns.
|
|
||||||
* It is recommend that 2 ~ 4 columns participate in sorting.
|
|
||||||
*/
|
*/
|
||||||
public static final ConfigProperty LAYOUT_OPTIMIZE_ENABLE = ConfigProperty
|
public static final ConfigProperty LAYOUT_OPTIMIZE_ENABLE = ConfigProperty
|
||||||
.key(LAYOUT_OPTIMIZE_PARAM_PREFIX + "enable")
|
.key(LAYOUT_OPTIMIZE_PARAM_PREFIX + "enable")
|
||||||
.defaultValue(false)
|
.defaultValue(false)
|
||||||
.sinceVersion("0.10.0")
|
.sinceVersion("0.10.0")
|
||||||
.withDocumentation("Enable use z-ordering/space-filling curves to optimize the layout of table to boost query performance. "
|
.deprecatedAfter("0.11.0")
|
||||||
+ "This parameter takes precedence over clustering strategy set using " + EXECUTION_STRATEGY_CLASS_NAME.key());
|
.withDocumentation("This setting has no effect. Please refer to clustering configuration, as well as "
|
||||||
|
+ "LAYOUT_OPTIMIZE_STRATEGY config to enable advanced record layout optimization strategies");
|
||||||
public static final ConfigProperty LAYOUT_OPTIMIZE_STRATEGY = ConfigProperty
|
|
||||||
.key(LAYOUT_OPTIMIZE_PARAM_PREFIX + "strategy")
|
|
||||||
.defaultValue("z-order")
|
|
||||||
.sinceVersion("0.10.0")
|
|
||||||
.withDocumentation("Type of layout optimization to be applied, current only supports `z-order` and `hilbert` curves.");
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* There exists two method to build z-curve.
|
* Determines ordering strategy in for records layout optimization.
|
||||||
* one is directly mapping sort cols to z-value to build z-curve;
|
* Currently, following strategies are supported
|
||||||
* we can find this method in Amazon DynamoDB https://aws.amazon.com/cn/blogs/database/tag/z-order/
|
* <ul>
|
||||||
* the other one is Boundary-based Interleaved Index method which we proposed. simply call it sample method.
|
* <li>Linear: simply orders records lexicographically</li>
|
||||||
* Refer to rfc-28 for specific algorithm flow.
|
* <li>Z-order: orders records along Z-order spatial-curve</li>
|
||||||
* Boundary-based Interleaved Index method has better generalization, but the build speed is slower than direct method.
|
* <li>Hilbert: orders records along Hilbert's spatial-curve</li>
|
||||||
|
* </ul>
|
||||||
|
*
|
||||||
|
* NOTE: "z-order", "hilbert" strategies may consume considerably more compute, than "linear".
|
||||||
|
* Make sure to perform small-scale local testing for your dataset before applying globally.
|
||||||
*/
|
*/
|
||||||
public static final ConfigProperty LAYOUT_OPTIMIZE_CURVE_BUILD_METHOD = ConfigProperty
|
public static final ConfigProperty<String> LAYOUT_OPTIMIZE_STRATEGY = ConfigProperty
|
||||||
|
.key(LAYOUT_OPTIMIZE_PARAM_PREFIX + "strategy")
|
||||||
|
.defaultValue("linear")
|
||||||
|
.sinceVersion("0.10.0")
|
||||||
|
.withDocumentation("Determines ordering strategy used in records layout optimization. "
|
||||||
|
+ "Currently supported strategies are \"linear\", \"z-order\" and \"hilbert\" values are supported.");
|
||||||
|
|
||||||
|
/**
|
||||||
|
* NOTE: This setting only has effect if {@link #LAYOUT_OPTIMIZE_STRATEGY} value is set to
|
||||||
|
* either "z-order" or "hilbert" (ie leveraging space-filling curves)
|
||||||
|
*
|
||||||
|
* Currently, two methods to order records along the curve are supported "build" and "sample":
|
||||||
|
*
|
||||||
|
* <ul>
|
||||||
|
* <li>Direct: entails that spatial curve will be built in full, "filling in" all of the individual
|
||||||
|
* points corresponding to each individual record</li>
|
||||||
|
* <li>Sample: leverages boundary-base interleaved index method (described in more details in
|
||||||
|
* Amazon DynamoDB blog [1])</li>
|
||||||
|
* </ul>
|
||||||
|
*
|
||||||
|
* NOTE: Boundary-based interleaved Index method has better generalization,
|
||||||
|
* but is slower than direct method.
|
||||||
|
*
|
||||||
|
* Please refer to RFC-28 for specific elaboration on both flows.
|
||||||
|
*
|
||||||
|
* [1] https://aws.amazon.com/cn/blogs/database/tag/z-order/
|
||||||
|
*/
|
||||||
|
public static final ConfigProperty<String> LAYOUT_OPTIMIZE_SPATIAL_CURVE_BUILD_METHOD = ConfigProperty
|
||||||
.key(LAYOUT_OPTIMIZE_PARAM_PREFIX + "curve.build.method")
|
.key(LAYOUT_OPTIMIZE_PARAM_PREFIX + "curve.build.method")
|
||||||
.defaultValue("direct")
|
.defaultValue("direct")
|
||||||
.sinceVersion("0.10.0")
|
.sinceVersion("0.10.0")
|
||||||
.withDocumentation("Controls how data is sampled to build the space filling curves. two methods: `direct`,`sample`."
|
.withDocumentation("Controls how data is sampled to build the space-filling curves. "
|
||||||
+ "The direct method is faster than the sampling, however sample method would produce a better data layout.");
|
+ "Two methods: \"direct\", \"sample\". The direct method is faster than the sampling, "
|
||||||
|
+ "however sample method would produce a better data layout.");
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Doing sample for table data is the first step in Boundary-based Interleaved Index method.
|
* NOTE: This setting only has effect if {@link #LAYOUT_OPTIMIZE_SPATIAL_CURVE_BUILD_METHOD} value
|
||||||
* larger sample number means better optimize result, but more memory consumption
|
* is set to "sample"
|
||||||
|
*
|
||||||
|
* Determines target sample size used by the Boundary-based Interleaved Index method.
|
||||||
|
* Larger sample size entails better layout optimization outcomes, at the expense of higher memory
|
||||||
|
* footprint.
|
||||||
*/
|
*/
|
||||||
public static final ConfigProperty LAYOUT_OPTIMIZE_BUILD_CURVE_SAMPLE_SIZE = ConfigProperty
|
public static final ConfigProperty<String> LAYOUT_OPTIMIZE_BUILD_CURVE_SAMPLE_SIZE = ConfigProperty
|
||||||
.key(LAYOUT_OPTIMIZE_PARAM_PREFIX + "build.curve.sample.size")
|
.key(LAYOUT_OPTIMIZE_PARAM_PREFIX + "build.curve.sample.size")
|
||||||
.defaultValue("200000")
|
.defaultValue("200000")
|
||||||
.sinceVersion("0.10.0")
|
.sinceVersion("0.10.0")
|
||||||
.withDocumentation("when setting" + LAYOUT_OPTIMIZE_CURVE_BUILD_METHOD.key() + " to `sample`, the amount of sampling to be done."
|
.withDocumentation("Determines target sample size used by the Boundary-based Interleaved Index method "
|
||||||
+ "Large sample size leads to better results, at the expense of more memory usage.");
|
+ "of building space-filling curve. Larger sample size entails better layout optimization outcomes, "
|
||||||
|
+ "at the expense of higher memory footprint.");
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* The best way to use Z-order/Space-filling curves is to cooperate with Data-Skipping
|
* @deprecated this setting has no effect
|
||||||
* with data-skipping query engine can greatly reduce the number of table files to be read.
|
|
||||||
* otherwise query engine can only do row-group skipping for files (parquet/orc)
|
|
||||||
*/
|
*/
|
||||||
public static final ConfigProperty LAYOUT_OPTIMIZE_DATA_SKIPPING_ENABLE = ConfigProperty
|
public static final ConfigProperty LAYOUT_OPTIMIZE_DATA_SKIPPING_ENABLE = ConfigProperty
|
||||||
.key(LAYOUT_OPTIMIZE_PARAM_PREFIX + "data.skipping.enable")
|
.key(LAYOUT_OPTIMIZE_PARAM_PREFIX + "data.skipping.enable")
|
||||||
.defaultValue(true)
|
.defaultValue(true)
|
||||||
.sinceVersion("0.10.0")
|
.sinceVersion("0.10.0")
|
||||||
|
.deprecatedAfter("0.11.0")
|
||||||
.withDocumentation("Enable data skipping by collecting statistics once layout optimization is complete.");
|
.withDocumentation("Enable data skipping by collecting statistics once layout optimization is complete.");
|
||||||
|
|
||||||
public static final ConfigProperty<Boolean> ROLLBACK_PENDING_CLUSTERING_ON_CONFLICT = ConfigProperty
|
public static final ConfigProperty<Boolean> ROLLBACK_PENDING_CLUSTERING_ON_CONFLICT = ConfigProperty
|
||||||
@@ -516,18 +541,13 @@ public class HoodieClusteringConfig extends HoodieConfig {
|
|||||||
return this;
|
return this;
|
||||||
}
|
}
|
||||||
|
|
||||||
public Builder withSpaceFillingCurveDataOptimizeEnable(Boolean enable) {
|
|
||||||
clusteringConfig.setValue(LAYOUT_OPTIMIZE_ENABLE, String.valueOf(enable));
|
|
||||||
return this;
|
|
||||||
}
|
|
||||||
|
|
||||||
public Builder withDataOptimizeStrategy(String strategy) {
|
public Builder withDataOptimizeStrategy(String strategy) {
|
||||||
clusteringConfig.setValue(LAYOUT_OPTIMIZE_STRATEGY, strategy);
|
clusteringConfig.setValue(LAYOUT_OPTIMIZE_STRATEGY, strategy);
|
||||||
return this;
|
return this;
|
||||||
}
|
}
|
||||||
|
|
||||||
public Builder withDataOptimizeBuildCurveStrategy(String method) {
|
public Builder withDataOptimizeBuildCurveStrategy(String method) {
|
||||||
clusteringConfig.setValue(LAYOUT_OPTIMIZE_CURVE_BUILD_METHOD, method);
|
clusteringConfig.setValue(LAYOUT_OPTIMIZE_SPATIAL_CURVE_BUILD_METHOD, method);
|
||||||
return this;
|
return this;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -536,11 +556,6 @@ public class HoodieClusteringConfig extends HoodieConfig {
|
|||||||
return this;
|
return this;
|
||||||
}
|
}
|
||||||
|
|
||||||
public Builder withDataOptimizeDataSkippingEnable(boolean dataSkipping) {
|
|
||||||
clusteringConfig.setValue(LAYOUT_OPTIMIZE_DATA_SKIPPING_ENABLE, String.valueOf(dataSkipping));
|
|
||||||
return this;
|
|
||||||
}
|
|
||||||
|
|
||||||
public HoodieClusteringConfig build() {
|
public HoodieClusteringConfig build() {
|
||||||
clusteringConfig.setDefaultValue(
|
clusteringConfig.setDefaultValue(
|
||||||
PLAN_STRATEGY_CLASS_NAME, getDefaultPlanStrategyClassName(engineType));
|
PLAN_STRATEGY_CLASS_NAME, getDefaultPlanStrategyClassName(engineType));
|
||||||
@@ -578,21 +593,21 @@ public class HoodieClusteringConfig extends HoodieConfig {
|
|||||||
/**
|
/**
|
||||||
* Type of a strategy for building Z-order/Hilbert space-filling curves.
|
* Type of a strategy for building Z-order/Hilbert space-filling curves.
|
||||||
*/
|
*/
|
||||||
public enum BuildCurveStrategyType {
|
public enum SpatialCurveCompositionStrategyType {
|
||||||
DIRECT("direct"),
|
DIRECT("direct"),
|
||||||
SAMPLE("sample");
|
SAMPLE("sample");
|
||||||
|
|
||||||
private static final Map<String, BuildCurveStrategyType> VALUE_TO_ENUM_MAP =
|
private static final Map<String, SpatialCurveCompositionStrategyType> VALUE_TO_ENUM_MAP =
|
||||||
TypeUtils.getValueToEnumMap(BuildCurveStrategyType.class, e -> e.value);
|
TypeUtils.getValueToEnumMap(SpatialCurveCompositionStrategyType.class, e -> e.value);
|
||||||
|
|
||||||
private final String value;
|
private final String value;
|
||||||
|
|
||||||
BuildCurveStrategyType(String value) {
|
SpatialCurveCompositionStrategyType(String value) {
|
||||||
this.value = value;
|
this.value = value;
|
||||||
}
|
}
|
||||||
|
|
||||||
public static BuildCurveStrategyType fromValue(String value) {
|
public static SpatialCurveCompositionStrategyType fromValue(String value) {
|
||||||
BuildCurveStrategyType enumValue = VALUE_TO_ENUM_MAP.get(value);
|
SpatialCurveCompositionStrategyType enumValue = VALUE_TO_ENUM_MAP.get(value);
|
||||||
if (enumValue == null) {
|
if (enumValue == null) {
|
||||||
throw new HoodieException(String.format("Invalid value (%s)", value));
|
throw new HoodieException(String.format("Invalid value (%s)", value));
|
||||||
}
|
}
|
||||||
@@ -605,6 +620,7 @@ public class HoodieClusteringConfig extends HoodieConfig {
|
|||||||
* Layout optimization strategies such as Z-order/Hilbert space-curves, etc
|
* Layout optimization strategies such as Z-order/Hilbert space-curves, etc
|
||||||
*/
|
*/
|
||||||
public enum LayoutOptimizationStrategy {
|
public enum LayoutOptimizationStrategy {
|
||||||
|
LINEAR("linear"),
|
||||||
ZORDER("z-order"),
|
ZORDER("z-order"),
|
||||||
HILBERT("hilbert");
|
HILBERT("hilbert");
|
||||||
|
|
||||||
|
|||||||
@@ -1288,30 +1288,21 @@ public class HoodieWriteConfig extends HoodieConfig {
|
|||||||
return getString(HoodieClusteringConfig.PLAN_STRATEGY_SORT_COLUMNS);
|
return getString(HoodieClusteringConfig.PLAN_STRATEGY_SORT_COLUMNS);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
public HoodieClusteringConfig.LayoutOptimizationStrategy getLayoutOptimizationStrategy() {
|
||||||
* Data layout optimize properties.
|
return HoodieClusteringConfig.LayoutOptimizationStrategy.fromValue(
|
||||||
*/
|
getStringOrDefault(HoodieClusteringConfig.LAYOUT_OPTIMIZE_STRATEGY)
|
||||||
public boolean isLayoutOptimizationEnabled() {
|
);
|
||||||
return getBoolean(HoodieClusteringConfig.LAYOUT_OPTIMIZE_ENABLE);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public String getLayoutOptimizationStrategy() {
|
public HoodieClusteringConfig.SpatialCurveCompositionStrategyType getLayoutOptimizationCurveBuildMethod() {
|
||||||
return getString(HoodieClusteringConfig.LAYOUT_OPTIMIZE_STRATEGY);
|
return HoodieClusteringConfig.SpatialCurveCompositionStrategyType.fromValue(
|
||||||
}
|
getString(HoodieClusteringConfig.LAYOUT_OPTIMIZE_SPATIAL_CURVE_BUILD_METHOD));
|
||||||
|
|
||||||
public HoodieClusteringConfig.BuildCurveStrategyType getLayoutOptimizationCurveBuildMethod() {
|
|
||||||
return HoodieClusteringConfig.BuildCurveStrategyType.fromValue(
|
|
||||||
getString(HoodieClusteringConfig.LAYOUT_OPTIMIZE_CURVE_BUILD_METHOD));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public int getLayoutOptimizationSampleSize() {
|
public int getLayoutOptimizationSampleSize() {
|
||||||
return getInt(HoodieClusteringConfig.LAYOUT_OPTIMIZE_BUILD_CURVE_SAMPLE_SIZE);
|
return getInt(HoodieClusteringConfig.LAYOUT_OPTIMIZE_BUILD_CURVE_SAMPLE_SIZE);
|
||||||
}
|
}
|
||||||
|
|
||||||
public boolean isDataSkippingEnabled() {
|
|
||||||
return getBoolean(HoodieClusteringConfig.LAYOUT_OPTIMIZE_DATA_SKIPPING_ENABLE);
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* index properties.
|
* index properties.
|
||||||
*/
|
*/
|
||||||
|
|||||||
@@ -305,7 +305,7 @@ public class SparkRDDWriteClient<T extends HoodieRecordPayload> extends
|
|||||||
this.txnManager.beginTransaction(Option.of(compactionInstant), Option.empty());
|
this.txnManager.beginTransaction(Option.of(compactionInstant), Option.empty());
|
||||||
finalizeWrite(table, compactionCommitTime, writeStats);
|
finalizeWrite(table, compactionCommitTime, writeStats);
|
||||||
// commit to data table after committing to metadata table.
|
// commit to data table after committing to metadata table.
|
||||||
writeTableMetadataForTableServices(table, metadata, compactionInstant);
|
updateTableMetadata(table, metadata, compactionInstant);
|
||||||
LOG.info("Committing Compaction " + compactionCommitTime + ". Finished with result " + metadata);
|
LOG.info("Committing Compaction " + compactionCommitTime + ". Finished with result " + metadata);
|
||||||
CompactHelpers.getInstance().completeInflightCompaction(table, compactionCommitTime, metadata);
|
CompactHelpers.getInstance().completeInflightCompaction(table, compactionCommitTime, metadata);
|
||||||
} finally {
|
} finally {
|
||||||
@@ -378,17 +378,20 @@ public class SparkRDDWriteClient<T extends HoodieRecordPayload> extends
|
|||||||
throw new HoodieClusteringException("Clustering failed to write to files:"
|
throw new HoodieClusteringException("Clustering failed to write to files:"
|
||||||
+ writeStats.stream().filter(s -> s.getTotalWriteErrors() > 0L).map(s -> s.getFileId()).collect(Collectors.joining(",")));
|
+ writeStats.stream().filter(s -> s.getTotalWriteErrors() > 0L).map(s -> s.getFileId()).collect(Collectors.joining(",")));
|
||||||
}
|
}
|
||||||
|
|
||||||
final HoodieInstant clusteringInstant = new HoodieInstant(HoodieInstant.State.INFLIGHT, HoodieTimeline.REPLACE_COMMIT_ACTION, clusteringCommitTime);
|
final HoodieInstant clusteringInstant = new HoodieInstant(HoodieInstant.State.INFLIGHT, HoodieTimeline.REPLACE_COMMIT_ACTION, clusteringCommitTime);
|
||||||
try {
|
try {
|
||||||
this.txnManager.beginTransaction(Option.of(clusteringInstant), Option.empty());
|
this.txnManager.beginTransaction(Option.of(clusteringInstant), Option.empty());
|
||||||
|
|
||||||
finalizeWrite(table, clusteringCommitTime, writeStats);
|
finalizeWrite(table, clusteringCommitTime, writeStats);
|
||||||
writeTableMetadataForTableServices(table, metadata,clusteringInstant);
|
// Update table's metadata (table)
|
||||||
// Update outstanding metadata indexes
|
updateTableMetadata(table, metadata, clusteringInstant);
|
||||||
if (config.isLayoutOptimizationEnabled()
|
// Update tables' metadata indexes
|
||||||
&& !config.getClusteringSortColumns().isEmpty()) {
|
// NOTE: This overlaps w/ metadata table (above) and will be reconciled in the future
|
||||||
table.updateMetadataIndexes(context, writeStats, clusteringCommitTime);
|
table.updateMetadataIndexes(context, writeStats, clusteringCommitTime);
|
||||||
}
|
|
||||||
LOG.info("Committing Clustering " + clusteringCommitTime + ". Finished with result " + metadata);
|
LOG.info("Committing Clustering " + clusteringCommitTime + ". Finished with result " + metadata);
|
||||||
|
|
||||||
table.getActiveTimeline().transitionReplaceInflightToComplete(
|
table.getActiveTimeline().transitionReplaceInflightToComplete(
|
||||||
HoodieTimeline.getReplaceCommitInflightInstant(clusteringCommitTime),
|
HoodieTimeline.getReplaceCommitInflightInstant(clusteringCommitTime),
|
||||||
Option.of(metadata.toJsonString().getBytes(StandardCharsets.UTF_8)));
|
Option.of(metadata.toJsonString().getBytes(StandardCharsets.UTF_8)));
|
||||||
@@ -412,13 +415,13 @@ public class SparkRDDWriteClient<T extends HoodieRecordPayload> extends
|
|||||||
LOG.info("Clustering successfully on commit " + clusteringCommitTime);
|
LOG.info("Clustering successfully on commit " + clusteringCommitTime);
|
||||||
}
|
}
|
||||||
|
|
||||||
private void writeTableMetadataForTableServices(HoodieTable<T, JavaRDD<HoodieRecord<T>>, JavaRDD<HoodieKey>, JavaRDD<WriteStatus>> table, HoodieCommitMetadata commitMetadata,
|
private void updateTableMetadata(HoodieTable<T, JavaRDD<HoodieRecord<T>>, JavaRDD<HoodieKey>, JavaRDD<WriteStatus>> table, HoodieCommitMetadata commitMetadata,
|
||||||
HoodieInstant hoodieInstant) {
|
HoodieInstant hoodieInstant) {
|
||||||
boolean isTableServiceAction = table.isTableServiceAction(hoodieInstant.getAction());
|
boolean isTableServiceAction = table.isTableServiceAction(hoodieInstant.getAction());
|
||||||
// Do not do any conflict resolution here as we do with regular writes. We take the lock here to ensure all writes to metadata table happens within a
|
// Do not do any conflict resolution here as we do with regular writes. We take the lock here to ensure all writes to metadata table happens within a
|
||||||
// single lock (single writer). Because more than one write to metadata table will result in conflicts since all of them updates the same partition.
|
// single lock (single writer). Because more than one write to metadata table will result in conflicts since all of them updates the same partition.
|
||||||
table.getMetadataWriter(hoodieInstant.getTimestamp()).ifPresent(
|
table.getMetadataWriter(hoodieInstant.getTimestamp())
|
||||||
w -> w.update(commitMetadata, hoodieInstant.getTimestamp(), isTableServiceAction));
|
.ifPresent(writer -> writer.update(commitMetadata, hoodieInstant.getTimestamp(), isTableServiceAction));
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
|||||||
@@ -18,6 +18,10 @@
|
|||||||
|
|
||||||
package org.apache.hudi.client.clustering.run.strategy;
|
package org.apache.hudi.client.clustering.run.strategy;
|
||||||
|
|
||||||
|
import org.apache.avro.Schema;
|
||||||
|
import org.apache.avro.generic.GenericRecord;
|
||||||
|
import org.apache.avro.generic.IndexedRecord;
|
||||||
|
import org.apache.hadoop.fs.Path;
|
||||||
import org.apache.hudi.avro.HoodieAvroUtils;
|
import org.apache.hudi.avro.HoodieAvroUtils;
|
||||||
import org.apache.hudi.avro.model.HoodieClusteringGroup;
|
import org.apache.hudi.avro.model.HoodieClusteringGroup;
|
||||||
import org.apache.hudi.avro.model.HoodieClusteringPlan;
|
import org.apache.hudi.avro.model.HoodieClusteringPlan;
|
||||||
@@ -39,11 +43,12 @@ import org.apache.hudi.common.util.FutureUtils;
|
|||||||
import org.apache.hudi.common.util.Option;
|
import org.apache.hudi.common.util.Option;
|
||||||
import org.apache.hudi.common.util.StringUtils;
|
import org.apache.hudi.common.util.StringUtils;
|
||||||
import org.apache.hudi.common.util.collection.Pair;
|
import org.apache.hudi.common.util.collection.Pair;
|
||||||
|
import org.apache.hudi.config.HoodieClusteringConfig;
|
||||||
import org.apache.hudi.config.HoodieWriteConfig;
|
import org.apache.hudi.config.HoodieWriteConfig;
|
||||||
import org.apache.hudi.exception.HoodieClusteringException;
|
import org.apache.hudi.exception.HoodieClusteringException;
|
||||||
import org.apache.hudi.exception.HoodieIOException;
|
import org.apache.hudi.exception.HoodieIOException;
|
||||||
import org.apache.hudi.execution.bulkinsert.RDDCustomColumnsSortPartitioner;
|
import org.apache.hudi.execution.bulkinsert.RDDCustomColumnsSortPartitioner;
|
||||||
import org.apache.hudi.execution.bulkinsert.RDDSpatialCurveOptimizationSortPartitioner;
|
import org.apache.hudi.execution.bulkinsert.RDDSpatialCurveSortPartitioner;
|
||||||
import org.apache.hudi.io.IOUtils;
|
import org.apache.hudi.io.IOUtils;
|
||||||
import org.apache.hudi.io.storage.HoodieFileReader;
|
import org.apache.hudi.io.storage.HoodieFileReader;
|
||||||
import org.apache.hudi.io.storage.HoodieFileReaderFactory;
|
import org.apache.hudi.io.storage.HoodieFileReaderFactory;
|
||||||
@@ -54,11 +59,6 @@ import org.apache.hudi.table.BulkInsertPartitioner;
|
|||||||
import org.apache.hudi.table.HoodieTable;
|
import org.apache.hudi.table.HoodieTable;
|
||||||
import org.apache.hudi.table.action.HoodieWriteMetadata;
|
import org.apache.hudi.table.action.HoodieWriteMetadata;
|
||||||
import org.apache.hudi.table.action.cluster.strategy.ClusteringExecutionStrategy;
|
import org.apache.hudi.table.action.cluster.strategy.ClusteringExecutionStrategy;
|
||||||
|
|
||||||
import org.apache.avro.Schema;
|
|
||||||
import org.apache.avro.generic.GenericRecord;
|
|
||||||
import org.apache.avro.generic.IndexedRecord;
|
|
||||||
import org.apache.hadoop.fs.Path;
|
|
||||||
import org.apache.log4j.LogManager;
|
import org.apache.log4j.LogManager;
|
||||||
import org.apache.log4j.Logger;
|
import org.apache.log4j.Logger;
|
||||||
import org.apache.spark.api.java.JavaRDD;
|
import org.apache.spark.api.java.JavaRDD;
|
||||||
@@ -134,16 +134,28 @@ public abstract class MultipleSparkJobExecutionStrategy<T extends HoodieRecordPa
|
|||||||
* @return {@link RDDCustomColumnsSortPartitioner} if sort columns are provided, otherwise empty.
|
* @return {@link RDDCustomColumnsSortPartitioner} if sort columns are provided, otherwise empty.
|
||||||
*/
|
*/
|
||||||
protected Option<BulkInsertPartitioner<T>> getPartitioner(Map<String, String> strategyParams, Schema schema) {
|
protected Option<BulkInsertPartitioner<T>> getPartitioner(Map<String, String> strategyParams, Schema schema) {
|
||||||
if (getWriteConfig().isLayoutOptimizationEnabled()) {
|
Option<String[]> orderByColumnsOpt =
|
||||||
// sort input records by z-order/hilbert
|
Option.ofNullable(strategyParams.get(PLAN_STRATEGY_SORT_COLUMNS.key()))
|
||||||
return Option.of(new RDDSpatialCurveOptimizationSortPartitioner((HoodieSparkEngineContext) getEngineContext(),
|
.map(listStr -> listStr.split(","));
|
||||||
getWriteConfig(), HoodieAvroUtils.addMetadataFields(schema)));
|
|
||||||
} else if (strategyParams.containsKey(PLAN_STRATEGY_SORT_COLUMNS.key())) {
|
return orderByColumnsOpt.map(orderByColumns -> {
|
||||||
return Option.of(new RDDCustomColumnsSortPartitioner(strategyParams.get(PLAN_STRATEGY_SORT_COLUMNS.key()).split(","),
|
HoodieClusteringConfig.LayoutOptimizationStrategy layoutOptStrategy = getWriteConfig().getLayoutOptimizationStrategy();
|
||||||
HoodieAvroUtils.addMetadataFields(schema), getWriteConfig().isConsistentLogicalTimestampEnabled()));
|
switch (layoutOptStrategy) {
|
||||||
} else {
|
case ZORDER:
|
||||||
return Option.empty();
|
case HILBERT:
|
||||||
|
return new RDDSpatialCurveSortPartitioner(
|
||||||
|
(HoodieSparkEngineContext) getEngineContext(),
|
||||||
|
orderByColumns,
|
||||||
|
layoutOptStrategy,
|
||||||
|
getWriteConfig().getLayoutOptimizationCurveBuildMethod(),
|
||||||
|
HoodieAvroUtils.addMetadataFields(schema));
|
||||||
|
case LINEAR:
|
||||||
|
return new RDDCustomColumnsSortPartitioner(orderByColumns, HoodieAvroUtils.addMetadataFields(schema),
|
||||||
|
getWriteConfig().isConsistentLogicalTimestampEnabled());
|
||||||
|
default:
|
||||||
|
throw new UnsupportedOperationException(String.format("Layout optimization strategy '%s' is not supported", layoutOptStrategy));
|
||||||
}
|
}
|
||||||
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|||||||
@@ -124,8 +124,8 @@ public abstract class SingleSparkJobExecutionStrategy<T extends HoodieRecordPayl
|
|||||||
Iterator<List<WriteStatus>> writeStatuses = performClusteringWithRecordsIterator(inputRecords, clusteringOps.getNumOutputGroups(), instantTime,
|
Iterator<List<WriteStatus>> writeStatuses = performClusteringWithRecordsIterator(inputRecords, clusteringOps.getNumOutputGroups(), instantTime,
|
||||||
strategyParams, schema.get(), inputFileIds, preserveHoodieMetadata, taskContextSupplier);
|
strategyParams, schema.get(), inputFileIds, preserveHoodieMetadata, taskContextSupplier);
|
||||||
|
|
||||||
Iterable<List<WriteStatus>> writestatusIterable = () -> writeStatuses;
|
Iterable<List<WriteStatus>> writeStatusIterable = () -> writeStatuses;
|
||||||
return StreamSupport.stream(writestatusIterable.spliterator(), false)
|
return StreamSupport.stream(writeStatusIterable.spliterator(), false)
|
||||||
.flatMap(writeStatusList -> writeStatusList.stream());
|
.flatMap(writeStatusList -> writeStatusList.stream());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -28,8 +28,8 @@ import org.apache.hudi.common.model.HoodieKey;
|
|||||||
import org.apache.hudi.common.model.HoodieRecord;
|
import org.apache.hudi.common.model.HoodieRecord;
|
||||||
import org.apache.hudi.common.model.HoodieRecordPayload;
|
import org.apache.hudi.common.model.HoodieRecordPayload;
|
||||||
import org.apache.hudi.common.model.RewriteAvroPayload;
|
import org.apache.hudi.common.model.RewriteAvroPayload;
|
||||||
|
import org.apache.hudi.common.util.Option;
|
||||||
import org.apache.hudi.config.HoodieClusteringConfig;
|
import org.apache.hudi.config.HoodieClusteringConfig;
|
||||||
import org.apache.hudi.config.HoodieWriteConfig;
|
|
||||||
import org.apache.hudi.sort.SpaceCurveSortingHelper;
|
import org.apache.hudi.sort.SpaceCurveSortingHelper;
|
||||||
import org.apache.hudi.table.BulkInsertPartitioner;
|
import org.apache.hudi.table.BulkInsertPartitioner;
|
||||||
import org.apache.spark.api.java.JavaRDD;
|
import org.apache.spark.api.java.JavaRDD;
|
||||||
@@ -38,31 +38,51 @@ import org.apache.spark.sql.Row;
|
|||||||
|
|
||||||
import java.util.Arrays;
|
import java.util.Arrays;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.stream.Collectors;
|
|
||||||
|
|
||||||
import static org.apache.hudi.common.util.StringUtils.isNullOrEmpty;
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* A partitioner that does spatial curve optimization sorting based on specified column values for each RDD partition.
|
* A partitioner that does spatial curve optimization sorting based on specified column values for each RDD partition.
|
||||||
* support z-curve optimization, hilbert will come soon.
|
* support z-curve optimization, hilbert will come soon.
|
||||||
* @param <T> HoodieRecordPayload type
|
* @param <T> HoodieRecordPayload type
|
||||||
*/
|
*/
|
||||||
public class RDDSpatialCurveOptimizationSortPartitioner<T extends HoodieRecordPayload>
|
public class RDDSpatialCurveSortPartitioner<T extends HoodieRecordPayload>
|
||||||
implements BulkInsertPartitioner<JavaRDD<HoodieRecord<T>>> {
|
implements BulkInsertPartitioner<JavaRDD<HoodieRecord<T>>> {
|
||||||
private final HoodieSparkEngineContext sparkEngineContext;
|
|
||||||
private final SerializableSchema serializableSchema;
|
|
||||||
private final HoodieWriteConfig config;
|
|
||||||
|
|
||||||
public RDDSpatialCurveOptimizationSortPartitioner(HoodieSparkEngineContext sparkEngineContext, HoodieWriteConfig config, Schema schema) {
|
private final HoodieSparkEngineContext sparkEngineContext;
|
||||||
|
private final String[] orderByColumns;
|
||||||
|
private final Schema schema;
|
||||||
|
private final HoodieClusteringConfig.LayoutOptimizationStrategy layoutOptStrategy;
|
||||||
|
private final HoodieClusteringConfig.SpatialCurveCompositionStrategyType curveCompositionStrategyType;
|
||||||
|
|
||||||
|
public RDDSpatialCurveSortPartitioner(HoodieSparkEngineContext sparkEngineContext,
|
||||||
|
String[] orderByColumns,
|
||||||
|
HoodieClusteringConfig.LayoutOptimizationStrategy layoutOptStrategy,
|
||||||
|
HoodieClusteringConfig.SpatialCurveCompositionStrategyType curveCompositionStrategyType,
|
||||||
|
Schema schema) {
|
||||||
this.sparkEngineContext = sparkEngineContext;
|
this.sparkEngineContext = sparkEngineContext;
|
||||||
this.config = config;
|
this.orderByColumns = orderByColumns;
|
||||||
this.serializableSchema = new SerializableSchema(schema);
|
this.layoutOptStrategy = layoutOptStrategy;
|
||||||
|
this.curveCompositionStrategyType = curveCompositionStrategyType;
|
||||||
|
this.schema = schema;
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public JavaRDD<HoodieRecord<T>> repartitionRecords(JavaRDD<HoodieRecord<T>> records, int outputSparkPartitions) {
|
public JavaRDD<HoodieRecord<T>> repartitionRecords(JavaRDD<HoodieRecord<T>> records, int outputSparkPartitions) {
|
||||||
JavaRDD<GenericRecord> preparedRecord = prepareGenericRecord(records, outputSparkPartitions, serializableSchema.get());
|
SerializableSchema serializableSchema = new SerializableSchema(schema);
|
||||||
return preparedRecord.map(record -> {
|
JavaRDD<GenericRecord> genericRecordsRDD =
|
||||||
|
records.map(f -> (GenericRecord) f.getData().getInsertValue(serializableSchema.get()).get());
|
||||||
|
|
||||||
|
Dataset<Row> sourceDataset =
|
||||||
|
AvroConversionUtils.createDataFrame(
|
||||||
|
genericRecordsRDD.rdd(),
|
||||||
|
schema.toString(),
|
||||||
|
sparkEngineContext.getSqlContext().sparkSession()
|
||||||
|
);
|
||||||
|
|
||||||
|
Dataset<Row> sortedDataset = reorder(sourceDataset, outputSparkPartitions);
|
||||||
|
|
||||||
|
return HoodieSparkUtils.createRdd(sortedDataset, schema.getName(), schema.getNamespace(), false, Option.empty())
|
||||||
|
.toJavaRDD()
|
||||||
|
.map(record -> {
|
||||||
String key = record.get(HoodieRecord.RECORD_KEY_METADATA_FIELD).toString();
|
String key = record.get(HoodieRecord.RECORD_KEY_METADATA_FIELD).toString();
|
||||||
String partition = record.get(HoodieRecord.PARTITION_PATH_METADATA_FIELD).toString();
|
String partition = record.get(HoodieRecord.PARTITION_PATH_METADATA_FIELD).toString();
|
||||||
HoodieKey hoodieKey = new HoodieKey(key, partition);
|
HoodieKey hoodieKey = new HoodieKey(key, partition);
|
||||||
@@ -71,47 +91,21 @@ public class RDDSpatialCurveOptimizationSortPartitioner<T extends HoodieRecordPa
|
|||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
private JavaRDD<GenericRecord> prepareGenericRecord(JavaRDD<HoodieRecord<T>> inputRecords, final int numOutputGroups, final Schema schema) {
|
private Dataset<Row> reorder(Dataset<Row> dataset, int numOutputGroups) {
|
||||||
SerializableSchema serializableSchema = new SerializableSchema(schema);
|
if (orderByColumns.length == 0) {
|
||||||
JavaRDD<GenericRecord> genericRecordJavaRDD = inputRecords.map(f -> (GenericRecord) f.getData().getInsertValue(serializableSchema.get()).get());
|
|
||||||
Dataset<Row> originDF =
|
|
||||||
AvroConversionUtils.createDataFrame(
|
|
||||||
genericRecordJavaRDD.rdd(),
|
|
||||||
schema.toString(),
|
|
||||||
sparkEngineContext.getSqlContext().sparkSession()
|
|
||||||
);
|
|
||||||
|
|
||||||
Dataset<Row> sortedDF = reorder(originDF, numOutputGroups);
|
|
||||||
|
|
||||||
return HoodieSparkUtils.createRdd(sortedDF, schema.getName(),
|
|
||||||
schema.getNamespace(), false, org.apache.hudi.common.util.Option.empty()).toJavaRDD();
|
|
||||||
}
|
|
||||||
|
|
||||||
private Dataset<Row> reorder(Dataset<Row> originDF, int numOutputGroups) {
|
|
||||||
String orderedColumnsListConfig = config.getClusteringSortColumns();
|
|
||||||
|
|
||||||
if (isNullOrEmpty(orderedColumnsListConfig) || numOutputGroups <= 0) {
|
|
||||||
// No-op
|
// No-op
|
||||||
return originDF;
|
return dataset;
|
||||||
}
|
}
|
||||||
|
|
||||||
List<String> orderedCols =
|
List<String> orderedCols = Arrays.asList(orderByColumns);
|
||||||
Arrays.stream(orderedColumnsListConfig.split(","))
|
|
||||||
.map(String::trim)
|
|
||||||
.collect(Collectors.toList());
|
|
||||||
|
|
||||||
HoodieClusteringConfig.LayoutOptimizationStrategy layoutOptStrategy =
|
switch (curveCompositionStrategyType) {
|
||||||
HoodieClusteringConfig.LayoutOptimizationStrategy.fromValue(config.getLayoutOptimizationStrategy());
|
|
||||||
|
|
||||||
HoodieClusteringConfig.BuildCurveStrategyType curveBuildStrategyType = config.getLayoutOptimizationCurveBuildMethod();
|
|
||||||
|
|
||||||
switch (curveBuildStrategyType) {
|
|
||||||
case DIRECT:
|
case DIRECT:
|
||||||
return SpaceCurveSortingHelper.orderDataFrameByMappingValues(originDF, layoutOptStrategy, orderedCols, numOutputGroups);
|
return SpaceCurveSortingHelper.orderDataFrameByMappingValues(dataset, layoutOptStrategy, orderedCols, numOutputGroups);
|
||||||
case SAMPLE:
|
case SAMPLE:
|
||||||
return SpaceCurveSortingHelper.orderDataFrameBySamplingValues(originDF, layoutOptStrategy, orderedCols, numOutputGroups);
|
return SpaceCurveSortingHelper.orderDataFrameBySamplingValues(dataset, layoutOptStrategy, orderedCols, numOutputGroups);
|
||||||
default:
|
default:
|
||||||
throw new UnsupportedOperationException(String.format("Unsupported space-curve curve building strategy (%s)", curveBuildStrategyType));
|
throw new UnsupportedOperationException(String.format("Unsupported space-curve curve building strategy (%s)", curveCompositionStrategyType));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -262,10 +262,10 @@ public class ColumnStatsIndexHelper {
|
|||||||
// │ │ ├── <part-...>.parquet
|
// │ │ ├── <part-...>.parquet
|
||||||
// │ │ └── ...
|
// │ │ └── ...
|
||||||
//
|
//
|
||||||
// If index is currently empty (no persisted tables), we simply create one
|
|
||||||
// using clustering operation's commit instance as it's name
|
|
||||||
Path newIndexTablePath = new Path(indexFolderPath, commitTime);
|
Path newIndexTablePath = new Path(indexFolderPath, commitTime);
|
||||||
|
|
||||||
|
// If index is currently empty (no persisted tables), we simply create one
|
||||||
|
// using clustering operation's commit instance as it's name
|
||||||
if (!fs.exists(new Path(indexFolderPath))) {
|
if (!fs.exists(new Path(indexFolderPath))) {
|
||||||
newColStatsIndexDf.repartition(1)
|
newColStatsIndexDf.repartition(1)
|
||||||
.write()
|
.write()
|
||||||
@@ -326,6 +326,9 @@ public class ColumnStatsIndexHelper {
|
|||||||
.repartition(1)
|
.repartition(1)
|
||||||
.write()
|
.write()
|
||||||
.format("parquet")
|
.format("parquet")
|
||||||
|
// NOTE: We intend to potentially overwrite index-table from the previous Clustering
|
||||||
|
// operation that has failed to commit
|
||||||
|
.mode("overwrite")
|
||||||
.save(newIndexTablePath.toString());
|
.save(newIndexTablePath.toString());
|
||||||
|
|
||||||
// Clean up residual col-stats-index tables that have might have been dangling since
|
// Clean up residual col-stats-index tables that have might have been dangling since
|
||||||
|
|||||||
@@ -184,13 +184,6 @@ public class HoodieSparkCopyOnWriteTable<T extends HoodieRecordPayload>
|
|||||||
String basePath = metaClient.getBasePath();
|
String basePath = metaClient.getBasePath();
|
||||||
String indexPath = metaClient.getColumnStatsIndexPath();
|
String indexPath = metaClient.getColumnStatsIndexPath();
|
||||||
|
|
||||||
List<String> completedCommits =
|
|
||||||
metaClient.getCommitsTimeline()
|
|
||||||
.filterCompletedInstants()
|
|
||||||
.getInstants()
|
|
||||||
.map(HoodieInstant::getTimestamp)
|
|
||||||
.collect(Collectors.toList());
|
|
||||||
|
|
||||||
List<String> touchedFiles =
|
List<String> touchedFiles =
|
||||||
updatedFilesStats.stream()
|
updatedFilesStats.stream()
|
||||||
.map(s -> new Path(basePath, s.getPath()).toString())
|
.map(s -> new Path(basePath, s.getPath()).toString())
|
||||||
@@ -214,6 +207,13 @@ public class HoodieSparkCopyOnWriteTable<T extends HoodieRecordPayload>
|
|||||||
new TableSchemaResolver(metaClient).getTableAvroSchemaWithoutMetadataFields()
|
new TableSchemaResolver(metaClient).getTableAvroSchemaWithoutMetadataFields()
|
||||||
);
|
);
|
||||||
|
|
||||||
|
List<String> completedCommits =
|
||||||
|
metaClient.getCommitsTimeline()
|
||||||
|
.filterCompletedInstants()
|
||||||
|
.getInstants()
|
||||||
|
.map(HoodieInstant::getTimestamp)
|
||||||
|
.collect(Collectors.toList());
|
||||||
|
|
||||||
ColumnStatsIndexHelper.updateColumnStatsIndexFor(
|
ColumnStatsIndexHelper.updateColumnStatsIndexFor(
|
||||||
sparkEngineContext.getSqlContext().sparkSession(),
|
sparkEngineContext.getSqlContext().sparkSession(),
|
||||||
AvroConversionUtils.convertAvroSchemaToStructType(tableWriteSchema),
|
AvroConversionUtils.convertAvroSchemaToStructType(tableWriteSchema),
|
||||||
|
|||||||
@@ -119,7 +119,8 @@ object DataSourceReadOptions {
|
|||||||
.key("hoodie.enable.data.skipping")
|
.key("hoodie.enable.data.skipping")
|
||||||
.defaultValue(true)
|
.defaultValue(true)
|
||||||
.sinceVersion("0.10.0")
|
.sinceVersion("0.10.0")
|
||||||
.withDocumentation("enable data skipping to boost query after doing z-order optimize for current table")
|
.withDocumentation("Enables data-skipping allowing queries to leverage indexes to reduce the search space by " +
|
||||||
|
"skipping over files")
|
||||||
|
|
||||||
/** @deprecated Use {@link QUERY_TYPE} and its methods instead */
|
/** @deprecated Use {@link QUERY_TYPE} and its methods instead */
|
||||||
@Deprecated
|
@Deprecated
|
||||||
|
|||||||
@@ -37,7 +37,7 @@ import scala.collection.JavaConversions._
|
|||||||
import scala.util.Random
|
import scala.util.Random
|
||||||
|
|
||||||
@Tag("functional")
|
@Tag("functional")
|
||||||
class TestSpaceCurveLayoutOptimization extends HoodieClientTestBase {
|
class TestLayoutOptimization extends HoodieClientTestBase {
|
||||||
var spark: SparkSession = _
|
var spark: SparkSession = _
|
||||||
|
|
||||||
val sourceTableSchema =
|
val sourceTableSchema =
|
||||||
@@ -79,7 +79,13 @@ class TestSpaceCurveLayoutOptimization extends HoodieClientTestBase {
|
|||||||
|
|
||||||
@ParameterizedTest
|
@ParameterizedTest
|
||||||
@MethodSource(Array("testLayoutOptimizationParameters"))
|
@MethodSource(Array("testLayoutOptimizationParameters"))
|
||||||
def testLayoutOptimizationFunctional(tableType: String): Unit = {
|
def testLayoutOptimizationFunctional(tableType: String,
|
||||||
|
layoutOptimizationStrategy: String,
|
||||||
|
spatialCurveCompositionStrategy: String): Unit = {
|
||||||
|
val curveCompositionStrategy =
|
||||||
|
Option(spatialCurveCompositionStrategy)
|
||||||
|
.getOrElse(HoodieClusteringConfig.LAYOUT_OPTIMIZE_SPATIAL_CURVE_BUILD_METHOD.defaultValue())
|
||||||
|
|
||||||
val targetRecordsCount = 10000
|
val targetRecordsCount = 10000
|
||||||
// Bulk Insert Operation
|
// Bulk Insert Operation
|
||||||
val records = recordsToStrings(dataGen.generateInserts("001", targetRecordsCount)).toList
|
val records = recordsToStrings(dataGen.generateInserts("001", targetRecordsCount)).toList
|
||||||
@@ -98,8 +104,9 @@ class TestSpaceCurveLayoutOptimization extends HoodieClientTestBase {
|
|||||||
.option("hoodie.clustering.plan.strategy.small.file.limit", "629145600")
|
.option("hoodie.clustering.plan.strategy.small.file.limit", "629145600")
|
||||||
.option("hoodie.clustering.plan.strategy.max.bytes.per.group", Long.MaxValue.toString)
|
.option("hoodie.clustering.plan.strategy.max.bytes.per.group", Long.MaxValue.toString)
|
||||||
.option("hoodie.clustering.plan.strategy.target.file.max.bytes", String.valueOf(64 * 1024 * 1024L))
|
.option("hoodie.clustering.plan.strategy.target.file.max.bytes", String.valueOf(64 * 1024 * 1024L))
|
||||||
.option(HoodieClusteringConfig.LAYOUT_OPTIMIZE_ENABLE.key, "true")
|
.option(HoodieClusteringConfig.LAYOUT_OPTIMIZE_STRATEGY.key(), layoutOptimizationStrategy)
|
||||||
.option(HoodieClusteringConfig.PLAN_STRATEGY_SORT_COLUMNS.key, "begin_lat, begin_lon")
|
.option(HoodieClusteringConfig.LAYOUT_OPTIMIZE_SPATIAL_CURVE_BUILD_METHOD.key(), curveCompositionStrategy)
|
||||||
|
.option(HoodieClusteringConfig.PLAN_STRATEGY_SORT_COLUMNS.key, "begin_lat,begin_lon")
|
||||||
.mode(SaveMode.Overwrite)
|
.mode(SaveMode.Overwrite)
|
||||||
.save(basePath)
|
.save(basePath)
|
||||||
|
|
||||||
@@ -162,14 +169,20 @@ class TestSpaceCurveLayoutOptimization extends HoodieClientTestBase {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
object TestSpaceCurveLayoutOptimization {
|
object TestLayoutOptimization {
|
||||||
def testLayoutOptimizationParameters(): java.util.stream.Stream[Arguments] = {
|
def testLayoutOptimizationParameters(): java.util.stream.Stream[Arguments] = {
|
||||||
java.util.stream.Stream.of(
|
java.util.stream.Stream.of(
|
||||||
arguments("COPY_ON_WRITE", "hilbert"),
|
arguments("COPY_ON_WRITE", "linear", null),
|
||||||
arguments("COPY_ON_WRITE", "z-order"),
|
arguments("COPY_ON_WRITE", "z-order", "direct"),
|
||||||
arguments("MERGE_ON_READ", "hilbert"),
|
arguments("COPY_ON_WRITE", "z-order", "sample"),
|
||||||
arguments("MERGE_ON_READ", "z-order")
|
arguments("COPY_ON_WRITE", "hilbert", "direct"),
|
||||||
|
arguments("COPY_ON_WRITE", "hilbert", "sample"),
|
||||||
|
|
||||||
|
arguments("MERGE_ON_READ", "linear", null),
|
||||||
|
arguments("MERGE_ON_READ", "z-order", "direct"),
|
||||||
|
arguments("MERGE_ON_READ", "z-order", "sample"),
|
||||||
|
arguments("MERGE_ON_READ", "hilbert", "direct"),
|
||||||
|
arguments("MERGE_ON_READ", "hilbert", "sample")
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
Reference in New Issue
Block a user