[HUDI-2149] Ensure and Audit docs for every configuration class in the codebase (#3272)
- Added docs when missing - Rewrote, reworded as needed - Made couple more classes extend HoodieConfig
This commit is contained in:
@@ -18,6 +18,8 @@
|
||||
|
||||
package org.apache.hudi.client;
|
||||
|
||||
import org.apache.hudi.ApiMaturityLevel;
|
||||
import org.apache.hudi.PublicAPIClass;
|
||||
import org.apache.hudi.common.model.HoodieKey;
|
||||
import org.apache.hudi.common.model.HoodieRecord;
|
||||
import org.apache.hudi.common.model.HoodieWriteStat;
|
||||
@@ -40,6 +42,7 @@ import static org.apache.hudi.common.model.DefaultHoodieRecordPayload.METADATA_E
|
||||
/**
|
||||
* Status of a write operation.
|
||||
*/
|
||||
@PublicAPIClass(maturity = ApiMaturityLevel.STABLE)
|
||||
public class WriteStatus implements Serializable {
|
||||
|
||||
private static final Logger LOG = LogManager.getLogger(WriteStatus.class);
|
||||
|
||||
@@ -97,7 +97,7 @@ public class HoodieBootstrapConfig extends HoodieConfig {
|
||||
.key("hoodie.bootstrap.index.class")
|
||||
.defaultValue(HFileBootstrapIndex.class.getName())
|
||||
.sinceVersion("0.6.0")
|
||||
.withDocumentation("");
|
||||
.withDocumentation("Implementation to use, for mapping a skeleton base file to a boostrap base file.");
|
||||
|
||||
private HoodieBootstrapConfig() {
|
||||
super();
|
||||
|
||||
@@ -31,36 +31,6 @@ import java.util.Properties;
|
||||
*/
|
||||
public class HoodieClusteringConfig extends HoodieConfig {
|
||||
|
||||
public static final ConfigProperty<String> CLUSTERING_PLAN_STRATEGY_CLASS = ConfigProperty
|
||||
.key("hoodie.clustering.plan.strategy.class")
|
||||
.defaultValue("org.apache.hudi.client.clustering.plan.strategy.SparkRecentDaysClusteringPlanStrategy")
|
||||
.sinceVersion("0.7.0")
|
||||
.withDocumentation("Config to provide a strategy class to create ClusteringPlan. Class has to be subclass of ClusteringPlanStrategy");
|
||||
|
||||
public static final ConfigProperty<String> CLUSTERING_EXECUTION_STRATEGY_CLASS = ConfigProperty
|
||||
.key("hoodie.clustering.execution.strategy.class")
|
||||
.defaultValue("org.apache.hudi.client.clustering.run.strategy.SparkSortAndSizeExecutionStrategy")
|
||||
.sinceVersion("0.7.0")
|
||||
.withDocumentation("Config to provide a strategy class to execute a ClusteringPlan. Class has to be subclass of RunClusteringStrategy");
|
||||
|
||||
public static final ConfigProperty<String> INLINE_CLUSTERING_PROP = ConfigProperty
|
||||
.key("hoodie.clustering.inline")
|
||||
.defaultValue("false")
|
||||
.sinceVersion("0.7.0")
|
||||
.withDocumentation("Turn on inline clustering - clustering will be run after write operation is complete");
|
||||
|
||||
public static final ConfigProperty<String> INLINE_CLUSTERING_MAX_COMMIT_PROP = ConfigProperty
|
||||
.key("hoodie.clustering.inline.max.commits")
|
||||
.defaultValue("4")
|
||||
.sinceVersion("0.7.0")
|
||||
.withDocumentation("Config to control frequency of inline clustering");
|
||||
|
||||
public static final ConfigProperty<String> ASYNC_CLUSTERING_MAX_COMMIT_PROP = ConfigProperty
|
||||
.key("hoodie.clustering.async.max.commits")
|
||||
.defaultValue("4")
|
||||
.sinceVersion("0.9.0")
|
||||
.withDocumentation("Config to control frequency of async clustering");
|
||||
|
||||
// Any strategy specific params can be saved with this prefix
|
||||
public static final String CLUSTERING_STRATEGY_PARAM_PREFIX = "hoodie.clustering.plan.strategy.";
|
||||
|
||||
@@ -70,6 +40,40 @@ public class HoodieClusteringConfig extends HoodieConfig {
|
||||
.sinceVersion("0.7.0")
|
||||
.withDocumentation("Number of partitions to list to create ClusteringPlan");
|
||||
|
||||
public static final ConfigProperty<String> CLUSTERING_PLAN_STRATEGY_CLASS = ConfigProperty
|
||||
.key("hoodie.clustering.plan.strategy.class")
|
||||
.defaultValue("org.apache.hudi.client.clustering.plan.strategy.SparkRecentDaysClusteringPlanStrategy")
|
||||
.sinceVersion("0.7.0")
|
||||
.withDocumentation("Config to provide a strategy class (subclass of ClusteringPlanStrategy) to create clustering plan "
|
||||
+ "i.e select what file groups are being clustered. Default strategy, looks at the last N (determined by "
|
||||
+ CLUSTERING_TARGET_PARTITIONS.key() + ") day based partitions picks the small file slices within those partitions.");
|
||||
|
||||
public static final ConfigProperty<String> CLUSTERING_EXECUTION_STRATEGY_CLASS = ConfigProperty
|
||||
.key("hoodie.clustering.execution.strategy.class")
|
||||
.defaultValue("org.apache.hudi.client.clustering.run.strategy.SparkSortAndSizeExecutionStrategy")
|
||||
.sinceVersion("0.7.0")
|
||||
.withDocumentation("Config to provide a strategy class (subclass of RunClusteringStrategy) to define how the "
|
||||
+ " clustering plan is executed. By default, we sort the file groups in th plan by the specified columns, while "
|
||||
+ " meeting the configured target file sizes.");
|
||||
|
||||
public static final ConfigProperty<String> INLINE_CLUSTERING_PROP = ConfigProperty
|
||||
.key("hoodie.clustering.inline")
|
||||
.defaultValue("false")
|
||||
.sinceVersion("0.7.0")
|
||||
.withDocumentation("Turn on inline clustering - clustering will be run after each write operation is complete");
|
||||
|
||||
public static final ConfigProperty<String> INLINE_CLUSTERING_MAX_COMMIT_PROP = ConfigProperty
|
||||
.key("hoodie.clustering.inline.max.commits")
|
||||
.defaultValue("4")
|
||||
.sinceVersion("0.7.0")
|
||||
.withDocumentation("Config to control frequency of clustering planning");
|
||||
|
||||
public static final ConfigProperty<String> ASYNC_CLUSTERING_MAX_COMMIT_PROP = ConfigProperty
|
||||
.key("hoodie.clustering.async.max.commits")
|
||||
.defaultValue("4")
|
||||
.sinceVersion("0.9.0")
|
||||
.withDocumentation("Config to control frequency of async clustering");
|
||||
|
||||
public static final ConfigProperty<String> CLUSTERING_PLAN_SMALL_FILE_LIMIT = ConfigProperty
|
||||
.key(CLUSTERING_STRATEGY_PARAM_PREFIX + "small.file.limit")
|
||||
.defaultValue(String.valueOf(600 * 1024 * 1024L))
|
||||
@@ -80,7 +84,7 @@ public class HoodieClusteringConfig extends HoodieConfig {
|
||||
.key(CLUSTERING_STRATEGY_PARAM_PREFIX + "max.bytes.per.group")
|
||||
.defaultValue(String.valueOf(2 * 1024 * 1024 * 1024L))
|
||||
.sinceVersion("0.7.0")
|
||||
.withDocumentation("Each clustering operation can create multiple groups. Total amount of data processed by clustering operation"
|
||||
.withDocumentation("Each clustering operation can create multiple output file groups. Total amount of data processed by clustering operation"
|
||||
+ " is defined by below two properties (CLUSTERING_MAX_BYTES_PER_GROUP * CLUSTERING_MAX_NUM_GROUPS)."
|
||||
+ " Max amount of data to be included in one group");
|
||||
|
||||
@@ -92,7 +96,7 @@ public class HoodieClusteringConfig extends HoodieConfig {
|
||||
|
||||
public static final ConfigProperty<String> CLUSTERING_TARGET_FILE_MAX_BYTES = ConfigProperty
|
||||
.key(CLUSTERING_STRATEGY_PARAM_PREFIX + "target.file.max.bytes")
|
||||
.defaultValue(String.valueOf(1 * 1024 * 1024 * 1024L))
|
||||
.defaultValue(String.valueOf(1024 * 1024 * 1024L))
|
||||
.sinceVersion("0.7.0")
|
||||
.withDocumentation("Each group can produce 'N' (CLUSTERING_MAX_GROUP_SIZE/CLUSTERING_TARGET_FILE_SIZE) output file groups");
|
||||
|
||||
@@ -106,13 +110,14 @@ public class HoodieClusteringConfig extends HoodieConfig {
|
||||
.key("hoodie.clustering.updates.strategy")
|
||||
.defaultValue("org.apache.hudi.client.clustering.update.strategy.SparkRejectUpdateStrategy")
|
||||
.sinceVersion("0.7.0")
|
||||
.withDocumentation("When file groups is in clustering, need to handle the update to these file groups. Default strategy just reject the update");
|
||||
.withDocumentation("Determines how to handle updates, deletes to file groups that are under clustering."
|
||||
+ " Default strategy just rejects the update");
|
||||
|
||||
public static final ConfigProperty<String> ASYNC_CLUSTERING_ENABLE_OPT_KEY = ConfigProperty
|
||||
.key("hoodie.clustering.async.enabled")
|
||||
.defaultValue("false")
|
||||
.sinceVersion("0.7.0")
|
||||
.withDocumentation("Async clustering");
|
||||
.withDocumentation("Enable running of clustering service, asynchronously as inserts happen on the table.");
|
||||
|
||||
private HoodieClusteringConfig() {
|
||||
super();
|
||||
|
||||
@@ -33,7 +33,9 @@ import javax.annotation.concurrent.Immutable;
|
||||
import java.io.File;
|
||||
import java.io.FileReader;
|
||||
import java.io.IOException;
|
||||
import java.util.Arrays;
|
||||
import java.util.Properties;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
/**
|
||||
* Compaction related config.
|
||||
@@ -41,101 +43,114 @@ import java.util.Properties;
|
||||
@Immutable
|
||||
public class HoodieCompactionConfig extends HoodieConfig {
|
||||
|
||||
public static final ConfigProperty<String> CLEANER_POLICY_PROP = ConfigProperty
|
||||
.key("hoodie.cleaner.policy")
|
||||
.defaultValue(HoodieCleaningPolicy.KEEP_LATEST_COMMITS.name())
|
||||
.withDocumentation("Cleaning policy to be used. Hudi will delete older versions of parquet files to re-claim space."
|
||||
+ " Any Query/Computation referring to this version of the file will fail. "
|
||||
+ "It is good to make sure that the data is retained for more than the maximum query execution time.");
|
||||
|
||||
public static final ConfigProperty<String> AUTO_CLEAN_PROP = ConfigProperty
|
||||
.key("hoodie.clean.automatic")
|
||||
.defaultValue("true")
|
||||
.withDocumentation("Should cleanup if there is anything to cleanup immediately after the commit");
|
||||
.withDocumentation("When enabled, the cleaner table service is invoked immediately after each commit,"
|
||||
+ " to delete older file slices. It's recommended to enable this, to ensure metadata and data storage"
|
||||
+ " growth is bounded.");
|
||||
|
||||
public static final ConfigProperty<String> ASYNC_CLEAN_PROP = ConfigProperty
|
||||
.key("hoodie.clean.async")
|
||||
.defaultValue("false")
|
||||
.withDocumentation("Only applies when #withAutoClean is turned on. When turned on runs cleaner async with writing.");
|
||||
|
||||
public static final ConfigProperty<String> INLINE_COMPACT_PROP = ConfigProperty
|
||||
.key("hoodie.compact.inline")
|
||||
.defaultValue("false")
|
||||
.withDocumentation("When set to true, compaction is triggered by the ingestion itself, "
|
||||
+ "right after a commit/deltacommit action as part of insert/upsert/bulk_insert");
|
||||
|
||||
public static final ConfigProperty<String> INLINE_COMPACT_NUM_DELTA_COMMITS_PROP = ConfigProperty
|
||||
.key("hoodie.compact.inline.max.delta.commits")
|
||||
.defaultValue("5")
|
||||
.withDocumentation("Number of max delta commits to keep before triggering an inline compaction");
|
||||
|
||||
public static final ConfigProperty<String> INLINE_COMPACT_TIME_DELTA_SECONDS_PROP = ConfigProperty
|
||||
.key("hoodie.compact.inline.max.delta.seconds")
|
||||
.defaultValue(String.valueOf(60 * 60))
|
||||
.withDocumentation("Run a compaction when time elapsed > N seconds since last compaction");
|
||||
|
||||
public static final ConfigProperty<String> INLINE_COMPACT_TRIGGER_STRATEGY_PROP = ConfigProperty
|
||||
.key("hoodie.compact.inline.trigger.strategy")
|
||||
.defaultValue(CompactionTriggerStrategy.NUM_COMMITS.name())
|
||||
.withDocumentation("");
|
||||
|
||||
public static final ConfigProperty<String> CLEANER_FILE_VERSIONS_RETAINED_PROP = ConfigProperty
|
||||
.key("hoodie.cleaner.fileversions.retained")
|
||||
.defaultValue("3")
|
||||
.withDocumentation("");
|
||||
.withDocumentation("Only applies when " + AUTO_CLEAN_PROP.key() + " is turned on. "
|
||||
+ "When turned on runs cleaner async with writing, which can speed up overall write performance.");
|
||||
|
||||
public static final ConfigProperty<String> CLEANER_COMMITS_RETAINED_PROP = ConfigProperty
|
||||
.key("hoodie.cleaner.commits.retained")
|
||||
.defaultValue("10")
|
||||
.withDocumentation("Number of commits to retain. So data will be retained for num_of_commits * time_between_commits "
|
||||
+ "(scheduled). This also directly translates into how much you can incrementally pull on this table");
|
||||
.withDocumentation("Number of commits to retain, without cleaning. This will be retained for num_of_commits * time_between_commits "
|
||||
+ "(scheduled). This also directly translates into how much data retention the table supports for incremental queries.");
|
||||
|
||||
public static final ConfigProperty<String> CLEANER_POLICY_PROP = ConfigProperty
|
||||
.key("hoodie.cleaner.policy")
|
||||
.defaultValue(HoodieCleaningPolicy.KEEP_LATEST_COMMITS.name())
|
||||
.withDocumentation("Cleaning policy to be used. The cleaner service deletes older file slices files to re-claim space."
|
||||
+ " By default, cleaner spares the file slices written by the last N commits, determined by " + CLEANER_COMMITS_RETAINED_PROP.key()
|
||||
+ " Long running query plans may often refer to older file slices and will break if those are cleaned, before the query has had"
|
||||
+ " a chance to run. So, it is good to make sure that the data is retained for more than the maximum query execution time");
|
||||
|
||||
public static final ConfigProperty<String> INLINE_COMPACT_PROP = ConfigProperty
|
||||
.key("hoodie.compact.inline")
|
||||
.defaultValue("false")
|
||||
.withDocumentation("When set to true, compaction service is triggered after each write. While being "
|
||||
+ " simpler operationally, this adds extra latency on the write path.");
|
||||
|
||||
public static final ConfigProperty<String> INLINE_COMPACT_NUM_DELTA_COMMITS_PROP = ConfigProperty
|
||||
.key("hoodie.compact.inline.max.delta.commits")
|
||||
.defaultValue("5")
|
||||
.withDocumentation("Number of delta commits after the last compaction, before scheduling of a new compaction is attempted.");
|
||||
|
||||
public static final ConfigProperty<String> INLINE_COMPACT_TIME_DELTA_SECONDS_PROP = ConfigProperty
|
||||
.key("hoodie.compact.inline.max.delta.seconds")
|
||||
.defaultValue(String.valueOf(60 * 60))
|
||||
.withDocumentation("Number of elapsed seconds after the last compaction, before scheduling a new one.");
|
||||
|
||||
public static final ConfigProperty<String> INLINE_COMPACT_TRIGGER_STRATEGY_PROP = ConfigProperty
|
||||
.key("hoodie.compact.inline.trigger.strategy")
|
||||
.defaultValue(CompactionTriggerStrategy.NUM_COMMITS.name())
|
||||
.withDocumentation("Controls how compaction scheduling is triggered, by time or num delta commits or combination of both. "
|
||||
+ "Valid options: " + Arrays.stream(CompactionTriggerStrategy.values()).map(Enum::name).collect(Collectors.joining(",")));
|
||||
|
||||
public static final ConfigProperty<String> CLEANER_FILE_VERSIONS_RETAINED_PROP = ConfigProperty
|
||||
.key("hoodie.cleaner.fileversions.retained")
|
||||
.defaultValue("3")
|
||||
.withDocumentation("When " + HoodieCleaningPolicy.KEEP_LATEST_FILE_VERSIONS.name() + " cleaning policy is used, "
|
||||
+ " the minimum number of file slices to retain in each file group, during cleaning.");
|
||||
|
||||
public static final ConfigProperty<String> CLEANER_INCREMENTAL_MODE = ConfigProperty
|
||||
.key("hoodie.cleaner.incremental.mode")
|
||||
.defaultValue("true")
|
||||
.withDocumentation("");
|
||||
.withDocumentation("When enabled, the plans for each cleaner service run is computed incrementally off the events "
|
||||
+ " in the timeline, since the last cleaner run. This is much more efficient than obtaining listings for the full"
|
||||
+ " table for each planning (even with a metadata table).");
|
||||
|
||||
public static final ConfigProperty<String> MAX_COMMITS_TO_KEEP_PROP = ConfigProperty
|
||||
.key("hoodie.keep.max.commits")
|
||||
.defaultValue("30")
|
||||
.withDocumentation("Each commit is a small file in the .hoodie directory. Since DFS typically does not favor lots of "
|
||||
+ "small files, Hudi archives older commits into a sequential log. A commit is published atomically "
|
||||
+ "by a rename of the commit file.");
|
||||
.withDocumentation("Archiving service moves older entries from timeline into an archived log after each write, to "
|
||||
+ " keep the metadata overhead constant, even as the table size grows."
|
||||
+ "This config controls the maximum number of instants to retain in the active timeline. ");
|
||||
|
||||
public static final ConfigProperty<String> MIN_COMMITS_TO_KEEP_PROP = ConfigProperty
|
||||
.key("hoodie.keep.min.commits")
|
||||
.defaultValue("20")
|
||||
.withDocumentation("Each commit is a small file in the .hoodie directory. Since DFS typically does not favor lots of "
|
||||
+ "small files, Hudi archives older commits into a sequential log. A commit is published atomically "
|
||||
+ "by a rename of the commit file.");
|
||||
.withDocumentation("Similar to " + MAX_COMMITS_TO_KEEP_PROP.key() + ", but controls the minimum number of"
|
||||
+ "instants to retain in the active timeline.");
|
||||
|
||||
public static final ConfigProperty<String> COMMITS_ARCHIVAL_BATCH_SIZE_PROP = ConfigProperty
|
||||
.key("hoodie.commits.archival.batch")
|
||||
.defaultValue(String.valueOf(10))
|
||||
.withDocumentation("This controls the number of commit instants read in memory as a batch and archived together.");
|
||||
.withDocumentation("Archiving of instants is batched in best-effort manner, to pack more instants into a single"
|
||||
+ " archive log. This config controls such archival batch size.");
|
||||
|
||||
public static final ConfigProperty<String> CLEANER_BOOTSTRAP_BASE_FILE_ENABLED = ConfigProperty
|
||||
.key("hoodie.cleaner.delete.bootstrap.base.file")
|
||||
.defaultValue("false")
|
||||
.withDocumentation("Set true to clean bootstrap source files when necessary");
|
||||
.withDocumentation("When set to true, cleaner also deletes the bootstrap base file when it's skeleton base file is "
|
||||
+ " cleaned. Turn this to true, if you want to ensure the bootstrap dataset storage is reclaimed over time, as the"
|
||||
+ " table receives updates/deletes. Another reason to turn this on, would be to ensure data residing in bootstrap "
|
||||
+ " base files are also physically deleted, to comply with data privacy enforcement processes.");
|
||||
|
||||
public static final ConfigProperty<String> PARQUET_SMALL_FILE_LIMIT_BYTES = ConfigProperty
|
||||
.key("hoodie.parquet.small.file.limit")
|
||||
.defaultValue(String.valueOf(104857600))
|
||||
.withDocumentation("Upsert uses this file size to compact new data onto existing files. "
|
||||
+ "By default, treat any file <= 100MB as a small file.");
|
||||
.withDocumentation("During upsert operation, we opportunistically expand existing small files on storage, instead of writing"
|
||||
+ " new files, to keep number of files to an optimum. This config sets the file size limit below which a file on storage "
|
||||
+ " becomes a candidate to be selected as such a `small file`. By default, treat any file <= 100MB as a small file.");
|
||||
|
||||
public static final ConfigProperty<String> RECORD_SIZE_ESTIMATION_THRESHOLD_PROP = ConfigProperty
|
||||
.key("hoodie.record.size.estimation.threshold")
|
||||
.defaultValue("1.0")
|
||||
.withDocumentation("Hudi will use the previous commit to calculate the estimated record size by totalBytesWritten/totalRecordsWritten. "
|
||||
+ "If the previous commit is too small to make an accurate estimation, Hudi will search commits in the reverse order, "
|
||||
+ "until find a commit has totalBytesWritten larger than (PARQUET_SMALL_FILE_LIMIT_BYTES * RECORD_SIZE_ESTIMATION_THRESHOLD)");
|
||||
.withDocumentation("We use the previous commits' metadata to calculate the estimated record size and use it "
|
||||
+ " to bin pack records into partitions. If the previous commit is too small to make an accurate estimation, "
|
||||
+ " Hudi will search commits in the reverse order, until we find a commit that has totalBytesWritten "
|
||||
+ " larger than (PARQUET_SMALL_FILE_LIMIT_BYTES * this_threshold)");
|
||||
|
||||
public static final ConfigProperty<String> CLEANER_PARALLELISM = ConfigProperty
|
||||
.key("hoodie.cleaner.parallelism")
|
||||
.defaultValue("200")
|
||||
.withDocumentation("Increase this if cleaning becomes slow.");
|
||||
.withDocumentation("Parallelism for the cleaning operation. Increase this if cleaning becomes slow.");
|
||||
|
||||
// 500GB of target IO per compaction (both read and write
|
||||
public static final ConfigProperty<String> TARGET_IO_PER_COMPACTION_IN_MB_PROP = ConfigProperty
|
||||
@@ -161,15 +176,15 @@ public class HoodieCompactionConfig extends HoodieConfig {
|
||||
public static final ConfigProperty<String> COMPACTION_LAZY_BLOCK_READ_ENABLED_PROP = ConfigProperty
|
||||
.key("hoodie.compaction.lazy.block.read")
|
||||
.defaultValue("false")
|
||||
.withDocumentation("When a CompactedLogScanner merges all log files, this config helps to choose whether the logblocks "
|
||||
+ "should be read lazily or not. Choose true to use I/O intensive lazy block reading (low memory usage) or false "
|
||||
+ "for Memory intensive immediate block read (high memory usage)");
|
||||
.withDocumentation("When merging the delta log files, this config helps to choose whether the log blocks "
|
||||
+ "should be read lazily or not. Choose true to use lazy block reading (low memory usage, but incurs seeks to each block"
|
||||
+ " header) or false for immediate block read (higher memory usage)");
|
||||
|
||||
public static final ConfigProperty<String> COMPACTION_REVERSE_LOG_READ_ENABLED_PROP = ConfigProperty
|
||||
.key("hoodie.compaction.reverse.log.read")
|
||||
.defaultValue("false")
|
||||
.withDocumentation("HoodieLogFormatReader reads a logfile in the forward direction starting from pos=0 to pos=file_length. "
|
||||
+ "If this config is set to true, the Reader reads the logfile in reverse direction, from pos=file_length to pos=0");
|
||||
+ "If this config is set to true, the reader reads the logfile in reverse direction, from pos=file_length to pos=0");
|
||||
|
||||
public static final ConfigProperty<String> FAILED_WRITES_CLEANER_POLICY_PROP = ConfigProperty
|
||||
.key("hoodie.cleaner.policy.failed.writes")
|
||||
@@ -190,22 +205,24 @@ public class HoodieCompactionConfig extends HoodieConfig {
|
||||
public static final ConfigProperty<String> COPY_ON_WRITE_TABLE_INSERT_SPLIT_SIZE = ConfigProperty
|
||||
.key("hoodie.copyonwrite.insert.split.size")
|
||||
.defaultValue(String.valueOf(500000))
|
||||
.withDocumentation("Number of inserts, that will be put each partition/bucket for writing. "
|
||||
+ "The rationale to pick the insert parallelism is the following. Writing out 100MB files, "
|
||||
+ "with at least 1kb records, means 100K records per file. we just over provision to 500K.");
|
||||
.withDocumentation("Number of inserts assigned for each partition/bucket for writing. "
|
||||
+ "We based the default on writing out 100MB files, with at least 1kb records (100K records per file), and "
|
||||
+ " over provision to 500K. As long as auto-tuning of splits is turned on, this only affects the first "
|
||||
+ " write, where there is no history to learn record sizes from.");
|
||||
|
||||
public static final ConfigProperty<String> COPY_ON_WRITE_TABLE_AUTO_SPLIT_INSERTS = ConfigProperty
|
||||
.key("hoodie.copyonwrite.insert.auto.split")
|
||||
.defaultValue("true")
|
||||
.withDocumentation("Config to control whether we control insert split sizes automatically based on average"
|
||||
+ " record sizes.");
|
||||
+ " record sizes. It's recommended to keep this turned on, since hand tuning is otherwise extremely"
|
||||
+ " cumbersome.");
|
||||
|
||||
public static final ConfigProperty<String> COPY_ON_WRITE_TABLE_RECORD_SIZE_ESTIMATE = ConfigProperty
|
||||
.key("hoodie.copyonwrite.record.size.estimate")
|
||||
.defaultValue(String.valueOf(1024))
|
||||
.withDocumentation("The average record size. If specified, hudi will use this and not compute dynamically "
|
||||
+ "based on the last 24 commit’s metadata. No value set as default. This is critical in computing "
|
||||
+ "the insert parallelism and bin-packing inserts into small files. See above.");
|
||||
.withDocumentation("The average record size. If not explicitly specified, hudi will compute the "
|
||||
+ "record size estimate compute dynamically based on commit metadata. "
|
||||
+ " This is critical in computing the insert parallelism and bin-packing inserts into small files.");
|
||||
|
||||
private HoodieCompactionConfig() {
|
||||
super();
|
||||
|
||||
@@ -48,7 +48,8 @@ public class HoodieHBaseIndexConfig extends HoodieConfig {
|
||||
public static final ConfigProperty<Integer> HBASE_GET_BATCH_SIZE_PROP = ConfigProperty
|
||||
.key("hoodie.index.hbase.get.batch.size")
|
||||
.defaultValue(100)
|
||||
.withDocumentation("");
|
||||
.withDocumentation("Controls the batch size for performing gets against HBase. "
|
||||
+ "Batching improves throughput, by saving round trips.");
|
||||
|
||||
public static final ConfigProperty<String> HBASE_ZK_ZNODEPARENT = ConfigProperty
|
||||
.key("hoodie.index.hbase.zknode.path")
|
||||
@@ -59,12 +60,14 @@ public class HoodieHBaseIndexConfig extends HoodieConfig {
|
||||
public static final ConfigProperty<Integer> HBASE_PUT_BATCH_SIZE_PROP = ConfigProperty
|
||||
.key("hoodie.index.hbase.put.batch.size")
|
||||
.defaultValue(100)
|
||||
.withDocumentation("");
|
||||
.withDocumentation("Controls the batch size for performing puts against HBase. "
|
||||
+ "Batching improves throughput, by saving round trips.");
|
||||
|
||||
public static final ConfigProperty<String> HBASE_INDEX_QPS_ALLOCATOR_CLASS = ConfigProperty
|
||||
.key("hoodie.index.hbase.qps.allocator.class")
|
||||
.defaultValue(DefaultHBaseQPSResourceAllocator.class.getName())
|
||||
.withDocumentation("Property to set which implementation of HBase QPS resource allocator to be used");
|
||||
.withDocumentation("Property to set which implementation of HBase QPS resource allocator to be used, which"
|
||||
+ "controls the batching rate dynamically.");
|
||||
|
||||
public static final ConfigProperty<String> HBASE_PUT_BATCH_SIZE_AUTO_COMPUTE_PROP = ConfigProperty
|
||||
.key("hoodie.index.hbase.put.batch.size.autocompute")
|
||||
@@ -90,17 +93,17 @@ public class HoodieHBaseIndexConfig extends HoodieConfig {
|
||||
public static final ConfigProperty<Boolean> HOODIE_INDEX_COMPUTE_QPS_DYNAMICALLY = ConfigProperty
|
||||
.key("hoodie.index.hbase.dynamic_qps")
|
||||
.defaultValue(false)
|
||||
.withDocumentation("Property to decide if HBASE_QPS_FRACTION_PROP is dynamically calculated based on volume");
|
||||
.withDocumentation("Property to decide if HBASE_QPS_FRACTION_PROP is dynamically calculated based on write volume.");
|
||||
|
||||
public static final ConfigProperty<String> HBASE_MIN_QPS_FRACTION_PROP = ConfigProperty
|
||||
.key("hoodie.index.hbase.min.qps.fraction")
|
||||
.noDefaultValue()
|
||||
.withDocumentation("Min for HBASE_QPS_FRACTION_PROP to stabilize skewed volume workloads");
|
||||
.withDocumentation("Minimum for HBASE_QPS_FRACTION_PROP to stabilize skewed write workloads");
|
||||
|
||||
public static final ConfigProperty<String> HBASE_MAX_QPS_FRACTION_PROP = ConfigProperty
|
||||
.key("hoodie.index.hbase.max.qps.fraction")
|
||||
.noDefaultValue()
|
||||
.withDocumentation("Max for HBASE_QPS_FRACTION_PROP to stabilize skewed volume workloads");
|
||||
.withDocumentation("Maximum for HBASE_QPS_FRACTION_PROP to stabilize skewed write workloads");
|
||||
|
||||
public static final ConfigProperty<Integer> HOODIE_INDEX_DESIRED_PUTS_TIME_IN_SECS = ConfigProperty
|
||||
.key("hoodie.index.hbase.desired_puts_time_in_secs")
|
||||
@@ -120,17 +123,18 @@ public class HoodieHBaseIndexConfig extends HoodieConfig {
|
||||
public static final ConfigProperty<Integer> HOODIE_INDEX_HBASE_ZK_SESSION_TIMEOUT_MS = ConfigProperty
|
||||
.key("hoodie.index.hbase.zk.session_timeout_ms")
|
||||
.defaultValue(60 * 1000)
|
||||
.withDocumentation("");
|
||||
.withDocumentation("Session timeout value to use for Zookeeper failure detection, for the HBase client."
|
||||
+ "Lower this value, if you want to fail faster.");
|
||||
|
||||
public static final ConfigProperty<Integer> HOODIE_INDEX_HBASE_ZK_CONNECTION_TIMEOUT_MS = ConfigProperty
|
||||
.key("hoodie.index.hbase.zk.connection_timeout_ms")
|
||||
.defaultValue(15 * 1000)
|
||||
.withDocumentation("");
|
||||
.withDocumentation("Timeout to use for establishing connection with zookeeper, from HBase client.");
|
||||
|
||||
public static final ConfigProperty<String> HBASE_ZK_PATH_QPS_ROOT = ConfigProperty
|
||||
.key("hoodie.index.hbase.zkpath.qps_root")
|
||||
.defaultValue("/QPS_ROOT")
|
||||
.withDocumentation("");
|
||||
.withDocumentation("chroot in zookeeper, to use for all qps allocation co-ordination.");
|
||||
|
||||
public static final ConfigProperty<Boolean> HBASE_INDEX_UPDATE_PARTITION_PATH = ConfigProperty
|
||||
.key("hoodie.hbase.index.update.partition.path")
|
||||
|
||||
@@ -58,13 +58,12 @@ public class HoodieIndexConfig extends HoodieConfig {
|
||||
.defaultValue("60000")
|
||||
.withDocumentation("Only applies if index type is BLOOM. "
|
||||
+ "This is the number of entries to be stored in the bloom filter. "
|
||||
+ "We assume the maxParquetFileSize is 128MB and averageRecordSize is 1024B and "
|
||||
+ "The rationale for the default: Assume the maxParquetFileSize is 128MB and averageRecordSize is 1kb and "
|
||||
+ "hence we approx a total of 130K records in a file. The default (60000) is roughly half of this approximation. "
|
||||
+ "HUDI-56 tracks computing this dynamically. Warning: Setting this very low, "
|
||||
+ "will generate a lot of false positives and index lookup will have to scan a lot more files "
|
||||
+ "than it has to and Setting this to a very high number will increase the size every data file linearly "
|
||||
+ "(roughly 4KB for every 50000 entries). "
|
||||
+ "This config is also used with DYNNAMIC bloom filter which determines the initial size for the bloom.");
|
||||
+ "Warning: Setting this very low, will generate a lot of false positives and index lookup "
|
||||
+ "will have to scan a lot more files than it has to and setting this to a very high number will "
|
||||
+ "increase the size every base file linearly (roughly 4KB for every 50000 entries). "
|
||||
+ "This config is also used with DYNAMIC bloom filter which determines the initial size for the bloom.");
|
||||
|
||||
public static final ConfigProperty<String> BLOOM_FILTER_FPP = ConfigProperty
|
||||
.key("hoodie.index.bloom.fpp")
|
||||
@@ -73,16 +72,15 @@ public class HoodieIndexConfig extends HoodieConfig {
|
||||
+ "Error rate allowed given the number of entries. This is used to calculate how many bits should be "
|
||||
+ "assigned for the bloom filter and the number of hash functions. This is usually set very low (default: 0.000000001), "
|
||||
+ "we like to tradeoff disk space for lower false positives. "
|
||||
+ "If the number of entries added to bloom filter exceeds the congfigured value (hoodie.index.bloom.num_entries), "
|
||||
+ "If the number of entries added to bloom filter exceeds the configured value (hoodie.index.bloom.num_entries), "
|
||||
+ "then this fpp may not be honored.");
|
||||
|
||||
public static final ConfigProperty<String> BLOOM_INDEX_PARALLELISM_PROP = ConfigProperty
|
||||
.key("hoodie.bloom.index.parallelism")
|
||||
.defaultValue("0")
|
||||
.withDocumentation("Only applies if index type is BLOOM. "
|
||||
+ "This is the amount of parallelism for index lookup, which involves a Spark Shuffle. "
|
||||
+ "By default, this is auto computed based on input workload characteristics. "
|
||||
+ "Disable explicit bloom index parallelism setting by default - hoodie auto computes");
|
||||
+ "This is the amount of parallelism for index lookup, which involves a shuffle. "
|
||||
+ "By default, this is auto computed based on input workload characteristics.");
|
||||
|
||||
public static final ConfigProperty<String> BLOOM_INDEX_PRUNE_BY_RANGES_PROP = ConfigProperty
|
||||
.key("hoodie.bloom.index.prune.by.ranges")
|
||||
@@ -90,7 +88,8 @@ public class HoodieIndexConfig extends HoodieConfig {
|
||||
.withDocumentation("Only applies if index type is BLOOM. "
|
||||
+ "When true, range information from files to leveraged speed up index lookups. Particularly helpful, "
|
||||
+ "if the key has a monotonously increasing prefix, such as timestamp. "
|
||||
+ "If the record key is completely random, it is better to turn this off.");
|
||||
+ "If the record key is completely random, it is better to turn this off, since range pruning will only "
|
||||
+ " add extra overhead to the index lookup.");
|
||||
|
||||
public static final ConfigProperty<String> BLOOM_INDEX_USE_CACHING_PROP = ConfigProperty
|
||||
.key("hoodie.bloom.index.use.caching")
|
||||
@@ -131,7 +130,7 @@ public class HoodieIndexConfig extends HoodieConfig {
|
||||
.key("hoodie.simple.index.use.caching")
|
||||
.defaultValue("true")
|
||||
.withDocumentation("Only applies if index type is SIMPLE. "
|
||||
+ "When true, the input RDD will cached to speed up index lookup by reducing IO "
|
||||
+ "When true, the incoming writes will cached to speed up index lookup by reducing IO "
|
||||
+ "for computing parallelism or affected partitions");
|
||||
|
||||
public static final ConfigProperty<String> SIMPLE_INDEX_PARALLELISM_PROP = ConfigProperty
|
||||
@@ -187,7 +186,7 @@ public class HoodieIndexConfig extends HoodieConfig {
|
||||
public static final ConfigProperty<String> SIMPLE_INDEX_UPDATE_PARTITION_PATH = ConfigProperty
|
||||
.key("hoodie.simple.index.update.partition.path")
|
||||
.defaultValue("false")
|
||||
.withDocumentation("");
|
||||
.withDocumentation("Similar to " + BLOOM_INDEX_UPDATE_PARTITION_PATH + ", but for simple index.");
|
||||
|
||||
private EngineType engineType;
|
||||
|
||||
|
||||
@@ -18,6 +18,7 @@
|
||||
|
||||
package org.apache.hudi.config;
|
||||
|
||||
import org.apache.hudi.common.config.ConfigProperty;
|
||||
import org.apache.hudi.common.config.HoodieConfig;
|
||||
|
||||
/**
|
||||
@@ -30,13 +31,18 @@ public class HoodieInternalConfig extends HoodieConfig {
|
||||
public static final String BULKINSERT_ARE_PARTITIONER_RECORDS_SORTED = "hoodie.bulkinsert.are.partitioner.records.sorted";
|
||||
public static final Boolean DEFAULT_BULKINSERT_ARE_PARTITIONER_RECORDS_SORTED = false;
|
||||
|
||||
public static final ConfigProperty<String> BULKINSERT_INPUT_DATA_SCHEMA_DDL = ConfigProperty
|
||||
.key("hoodie.bulkinsert.schema.ddl")
|
||||
.noDefaultValue()
|
||||
.withDocumentation("Schema set for row writer/bulk insert.");
|
||||
|
||||
/**
|
||||
* Returns if partition records are sorted or not.
|
||||
*
|
||||
* @param propertyValue value for property BULKINSERT_ARE_PARTITIONER_RECORDS_SORTED.
|
||||
* @return the property value.
|
||||
*/
|
||||
public static Boolean getBulkInsertIsPartitionRecordsSorted(String propertyValue) {
|
||||
return propertyValue != null ? Boolean.parseBoolean(propertyValue) : DEFAULT_BULKINSERT_ARE_PARTITIONER_RECORDS_SORTED;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@@ -17,8 +17,8 @@
|
||||
|
||||
package org.apache.hudi.config;
|
||||
|
||||
import org.apache.hudi.client.transaction.SimpleConcurrentFileWritesConflictResolutionStrategy;
|
||||
import org.apache.hudi.client.transaction.ConflictResolutionStrategy;
|
||||
import org.apache.hudi.client.transaction.SimpleConcurrentFileWritesConflictResolutionStrategy;
|
||||
import org.apache.hudi.client.transaction.lock.ZookeeperBasedLockProvider;
|
||||
import org.apache.hudi.common.config.ConfigProperty;
|
||||
import org.apache.hudi.common.config.HoodieConfig;
|
||||
@@ -61,94 +61,94 @@ public class HoodieLockConfig extends HoodieConfig {
|
||||
.key(LOCK_ACQUIRE_RETRY_WAIT_TIME_IN_MILLIS_PROP_KEY)
|
||||
.defaultValue(DEFAULT_LOCK_ACQUIRE_RETRY_WAIT_TIME_IN_MILLIS)
|
||||
.sinceVersion("0.8.0")
|
||||
.withDocumentation("Parameter used in the exponential backoff retry policy. Stands for the Initial amount "
|
||||
+ "of time to wait between retries by lock provider client");
|
||||
.withDocumentation("Initial amount of time to wait between retries to acquire locks, "
|
||||
+ " subsequent retries will exponentially backoff.");
|
||||
|
||||
public static final ConfigProperty<String> LOCK_ACQUIRE_RETRY_MAX_WAIT_TIME_IN_MILLIS_PROP = ConfigProperty
|
||||
.key(LOCK_ACQUIRE_RETRY_MAX_WAIT_TIME_IN_MILLIS_PROP_KEY)
|
||||
.defaultValue(String.valueOf(5000L))
|
||||
.sinceVersion("0.8.0")
|
||||
.withDocumentation("Parameter used in the exponential backoff retry policy. Stands for the maximum amount "
|
||||
+ "of time to wait between retries by lock provider client");
|
||||
.withDocumentation("Maximum amount of time to wait between retries by lock provider client. This bounds"
|
||||
+ " the maximum delay from the exponential backoff. Currently used by ZK based lock provider only.");
|
||||
|
||||
public static final ConfigProperty<String> LOCK_ACQUIRE_CLIENT_RETRY_WAIT_TIME_IN_MILLIS_PROP = ConfigProperty
|
||||
.key(LOCK_ACQUIRE_CLIENT_RETRY_WAIT_TIME_IN_MILLIS_PROP_KEY)
|
||||
.defaultValue(String.valueOf(10000L))
|
||||
.sinceVersion("0.8.0")
|
||||
.withDocumentation("Amount of time to wait between retries from the hudi client");
|
||||
.withDocumentation("Amount of time to wait between retries on the lock provider by the lock manager");
|
||||
|
||||
public static final ConfigProperty<String> LOCK_ACQUIRE_NUM_RETRIES_PROP = ConfigProperty
|
||||
.key(LOCK_ACQUIRE_NUM_RETRIES_PROP_KEY)
|
||||
.defaultValue(DEFAULT_LOCK_ACQUIRE_NUM_RETRIES)
|
||||
.sinceVersion("0.8.0")
|
||||
.withDocumentation("Maximum number of times to retry by lock provider client");
|
||||
.withDocumentation("Maximum number of times to retry lock acquire, at each lock provider");
|
||||
|
||||
public static final ConfigProperty<String> LOCK_ACQUIRE_CLIENT_NUM_RETRIES_PROP = ConfigProperty
|
||||
.key(LOCK_ACQUIRE_CLIENT_NUM_RETRIES_PROP_KEY)
|
||||
.defaultValue(String.valueOf(0))
|
||||
.sinceVersion("0.8.0")
|
||||
.withDocumentation("Maximum number of times to retry to acquire lock additionally from the hudi client");
|
||||
.withDocumentation("Maximum number of times to retry to acquire lock additionally from the lock manager.");
|
||||
|
||||
public static final ConfigProperty<Integer> LOCK_ACQUIRE_WAIT_TIMEOUT_MS_PROP = ConfigProperty
|
||||
.key(LOCK_ACQUIRE_WAIT_TIMEOUT_MS_PROP_KEY)
|
||||
.defaultValue(60 * 1000)
|
||||
.sinceVersion("0.8.0")
|
||||
.withDocumentation("");
|
||||
.withDocumentation("Timeout in ms, to wait on an individual lock acquire() call, at the lock provider.");
|
||||
|
||||
public static final ConfigProperty<String> FILESYSTEM_LOCK_PATH_PROP = ConfigProperty
|
||||
.key(FILESYSTEM_LOCK_PATH_PROP_KEY)
|
||||
.noDefaultValue()
|
||||
.sinceVersion("0.8.0")
|
||||
.withDocumentation("");
|
||||
.withDocumentation("For DFS based lock providers, path to store the locks under.");
|
||||
|
||||
public static final ConfigProperty<String> HIVE_DATABASE_NAME_PROP = ConfigProperty
|
||||
.key(HIVE_DATABASE_NAME_PROP_KEY)
|
||||
.noDefaultValue()
|
||||
.sinceVersion("0.8.0")
|
||||
.withDocumentation("The Hive database to acquire lock against");
|
||||
.withDocumentation("For Hive based lock provider, the Hive database to acquire lock against");
|
||||
|
||||
public static final ConfigProperty<String> HIVE_TABLE_NAME_PROP = ConfigProperty
|
||||
.key(HIVE_TABLE_NAME_PROP_KEY)
|
||||
.noDefaultValue()
|
||||
.sinceVersion("0.8.0")
|
||||
.withDocumentation("The Hive table under the hive database to acquire lock against");
|
||||
.withDocumentation("For Hive based lock provider, the Hive table to acquire lock against");
|
||||
|
||||
public static final ConfigProperty<String> HIVE_METASTORE_URI_PROP = ConfigProperty
|
||||
.key(HIVE_METASTORE_URI_PROP_KEY)
|
||||
.noDefaultValue()
|
||||
.sinceVersion("0.8.0")
|
||||
.withDocumentation("");
|
||||
.withDocumentation("For Hive based lock provider, the Hive metastore URI to acquire locks against.");
|
||||
|
||||
public static final ConfigProperty<String> ZK_BASE_PATH_PROP = ConfigProperty
|
||||
.key(ZK_BASE_PATH_PROP_KEY)
|
||||
.noDefaultValue()
|
||||
.sinceVersion("0.8.0")
|
||||
.withDocumentation("The base path on Zookeeper under which to create a ZNode to acquire the lock. "
|
||||
+ "This should be common for all jobs writing to the same table");
|
||||
.withDocumentation("The base path on Zookeeper under which to create lock related ZNodes. "
|
||||
+ "This should be same for all concurrent writers to the same table");
|
||||
|
||||
public static final ConfigProperty<Integer> ZK_SESSION_TIMEOUT_MS_PROP = ConfigProperty
|
||||
.key(ZK_SESSION_TIMEOUT_MS_PROP_KEY)
|
||||
.defaultValue(DEFAULT_ZK_SESSION_TIMEOUT_MS)
|
||||
.sinceVersion("0.8.0")
|
||||
.withDocumentation("How long to wait after losing a connection to ZooKeeper before the session is expired");
|
||||
.withDocumentation("Timeout in ms, to wait after losing connection to ZooKeeper, before the session is expired");
|
||||
|
||||
public static final ConfigProperty<Integer> ZK_CONNECTION_TIMEOUT_MS_PROP = ConfigProperty
|
||||
.key(ZK_CONNECTION_TIMEOUT_MS_PROP_KEY)
|
||||
.defaultValue(DEFAULT_ZK_CONNECTION_TIMEOUT_MS)
|
||||
.sinceVersion("0.8.0")
|
||||
.withDocumentation("How long to wait when connecting to ZooKeeper before considering the connection a failure");
|
||||
.withDocumentation("Timeout in ms, to wait for establishing connection with Zookeeper.");
|
||||
|
||||
public static final ConfigProperty<String> ZK_CONNECT_URL_PROP = ConfigProperty
|
||||
.key(ZK_CONNECT_URL_PROP_KEY)
|
||||
.noDefaultValue()
|
||||
.sinceVersion("0.8.0")
|
||||
.withDocumentation("Set the list of comma separated servers to connect to");
|
||||
.withDocumentation("Zookeeper URL to connect to.");
|
||||
|
||||
public static final ConfigProperty<String> ZK_PORT_PROP = ConfigProperty
|
||||
.key(ZK_PORT_PROP_KEY)
|
||||
.noDefaultValue()
|
||||
.sinceVersion("0.8.0")
|
||||
.withDocumentation("The connection port to be used for Zookeeper");
|
||||
.withDocumentation("Zookeeper port to connect to.");
|
||||
|
||||
public static final ConfigProperty<String> ZK_LOCK_KEY_PROP = ConfigProperty
|
||||
.key(ZK_LOCK_KEY_PROP_KEY)
|
||||
|
||||
@@ -59,22 +59,22 @@ public class HoodieMemoryConfig extends HoodieConfig {
|
||||
public static final ConfigProperty<Long> MAX_MEMORY_FOR_MERGE_PROP = ConfigProperty
|
||||
.key("hoodie.memory.merge.max.size")
|
||||
.defaultValue(DEFAULT_MAX_MEMORY_FOR_SPILLABLE_MAP_IN_BYTES)
|
||||
.withDocumentation("Property to set the max memory for merge");
|
||||
.withDocumentation("Maximum amount of memory used for merge operations, before spilling to local storage.");
|
||||
|
||||
public static final ConfigProperty<String> MAX_MEMORY_FOR_COMPACTION_PROP = ConfigProperty
|
||||
.key("hoodie.memory.compaction.max.size")
|
||||
.noDefaultValue()
|
||||
.withDocumentation("Property to set the max memory for compaction");
|
||||
.withDocumentation("Maximum amount of memory used for compaction operations, before spilling to local storage.");
|
||||
|
||||
public static final ConfigProperty<Integer> MAX_DFS_STREAM_BUFFER_SIZE_PROP = ConfigProperty
|
||||
.key("hoodie.memory.dfs.buffer.max.size")
|
||||
.defaultValue(16 * 1024 * 1024)
|
||||
.withDocumentation("Property to set the max memory for dfs inputstream buffer size");
|
||||
.withDocumentation("Property to control the max memory for dfs input stream buffer size");
|
||||
|
||||
public static final ConfigProperty<String> SPILLABLE_MAP_BASE_PATH_PROP = ConfigProperty
|
||||
.key("hoodie.memory.spillable.map.path")
|
||||
.defaultValue("/tmp/")
|
||||
.withDocumentation("Default file path prefix for spillable file");
|
||||
.withDocumentation("Default file path prefix for spillable map");
|
||||
|
||||
public static final ConfigProperty<Double> WRITESTATUS_FAILURE_FRACTION_PROP = ConfigProperty
|
||||
.key("hoodie.memory.writestatus.failure.fraction")
|
||||
|
||||
@@ -41,7 +41,7 @@ public class HoodieMetricsDatadogConfig extends HoodieConfig {
|
||||
.key(DATADOG_PREFIX + ".report.period.seconds")
|
||||
.defaultValue(30)
|
||||
.sinceVersion("0.6.0")
|
||||
.withDocumentation("Datadog report period in seconds. Default to 30.");
|
||||
.withDocumentation("Datadog reporting period in seconds. Default to 30.");
|
||||
|
||||
public static final ConfigProperty<String> DATADOG_API_SITE = ConfigProperty
|
||||
.key(DATADOG_PREFIX + ".api.site")
|
||||
|
||||
@@ -34,19 +34,19 @@ public class HoodieMetricsPrometheusConfig extends HoodieConfig {
|
||||
.key(PUSHGATEWAY_PREFIX + ".host")
|
||||
.defaultValue("localhost")
|
||||
.sinceVersion("0.6.0")
|
||||
.withDocumentation("");
|
||||
.withDocumentation("Hostname of the prometheus push gateway");
|
||||
|
||||
public static final ConfigProperty<Integer> PUSHGATEWAY_PORT = ConfigProperty
|
||||
.key(PUSHGATEWAY_PREFIX + ".port")
|
||||
.defaultValue(9091)
|
||||
.sinceVersion("0.6.0")
|
||||
.withDocumentation("");
|
||||
.withDocumentation("Port for the push gateway.");
|
||||
|
||||
public static final ConfigProperty<Integer> PUSHGATEWAY_REPORT_PERIOD_SECONDS = ConfigProperty
|
||||
.key(PUSHGATEWAY_PREFIX + ".report.period.seconds")
|
||||
.defaultValue(30)
|
||||
.sinceVersion("0.6.0")
|
||||
.withDocumentation("");
|
||||
.withDocumentation("Reporting interval in seconds.");
|
||||
|
||||
public static final ConfigProperty<Boolean> PUSHGATEWAY_DELETE_ON_SHUTDOWN = ConfigProperty
|
||||
.key(PUSHGATEWAY_PREFIX + ".delete.on.shutdown")
|
||||
@@ -58,7 +58,7 @@ public class HoodieMetricsPrometheusConfig extends HoodieConfig {
|
||||
.key(PUSHGATEWAY_PREFIX + ".job.name")
|
||||
.defaultValue("")
|
||||
.sinceVersion("0.6.0")
|
||||
.withDocumentation("");
|
||||
.withDocumentation("Name of the push gateway job.");
|
||||
|
||||
public static final ConfigProperty<Boolean> PUSHGATEWAY_RANDOM_JOB_NAME_SUFFIX = ConfigProperty
|
||||
.key(PUSHGATEWAY_PREFIX + ".random.job.name.suffix")
|
||||
@@ -73,7 +73,7 @@ public class HoodieMetricsPrometheusConfig extends HoodieConfig {
|
||||
.key(PROMETHEUS_PREFIX + ".port")
|
||||
.defaultValue(9090)
|
||||
.sinceVersion("0.6.0")
|
||||
.withDocumentation("");
|
||||
.withDocumentation("Port for prometheus server.");
|
||||
|
||||
private HoodieMetricsPrometheusConfig() {
|
||||
super();
|
||||
|
||||
@@ -37,12 +37,14 @@ public class HoodiePayloadConfig extends HoodieConfig {
|
||||
public static final ConfigProperty<String> PAYLOAD_ORDERING_FIELD_PROP = ConfigProperty
|
||||
.key(PAYLOAD_ORDERING_FIELD_PROP_KEY)
|
||||
.defaultValue("ts")
|
||||
.withDocumentation("Property to hold the payload ordering field name");
|
||||
.withDocumentation("Table column/field name to order records that have the same key, before "
|
||||
+ "merging and writing to storage.");
|
||||
|
||||
public static final ConfigProperty<String> PAYLOAD_EVENT_TIME_FIELD_PROP = ConfigProperty
|
||||
.key(PAYLOAD_EVENT_TIME_FIELD_PROP_KEY)
|
||||
.defaultValue("ts")
|
||||
.withDocumentation("Property for payload event time field");
|
||||
.withDocumentation("Table column/field name to derive timestamp associated with the records. This can"
|
||||
+ "be useful for e.g, determining the freshness of the table.");
|
||||
|
||||
private HoodiePayloadConfig() {
|
||||
super();
|
||||
|
||||
@@ -43,19 +43,19 @@ public class HoodieStorageConfig extends HoodieConfig {
|
||||
public static final ConfigProperty<String> PARQUET_BLOCK_SIZE_BYTES = ConfigProperty
|
||||
.key("hoodie.parquet.block.size")
|
||||
.defaultValue(String.valueOf(120 * 1024 * 1024))
|
||||
.withDocumentation("Parquet RowGroup size. Its better this is same as the file size, so that a single column "
|
||||
+ "within a file is stored continuously on disk");
|
||||
.withDocumentation("Parquet RowGroup size. It's recommended to make this large enough that scan costs can be"
|
||||
+ " amortized by packing enough column values into a single row group.");
|
||||
|
||||
public static final ConfigProperty<String> PARQUET_PAGE_SIZE_BYTES = ConfigProperty
|
||||
.key("hoodie.parquet.page.size")
|
||||
.defaultValue(String.valueOf(1 * 1024 * 1024))
|
||||
.withDocumentation("Parquet page size. Page is the unit of read within a parquet file. "
|
||||
+ "Within a block, pages are compressed seperately.");
|
||||
+ "Within a block, pages are compressed separately.");
|
||||
|
||||
public static final ConfigProperty<String> ORC_FILE_MAX_BYTES = ConfigProperty
|
||||
.key("hoodie.orc.max.file.size")
|
||||
.defaultValue(String.valueOf(120 * 1024 * 1024))
|
||||
.withDocumentation("");
|
||||
.withDocumentation("Target file size for ORC base files.");
|
||||
|
||||
public static final ConfigProperty<String> ORC_STRIPE_SIZE = ConfigProperty
|
||||
.key("hoodie.orc.stripe.size")
|
||||
@@ -65,17 +65,18 @@ public class HoodieStorageConfig extends HoodieConfig {
|
||||
public static final ConfigProperty<String> ORC_BLOCK_SIZE = ConfigProperty
|
||||
.key("hoodie.orc.block.size")
|
||||
.defaultValue(ORC_FILE_MAX_BYTES.defaultValue())
|
||||
.withDocumentation("File system block size");
|
||||
.withDocumentation("ORC block size, recommended to be aligned with the target file size.");
|
||||
|
||||
public static final ConfigProperty<String> HFILE_FILE_MAX_BYTES = ConfigProperty
|
||||
.key("hoodie.hfile.max.file.size")
|
||||
.defaultValue(String.valueOf(120 * 1024 * 1024))
|
||||
.withDocumentation("");
|
||||
.withDocumentation("Target file size for HFile base files.");
|
||||
|
||||
public static final ConfigProperty<String> HFILE_BLOCK_SIZE_BYTES = ConfigProperty
|
||||
.key("hoodie.hfile.block.size")
|
||||
.defaultValue(String.valueOf(1 * 1024 * 1024))
|
||||
.withDocumentation("");
|
||||
.defaultValue(String.valueOf(1024 * 1024))
|
||||
.withDocumentation("Lower values increase the size of metadata tracked within HFile, but can offer potentially "
|
||||
+ "faster lookup times.");
|
||||
|
||||
// used to size log files
|
||||
public static final ConfigProperty<String> LOGFILE_SIZE_MAX_BYTES = ConfigProperty
|
||||
@@ -107,12 +108,12 @@ public class HoodieStorageConfig extends HoodieConfig {
|
||||
public static final ConfigProperty<String> HFILE_COMPRESSION_ALGORITHM = ConfigProperty
|
||||
.key("hoodie.hfile.compression.algorithm")
|
||||
.defaultValue("GZ")
|
||||
.withDocumentation("");
|
||||
.withDocumentation("Compression codec to use for hfile base files.");
|
||||
|
||||
public static final ConfigProperty<String> ORC_COMPRESSION_CODEC = ConfigProperty
|
||||
.key("hoodie.orc.compression.codec")
|
||||
.defaultValue("ZLIB")
|
||||
.withDocumentation("");
|
||||
.withDocumentation("Compression codec to use for ORC base files.");
|
||||
|
||||
// Default compression ratio for log file to parquet, general 3x
|
||||
public static final ConfigProperty<String> LOGFILE_TO_PARQUET_COMPRESSION_RATIO = ConfigProperty
|
||||
|
||||
@@ -36,7 +36,7 @@ public class HoodieWriteCommitCallbackConfig extends HoodieConfig {
|
||||
.key(CALLBACK_PREFIX + "on")
|
||||
.defaultValue(false)
|
||||
.sinceVersion("0.6.0")
|
||||
.withDocumentation("Turn callback on/off. off by default.");
|
||||
.withDocumentation("Turn commit callback on/off. off by default.");
|
||||
|
||||
public static final ConfigProperty<String> CALLBACK_CLASS_PROP = ConfigProperty
|
||||
.key(CALLBACK_PREFIX + "class")
|
||||
|
||||
@@ -18,7 +18,6 @@
|
||||
|
||||
package org.apache.hudi.config;
|
||||
|
||||
import org.apache.hadoop.hbase.io.compress.Compression;
|
||||
import org.apache.hudi.client.WriteStatus;
|
||||
import org.apache.hudi.client.bootstrap.BootstrapMode;
|
||||
import org.apache.hudi.client.transaction.ConflictResolutionStrategy;
|
||||
@@ -27,8 +26,8 @@ import org.apache.hudi.common.config.HoodieConfig;
|
||||
import org.apache.hudi.common.config.HoodieMetadataConfig;
|
||||
import org.apache.hudi.common.engine.EngineType;
|
||||
import org.apache.hudi.common.fs.ConsistencyGuardConfig;
|
||||
import org.apache.hudi.common.model.HoodieFailedWritesCleaningPolicy;
|
||||
import org.apache.hudi.common.model.HoodieCleaningPolicy;
|
||||
import org.apache.hudi.common.model.HoodieFailedWritesCleaningPolicy;
|
||||
import org.apache.hudi.common.model.OverwriteWithLatestAvroPayload;
|
||||
import org.apache.hudi.common.model.WriteConcurrencyMode;
|
||||
import org.apache.hudi.common.table.timeline.versioning.TimelineLayoutVersion;
|
||||
@@ -43,10 +42,13 @@ import org.apache.hudi.metrics.MetricsReporterType;
|
||||
import org.apache.hudi.metrics.datadog.DatadogHttpClient.ApiSite;
|
||||
import org.apache.hudi.table.action.compact.CompactionTriggerStrategy;
|
||||
import org.apache.hudi.table.action.compact.strategy.CompactionStrategy;
|
||||
|
||||
import org.apache.hadoop.hbase.io.compress.Compression;
|
||||
import org.apache.orc.CompressionKind;
|
||||
import org.apache.parquet.hadoop.metadata.CompressionCodecName;
|
||||
|
||||
import javax.annotation.concurrent.Immutable;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.FileReader;
|
||||
import java.io.IOException;
|
||||
@@ -71,7 +73,7 @@ public class HoodieWriteConfig extends HoodieConfig {
|
||||
public static final ConfigProperty<String> TABLE_NAME = ConfigProperty
|
||||
.key("hoodie.table.name")
|
||||
.noDefaultValue()
|
||||
.withDocumentation("Table name that will be used for registering with Hive. Needs to be same across runs.");
|
||||
.withDocumentation("Table name that will be used for registering with metastores like HMS. Needs to be same across runs.");
|
||||
|
||||
public static final ConfigProperty<String> PRECOMBINE_FIELD_PROP = ConfigProperty
|
||||
.key("hoodie.datasource.write.precombine.field")
|
||||
@@ -88,12 +90,14 @@ public class HoodieWriteConfig extends HoodieConfig {
|
||||
public static final ConfigProperty<String> KEYGENERATOR_CLASS_PROP = ConfigProperty
|
||||
.key("hoodie.datasource.write.keygenerator.class")
|
||||
.noDefaultValue()
|
||||
.withDocumentation("Key generator class, that implements will extract the key out of incoming Row object");
|
||||
.withDocumentation("Key generator class, that implements `org.apache.hudi.keygen.KeyGenerator` "
|
||||
+ "extract a key out of incoming records.");
|
||||
|
||||
public static final ConfigProperty<String> KEYGENERATOR_TYPE_PROP = ConfigProperty
|
||||
.key("hoodie.datasource.write.keygenerator.type")
|
||||
.defaultValue(KeyGeneratorType.SIMPLE.name())
|
||||
.withDocumentation("Type of build-in key generator, currently support SIMPLE, COMPLEX, TIMESTAMP, CUSTOM, NON_PARTITION, GLOBAL_DELETE");
|
||||
.withDocumentation("Easily configure one the built-in key generators, instead of specifying the key generator class."
|
||||
+ "Currently supports SIMPLE, COMPLEX, TIMESTAMP, CUSTOM, NON_PARTITION, GLOBAL_DELETE");
|
||||
|
||||
public static final ConfigProperty<String> ROLLBACK_USING_MARKERS = ConfigProperty
|
||||
.key("hoodie.rollback.using.markers")
|
||||
@@ -104,206 +108,220 @@ public class HoodieWriteConfig extends HoodieConfig {
|
||||
public static final ConfigProperty<String> TIMELINE_LAYOUT_VERSION = ConfigProperty
|
||||
.key("hoodie.timeline.layout.version")
|
||||
.noDefaultValue()
|
||||
.withDocumentation("");
|
||||
.sinceVersion("0.5.1")
|
||||
.withDocumentation("Controls the layout of the timeline. Version 0 relied on renames, Version 1 (default) models "
|
||||
+ "the timeline as an immutable log relying only on atomic writes for object storage.");
|
||||
|
||||
public static final ConfigProperty<String> BASE_PATH_PROP = ConfigProperty
|
||||
.key("hoodie.base.path")
|
||||
.noDefaultValue()
|
||||
.withDocumentation("Base DFS path under which all the data partitions are created. "
|
||||
.withDocumentation("Base path on lake storage, under which all the table data is stored. "
|
||||
+ "Always prefix it explicitly with the storage scheme (e.g hdfs://, s3:// etc). "
|
||||
+ "Hudi stores all the main meta-data about commits, savepoints, cleaning audit logs "
|
||||
+ "etc in .hoodie directory under the base directory.");
|
||||
+ "etc in .hoodie directory under this base path directory.");
|
||||
|
||||
public static final ConfigProperty<String> AVRO_SCHEMA = ConfigProperty
|
||||
.key("hoodie.avro.schema")
|
||||
.noDefaultValue()
|
||||
.withDocumentation("This is the current reader avro schema for the table. This is a string of the entire schema. "
|
||||
+ "HoodieWriteClient uses this schema to pass on to implementations of HoodieRecordPayload to convert "
|
||||
+ "from the source format to avro record. This is also used when re-writing records during an update.");
|
||||
.withDocumentation("Schema string representing the current write schema of the table. Hudi passes this to "
|
||||
+ "implementations of HoodieRecordPayload to convert incoming records to avro. This is also used as the write schema "
|
||||
+ "evolving records during an update.");
|
||||
|
||||
public static final ConfigProperty<String> AVRO_SCHEMA_VALIDATE = ConfigProperty
|
||||
.key("hoodie.avro.schema.validate")
|
||||
.defaultValue("false")
|
||||
.withDocumentation("");
|
||||
.withDocumentation("Validate the schema used for the write against the latest schema, for backwards compatibility.");
|
||||
|
||||
public static final ConfigProperty<String> INSERT_PARALLELISM = ConfigProperty
|
||||
.key("hoodie.insert.shuffle.parallelism")
|
||||
.defaultValue("1500")
|
||||
.withDocumentation("Once data has been initially imported, this parallelism controls initial parallelism for reading input records. "
|
||||
+ "Ensure this value is high enough say: 1 partition for 1 GB of input data");
|
||||
.withDocumentation("Parallelism for inserting records into the table. Inserts can shuffle data before writing to tune file sizes and optimize the storage layout.");
|
||||
|
||||
public static final ConfigProperty<String> BULKINSERT_PARALLELISM = ConfigProperty
|
||||
.key("hoodie.bulkinsert.shuffle.parallelism")
|
||||
.defaultValue("1500")
|
||||
.withDocumentation("Bulk insert is meant to be used for large initial imports and this parallelism determines "
|
||||
+ "the initial number of files in your table. Tune this to achieve a desired optimal size during initial import.");
|
||||
.withDocumentation("For large initial imports using bulk_insert operation, controls the parallelism to use for sort modes or custom partitioning done"
|
||||
+ "before writing records to the table.");
|
||||
|
||||
public static final ConfigProperty<String> BULKINSERT_USER_DEFINED_PARTITIONER_CLASS = ConfigProperty
|
||||
.key("hoodie.bulkinsert.user.defined.partitioner.class")
|
||||
.noDefaultValue()
|
||||
.withDocumentation("If specified, this class will be used to re-partition input records before they are inserted.");
|
||||
|
||||
public static final ConfigProperty<String> BULKINSERT_INPUT_DATA_SCHEMA_DDL = ConfigProperty
|
||||
.key("hoodie.bulkinsert.schema.ddl")
|
||||
.noDefaultValue()
|
||||
.withDocumentation("");
|
||||
.withDocumentation("If specified, this class will be used to re-partition records before they are bulk inserted. This can be used to sort, pack, cluster data"
|
||||
+ " optimally for common query patterns.");
|
||||
|
||||
public static final ConfigProperty<String> UPSERT_PARALLELISM = ConfigProperty
|
||||
.key("hoodie.upsert.shuffle.parallelism")
|
||||
.defaultValue("1500")
|
||||
.withDocumentation("Once data has been initially imported, this parallelism controls initial parallelism for reading input records. "
|
||||
+ "Ensure this value is high enough say: 1 partition for 1 GB of input data");
|
||||
.withDocumentation("Parallelism to use for upsert operation on the table. Upserts can shuffle data to perform index lookups, file sizing, bin packing records optimally"
|
||||
+ "into file groups.");
|
||||
|
||||
public static final ConfigProperty<String> DELETE_PARALLELISM = ConfigProperty
|
||||
.key("hoodie.delete.shuffle.parallelism")
|
||||
.defaultValue("1500")
|
||||
.withDocumentation("This parallelism is Used for “delete” operation while deduping or repartioning.");
|
||||
.withDocumentation("Parallelism used for “delete” operation. Delete operations also performs shuffles, similar to upsert operation.");
|
||||
|
||||
public static final ConfigProperty<String> ROLLBACK_PARALLELISM = ConfigProperty
|
||||
.key("hoodie.rollback.parallelism")
|
||||
.defaultValue("100")
|
||||
.withDocumentation("Determines the parallelism for rollback of commits.");
|
||||
.withDocumentation("Parallelism for rollback of commits. Rollbacks perform delete of files or logging delete blocks to file groups on storage in parallel.");
|
||||
|
||||
public static final ConfigProperty<String> WRITE_BUFFER_LIMIT_BYTES = ConfigProperty
|
||||
.key("hoodie.write.buffer.limit.bytes")
|
||||
.defaultValue(String.valueOf(4 * 1024 * 1024))
|
||||
.withDocumentation("");
|
||||
.withDocumentation("Size of in-memory buffer used for parallelizing network reads and lake storage writes.");
|
||||
|
||||
public static final ConfigProperty<String> COMBINE_BEFORE_INSERT_PROP = ConfigProperty
|
||||
.key("hoodie.combine.before.insert")
|
||||
.defaultValue("false")
|
||||
.withDocumentation("Flag which first combines the input RDD and merges multiple partial records into a single record "
|
||||
+ "before inserting or updating in DFS");
|
||||
.withDocumentation("When inserted records share same key, controls whether they should be first combined (i.e de-duplicated) before"
|
||||
+ " writing to storage.");
|
||||
|
||||
public static final ConfigProperty<String> COMBINE_BEFORE_UPSERT_PROP = ConfigProperty
|
||||
.key("hoodie.combine.before.upsert")
|
||||
.defaultValue("true")
|
||||
.withDocumentation("Flag which first combines the input RDD and merges multiple partial records into a single record "
|
||||
+ "before inserting or updating in DFS");
|
||||
.withDocumentation("When upserted records share same key, controls whether they should be first combined (i.e de-duplicated) before"
|
||||
+ " writing to storage. This should be turned off only if you are absolutely certain that there are no duplicates incoming, "
|
||||
+ " otherwise it can lead to duplicate keys and violate the uniqueness guarantees.");
|
||||
|
||||
public static final ConfigProperty<String> COMBINE_BEFORE_DELETE_PROP = ConfigProperty
|
||||
.key("hoodie.combine.before.delete")
|
||||
.defaultValue("true")
|
||||
.withDocumentation("Flag which first combines the input RDD and merges multiple partial records into a single record "
|
||||
+ "before deleting in DFS");
|
||||
.withDocumentation("During delete operations, controls whether we should combine deletes (and potentially also upserts) before "
|
||||
+ " writing to storage.");
|
||||
|
||||
public static final ConfigProperty<String> WRITE_STATUS_STORAGE_LEVEL = ConfigProperty
|
||||
.key("hoodie.write.status.storage.level")
|
||||
.defaultValue("MEMORY_AND_DISK_SER")
|
||||
.withDocumentation("HoodieWriteClient.insert and HoodieWriteClient.upsert returns a persisted RDD[WriteStatus], "
|
||||
+ "this is because the Client can choose to inspect the WriteStatus and choose and commit or not based on the failures. "
|
||||
+ "This is a configuration for the storage level for this RDD");
|
||||
.withDocumentation("Write status objects hold metadata about a write (stats, errors), that is not yet committed to storage. "
|
||||
+ "This controls the how that information is cached for inspection by clients. We rarely expect this to be changed.");
|
||||
|
||||
public static final ConfigProperty<String> HOODIE_AUTO_COMMIT_PROP = ConfigProperty
|
||||
.key("hoodie.auto.commit")
|
||||
.defaultValue("true")
|
||||
.withDocumentation("Should HoodieWriteClient autoCommit after insert and upsert. "
|
||||
+ "The client can choose to turn off auto-commit and commit on a “defined success condition”");
|
||||
.withDocumentation("Controls whether a write operation should auto commit. This can be turned off to perform inspection"
|
||||
+ " of the uncommitted write before deciding to commit.");
|
||||
|
||||
public static final ConfigProperty<String> HOODIE_WRITE_STATUS_CLASS_PROP = ConfigProperty
|
||||
.key("hoodie.writestatus.class")
|
||||
.defaultValue(WriteStatus.class.getName())
|
||||
.withDocumentation("");
|
||||
.withDocumentation("Subclass of " + WriteStatus.class.getName() + " to be used to collect information about a write. Can be "
|
||||
+ "overridden to collection additional metrics/statistics about the data if needed.");
|
||||
|
||||
public static final ConfigProperty<String> FINALIZE_WRITE_PARALLELISM = ConfigProperty
|
||||
.key("hoodie.finalize.write.parallelism")
|
||||
.defaultValue("1500")
|
||||
.withDocumentation("");
|
||||
.withDocumentation("Parallelism for the write finalization internal operation, which involves removing any partially written "
|
||||
+ "files from lake storage, before committing the write. Reduce this value, if the high number of tasks incur delays for smaller tables "
|
||||
+ "or low latency writes.");
|
||||
|
||||
public static final ConfigProperty<String> MARKERS_DELETE_PARALLELISM = ConfigProperty
|
||||
.key("hoodie.markers.delete.parallelism")
|
||||
.defaultValue("100")
|
||||
.withDocumentation("Determines the parallelism for deleting marker files.");
|
||||
.withDocumentation("Determines the parallelism for deleting marker files, which are used to track all files (valid or invalid/partial) written during "
|
||||
+ "a write operation. Increase this value if delays are observed, with large batch writes.");
|
||||
|
||||
public static final ConfigProperty<String> BULKINSERT_SORT_MODE = ConfigProperty
|
||||
.key("hoodie.bulkinsert.sort.mode")
|
||||
.defaultValue(BulkInsertSortMode.GLOBAL_SORT.toString())
|
||||
.withDocumentation("Sorting modes to use for sorting records for bulk insert. This is leveraged when user "
|
||||
+ "defined partitioner is not configured. Default is GLOBAL_SORT. Available values are - GLOBAL_SORT: "
|
||||
+ "this ensures best file sizes, with lowest memory overhead at cost of sorting. PARTITION_SORT: "
|
||||
+ "Strikes a balance by only sorting within a partition, still keeping the memory overhead of writing "
|
||||
+ "lowest and best effort file sizing. NONE: No sorting. Fastest and matches spark.write.parquet() "
|
||||
+ "in terms of number of files, overheads");
|
||||
.withDocumentation("Sorting modes to use for sorting records for bulk insert. This is user when user "
|
||||
+ BULKINSERT_USER_DEFINED_PARTITIONER_CLASS.key() + "is not configured. Available values are - "
|
||||
+ "GLOBAL_SORT: this ensures best file sizes, with lowest memory overhead at cost of sorting. "
|
||||
+ "PARTITION_SORT: Strikes a balance by only sorting within a partition, still keeping the memory overhead of writing "
|
||||
+ "lowest and best effort file sizing. "
|
||||
+ "NONE: No sorting. Fastest and matches `spark.write.parquet()` in terms of number of files, overheads");
|
||||
|
||||
public static final ConfigProperty<String> EMBEDDED_TIMELINE_SERVER_ENABLED = ConfigProperty
|
||||
.key("hoodie.embed.timeline.server")
|
||||
.defaultValue("true")
|
||||
.withDocumentation("");
|
||||
.withDocumentation("When true, spins up an instance of the timeline server (meta server that serves cached file listings, statistics),"
|
||||
+ "running on each writer's driver process, accepting requests during the write from executors.");
|
||||
|
||||
public static final ConfigProperty<String> EMBEDDED_TIMELINE_SERVER_REUSE_ENABLED = ConfigProperty
|
||||
.key("hoodie.embed.timeline.server.reuse.enabled")
|
||||
.defaultValue("false")
|
||||
.withDocumentation("");
|
||||
.withDocumentation("Controls whether the timeline server instance should be cached and reused across the JVM (across task lifecycles)"
|
||||
+ "to avoid startup costs. This should rarely be changed.");
|
||||
|
||||
public static final ConfigProperty<String> EMBEDDED_TIMELINE_SERVER_PORT = ConfigProperty
|
||||
.key("hoodie.embed.timeline.server.port")
|
||||
.defaultValue("0")
|
||||
.withDocumentation("");
|
||||
.withDocumentation("Port at which the timeline server listens for requests. When running embedded in each writer, it picks "
|
||||
+ "a free port and communicates to all the executors. This should rarely be changed.");
|
||||
|
||||
public static final ConfigProperty<String> EMBEDDED_TIMELINE_SERVER_THREADS = ConfigProperty
|
||||
.key("hoodie.embed.timeline.server.threads")
|
||||
.defaultValue("-1")
|
||||
.withDocumentation("");
|
||||
.withDocumentation("Number of threads to serve requests in the timeline server. By default, auto configured based on the number of underlying cores.");
|
||||
|
||||
public static final ConfigProperty<String> EMBEDDED_TIMELINE_SERVER_COMPRESS_OUTPUT = ConfigProperty
|
||||
.key("hoodie.embed.timeline.server.gzip")
|
||||
.defaultValue("true")
|
||||
.withDocumentation("");
|
||||
.withDocumentation("Controls whether gzip compression is used, for large responses from the timeline server, to improve latency.");
|
||||
|
||||
public static final ConfigProperty<String> EMBEDDED_TIMELINE_SERVER_USE_ASYNC = ConfigProperty
|
||||
.key("hoodie.embed.timeline.server.async")
|
||||
.defaultValue("false")
|
||||
.withDocumentation("");
|
||||
.withDocumentation("Controls whether or not, the requests to the timeline server are processed in asynchronous fashion, "
|
||||
+ "potentially improving throughput.");
|
||||
|
||||
public static final ConfigProperty<String> FAIL_ON_TIMELINE_ARCHIVING_ENABLED_PROP = ConfigProperty
|
||||
.key("hoodie.fail.on.timeline.archiving")
|
||||
.defaultValue("true")
|
||||
.withDocumentation("");
|
||||
.withDocumentation("Timeline archiving removes older instants from the timeline, after each write operation, to minimize metadata overhead. "
|
||||
+ "Controls whether or not, the write should be failed as well, if such archiving fails.");
|
||||
|
||||
public static final ConfigProperty<Long> INITIAL_CONSISTENCY_CHECK_INTERVAL_MS_PROP = ConfigProperty
|
||||
.key("hoodie.consistency.check.initial_interval_ms")
|
||||
.defaultValue(2000L)
|
||||
.withDocumentation("Time between successive attempts to ensure written data's metadata is consistent on storage");
|
||||
.withDocumentation("Initial time between successive attempts to ensure written data's metadata is consistent on storage. Grows with exponential"
|
||||
+ " backoff after the initial value.");
|
||||
|
||||
public static final ConfigProperty<Long> MAX_CONSISTENCY_CHECK_INTERVAL_MS_PROP = ConfigProperty
|
||||
.key("hoodie.consistency.check.max_interval_ms")
|
||||
.defaultValue(300000L)
|
||||
.withDocumentation("Max interval time for consistency check");
|
||||
.withDocumentation("Max time to wait between successive attempts at performing consistency checks");
|
||||
|
||||
public static final ConfigProperty<Integer> MAX_CONSISTENCY_CHECKS_PROP = ConfigProperty
|
||||
.key("hoodie.consistency.check.max_checks")
|
||||
.defaultValue(7)
|
||||
.withDocumentation("Maximum number of checks, for consistency of written data. Will wait upto 256 Secs");
|
||||
.withDocumentation("Maximum number of checks, for consistency of written data.");
|
||||
|
||||
public static final ConfigProperty<String> MERGE_DATA_VALIDATION_CHECK_ENABLED = ConfigProperty
|
||||
.key("hoodie.merge.data.validation.enabled")
|
||||
.defaultValue("false")
|
||||
.withDocumentation("Data validation check performed during merges before actual commits");
|
||||
.withDocumentation("When enabled, data validation checks are performed during merges to ensure expected "
|
||||
+ "number of records after merge operation.");
|
||||
|
||||
public static final ConfigProperty<String> MERGE_ALLOW_DUPLICATE_ON_INSERTS = ConfigProperty
|
||||
.key("hoodie.merge.allow.duplicate.on.inserts")
|
||||
.defaultValue("false")
|
||||
.withDocumentation("Allow duplicates with inserts while merging with existing records");
|
||||
.withDocumentation("When enabled, we allow duplicate keys even if inserts are routed to merge with an existing file (for ensuring file sizing)."
|
||||
+ " This is only relevant for insert operation, since upsert, delete operations will ensure unique key constraints are maintained.");
|
||||
|
||||
public static final ConfigProperty<ExternalSpillableMap.DiskMapType> SPILLABLE_DISK_MAP_TYPE = ConfigProperty
|
||||
.key("hoodie.spillable.diskmap.type")
|
||||
.defaultValue(ExternalSpillableMap.DiskMapType.BITCASK)
|
||||
.withDocumentation("Enable usage of either BITCASK or ROCKS_DB as disk map for External Spillable Map");
|
||||
.withDocumentation("When handling input data that cannot be held in memory, to merge with a file on storage, a spillable diskmap is employed. "
|
||||
+ "By default, we use a persistent hashmap based loosely on bitcask, that offers O(1) inserts, lookups. "
|
||||
+ "Change this to `ROCKS_DB` to prefer using rocksDB, for handling the spill.");
|
||||
|
||||
public static final ConfigProperty<Integer> CLIENT_HEARTBEAT_INTERVAL_IN_MS_PROP = ConfigProperty
|
||||
.key("hoodie.client.heartbeat.interval_in_ms")
|
||||
.defaultValue(60 * 1000)
|
||||
.withDocumentation("");
|
||||
.withDocumentation("Writers perform heartbeats to indicate liveness. Controls how often (in ms), such heartbeats are registered to lake storage.");
|
||||
|
||||
public static final ConfigProperty<Integer> CLIENT_HEARTBEAT_NUM_TOLERABLE_MISSES_PROP = ConfigProperty
|
||||
.key("hoodie.client.heartbeat.tolerable.misses")
|
||||
.defaultValue(2)
|
||||
.withDocumentation("");
|
||||
.withDocumentation("Number of heartbeat misses, before a writer is deemed not alive and all pending writes are aborted.");
|
||||
|
||||
public static final ConfigProperty<String> WRITE_CONCURRENCY_MODE_PROP = ConfigProperty
|
||||
.key("hoodie.write.concurrency.mode")
|
||||
.defaultValue(WriteConcurrencyMode.SINGLE_WRITER.name())
|
||||
.withDocumentation("Enable different concurrency support");
|
||||
.withDocumentation("Enable different concurrency modes. Options are "
|
||||
+ "SINGLE_WRITER: Only one active writer to the table. Maximizes throughput"
|
||||
+ "OPTIMISTIC_CONCURRENCY_CONTROL: Multiple writers can operate on the table and exactly one of them succeed "
|
||||
+ "if a conflict (writes affect the same file group) is detected.");
|
||||
|
||||
public static final ConfigProperty<String> WRITE_META_KEY_PREFIXES_PROP = ConfigProperty
|
||||
.key("hoodie.write.meta.key.prefixes")
|
||||
@@ -312,16 +330,14 @@ public class HoodieWriteConfig extends HoodieConfig {
|
||||
+ "during overlapping commits via multi writing");
|
||||
|
||||
/**
|
||||
* The specified write schema. In most case, we do not need set this parameter,
|
||||
* but for the case the write schema is not equal to the specified table schema, we can
|
||||
* specify the write schema by this parameter.
|
||||
*
|
||||
* Currently the MergeIntoHoodieTableCommand use this to specify the write schema.
|
||||
* Currently the use this to specify the write schema.
|
||||
*/
|
||||
public static final ConfigProperty<String> WRITE_SCHEMA_PROP = ConfigProperty
|
||||
.key("hoodie.write.schema")
|
||||
.noDefaultValue()
|
||||
.withDocumentation("");
|
||||
.withDocumentation("The specified write schema. In most case, we do not need set this parameter,"
|
||||
+ " but for the case the write schema is not equal to the specified table schema, we can"
|
||||
+ " specify the write schema by this parameter. Used by MergeIntoHoodieTableCommand");
|
||||
|
||||
/**
|
||||
* HUDI-858 : There are users who had been directly using RDD APIs and have relied on a behavior in 0.4.x to allow
|
||||
@@ -342,7 +358,8 @@ public class HoodieWriteConfig extends HoodieConfig {
|
||||
.key(AVRO_SCHEMA.key() + ".external.transformation")
|
||||
.defaultValue("false")
|
||||
.withAlternatives(AVRO_SCHEMA.key() + ".externalTransformation")
|
||||
.withDocumentation("");
|
||||
.withDocumentation("When enabled, records in older schema are rewritten into newer schema during upsert,delete and background"
|
||||
+ " compaction,clustering operations.");
|
||||
|
||||
private ConsistencyGuardConfig consistencyGuardConfig;
|
||||
|
||||
@@ -352,7 +369,6 @@ public class HoodieWriteConfig extends HoodieConfig {
|
||||
private FileSystemViewStorageConfig viewStorageConfig;
|
||||
private HoodiePayloadConfig hoodiePayloadConfig;
|
||||
private HoodieMetadataConfig metadataConfig;
|
||||
|
||||
private EngineType engineType;
|
||||
|
||||
/**
|
||||
|
||||
@@ -19,13 +19,14 @@
|
||||
package org.apache.hudi.keygen.constant;
|
||||
|
||||
import org.apache.hudi.common.config.ConfigProperty;
|
||||
import org.apache.hudi.common.config.HoodieConfig;
|
||||
|
||||
public class KeyGeneratorOptions {
|
||||
public class KeyGeneratorOptions extends HoodieConfig {
|
||||
|
||||
public static final ConfigProperty<String> URL_ENCODE_PARTITIONING_OPT_KEY = ConfigProperty
|
||||
.key("hoodie.datasource.write.partitionpath.urlencode")
|
||||
.defaultValue("false")
|
||||
.withDocumentation("");
|
||||
.withDocumentation("Should we url encode the partition path value, before creating the folder structure.");
|
||||
|
||||
public static final ConfigProperty<String> HIVE_STYLE_PARTITIONING_OPT_KEY = ConfigProperty
|
||||
.key("hoodie.datasource.write.hive_style_partitioning")
|
||||
|
||||
@@ -60,7 +60,7 @@ public class HoodieAvroKeyGeneratorFactory {
|
||||
props.getString(HoodieWriteConfig.KEYGENERATOR_TYPE_PROP.key(), null);
|
||||
|
||||
if (StringUtils.isNullOrEmpty(keyGeneratorType)) {
|
||||
LOG.info("The value of {} is empty, use SIMPLE", HoodieWriteConfig.KEYGENERATOR_TYPE_PROP.key());
|
||||
LOG.info("The value of {} is empty, using SIMPLE", HoodieWriteConfig.KEYGENERATOR_TYPE_PROP.key());
|
||||
keyGeneratorType = KeyGeneratorType.SIMPLE.name();
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user