1
0

[HUDI-2149] Ensure and Audit docs for every configuration class in the codebase (#3272)

- Added docs when missing
 - Rewrote, reworded as needed
 - Made couple more classes extend HoodieConfig
This commit is contained in:
vinoth chandar
2021-07-14 10:56:08 -07:00
committed by GitHub
parent c1810f210e
commit 75040ee9e5
28 changed files with 406 additions and 400 deletions

View File

@@ -18,6 +18,8 @@
package org.apache.hudi.client;
import org.apache.hudi.ApiMaturityLevel;
import org.apache.hudi.PublicAPIClass;
import org.apache.hudi.common.model.HoodieKey;
import org.apache.hudi.common.model.HoodieRecord;
import org.apache.hudi.common.model.HoodieWriteStat;
@@ -40,6 +42,7 @@ import static org.apache.hudi.common.model.DefaultHoodieRecordPayload.METADATA_E
/**
* Status of a write operation.
*/
@PublicAPIClass(maturity = ApiMaturityLevel.STABLE)
public class WriteStatus implements Serializable {
private static final Logger LOG = LogManager.getLogger(WriteStatus.class);

View File

@@ -97,7 +97,7 @@ public class HoodieBootstrapConfig extends HoodieConfig {
.key("hoodie.bootstrap.index.class")
.defaultValue(HFileBootstrapIndex.class.getName())
.sinceVersion("0.6.0")
.withDocumentation("");
.withDocumentation("Implementation to use, for mapping a skeleton base file to a boostrap base file.");
private HoodieBootstrapConfig() {
super();

View File

@@ -31,36 +31,6 @@ import java.util.Properties;
*/
public class HoodieClusteringConfig extends HoodieConfig {
public static final ConfigProperty<String> CLUSTERING_PLAN_STRATEGY_CLASS = ConfigProperty
.key("hoodie.clustering.plan.strategy.class")
.defaultValue("org.apache.hudi.client.clustering.plan.strategy.SparkRecentDaysClusteringPlanStrategy")
.sinceVersion("0.7.0")
.withDocumentation("Config to provide a strategy class to create ClusteringPlan. Class has to be subclass of ClusteringPlanStrategy");
public static final ConfigProperty<String> CLUSTERING_EXECUTION_STRATEGY_CLASS = ConfigProperty
.key("hoodie.clustering.execution.strategy.class")
.defaultValue("org.apache.hudi.client.clustering.run.strategy.SparkSortAndSizeExecutionStrategy")
.sinceVersion("0.7.0")
.withDocumentation("Config to provide a strategy class to execute a ClusteringPlan. Class has to be subclass of RunClusteringStrategy");
public static final ConfigProperty<String> INLINE_CLUSTERING_PROP = ConfigProperty
.key("hoodie.clustering.inline")
.defaultValue("false")
.sinceVersion("0.7.0")
.withDocumentation("Turn on inline clustering - clustering will be run after write operation is complete");
public static final ConfigProperty<String> INLINE_CLUSTERING_MAX_COMMIT_PROP = ConfigProperty
.key("hoodie.clustering.inline.max.commits")
.defaultValue("4")
.sinceVersion("0.7.0")
.withDocumentation("Config to control frequency of inline clustering");
public static final ConfigProperty<String> ASYNC_CLUSTERING_MAX_COMMIT_PROP = ConfigProperty
.key("hoodie.clustering.async.max.commits")
.defaultValue("4")
.sinceVersion("0.9.0")
.withDocumentation("Config to control frequency of async clustering");
// Any strategy specific params can be saved with this prefix
public static final String CLUSTERING_STRATEGY_PARAM_PREFIX = "hoodie.clustering.plan.strategy.";
@@ -70,6 +40,40 @@ public class HoodieClusteringConfig extends HoodieConfig {
.sinceVersion("0.7.0")
.withDocumentation("Number of partitions to list to create ClusteringPlan");
public static final ConfigProperty<String> CLUSTERING_PLAN_STRATEGY_CLASS = ConfigProperty
.key("hoodie.clustering.plan.strategy.class")
.defaultValue("org.apache.hudi.client.clustering.plan.strategy.SparkRecentDaysClusteringPlanStrategy")
.sinceVersion("0.7.0")
.withDocumentation("Config to provide a strategy class (subclass of ClusteringPlanStrategy) to create clustering plan "
+ "i.e select what file groups are being clustered. Default strategy, looks at the last N (determined by "
+ CLUSTERING_TARGET_PARTITIONS.key() + ") day based partitions picks the small file slices within those partitions.");
public static final ConfigProperty<String> CLUSTERING_EXECUTION_STRATEGY_CLASS = ConfigProperty
.key("hoodie.clustering.execution.strategy.class")
.defaultValue("org.apache.hudi.client.clustering.run.strategy.SparkSortAndSizeExecutionStrategy")
.sinceVersion("0.7.0")
.withDocumentation("Config to provide a strategy class (subclass of RunClusteringStrategy) to define how the "
+ " clustering plan is executed. By default, we sort the file groups in th plan by the specified columns, while "
+ " meeting the configured target file sizes.");
public static final ConfigProperty<String> INLINE_CLUSTERING_PROP = ConfigProperty
.key("hoodie.clustering.inline")
.defaultValue("false")
.sinceVersion("0.7.0")
.withDocumentation("Turn on inline clustering - clustering will be run after each write operation is complete");
public static final ConfigProperty<String> INLINE_CLUSTERING_MAX_COMMIT_PROP = ConfigProperty
.key("hoodie.clustering.inline.max.commits")
.defaultValue("4")
.sinceVersion("0.7.0")
.withDocumentation("Config to control frequency of clustering planning");
public static final ConfigProperty<String> ASYNC_CLUSTERING_MAX_COMMIT_PROP = ConfigProperty
.key("hoodie.clustering.async.max.commits")
.defaultValue("4")
.sinceVersion("0.9.0")
.withDocumentation("Config to control frequency of async clustering");
public static final ConfigProperty<String> CLUSTERING_PLAN_SMALL_FILE_LIMIT = ConfigProperty
.key(CLUSTERING_STRATEGY_PARAM_PREFIX + "small.file.limit")
.defaultValue(String.valueOf(600 * 1024 * 1024L))
@@ -80,7 +84,7 @@ public class HoodieClusteringConfig extends HoodieConfig {
.key(CLUSTERING_STRATEGY_PARAM_PREFIX + "max.bytes.per.group")
.defaultValue(String.valueOf(2 * 1024 * 1024 * 1024L))
.sinceVersion("0.7.0")
.withDocumentation("Each clustering operation can create multiple groups. Total amount of data processed by clustering operation"
.withDocumentation("Each clustering operation can create multiple output file groups. Total amount of data processed by clustering operation"
+ " is defined by below two properties (CLUSTERING_MAX_BYTES_PER_GROUP * CLUSTERING_MAX_NUM_GROUPS)."
+ " Max amount of data to be included in one group");
@@ -92,7 +96,7 @@ public class HoodieClusteringConfig extends HoodieConfig {
public static final ConfigProperty<String> CLUSTERING_TARGET_FILE_MAX_BYTES = ConfigProperty
.key(CLUSTERING_STRATEGY_PARAM_PREFIX + "target.file.max.bytes")
.defaultValue(String.valueOf(1 * 1024 * 1024 * 1024L))
.defaultValue(String.valueOf(1024 * 1024 * 1024L))
.sinceVersion("0.7.0")
.withDocumentation("Each group can produce 'N' (CLUSTERING_MAX_GROUP_SIZE/CLUSTERING_TARGET_FILE_SIZE) output file groups");
@@ -106,13 +110,14 @@ public class HoodieClusteringConfig extends HoodieConfig {
.key("hoodie.clustering.updates.strategy")
.defaultValue("org.apache.hudi.client.clustering.update.strategy.SparkRejectUpdateStrategy")
.sinceVersion("0.7.0")
.withDocumentation("When file groups is in clustering, need to handle the update to these file groups. Default strategy just reject the update");
.withDocumentation("Determines how to handle updates, deletes to file groups that are under clustering."
+ " Default strategy just rejects the update");
public static final ConfigProperty<String> ASYNC_CLUSTERING_ENABLE_OPT_KEY = ConfigProperty
.key("hoodie.clustering.async.enabled")
.defaultValue("false")
.sinceVersion("0.7.0")
.withDocumentation("Async clustering");
.withDocumentation("Enable running of clustering service, asynchronously as inserts happen on the table.");
private HoodieClusteringConfig() {
super();

View File

@@ -33,7 +33,9 @@ import javax.annotation.concurrent.Immutable;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.util.Arrays;
import java.util.Properties;
import java.util.stream.Collectors;
/**
* Compaction related config.
@@ -41,101 +43,114 @@ import java.util.Properties;
@Immutable
public class HoodieCompactionConfig extends HoodieConfig {
public static final ConfigProperty<String> CLEANER_POLICY_PROP = ConfigProperty
.key("hoodie.cleaner.policy")
.defaultValue(HoodieCleaningPolicy.KEEP_LATEST_COMMITS.name())
.withDocumentation("Cleaning policy to be used. Hudi will delete older versions of parquet files to re-claim space."
+ " Any Query/Computation referring to this version of the file will fail. "
+ "It is good to make sure that the data is retained for more than the maximum query execution time.");
public static final ConfigProperty<String> AUTO_CLEAN_PROP = ConfigProperty
.key("hoodie.clean.automatic")
.defaultValue("true")
.withDocumentation("Should cleanup if there is anything to cleanup immediately after the commit");
.withDocumentation("When enabled, the cleaner table service is invoked immediately after each commit,"
+ " to delete older file slices. It's recommended to enable this, to ensure metadata and data storage"
+ " growth is bounded.");
public static final ConfigProperty<String> ASYNC_CLEAN_PROP = ConfigProperty
.key("hoodie.clean.async")
.defaultValue("false")
.withDocumentation("Only applies when #withAutoClean is turned on. When turned on runs cleaner async with writing.");
public static final ConfigProperty<String> INLINE_COMPACT_PROP = ConfigProperty
.key("hoodie.compact.inline")
.defaultValue("false")
.withDocumentation("When set to true, compaction is triggered by the ingestion itself, "
+ "right after a commit/deltacommit action as part of insert/upsert/bulk_insert");
public static final ConfigProperty<String> INLINE_COMPACT_NUM_DELTA_COMMITS_PROP = ConfigProperty
.key("hoodie.compact.inline.max.delta.commits")
.defaultValue("5")
.withDocumentation("Number of max delta commits to keep before triggering an inline compaction");
public static final ConfigProperty<String> INLINE_COMPACT_TIME_DELTA_SECONDS_PROP = ConfigProperty
.key("hoodie.compact.inline.max.delta.seconds")
.defaultValue(String.valueOf(60 * 60))
.withDocumentation("Run a compaction when time elapsed > N seconds since last compaction");
public static final ConfigProperty<String> INLINE_COMPACT_TRIGGER_STRATEGY_PROP = ConfigProperty
.key("hoodie.compact.inline.trigger.strategy")
.defaultValue(CompactionTriggerStrategy.NUM_COMMITS.name())
.withDocumentation("");
public static final ConfigProperty<String> CLEANER_FILE_VERSIONS_RETAINED_PROP = ConfigProperty
.key("hoodie.cleaner.fileversions.retained")
.defaultValue("3")
.withDocumentation("");
.withDocumentation("Only applies when " + AUTO_CLEAN_PROP.key() + " is turned on. "
+ "When turned on runs cleaner async with writing, which can speed up overall write performance.");
public static final ConfigProperty<String> CLEANER_COMMITS_RETAINED_PROP = ConfigProperty
.key("hoodie.cleaner.commits.retained")
.defaultValue("10")
.withDocumentation("Number of commits to retain. So data will be retained for num_of_commits * time_between_commits "
+ "(scheduled). This also directly translates into how much you can incrementally pull on this table");
.withDocumentation("Number of commits to retain, without cleaning. This will be retained for num_of_commits * time_between_commits "
+ "(scheduled). This also directly translates into how much data retention the table supports for incremental queries.");
public static final ConfigProperty<String> CLEANER_POLICY_PROP = ConfigProperty
.key("hoodie.cleaner.policy")
.defaultValue(HoodieCleaningPolicy.KEEP_LATEST_COMMITS.name())
.withDocumentation("Cleaning policy to be used. The cleaner service deletes older file slices files to re-claim space."
+ " By default, cleaner spares the file slices written by the last N commits, determined by " + CLEANER_COMMITS_RETAINED_PROP.key()
+ " Long running query plans may often refer to older file slices and will break if those are cleaned, before the query has had"
+ " a chance to run. So, it is good to make sure that the data is retained for more than the maximum query execution time");
public static final ConfigProperty<String> INLINE_COMPACT_PROP = ConfigProperty
.key("hoodie.compact.inline")
.defaultValue("false")
.withDocumentation("When set to true, compaction service is triggered after each write. While being "
+ " simpler operationally, this adds extra latency on the write path.");
public static final ConfigProperty<String> INLINE_COMPACT_NUM_DELTA_COMMITS_PROP = ConfigProperty
.key("hoodie.compact.inline.max.delta.commits")
.defaultValue("5")
.withDocumentation("Number of delta commits after the last compaction, before scheduling of a new compaction is attempted.");
public static final ConfigProperty<String> INLINE_COMPACT_TIME_DELTA_SECONDS_PROP = ConfigProperty
.key("hoodie.compact.inline.max.delta.seconds")
.defaultValue(String.valueOf(60 * 60))
.withDocumentation("Number of elapsed seconds after the last compaction, before scheduling a new one.");
public static final ConfigProperty<String> INLINE_COMPACT_TRIGGER_STRATEGY_PROP = ConfigProperty
.key("hoodie.compact.inline.trigger.strategy")
.defaultValue(CompactionTriggerStrategy.NUM_COMMITS.name())
.withDocumentation("Controls how compaction scheduling is triggered, by time or num delta commits or combination of both. "
+ "Valid options: " + Arrays.stream(CompactionTriggerStrategy.values()).map(Enum::name).collect(Collectors.joining(",")));
public static final ConfigProperty<String> CLEANER_FILE_VERSIONS_RETAINED_PROP = ConfigProperty
.key("hoodie.cleaner.fileversions.retained")
.defaultValue("3")
.withDocumentation("When " + HoodieCleaningPolicy.KEEP_LATEST_FILE_VERSIONS.name() + " cleaning policy is used, "
+ " the minimum number of file slices to retain in each file group, during cleaning.");
public static final ConfigProperty<String> CLEANER_INCREMENTAL_MODE = ConfigProperty
.key("hoodie.cleaner.incremental.mode")
.defaultValue("true")
.withDocumentation("");
.withDocumentation("When enabled, the plans for each cleaner service run is computed incrementally off the events "
+ " in the timeline, since the last cleaner run. This is much more efficient than obtaining listings for the full"
+ " table for each planning (even with a metadata table).");
public static final ConfigProperty<String> MAX_COMMITS_TO_KEEP_PROP = ConfigProperty
.key("hoodie.keep.max.commits")
.defaultValue("30")
.withDocumentation("Each commit is a small file in the .hoodie directory. Since DFS typically does not favor lots of "
+ "small files, Hudi archives older commits into a sequential log. A commit is published atomically "
+ "by a rename of the commit file.");
.withDocumentation("Archiving service moves older entries from timeline into an archived log after each write, to "
+ " keep the metadata overhead constant, even as the table size grows."
+ "This config controls the maximum number of instants to retain in the active timeline. ");
public static final ConfigProperty<String> MIN_COMMITS_TO_KEEP_PROP = ConfigProperty
.key("hoodie.keep.min.commits")
.defaultValue("20")
.withDocumentation("Each commit is a small file in the .hoodie directory. Since DFS typically does not favor lots of "
+ "small files, Hudi archives older commits into a sequential log. A commit is published atomically "
+ "by a rename of the commit file.");
.withDocumentation("Similar to " + MAX_COMMITS_TO_KEEP_PROP.key() + ", but controls the minimum number of"
+ "instants to retain in the active timeline.");
public static final ConfigProperty<String> COMMITS_ARCHIVAL_BATCH_SIZE_PROP = ConfigProperty
.key("hoodie.commits.archival.batch")
.defaultValue(String.valueOf(10))
.withDocumentation("This controls the number of commit instants read in memory as a batch and archived together.");
.withDocumentation("Archiving of instants is batched in best-effort manner, to pack more instants into a single"
+ " archive log. This config controls such archival batch size.");
public static final ConfigProperty<String> CLEANER_BOOTSTRAP_BASE_FILE_ENABLED = ConfigProperty
.key("hoodie.cleaner.delete.bootstrap.base.file")
.defaultValue("false")
.withDocumentation("Set true to clean bootstrap source files when necessary");
.withDocumentation("When set to true, cleaner also deletes the bootstrap base file when it's skeleton base file is "
+ " cleaned. Turn this to true, if you want to ensure the bootstrap dataset storage is reclaimed over time, as the"
+ " table receives updates/deletes. Another reason to turn this on, would be to ensure data residing in bootstrap "
+ " base files are also physically deleted, to comply with data privacy enforcement processes.");
public static final ConfigProperty<String> PARQUET_SMALL_FILE_LIMIT_BYTES = ConfigProperty
.key("hoodie.parquet.small.file.limit")
.defaultValue(String.valueOf(104857600))
.withDocumentation("Upsert uses this file size to compact new data onto existing files. "
+ "By default, treat any file <= 100MB as a small file.");
.withDocumentation("During upsert operation, we opportunistically expand existing small files on storage, instead of writing"
+ " new files, to keep number of files to an optimum. This config sets the file size limit below which a file on storage "
+ " becomes a candidate to be selected as such a `small file`. By default, treat any file <= 100MB as a small file.");
public static final ConfigProperty<String> RECORD_SIZE_ESTIMATION_THRESHOLD_PROP = ConfigProperty
.key("hoodie.record.size.estimation.threshold")
.defaultValue("1.0")
.withDocumentation("Hudi will use the previous commit to calculate the estimated record size by totalBytesWritten/totalRecordsWritten. "
+ "If the previous commit is too small to make an accurate estimation, Hudi will search commits in the reverse order, "
+ "until find a commit has totalBytesWritten larger than (PARQUET_SMALL_FILE_LIMIT_BYTES * RECORD_SIZE_ESTIMATION_THRESHOLD)");
.withDocumentation("We use the previous commits' metadata to calculate the estimated record size and use it "
+ " to bin pack records into partitions. If the previous commit is too small to make an accurate estimation, "
+ " Hudi will search commits in the reverse order, until we find a commit that has totalBytesWritten "
+ " larger than (PARQUET_SMALL_FILE_LIMIT_BYTES * this_threshold)");
public static final ConfigProperty<String> CLEANER_PARALLELISM = ConfigProperty
.key("hoodie.cleaner.parallelism")
.defaultValue("200")
.withDocumentation("Increase this if cleaning becomes slow.");
.withDocumentation("Parallelism for the cleaning operation. Increase this if cleaning becomes slow.");
// 500GB of target IO per compaction (both read and write
public static final ConfigProperty<String> TARGET_IO_PER_COMPACTION_IN_MB_PROP = ConfigProperty
@@ -161,15 +176,15 @@ public class HoodieCompactionConfig extends HoodieConfig {
public static final ConfigProperty<String> COMPACTION_LAZY_BLOCK_READ_ENABLED_PROP = ConfigProperty
.key("hoodie.compaction.lazy.block.read")
.defaultValue("false")
.withDocumentation("When a CompactedLogScanner merges all log files, this config helps to choose whether the logblocks "
+ "should be read lazily or not. Choose true to use I/O intensive lazy block reading (low memory usage) or false "
+ "for Memory intensive immediate block read (high memory usage)");
.withDocumentation("When merging the delta log files, this config helps to choose whether the log blocks "
+ "should be read lazily or not. Choose true to use lazy block reading (low memory usage, but incurs seeks to each block"
+ " header) or false for immediate block read (higher memory usage)");
public static final ConfigProperty<String> COMPACTION_REVERSE_LOG_READ_ENABLED_PROP = ConfigProperty
.key("hoodie.compaction.reverse.log.read")
.defaultValue("false")
.withDocumentation("HoodieLogFormatReader reads a logfile in the forward direction starting from pos=0 to pos=file_length. "
+ "If this config is set to true, the Reader reads the logfile in reverse direction, from pos=file_length to pos=0");
+ "If this config is set to true, the reader reads the logfile in reverse direction, from pos=file_length to pos=0");
public static final ConfigProperty<String> FAILED_WRITES_CLEANER_POLICY_PROP = ConfigProperty
.key("hoodie.cleaner.policy.failed.writes")
@@ -190,22 +205,24 @@ public class HoodieCompactionConfig extends HoodieConfig {
public static final ConfigProperty<String> COPY_ON_WRITE_TABLE_INSERT_SPLIT_SIZE = ConfigProperty
.key("hoodie.copyonwrite.insert.split.size")
.defaultValue(String.valueOf(500000))
.withDocumentation("Number of inserts, that will be put each partition/bucket for writing. "
+ "The rationale to pick the insert parallelism is the following. Writing out 100MB files, "
+ "with at least 1kb records, means 100K records per file. we just over provision to 500K.");
.withDocumentation("Number of inserts assigned for each partition/bucket for writing. "
+ "We based the default on writing out 100MB files, with at least 1kb records (100K records per file), and "
+ " over provision to 500K. As long as auto-tuning of splits is turned on, this only affects the first "
+ " write, where there is no history to learn record sizes from.");
public static final ConfigProperty<String> COPY_ON_WRITE_TABLE_AUTO_SPLIT_INSERTS = ConfigProperty
.key("hoodie.copyonwrite.insert.auto.split")
.defaultValue("true")
.withDocumentation("Config to control whether we control insert split sizes automatically based on average"
+ " record sizes.");
+ " record sizes. It's recommended to keep this turned on, since hand tuning is otherwise extremely"
+ " cumbersome.");
public static final ConfigProperty<String> COPY_ON_WRITE_TABLE_RECORD_SIZE_ESTIMATE = ConfigProperty
.key("hoodie.copyonwrite.record.size.estimate")
.defaultValue(String.valueOf(1024))
.withDocumentation("The average record size. If specified, hudi will use this and not compute dynamically "
+ "based on the last 24 commits metadata. No value set as default. This is critical in computing "
+ "the insert parallelism and bin-packing inserts into small files. See above.");
.withDocumentation("The average record size. If not explicitly specified, hudi will compute the "
+ "record size estimate compute dynamically based on commit metadata. "
+ " This is critical in computing the insert parallelism and bin-packing inserts into small files.");
private HoodieCompactionConfig() {
super();

View File

@@ -48,7 +48,8 @@ public class HoodieHBaseIndexConfig extends HoodieConfig {
public static final ConfigProperty<Integer> HBASE_GET_BATCH_SIZE_PROP = ConfigProperty
.key("hoodie.index.hbase.get.batch.size")
.defaultValue(100)
.withDocumentation("");
.withDocumentation("Controls the batch size for performing gets against HBase. "
+ "Batching improves throughput, by saving round trips.");
public static final ConfigProperty<String> HBASE_ZK_ZNODEPARENT = ConfigProperty
.key("hoodie.index.hbase.zknode.path")
@@ -59,12 +60,14 @@ public class HoodieHBaseIndexConfig extends HoodieConfig {
public static final ConfigProperty<Integer> HBASE_PUT_BATCH_SIZE_PROP = ConfigProperty
.key("hoodie.index.hbase.put.batch.size")
.defaultValue(100)
.withDocumentation("");
.withDocumentation("Controls the batch size for performing puts against HBase. "
+ "Batching improves throughput, by saving round trips.");
public static final ConfigProperty<String> HBASE_INDEX_QPS_ALLOCATOR_CLASS = ConfigProperty
.key("hoodie.index.hbase.qps.allocator.class")
.defaultValue(DefaultHBaseQPSResourceAllocator.class.getName())
.withDocumentation("Property to set which implementation of HBase QPS resource allocator to be used");
.withDocumentation("Property to set which implementation of HBase QPS resource allocator to be used, which"
+ "controls the batching rate dynamically.");
public static final ConfigProperty<String> HBASE_PUT_BATCH_SIZE_AUTO_COMPUTE_PROP = ConfigProperty
.key("hoodie.index.hbase.put.batch.size.autocompute")
@@ -90,17 +93,17 @@ public class HoodieHBaseIndexConfig extends HoodieConfig {
public static final ConfigProperty<Boolean> HOODIE_INDEX_COMPUTE_QPS_DYNAMICALLY = ConfigProperty
.key("hoodie.index.hbase.dynamic_qps")
.defaultValue(false)
.withDocumentation("Property to decide if HBASE_QPS_FRACTION_PROP is dynamically calculated based on volume");
.withDocumentation("Property to decide if HBASE_QPS_FRACTION_PROP is dynamically calculated based on write volume.");
public static final ConfigProperty<String> HBASE_MIN_QPS_FRACTION_PROP = ConfigProperty
.key("hoodie.index.hbase.min.qps.fraction")
.noDefaultValue()
.withDocumentation("Min for HBASE_QPS_FRACTION_PROP to stabilize skewed volume workloads");
.withDocumentation("Minimum for HBASE_QPS_FRACTION_PROP to stabilize skewed write workloads");
public static final ConfigProperty<String> HBASE_MAX_QPS_FRACTION_PROP = ConfigProperty
.key("hoodie.index.hbase.max.qps.fraction")
.noDefaultValue()
.withDocumentation("Max for HBASE_QPS_FRACTION_PROP to stabilize skewed volume workloads");
.withDocumentation("Maximum for HBASE_QPS_FRACTION_PROP to stabilize skewed write workloads");
public static final ConfigProperty<Integer> HOODIE_INDEX_DESIRED_PUTS_TIME_IN_SECS = ConfigProperty
.key("hoodie.index.hbase.desired_puts_time_in_secs")
@@ -120,17 +123,18 @@ public class HoodieHBaseIndexConfig extends HoodieConfig {
public static final ConfigProperty<Integer> HOODIE_INDEX_HBASE_ZK_SESSION_TIMEOUT_MS = ConfigProperty
.key("hoodie.index.hbase.zk.session_timeout_ms")
.defaultValue(60 * 1000)
.withDocumentation("");
.withDocumentation("Session timeout value to use for Zookeeper failure detection, for the HBase client."
+ "Lower this value, if you want to fail faster.");
public static final ConfigProperty<Integer> HOODIE_INDEX_HBASE_ZK_CONNECTION_TIMEOUT_MS = ConfigProperty
.key("hoodie.index.hbase.zk.connection_timeout_ms")
.defaultValue(15 * 1000)
.withDocumentation("");
.withDocumentation("Timeout to use for establishing connection with zookeeper, from HBase client.");
public static final ConfigProperty<String> HBASE_ZK_PATH_QPS_ROOT = ConfigProperty
.key("hoodie.index.hbase.zkpath.qps_root")
.defaultValue("/QPS_ROOT")
.withDocumentation("");
.withDocumentation("chroot in zookeeper, to use for all qps allocation co-ordination.");
public static final ConfigProperty<Boolean> HBASE_INDEX_UPDATE_PARTITION_PATH = ConfigProperty
.key("hoodie.hbase.index.update.partition.path")

View File

@@ -58,13 +58,12 @@ public class HoodieIndexConfig extends HoodieConfig {
.defaultValue("60000")
.withDocumentation("Only applies if index type is BLOOM. "
+ "This is the number of entries to be stored in the bloom filter. "
+ "We assume the maxParquetFileSize is 128MB and averageRecordSize is 1024B and "
+ "The rationale for the default: Assume the maxParquetFileSize is 128MB and averageRecordSize is 1kb and "
+ "hence we approx a total of 130K records in a file. The default (60000) is roughly half of this approximation. "
+ "HUDI-56 tracks computing this dynamically. Warning: Setting this very low, "
+ "will generate a lot of false positives and index lookup will have to scan a lot more files "
+ "than it has to and Setting this to a very high number will increase the size every data file linearly "
+ "(roughly 4KB for every 50000 entries). "
+ "This config is also used with DYNNAMIC bloom filter which determines the initial size for the bloom.");
+ "Warning: Setting this very low, will generate a lot of false positives and index lookup "
+ "will have to scan a lot more files than it has to and setting this to a very high number will "
+ "increase the size every base file linearly (roughly 4KB for every 50000 entries). "
+ "This config is also used with DYNAMIC bloom filter which determines the initial size for the bloom.");
public static final ConfigProperty<String> BLOOM_FILTER_FPP = ConfigProperty
.key("hoodie.index.bloom.fpp")
@@ -73,16 +72,15 @@ public class HoodieIndexConfig extends HoodieConfig {
+ "Error rate allowed given the number of entries. This is used to calculate how many bits should be "
+ "assigned for the bloom filter and the number of hash functions. This is usually set very low (default: 0.000000001), "
+ "we like to tradeoff disk space for lower false positives. "
+ "If the number of entries added to bloom filter exceeds the congfigured value (hoodie.index.bloom.num_entries), "
+ "If the number of entries added to bloom filter exceeds the configured value (hoodie.index.bloom.num_entries), "
+ "then this fpp may not be honored.");
public static final ConfigProperty<String> BLOOM_INDEX_PARALLELISM_PROP = ConfigProperty
.key("hoodie.bloom.index.parallelism")
.defaultValue("0")
.withDocumentation("Only applies if index type is BLOOM. "
+ "This is the amount of parallelism for index lookup, which involves a Spark Shuffle. "
+ "By default, this is auto computed based on input workload characteristics. "
+ "Disable explicit bloom index parallelism setting by default - hoodie auto computes");
+ "This is the amount of parallelism for index lookup, which involves a shuffle. "
+ "By default, this is auto computed based on input workload characteristics.");
public static final ConfigProperty<String> BLOOM_INDEX_PRUNE_BY_RANGES_PROP = ConfigProperty
.key("hoodie.bloom.index.prune.by.ranges")
@@ -90,7 +88,8 @@ public class HoodieIndexConfig extends HoodieConfig {
.withDocumentation("Only applies if index type is BLOOM. "
+ "When true, range information from files to leveraged speed up index lookups. Particularly helpful, "
+ "if the key has a monotonously increasing prefix, such as timestamp. "
+ "If the record key is completely random, it is better to turn this off.");
+ "If the record key is completely random, it is better to turn this off, since range pruning will only "
+ " add extra overhead to the index lookup.");
public static final ConfigProperty<String> BLOOM_INDEX_USE_CACHING_PROP = ConfigProperty
.key("hoodie.bloom.index.use.caching")
@@ -131,7 +130,7 @@ public class HoodieIndexConfig extends HoodieConfig {
.key("hoodie.simple.index.use.caching")
.defaultValue("true")
.withDocumentation("Only applies if index type is SIMPLE. "
+ "When true, the input RDD will cached to speed up index lookup by reducing IO "
+ "When true, the incoming writes will cached to speed up index lookup by reducing IO "
+ "for computing parallelism or affected partitions");
public static final ConfigProperty<String> SIMPLE_INDEX_PARALLELISM_PROP = ConfigProperty
@@ -187,7 +186,7 @@ public class HoodieIndexConfig extends HoodieConfig {
public static final ConfigProperty<String> SIMPLE_INDEX_UPDATE_PARTITION_PATH = ConfigProperty
.key("hoodie.simple.index.update.partition.path")
.defaultValue("false")
.withDocumentation("");
.withDocumentation("Similar to " + BLOOM_INDEX_UPDATE_PARTITION_PATH + ", but for simple index.");
private EngineType engineType;

View File

@@ -18,6 +18,7 @@
package org.apache.hudi.config;
import org.apache.hudi.common.config.ConfigProperty;
import org.apache.hudi.common.config.HoodieConfig;
/**
@@ -30,13 +31,18 @@ public class HoodieInternalConfig extends HoodieConfig {
public static final String BULKINSERT_ARE_PARTITIONER_RECORDS_SORTED = "hoodie.bulkinsert.are.partitioner.records.sorted";
public static final Boolean DEFAULT_BULKINSERT_ARE_PARTITIONER_RECORDS_SORTED = false;
public static final ConfigProperty<String> BULKINSERT_INPUT_DATA_SCHEMA_DDL = ConfigProperty
.key("hoodie.bulkinsert.schema.ddl")
.noDefaultValue()
.withDocumentation("Schema set for row writer/bulk insert.");
/**
* Returns if partition records are sorted or not.
*
* @param propertyValue value for property BULKINSERT_ARE_PARTITIONER_RECORDS_SORTED.
* @return the property value.
*/
public static Boolean getBulkInsertIsPartitionRecordsSorted(String propertyValue) {
return propertyValue != null ? Boolean.parseBoolean(propertyValue) : DEFAULT_BULKINSERT_ARE_PARTITIONER_RECORDS_SORTED;
}
}

View File

@@ -17,8 +17,8 @@
package org.apache.hudi.config;
import org.apache.hudi.client.transaction.SimpleConcurrentFileWritesConflictResolutionStrategy;
import org.apache.hudi.client.transaction.ConflictResolutionStrategy;
import org.apache.hudi.client.transaction.SimpleConcurrentFileWritesConflictResolutionStrategy;
import org.apache.hudi.client.transaction.lock.ZookeeperBasedLockProvider;
import org.apache.hudi.common.config.ConfigProperty;
import org.apache.hudi.common.config.HoodieConfig;
@@ -61,94 +61,94 @@ public class HoodieLockConfig extends HoodieConfig {
.key(LOCK_ACQUIRE_RETRY_WAIT_TIME_IN_MILLIS_PROP_KEY)
.defaultValue(DEFAULT_LOCK_ACQUIRE_RETRY_WAIT_TIME_IN_MILLIS)
.sinceVersion("0.8.0")
.withDocumentation("Parameter used in the exponential backoff retry policy. Stands for the Initial amount "
+ "of time to wait between retries by lock provider client");
.withDocumentation("Initial amount of time to wait between retries to acquire locks, "
+ " subsequent retries will exponentially backoff.");
public static final ConfigProperty<String> LOCK_ACQUIRE_RETRY_MAX_WAIT_TIME_IN_MILLIS_PROP = ConfigProperty
.key(LOCK_ACQUIRE_RETRY_MAX_WAIT_TIME_IN_MILLIS_PROP_KEY)
.defaultValue(String.valueOf(5000L))
.sinceVersion("0.8.0")
.withDocumentation("Parameter used in the exponential backoff retry policy. Stands for the maximum amount "
+ "of time to wait between retries by lock provider client");
.withDocumentation("Maximum amount of time to wait between retries by lock provider client. This bounds"
+ " the maximum delay from the exponential backoff. Currently used by ZK based lock provider only.");
public static final ConfigProperty<String> LOCK_ACQUIRE_CLIENT_RETRY_WAIT_TIME_IN_MILLIS_PROP = ConfigProperty
.key(LOCK_ACQUIRE_CLIENT_RETRY_WAIT_TIME_IN_MILLIS_PROP_KEY)
.defaultValue(String.valueOf(10000L))
.sinceVersion("0.8.0")
.withDocumentation("Amount of time to wait between retries from the hudi client");
.withDocumentation("Amount of time to wait between retries on the lock provider by the lock manager");
public static final ConfigProperty<String> LOCK_ACQUIRE_NUM_RETRIES_PROP = ConfigProperty
.key(LOCK_ACQUIRE_NUM_RETRIES_PROP_KEY)
.defaultValue(DEFAULT_LOCK_ACQUIRE_NUM_RETRIES)
.sinceVersion("0.8.0")
.withDocumentation("Maximum number of times to retry by lock provider client");
.withDocumentation("Maximum number of times to retry lock acquire, at each lock provider");
public static final ConfigProperty<String> LOCK_ACQUIRE_CLIENT_NUM_RETRIES_PROP = ConfigProperty
.key(LOCK_ACQUIRE_CLIENT_NUM_RETRIES_PROP_KEY)
.defaultValue(String.valueOf(0))
.sinceVersion("0.8.0")
.withDocumentation("Maximum number of times to retry to acquire lock additionally from the hudi client");
.withDocumentation("Maximum number of times to retry to acquire lock additionally from the lock manager.");
public static final ConfigProperty<Integer> LOCK_ACQUIRE_WAIT_TIMEOUT_MS_PROP = ConfigProperty
.key(LOCK_ACQUIRE_WAIT_TIMEOUT_MS_PROP_KEY)
.defaultValue(60 * 1000)
.sinceVersion("0.8.0")
.withDocumentation("");
.withDocumentation("Timeout in ms, to wait on an individual lock acquire() call, at the lock provider.");
public static final ConfigProperty<String> FILESYSTEM_LOCK_PATH_PROP = ConfigProperty
.key(FILESYSTEM_LOCK_PATH_PROP_KEY)
.noDefaultValue()
.sinceVersion("0.8.0")
.withDocumentation("");
.withDocumentation("For DFS based lock providers, path to store the locks under.");
public static final ConfigProperty<String> HIVE_DATABASE_NAME_PROP = ConfigProperty
.key(HIVE_DATABASE_NAME_PROP_KEY)
.noDefaultValue()
.sinceVersion("0.8.0")
.withDocumentation("The Hive database to acquire lock against");
.withDocumentation("For Hive based lock provider, the Hive database to acquire lock against");
public static final ConfigProperty<String> HIVE_TABLE_NAME_PROP = ConfigProperty
.key(HIVE_TABLE_NAME_PROP_KEY)
.noDefaultValue()
.sinceVersion("0.8.0")
.withDocumentation("The Hive table under the hive database to acquire lock against");
.withDocumentation("For Hive based lock provider, the Hive table to acquire lock against");
public static final ConfigProperty<String> HIVE_METASTORE_URI_PROP = ConfigProperty
.key(HIVE_METASTORE_URI_PROP_KEY)
.noDefaultValue()
.sinceVersion("0.8.0")
.withDocumentation("");
.withDocumentation("For Hive based lock provider, the Hive metastore URI to acquire locks against.");
public static final ConfigProperty<String> ZK_BASE_PATH_PROP = ConfigProperty
.key(ZK_BASE_PATH_PROP_KEY)
.noDefaultValue()
.sinceVersion("0.8.0")
.withDocumentation("The base path on Zookeeper under which to create a ZNode to acquire the lock. "
+ "This should be common for all jobs writing to the same table");
.withDocumentation("The base path on Zookeeper under which to create lock related ZNodes. "
+ "This should be same for all concurrent writers to the same table");
public static final ConfigProperty<Integer> ZK_SESSION_TIMEOUT_MS_PROP = ConfigProperty
.key(ZK_SESSION_TIMEOUT_MS_PROP_KEY)
.defaultValue(DEFAULT_ZK_SESSION_TIMEOUT_MS)
.sinceVersion("0.8.0")
.withDocumentation("How long to wait after losing a connection to ZooKeeper before the session is expired");
.withDocumentation("Timeout in ms, to wait after losing connection to ZooKeeper, before the session is expired");
public static final ConfigProperty<Integer> ZK_CONNECTION_TIMEOUT_MS_PROP = ConfigProperty
.key(ZK_CONNECTION_TIMEOUT_MS_PROP_KEY)
.defaultValue(DEFAULT_ZK_CONNECTION_TIMEOUT_MS)
.sinceVersion("0.8.0")
.withDocumentation("How long to wait when connecting to ZooKeeper before considering the connection a failure");
.withDocumentation("Timeout in ms, to wait for establishing connection with Zookeeper.");
public static final ConfigProperty<String> ZK_CONNECT_URL_PROP = ConfigProperty
.key(ZK_CONNECT_URL_PROP_KEY)
.noDefaultValue()
.sinceVersion("0.8.0")
.withDocumentation("Set the list of comma separated servers to connect to");
.withDocumentation("Zookeeper URL to connect to.");
public static final ConfigProperty<String> ZK_PORT_PROP = ConfigProperty
.key(ZK_PORT_PROP_KEY)
.noDefaultValue()
.sinceVersion("0.8.0")
.withDocumentation("The connection port to be used for Zookeeper");
.withDocumentation("Zookeeper port to connect to.");
public static final ConfigProperty<String> ZK_LOCK_KEY_PROP = ConfigProperty
.key(ZK_LOCK_KEY_PROP_KEY)

View File

@@ -59,22 +59,22 @@ public class HoodieMemoryConfig extends HoodieConfig {
public static final ConfigProperty<Long> MAX_MEMORY_FOR_MERGE_PROP = ConfigProperty
.key("hoodie.memory.merge.max.size")
.defaultValue(DEFAULT_MAX_MEMORY_FOR_SPILLABLE_MAP_IN_BYTES)
.withDocumentation("Property to set the max memory for merge");
.withDocumentation("Maximum amount of memory used for merge operations, before spilling to local storage.");
public static final ConfigProperty<String> MAX_MEMORY_FOR_COMPACTION_PROP = ConfigProperty
.key("hoodie.memory.compaction.max.size")
.noDefaultValue()
.withDocumentation("Property to set the max memory for compaction");
.withDocumentation("Maximum amount of memory used for compaction operations, before spilling to local storage.");
public static final ConfigProperty<Integer> MAX_DFS_STREAM_BUFFER_SIZE_PROP = ConfigProperty
.key("hoodie.memory.dfs.buffer.max.size")
.defaultValue(16 * 1024 * 1024)
.withDocumentation("Property to set the max memory for dfs inputstream buffer size");
.withDocumentation("Property to control the max memory for dfs input stream buffer size");
public static final ConfigProperty<String> SPILLABLE_MAP_BASE_PATH_PROP = ConfigProperty
.key("hoodie.memory.spillable.map.path")
.defaultValue("/tmp/")
.withDocumentation("Default file path prefix for spillable file");
.withDocumentation("Default file path prefix for spillable map");
public static final ConfigProperty<Double> WRITESTATUS_FAILURE_FRACTION_PROP = ConfigProperty
.key("hoodie.memory.writestatus.failure.fraction")

View File

@@ -41,7 +41,7 @@ public class HoodieMetricsDatadogConfig extends HoodieConfig {
.key(DATADOG_PREFIX + ".report.period.seconds")
.defaultValue(30)
.sinceVersion("0.6.0")
.withDocumentation("Datadog report period in seconds. Default to 30.");
.withDocumentation("Datadog reporting period in seconds. Default to 30.");
public static final ConfigProperty<String> DATADOG_API_SITE = ConfigProperty
.key(DATADOG_PREFIX + ".api.site")

View File

@@ -34,19 +34,19 @@ public class HoodieMetricsPrometheusConfig extends HoodieConfig {
.key(PUSHGATEWAY_PREFIX + ".host")
.defaultValue("localhost")
.sinceVersion("0.6.0")
.withDocumentation("");
.withDocumentation("Hostname of the prometheus push gateway");
public static final ConfigProperty<Integer> PUSHGATEWAY_PORT = ConfigProperty
.key(PUSHGATEWAY_PREFIX + ".port")
.defaultValue(9091)
.sinceVersion("0.6.0")
.withDocumentation("");
.withDocumentation("Port for the push gateway.");
public static final ConfigProperty<Integer> PUSHGATEWAY_REPORT_PERIOD_SECONDS = ConfigProperty
.key(PUSHGATEWAY_PREFIX + ".report.period.seconds")
.defaultValue(30)
.sinceVersion("0.6.0")
.withDocumentation("");
.withDocumentation("Reporting interval in seconds.");
public static final ConfigProperty<Boolean> PUSHGATEWAY_DELETE_ON_SHUTDOWN = ConfigProperty
.key(PUSHGATEWAY_PREFIX + ".delete.on.shutdown")
@@ -58,7 +58,7 @@ public class HoodieMetricsPrometheusConfig extends HoodieConfig {
.key(PUSHGATEWAY_PREFIX + ".job.name")
.defaultValue("")
.sinceVersion("0.6.0")
.withDocumentation("");
.withDocumentation("Name of the push gateway job.");
public static final ConfigProperty<Boolean> PUSHGATEWAY_RANDOM_JOB_NAME_SUFFIX = ConfigProperty
.key(PUSHGATEWAY_PREFIX + ".random.job.name.suffix")
@@ -73,7 +73,7 @@ public class HoodieMetricsPrometheusConfig extends HoodieConfig {
.key(PROMETHEUS_PREFIX + ".port")
.defaultValue(9090)
.sinceVersion("0.6.0")
.withDocumentation("");
.withDocumentation("Port for prometheus server.");
private HoodieMetricsPrometheusConfig() {
super();

View File

@@ -37,12 +37,14 @@ public class HoodiePayloadConfig extends HoodieConfig {
public static final ConfigProperty<String> PAYLOAD_ORDERING_FIELD_PROP = ConfigProperty
.key(PAYLOAD_ORDERING_FIELD_PROP_KEY)
.defaultValue("ts")
.withDocumentation("Property to hold the payload ordering field name");
.withDocumentation("Table column/field name to order records that have the same key, before "
+ "merging and writing to storage.");
public static final ConfigProperty<String> PAYLOAD_EVENT_TIME_FIELD_PROP = ConfigProperty
.key(PAYLOAD_EVENT_TIME_FIELD_PROP_KEY)
.defaultValue("ts")
.withDocumentation("Property for payload event time field");
.withDocumentation("Table column/field name to derive timestamp associated with the records. This can"
+ "be useful for e.g, determining the freshness of the table.");
private HoodiePayloadConfig() {
super();

View File

@@ -43,19 +43,19 @@ public class HoodieStorageConfig extends HoodieConfig {
public static final ConfigProperty<String> PARQUET_BLOCK_SIZE_BYTES = ConfigProperty
.key("hoodie.parquet.block.size")
.defaultValue(String.valueOf(120 * 1024 * 1024))
.withDocumentation("Parquet RowGroup size. Its better this is same as the file size, so that a single column "
+ "within a file is stored continuously on disk");
.withDocumentation("Parquet RowGroup size. It's recommended to make this large enough that scan costs can be"
+ " amortized by packing enough column values into a single row group.");
public static final ConfigProperty<String> PARQUET_PAGE_SIZE_BYTES = ConfigProperty
.key("hoodie.parquet.page.size")
.defaultValue(String.valueOf(1 * 1024 * 1024))
.withDocumentation("Parquet page size. Page is the unit of read within a parquet file. "
+ "Within a block, pages are compressed seperately.");
+ "Within a block, pages are compressed separately.");
public static final ConfigProperty<String> ORC_FILE_MAX_BYTES = ConfigProperty
.key("hoodie.orc.max.file.size")
.defaultValue(String.valueOf(120 * 1024 * 1024))
.withDocumentation("");
.withDocumentation("Target file size for ORC base files.");
public static final ConfigProperty<String> ORC_STRIPE_SIZE = ConfigProperty
.key("hoodie.orc.stripe.size")
@@ -65,17 +65,18 @@ public class HoodieStorageConfig extends HoodieConfig {
public static final ConfigProperty<String> ORC_BLOCK_SIZE = ConfigProperty
.key("hoodie.orc.block.size")
.defaultValue(ORC_FILE_MAX_BYTES.defaultValue())
.withDocumentation("File system block size");
.withDocumentation("ORC block size, recommended to be aligned with the target file size.");
public static final ConfigProperty<String> HFILE_FILE_MAX_BYTES = ConfigProperty
.key("hoodie.hfile.max.file.size")
.defaultValue(String.valueOf(120 * 1024 * 1024))
.withDocumentation("");
.withDocumentation("Target file size for HFile base files.");
public static final ConfigProperty<String> HFILE_BLOCK_SIZE_BYTES = ConfigProperty
.key("hoodie.hfile.block.size")
.defaultValue(String.valueOf(1 * 1024 * 1024))
.withDocumentation("");
.defaultValue(String.valueOf(1024 * 1024))
.withDocumentation("Lower values increase the size of metadata tracked within HFile, but can offer potentially "
+ "faster lookup times.");
// used to size log files
public static final ConfigProperty<String> LOGFILE_SIZE_MAX_BYTES = ConfigProperty
@@ -107,12 +108,12 @@ public class HoodieStorageConfig extends HoodieConfig {
public static final ConfigProperty<String> HFILE_COMPRESSION_ALGORITHM = ConfigProperty
.key("hoodie.hfile.compression.algorithm")
.defaultValue("GZ")
.withDocumentation("");
.withDocumentation("Compression codec to use for hfile base files.");
public static final ConfigProperty<String> ORC_COMPRESSION_CODEC = ConfigProperty
.key("hoodie.orc.compression.codec")
.defaultValue("ZLIB")
.withDocumentation("");
.withDocumentation("Compression codec to use for ORC base files.");
// Default compression ratio for log file to parquet, general 3x
public static final ConfigProperty<String> LOGFILE_TO_PARQUET_COMPRESSION_RATIO = ConfigProperty

View File

@@ -36,7 +36,7 @@ public class HoodieWriteCommitCallbackConfig extends HoodieConfig {
.key(CALLBACK_PREFIX + "on")
.defaultValue(false)
.sinceVersion("0.6.0")
.withDocumentation("Turn callback on/off. off by default.");
.withDocumentation("Turn commit callback on/off. off by default.");
public static final ConfigProperty<String> CALLBACK_CLASS_PROP = ConfigProperty
.key(CALLBACK_PREFIX + "class")

View File

@@ -18,7 +18,6 @@
package org.apache.hudi.config;
import org.apache.hadoop.hbase.io.compress.Compression;
import org.apache.hudi.client.WriteStatus;
import org.apache.hudi.client.bootstrap.BootstrapMode;
import org.apache.hudi.client.transaction.ConflictResolutionStrategy;
@@ -27,8 +26,8 @@ import org.apache.hudi.common.config.HoodieConfig;
import org.apache.hudi.common.config.HoodieMetadataConfig;
import org.apache.hudi.common.engine.EngineType;
import org.apache.hudi.common.fs.ConsistencyGuardConfig;
import org.apache.hudi.common.model.HoodieFailedWritesCleaningPolicy;
import org.apache.hudi.common.model.HoodieCleaningPolicy;
import org.apache.hudi.common.model.HoodieFailedWritesCleaningPolicy;
import org.apache.hudi.common.model.OverwriteWithLatestAvroPayload;
import org.apache.hudi.common.model.WriteConcurrencyMode;
import org.apache.hudi.common.table.timeline.versioning.TimelineLayoutVersion;
@@ -43,10 +42,13 @@ import org.apache.hudi.metrics.MetricsReporterType;
import org.apache.hudi.metrics.datadog.DatadogHttpClient.ApiSite;
import org.apache.hudi.table.action.compact.CompactionTriggerStrategy;
import org.apache.hudi.table.action.compact.strategy.CompactionStrategy;
import org.apache.hadoop.hbase.io.compress.Compression;
import org.apache.orc.CompressionKind;
import org.apache.parquet.hadoop.metadata.CompressionCodecName;
import javax.annotation.concurrent.Immutable;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
@@ -71,7 +73,7 @@ public class HoodieWriteConfig extends HoodieConfig {
public static final ConfigProperty<String> TABLE_NAME = ConfigProperty
.key("hoodie.table.name")
.noDefaultValue()
.withDocumentation("Table name that will be used for registering with Hive. Needs to be same across runs.");
.withDocumentation("Table name that will be used for registering with metastores like HMS. Needs to be same across runs.");
public static final ConfigProperty<String> PRECOMBINE_FIELD_PROP = ConfigProperty
.key("hoodie.datasource.write.precombine.field")
@@ -88,12 +90,14 @@ public class HoodieWriteConfig extends HoodieConfig {
public static final ConfigProperty<String> KEYGENERATOR_CLASS_PROP = ConfigProperty
.key("hoodie.datasource.write.keygenerator.class")
.noDefaultValue()
.withDocumentation("Key generator class, that implements will extract the key out of incoming Row object");
.withDocumentation("Key generator class, that implements `org.apache.hudi.keygen.KeyGenerator` "
+ "extract a key out of incoming records.");
public static final ConfigProperty<String> KEYGENERATOR_TYPE_PROP = ConfigProperty
.key("hoodie.datasource.write.keygenerator.type")
.defaultValue(KeyGeneratorType.SIMPLE.name())
.withDocumentation("Type of build-in key generator, currently support SIMPLE, COMPLEX, TIMESTAMP, CUSTOM, NON_PARTITION, GLOBAL_DELETE");
.withDocumentation("Easily configure one the built-in key generators, instead of specifying the key generator class."
+ "Currently supports SIMPLE, COMPLEX, TIMESTAMP, CUSTOM, NON_PARTITION, GLOBAL_DELETE");
public static final ConfigProperty<String> ROLLBACK_USING_MARKERS = ConfigProperty
.key("hoodie.rollback.using.markers")
@@ -104,206 +108,220 @@ public class HoodieWriteConfig extends HoodieConfig {
public static final ConfigProperty<String> TIMELINE_LAYOUT_VERSION = ConfigProperty
.key("hoodie.timeline.layout.version")
.noDefaultValue()
.withDocumentation("");
.sinceVersion("0.5.1")
.withDocumentation("Controls the layout of the timeline. Version 0 relied on renames, Version 1 (default) models "
+ "the timeline as an immutable log relying only on atomic writes for object storage.");
public static final ConfigProperty<String> BASE_PATH_PROP = ConfigProperty
.key("hoodie.base.path")
.noDefaultValue()
.withDocumentation("Base DFS path under which all the data partitions are created. "
.withDocumentation("Base path on lake storage, under which all the table data is stored. "
+ "Always prefix it explicitly with the storage scheme (e.g hdfs://, s3:// etc). "
+ "Hudi stores all the main meta-data about commits, savepoints, cleaning audit logs "
+ "etc in .hoodie directory under the base directory.");
+ "etc in .hoodie directory under this base path directory.");
public static final ConfigProperty<String> AVRO_SCHEMA = ConfigProperty
.key("hoodie.avro.schema")
.noDefaultValue()
.withDocumentation("This is the current reader avro schema for the table. This is a string of the entire schema. "
+ "HoodieWriteClient uses this schema to pass on to implementations of HoodieRecordPayload to convert "
+ "from the source format to avro record. This is also used when re-writing records during an update.");
.withDocumentation("Schema string representing the current write schema of the table. Hudi passes this to "
+ "implementations of HoodieRecordPayload to convert incoming records to avro. This is also used as the write schema "
+ "evolving records during an update.");
public static final ConfigProperty<String> AVRO_SCHEMA_VALIDATE = ConfigProperty
.key("hoodie.avro.schema.validate")
.defaultValue("false")
.withDocumentation("");
.withDocumentation("Validate the schema used for the write against the latest schema, for backwards compatibility.");
public static final ConfigProperty<String> INSERT_PARALLELISM = ConfigProperty
.key("hoodie.insert.shuffle.parallelism")
.defaultValue("1500")
.withDocumentation("Once data has been initially imported, this parallelism controls initial parallelism for reading input records. "
+ "Ensure this value is high enough say: 1 partition for 1 GB of input data");
.withDocumentation("Parallelism for inserting records into the table. Inserts can shuffle data before writing to tune file sizes and optimize the storage layout.");
public static final ConfigProperty<String> BULKINSERT_PARALLELISM = ConfigProperty
.key("hoodie.bulkinsert.shuffle.parallelism")
.defaultValue("1500")
.withDocumentation("Bulk insert is meant to be used for large initial imports and this parallelism determines "
+ "the initial number of files in your table. Tune this to achieve a desired optimal size during initial import.");
.withDocumentation("For large initial imports using bulk_insert operation, controls the parallelism to use for sort modes or custom partitioning done"
+ "before writing records to the table.");
public static final ConfigProperty<String> BULKINSERT_USER_DEFINED_PARTITIONER_CLASS = ConfigProperty
.key("hoodie.bulkinsert.user.defined.partitioner.class")
.noDefaultValue()
.withDocumentation("If specified, this class will be used to re-partition input records before they are inserted.");
public static final ConfigProperty<String> BULKINSERT_INPUT_DATA_SCHEMA_DDL = ConfigProperty
.key("hoodie.bulkinsert.schema.ddl")
.noDefaultValue()
.withDocumentation("");
.withDocumentation("If specified, this class will be used to re-partition records before they are bulk inserted. This can be used to sort, pack, cluster data"
+ " optimally for common query patterns.");
public static final ConfigProperty<String> UPSERT_PARALLELISM = ConfigProperty
.key("hoodie.upsert.shuffle.parallelism")
.defaultValue("1500")
.withDocumentation("Once data has been initially imported, this parallelism controls initial parallelism for reading input records. "
+ "Ensure this value is high enough say: 1 partition for 1 GB of input data");
.withDocumentation("Parallelism to use for upsert operation on the table. Upserts can shuffle data to perform index lookups, file sizing, bin packing records optimally"
+ "into file groups.");
public static final ConfigProperty<String> DELETE_PARALLELISM = ConfigProperty
.key("hoodie.delete.shuffle.parallelism")
.defaultValue("1500")
.withDocumentation("This parallelism is Used for “delete” operation while deduping or repartioning.");
.withDocumentation("Parallelism used for “delete” operation. Delete operations also performs shuffles, similar to upsert operation.");
public static final ConfigProperty<String> ROLLBACK_PARALLELISM = ConfigProperty
.key("hoodie.rollback.parallelism")
.defaultValue("100")
.withDocumentation("Determines the parallelism for rollback of commits.");
.withDocumentation("Parallelism for rollback of commits. Rollbacks perform delete of files or logging delete blocks to file groups on storage in parallel.");
public static final ConfigProperty<String> WRITE_BUFFER_LIMIT_BYTES = ConfigProperty
.key("hoodie.write.buffer.limit.bytes")
.defaultValue(String.valueOf(4 * 1024 * 1024))
.withDocumentation("");
.withDocumentation("Size of in-memory buffer used for parallelizing network reads and lake storage writes.");
public static final ConfigProperty<String> COMBINE_BEFORE_INSERT_PROP = ConfigProperty
.key("hoodie.combine.before.insert")
.defaultValue("false")
.withDocumentation("Flag which first combines the input RDD and merges multiple partial records into a single record "
+ "before inserting or updating in DFS");
.withDocumentation("When inserted records share same key, controls whether they should be first combined (i.e de-duplicated) before"
+ " writing to storage.");
public static final ConfigProperty<String> COMBINE_BEFORE_UPSERT_PROP = ConfigProperty
.key("hoodie.combine.before.upsert")
.defaultValue("true")
.withDocumentation("Flag which first combines the input RDD and merges multiple partial records into a single record "
+ "before inserting or updating in DFS");
.withDocumentation("When upserted records share same key, controls whether they should be first combined (i.e de-duplicated) before"
+ " writing to storage. This should be turned off only if you are absolutely certain that there are no duplicates incoming, "
+ " otherwise it can lead to duplicate keys and violate the uniqueness guarantees.");
public static final ConfigProperty<String> COMBINE_BEFORE_DELETE_PROP = ConfigProperty
.key("hoodie.combine.before.delete")
.defaultValue("true")
.withDocumentation("Flag which first combines the input RDD and merges multiple partial records into a single record "
+ "before deleting in DFS");
.withDocumentation("During delete operations, controls whether we should combine deletes (and potentially also upserts) before "
+ " writing to storage.");
public static final ConfigProperty<String> WRITE_STATUS_STORAGE_LEVEL = ConfigProperty
.key("hoodie.write.status.storage.level")
.defaultValue("MEMORY_AND_DISK_SER")
.withDocumentation("HoodieWriteClient.insert and HoodieWriteClient.upsert returns a persisted RDD[WriteStatus], "
+ "this is because the Client can choose to inspect the WriteStatus and choose and commit or not based on the failures. "
+ "This is a configuration for the storage level for this RDD");
.withDocumentation("Write status objects hold metadata about a write (stats, errors), that is not yet committed to storage. "
+ "This controls the how that information is cached for inspection by clients. We rarely expect this to be changed.");
public static final ConfigProperty<String> HOODIE_AUTO_COMMIT_PROP = ConfigProperty
.key("hoodie.auto.commit")
.defaultValue("true")
.withDocumentation("Should HoodieWriteClient autoCommit after insert and upsert. "
+ "The client can choose to turn off auto-commit and commit on a “defined success condition”");
.withDocumentation("Controls whether a write operation should auto commit. This can be turned off to perform inspection"
+ " of the uncommitted write before deciding to commit.");
public static final ConfigProperty<String> HOODIE_WRITE_STATUS_CLASS_PROP = ConfigProperty
.key("hoodie.writestatus.class")
.defaultValue(WriteStatus.class.getName())
.withDocumentation("");
.withDocumentation("Subclass of " + WriteStatus.class.getName() + " to be used to collect information about a write. Can be "
+ "overridden to collection additional metrics/statistics about the data if needed.");
public static final ConfigProperty<String> FINALIZE_WRITE_PARALLELISM = ConfigProperty
.key("hoodie.finalize.write.parallelism")
.defaultValue("1500")
.withDocumentation("");
.withDocumentation("Parallelism for the write finalization internal operation, which involves removing any partially written "
+ "files from lake storage, before committing the write. Reduce this value, if the high number of tasks incur delays for smaller tables "
+ "or low latency writes.");
public static final ConfigProperty<String> MARKERS_DELETE_PARALLELISM = ConfigProperty
.key("hoodie.markers.delete.parallelism")
.defaultValue("100")
.withDocumentation("Determines the parallelism for deleting marker files.");
.withDocumentation("Determines the parallelism for deleting marker files, which are used to track all files (valid or invalid/partial) written during "
+ "a write operation. Increase this value if delays are observed, with large batch writes.");
public static final ConfigProperty<String> BULKINSERT_SORT_MODE = ConfigProperty
.key("hoodie.bulkinsert.sort.mode")
.defaultValue(BulkInsertSortMode.GLOBAL_SORT.toString())
.withDocumentation("Sorting modes to use for sorting records for bulk insert. This is leveraged when user "
+ "defined partitioner is not configured. Default is GLOBAL_SORT. Available values are - GLOBAL_SORT: "
+ "this ensures best file sizes, with lowest memory overhead at cost of sorting. PARTITION_SORT: "
+ "Strikes a balance by only sorting within a partition, still keeping the memory overhead of writing "
+ "lowest and best effort file sizing. NONE: No sorting. Fastest and matches spark.write.parquet() "
+ "in terms of number of files, overheads");
.withDocumentation("Sorting modes to use for sorting records for bulk insert. This is user when user "
+ BULKINSERT_USER_DEFINED_PARTITIONER_CLASS.key() + "is not configured. Available values are - "
+ "GLOBAL_SORT: this ensures best file sizes, with lowest memory overhead at cost of sorting. "
+ "PARTITION_SORT: Strikes a balance by only sorting within a partition, still keeping the memory overhead of writing "
+ "lowest and best effort file sizing. "
+ "NONE: No sorting. Fastest and matches `spark.write.parquet()` in terms of number of files, overheads");
public static final ConfigProperty<String> EMBEDDED_TIMELINE_SERVER_ENABLED = ConfigProperty
.key("hoodie.embed.timeline.server")
.defaultValue("true")
.withDocumentation("");
.withDocumentation("When true, spins up an instance of the timeline server (meta server that serves cached file listings, statistics),"
+ "running on each writer's driver process, accepting requests during the write from executors.");
public static final ConfigProperty<String> EMBEDDED_TIMELINE_SERVER_REUSE_ENABLED = ConfigProperty
.key("hoodie.embed.timeline.server.reuse.enabled")
.defaultValue("false")
.withDocumentation("");
.withDocumentation("Controls whether the timeline server instance should be cached and reused across the JVM (across task lifecycles)"
+ "to avoid startup costs. This should rarely be changed.");
public static final ConfigProperty<String> EMBEDDED_TIMELINE_SERVER_PORT = ConfigProperty
.key("hoodie.embed.timeline.server.port")
.defaultValue("0")
.withDocumentation("");
.withDocumentation("Port at which the timeline server listens for requests. When running embedded in each writer, it picks "
+ "a free port and communicates to all the executors. This should rarely be changed.");
public static final ConfigProperty<String> EMBEDDED_TIMELINE_SERVER_THREADS = ConfigProperty
.key("hoodie.embed.timeline.server.threads")
.defaultValue("-1")
.withDocumentation("");
.withDocumentation("Number of threads to serve requests in the timeline server. By default, auto configured based on the number of underlying cores.");
public static final ConfigProperty<String> EMBEDDED_TIMELINE_SERVER_COMPRESS_OUTPUT = ConfigProperty
.key("hoodie.embed.timeline.server.gzip")
.defaultValue("true")
.withDocumentation("");
.withDocumentation("Controls whether gzip compression is used, for large responses from the timeline server, to improve latency.");
public static final ConfigProperty<String> EMBEDDED_TIMELINE_SERVER_USE_ASYNC = ConfigProperty
.key("hoodie.embed.timeline.server.async")
.defaultValue("false")
.withDocumentation("");
.withDocumentation("Controls whether or not, the requests to the timeline server are processed in asynchronous fashion, "
+ "potentially improving throughput.");
public static final ConfigProperty<String> FAIL_ON_TIMELINE_ARCHIVING_ENABLED_PROP = ConfigProperty
.key("hoodie.fail.on.timeline.archiving")
.defaultValue("true")
.withDocumentation("");
.withDocumentation("Timeline archiving removes older instants from the timeline, after each write operation, to minimize metadata overhead. "
+ "Controls whether or not, the write should be failed as well, if such archiving fails.");
public static final ConfigProperty<Long> INITIAL_CONSISTENCY_CHECK_INTERVAL_MS_PROP = ConfigProperty
.key("hoodie.consistency.check.initial_interval_ms")
.defaultValue(2000L)
.withDocumentation("Time between successive attempts to ensure written data's metadata is consistent on storage");
.withDocumentation("Initial time between successive attempts to ensure written data's metadata is consistent on storage. Grows with exponential"
+ " backoff after the initial value.");
public static final ConfigProperty<Long> MAX_CONSISTENCY_CHECK_INTERVAL_MS_PROP = ConfigProperty
.key("hoodie.consistency.check.max_interval_ms")
.defaultValue(300000L)
.withDocumentation("Max interval time for consistency check");
.withDocumentation("Max time to wait between successive attempts at performing consistency checks");
public static final ConfigProperty<Integer> MAX_CONSISTENCY_CHECKS_PROP = ConfigProperty
.key("hoodie.consistency.check.max_checks")
.defaultValue(7)
.withDocumentation("Maximum number of checks, for consistency of written data. Will wait upto 256 Secs");
.withDocumentation("Maximum number of checks, for consistency of written data.");
public static final ConfigProperty<String> MERGE_DATA_VALIDATION_CHECK_ENABLED = ConfigProperty
.key("hoodie.merge.data.validation.enabled")
.defaultValue("false")
.withDocumentation("Data validation check performed during merges before actual commits");
.withDocumentation("When enabled, data validation checks are performed during merges to ensure expected "
+ "number of records after merge operation.");
public static final ConfigProperty<String> MERGE_ALLOW_DUPLICATE_ON_INSERTS = ConfigProperty
.key("hoodie.merge.allow.duplicate.on.inserts")
.defaultValue("false")
.withDocumentation("Allow duplicates with inserts while merging with existing records");
.withDocumentation("When enabled, we allow duplicate keys even if inserts are routed to merge with an existing file (for ensuring file sizing)."
+ " This is only relevant for insert operation, since upsert, delete operations will ensure unique key constraints are maintained.");
public static final ConfigProperty<ExternalSpillableMap.DiskMapType> SPILLABLE_DISK_MAP_TYPE = ConfigProperty
.key("hoodie.spillable.diskmap.type")
.defaultValue(ExternalSpillableMap.DiskMapType.BITCASK)
.withDocumentation("Enable usage of either BITCASK or ROCKS_DB as disk map for External Spillable Map");
.withDocumentation("When handling input data that cannot be held in memory, to merge with a file on storage, a spillable diskmap is employed. "
+ "By default, we use a persistent hashmap based loosely on bitcask, that offers O(1) inserts, lookups. "
+ "Change this to `ROCKS_DB` to prefer using rocksDB, for handling the spill.");
public static final ConfigProperty<Integer> CLIENT_HEARTBEAT_INTERVAL_IN_MS_PROP = ConfigProperty
.key("hoodie.client.heartbeat.interval_in_ms")
.defaultValue(60 * 1000)
.withDocumentation("");
.withDocumentation("Writers perform heartbeats to indicate liveness. Controls how often (in ms), such heartbeats are registered to lake storage.");
public static final ConfigProperty<Integer> CLIENT_HEARTBEAT_NUM_TOLERABLE_MISSES_PROP = ConfigProperty
.key("hoodie.client.heartbeat.tolerable.misses")
.defaultValue(2)
.withDocumentation("");
.withDocumentation("Number of heartbeat misses, before a writer is deemed not alive and all pending writes are aborted.");
public static final ConfigProperty<String> WRITE_CONCURRENCY_MODE_PROP = ConfigProperty
.key("hoodie.write.concurrency.mode")
.defaultValue(WriteConcurrencyMode.SINGLE_WRITER.name())
.withDocumentation("Enable different concurrency support");
.withDocumentation("Enable different concurrency modes. Options are "
+ "SINGLE_WRITER: Only one active writer to the table. Maximizes throughput"
+ "OPTIMISTIC_CONCURRENCY_CONTROL: Multiple writers can operate on the table and exactly one of them succeed "
+ "if a conflict (writes affect the same file group) is detected.");
public static final ConfigProperty<String> WRITE_META_KEY_PREFIXES_PROP = ConfigProperty
.key("hoodie.write.meta.key.prefixes")
@@ -312,16 +330,14 @@ public class HoodieWriteConfig extends HoodieConfig {
+ "during overlapping commits via multi writing");
/**
* The specified write schema. In most case, we do not need set this parameter,
* but for the case the write schema is not equal to the specified table schema, we can
* specify the write schema by this parameter.
*
* Currently the MergeIntoHoodieTableCommand use this to specify the write schema.
* Currently the use this to specify the write schema.
*/
public static final ConfigProperty<String> WRITE_SCHEMA_PROP = ConfigProperty
.key("hoodie.write.schema")
.noDefaultValue()
.withDocumentation("");
.withDocumentation("The specified write schema. In most case, we do not need set this parameter,"
+ " but for the case the write schema is not equal to the specified table schema, we can"
+ " specify the write schema by this parameter. Used by MergeIntoHoodieTableCommand");
/**
* HUDI-858 : There are users who had been directly using RDD APIs and have relied on a behavior in 0.4.x to allow
@@ -342,7 +358,8 @@ public class HoodieWriteConfig extends HoodieConfig {
.key(AVRO_SCHEMA.key() + ".external.transformation")
.defaultValue("false")
.withAlternatives(AVRO_SCHEMA.key() + ".externalTransformation")
.withDocumentation("");
.withDocumentation("When enabled, records in older schema are rewritten into newer schema during upsert,delete and background"
+ " compaction,clustering operations.");
private ConsistencyGuardConfig consistencyGuardConfig;
@@ -352,7 +369,6 @@ public class HoodieWriteConfig extends HoodieConfig {
private FileSystemViewStorageConfig viewStorageConfig;
private HoodiePayloadConfig hoodiePayloadConfig;
private HoodieMetadataConfig metadataConfig;
private EngineType engineType;
/**

View File

@@ -19,13 +19,14 @@
package org.apache.hudi.keygen.constant;
import org.apache.hudi.common.config.ConfigProperty;
import org.apache.hudi.common.config.HoodieConfig;
public class KeyGeneratorOptions {
public class KeyGeneratorOptions extends HoodieConfig {
public static final ConfigProperty<String> URL_ENCODE_PARTITIONING_OPT_KEY = ConfigProperty
.key("hoodie.datasource.write.partitionpath.urlencode")
.defaultValue("false")
.withDocumentation("");
.withDocumentation("Should we url encode the partition path value, before creating the folder structure.");
public static final ConfigProperty<String> HIVE_STYLE_PARTITIONING_OPT_KEY = ConfigProperty
.key("hoodie.datasource.write.hive_style_partitioning")

View File

@@ -60,7 +60,7 @@ public class HoodieAvroKeyGeneratorFactory {
props.getString(HoodieWriteConfig.KEYGENERATOR_TYPE_PROP.key(), null);
if (StringUtils.isNullOrEmpty(keyGeneratorType)) {
LOG.info("The value of {} is empty, use SIMPLE", HoodieWriteConfig.KEYGENERATOR_TYPE_PROP.key());
LOG.info("The value of {} is empty, using SIMPLE", HoodieWriteConfig.KEYGENERATOR_TYPE_PROP.key());
keyGeneratorType = KeyGeneratorType.SIMPLE.name();
}