201 lines
7.9 KiB
Scala
201 lines
7.9 KiB
Scala
/*
|
|
* Copyright (c) 2017 Uber Technologies, Inc. (hoodie-dev-group@uber.com)
|
|
*
|
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
|
* you may not use this file except in compliance with the License.
|
|
* You may obtain a copy of the License at
|
|
*
|
|
* http://www.apache.org/licenses/LICENSE-2.0
|
|
*
|
|
* Unless required by applicable law or agreed to in writing, software
|
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
* See the License for the specific language governing permissions and
|
|
* limitations under the License.
|
|
*
|
|
*
|
|
*/
|
|
|
|
package com.uber.hoodie
|
|
|
|
import com.uber.hoodie.common.model.HoodieTableType
|
|
import com.uber.hoodie.hive.SlashEncodedDayPartitionValueExtractor
|
|
|
|
/**
|
|
* List of options that can be passed to the Hoodie datasource,
|
|
* in addition to the hoodie client configs
|
|
*/
|
|
|
|
/**
|
|
* Options supported for reading hoodie datasets.
|
|
*/
|
|
object DataSourceReadOptions {
|
|
/**
|
|
* Whether data needs to be read, in
|
|
* incremental mode (new data since an instantTime)
|
|
* (or) Read Optimized mode (obtain latest view, based on columnar data)
|
|
* (or) Real time mode (obtain latest view, based on row & columnar data)
|
|
*
|
|
* Default: READ_OPTIMIZED
|
|
*/
|
|
val VIEW_TYPE_OPT_KEY = "hoodie.datasource.view.type"
|
|
val VIEW_TYPE_READ_OPTIMIZED_OPT_VAL = "read_optimized"
|
|
val VIEW_TYPE_INCREMENTAL_OPT_VAL = "incremental"
|
|
val VIEW_TYPE_REALTIME_OPT_VAL = "realtime"
|
|
val DEFAULT_VIEW_TYPE_OPT_VAL = VIEW_TYPE_READ_OPTIMIZED_OPT_VAL
|
|
|
|
|
|
/**
|
|
* Instant time to start incrementally pulling data from. The instanttime here need not
|
|
* necessarily correspond to an instant on the timeline. New data written with an
|
|
* `instant_time > BEGIN_INSTANTTIME` are fetched out. For e.g: '20170901080000' will get
|
|
* all new data written after Sep 1, 2017 08:00AM.
|
|
*
|
|
* Default: None (Mandatory in incremental mode)
|
|
*/
|
|
val BEGIN_INSTANTTIME_OPT_KEY = "hoodie.datasource.read.begin.instanttime"
|
|
|
|
|
|
/**
|
|
* Instant time to limit incrementally fetched data to. New data written with an
|
|
* `instant_time <= END_INSTANTTIME` are fetched out.
|
|
*
|
|
* Default: latest instant (i.e fetches all new data since begin instant time)
|
|
*
|
|
*/
|
|
val END_INSTANTTIME_OPT_KEY = "hoodie.datasource.read.end.instanttime"
|
|
}
|
|
|
|
/**
|
|
* Options supported for writing hoodie datasets.
|
|
*/
|
|
object DataSourceWriteOptions {
|
|
/**
|
|
* The client operation, that this write should do
|
|
*
|
|
* Default: upsert()
|
|
*/
|
|
val OPERATION_OPT_KEY = "hoodie.datasource.write.operation"
|
|
val BULK_INSERT_OPERATION_OPT_VAL = "bulk_insert"
|
|
val INSERT_OPERATION_OPT_VAL = "insert"
|
|
val UPSERT_OPERATION_OPT_VAL = "upsert"
|
|
val DEFAULT_OPERATION_OPT_VAL = UPSERT_OPERATION_OPT_VAL;
|
|
|
|
/**
|
|
* The storage type for the underlying data, for this write.
|
|
* Note that this can't change across writes.
|
|
*
|
|
* Default: COPY_ON_WRITE
|
|
*/
|
|
val STORAGE_TYPE_OPT_KEY = "hoodie.datasource.write.storage.type"
|
|
val COW_STORAGE_TYPE_OPT_VAL = HoodieTableType.COPY_ON_WRITE.name
|
|
val MOR_STORAGE_TYPE_OPT_VAL = HoodieTableType.MERGE_ON_READ.name
|
|
val DEFAULT_STORAGE_TYPE_OPT_VAL = COW_STORAGE_TYPE_OPT_VAL
|
|
|
|
/**
|
|
* Hive table name, to register the dataset into.
|
|
*
|
|
* Default: None (mandatory)
|
|
*/
|
|
val TABLE_NAME_OPT_KEY = "hoodie.datasource.write.table.name"
|
|
|
|
/**
|
|
* Field used in preCombining before actual write. When two records have the same
|
|
* key value, we will pick the one with the largest value for the precombine field,
|
|
* determined by Object.compareTo(..)
|
|
*/
|
|
val PRECOMBINE_FIELD_OPT_KEY = "hoodie.datasource.write.precombine.field"
|
|
val DEFAULT_PRECOMBINE_FIELD_OPT_VAL = "ts"
|
|
|
|
|
|
/**
|
|
* Payload class used. Override this, if you like to roll your own merge logic, when upserting/inserting.
|
|
* This will render any value set for `PRECOMBINE_FIELD_OPT_VAL` in-effective
|
|
*/
|
|
val PAYLOAD_CLASS_OPT_KEY = "hoodie.datasource.write.payload.class"
|
|
val DEFAULT_PAYLOAD_OPT_VAL = classOf[OverwriteWithLatestAvroPayload].getName
|
|
|
|
/**
|
|
* Record key field. Value to be used as the `recordKey` component of `HoodieKey`. Actual value
|
|
* will be obtained by invoking .toString() on the field value. Nested fields can be specified using
|
|
* the dot notation eg: `a.b.c`
|
|
*
|
|
*/
|
|
val RECORDKEY_FIELD_OPT_KEY = "hoodie.datasource.write.recordkey.field"
|
|
val DEFAULT_RECORDKEY_FIELD_OPT_VAL = "uuid"
|
|
|
|
/**
|
|
* Partition path field. Value to be used at the `partitionPath` component of `HoodieKey`. Actual
|
|
* value ontained by invoking .toString()
|
|
*/
|
|
val PARTITIONPATH_FIELD_OPT_KEY = "hoodie.datasource.write.partitionpath.field"
|
|
val DEFAULT_PARTITIONPATH_FIELD_OPT_VAL = "partitionpath"
|
|
|
|
/**
|
|
* Key generator class, that implements will extract the key out of incoming record
|
|
*
|
|
*/
|
|
val KEYGENERATOR_CLASS_OPT_KEY = "hoodie.datasource.write.keygenerator.class"
|
|
val DEFAULT_KEYGENERATOR_CLASS_OPT_VAL = classOf[SimpleKeyGenerator].getName
|
|
|
|
/**
|
|
* Option keys beginning with this prefix, are automatically added to the commit/deltacommit metadata.
|
|
* This is useful to store checkpointing information, in a consistent way with the hoodie timeline
|
|
*/
|
|
val COMMIT_METADATA_KEYPREFIX_OPT_KEY = "hoodie.datasource.write.commitmeta.key.prefix"
|
|
val DEFAULT_COMMIT_METADATA_KEYPREFIX_OPT_VAL = "_"
|
|
|
|
/**
|
|
* Flag to indicate whether to drop duplicates upon insert.
|
|
* By default insert will accept duplicates, to gain extra performance.
|
|
*/
|
|
val INSERT_DROP_DUPS_OPT_KEY = "hoodie.datasource.write.insert.drop.duplicates"
|
|
val DEFAULT_INSERT_DROP_DUPS_OPT_VAL = "false"
|
|
|
|
/**
|
|
* Flag to indicate how many times streaming job should retry for a failed microbatch
|
|
* By default 3
|
|
*/
|
|
val STREAMING_RETRY_CNT_OPT_KEY = "hoodie.datasource.write.streaming.retry.count"
|
|
val DEFAULT_STREAMING_RETRY_CNT_OPT_VAL = "3"
|
|
|
|
/**
|
|
* Flag to indicate how long (by millisecond) before a retry should issued for failed microbatch
|
|
* By default 2000 and it will be doubled by every retry
|
|
*/
|
|
val STREAMING_RETRY_INTERVAL_MS_OPT_KEY = "hoodie.datasource.write.streaming.retry.interval.ms"
|
|
val DEFAULT_STREAMING_RETRY_INTERVAL_MS_OPT_VAL = "2000"
|
|
|
|
/**
|
|
* Flag to indicate whether to ignore any non exception error (e.g. writestatus error)
|
|
* within a streaming microbatch
|
|
* By default true (in favor of streaming progressing over data integrity)
|
|
*/
|
|
val STREAMING_IGNORE_FAILED_BATCH_OPT_KEY = "hoodie.datasource.write.streaming.ignore.failed.batch"
|
|
val DEFAULT_STREAMING_IGNORE_FAILED_BATCH_OPT_VAL = "true"
|
|
|
|
// HIVE SYNC SPECIFIC CONFIGS
|
|
//NOTE: DO NOT USE uppercase for the keys as they are internally lower-cased. Using upper-cases causes
|
|
// unexpected issues with config getting reset
|
|
val HIVE_SYNC_ENABLED_OPT_KEY = "hoodie.datasource.hive_sync.enable"
|
|
val HIVE_DATABASE_OPT_KEY = "hoodie.datasource.hive_sync.database"
|
|
val HIVE_TABLE_OPT_KEY = "hoodie.datasource.hive_sync.table"
|
|
val HIVE_USER_OPT_KEY = "hoodie.datasource.hive_sync.username"
|
|
val HIVE_PASS_OPT_KEY = "hoodie.datasource.hive_sync.password"
|
|
val HIVE_URL_OPT_KEY = "hoodie.datasource.hive_sync.jdbcurl"
|
|
val HIVE_PARTITION_FIELDS_OPT_KEY = "hoodie.datasource.hive_sync.partition_fields"
|
|
val HIVE_PARTITION_EXTRACTOR_CLASS_OPT_KEY = "hoodie.datasource.hive_sync.partition_extractor_class"
|
|
val HIVE_ASSUME_DATE_PARTITION_OPT_KEY = "hoodie.datasource.hive_sync.assume_date_partitioning"
|
|
|
|
// DEFAULT FOR HIVE SPECIFIC CONFIGS
|
|
val DEFAULT_HIVE_SYNC_ENABLED_OPT_VAL = "false"
|
|
val DEFAULT_HIVE_DATABASE_OPT_VAL = "default"
|
|
val DEFAULT_HIVE_TABLE_OPT_VAL = "unknown"
|
|
val DEFAULT_HIVE_USER_OPT_VAL = "hive"
|
|
val DEFAULT_HIVE_PASS_OPT_VAL = "hive"
|
|
val DEFAULT_HIVE_URL_OPT_VAL = "jdbc:hive2://localhost:10000"
|
|
val DEFAULT_HIVE_PARTITION_FIELDS_OPT_VAL = ""
|
|
val DEFAULT_HIVE_PARTITION_EXTRACTOR_CLASS_OPT_VAL = classOf[SlashEncodedDayPartitionValueExtractor].getCanonicalName
|
|
val DEFAULT_HIVE_ASSUME_DATE_PARTITION_OPT_VAL = "false"
|
|
}
|