[HUDI-3504] Support bootstrap command based on Call Produce Command (#5977)
This commit is contained in:
@@ -0,0 +1,254 @@
|
|||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one
|
||||||
|
* or more contributor license agreements. See the NOTICE file
|
||||||
|
* distributed with this work for additional information
|
||||||
|
* regarding copyright ownership. The ASF licenses this file
|
||||||
|
* to you under the Apache License, Version 2.0 (the
|
||||||
|
* "License"); you may not use this file except in compliance
|
||||||
|
* with the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.apache.hudi.cli;
|
||||||
|
|
||||||
|
import org.apache.hadoop.conf.Configuration;
|
||||||
|
import org.apache.hadoop.fs.FileSystem;
|
||||||
|
import org.apache.hadoop.fs.Path;
|
||||||
|
import org.apache.hudi.DataSourceWriteOptions;
|
||||||
|
import org.apache.hudi.client.SparkRDDWriteClient;
|
||||||
|
import org.apache.hudi.client.common.HoodieSparkEngineContext;
|
||||||
|
import org.apache.hudi.common.config.TypedProperties;
|
||||||
|
import org.apache.hudi.common.table.HoodieTableConfig;
|
||||||
|
import org.apache.hudi.common.table.HoodieTableMetaClient;
|
||||||
|
import org.apache.hudi.common.util.Option;
|
||||||
|
import org.apache.hudi.common.util.ReflectionUtils;
|
||||||
|
import org.apache.hudi.common.util.StringUtils;
|
||||||
|
import org.apache.hudi.common.util.ValidationUtils;
|
||||||
|
import org.apache.hudi.config.HoodieCompactionConfig;
|
||||||
|
import org.apache.hudi.config.HoodieIndexConfig;
|
||||||
|
import org.apache.hudi.config.HoodieWriteConfig;
|
||||||
|
import org.apache.hudi.exception.HoodieException;
|
||||||
|
import org.apache.hudi.hive.HiveSyncConfig;
|
||||||
|
import org.apache.hudi.hive.HiveSyncTool;
|
||||||
|
import org.apache.hudi.index.HoodieIndex;
|
||||||
|
import org.apache.hudi.sync.common.HoodieSyncConfig;
|
||||||
|
import org.apache.log4j.LogManager;
|
||||||
|
import org.apache.log4j.Logger;
|
||||||
|
import org.apache.spark.api.java.JavaSparkContext;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.io.Serializable;
|
||||||
|
import java.util.HashMap;
|
||||||
|
|
||||||
|
import static org.apache.hudi.common.table.HoodieTableConfig.ARCHIVELOG_FOLDER;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Performs bootstrap from a non-hudi source.
|
||||||
|
*/
|
||||||
|
public class BootstrapExecutorUtils implements Serializable {
|
||||||
|
|
||||||
|
private static final Logger LOG = LogManager.getLogger(BootstrapExecutorUtils.class);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Config.
|
||||||
|
*/
|
||||||
|
private final Config cfg;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Spark context.
|
||||||
|
*/
|
||||||
|
private final transient JavaSparkContext jssc;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Bag of properties with source, hoodie client, key generator etc.
|
||||||
|
*/
|
||||||
|
private final TypedProperties props;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Hadoop Configuration.
|
||||||
|
*/
|
||||||
|
private final Configuration configuration;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Bootstrap Configuration.
|
||||||
|
*/
|
||||||
|
private final HoodieWriteConfig bootstrapConfig;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* FileSystem instance.
|
||||||
|
*/
|
||||||
|
private final transient FileSystem fs;
|
||||||
|
|
||||||
|
private final String bootstrapBasePath;
|
||||||
|
|
||||||
|
public static final String CHECKPOINT_KEY = HoodieWriteConfig.DELTASTREAMER_CHECKPOINT_KEY;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Bootstrap Executor.
|
||||||
|
*
|
||||||
|
* @param cfg DeltaStreamer Config
|
||||||
|
* @param jssc Java Spark Context
|
||||||
|
* @param fs File System
|
||||||
|
* @param properties Bootstrap Writer Properties
|
||||||
|
* @throws IOException
|
||||||
|
*/
|
||||||
|
public BootstrapExecutorUtils(Config cfg, JavaSparkContext jssc, FileSystem fs, Configuration conf,
|
||||||
|
TypedProperties properties) throws IOException {
|
||||||
|
this.cfg = cfg;
|
||||||
|
this.jssc = jssc;
|
||||||
|
this.fs = fs;
|
||||||
|
this.configuration = conf;
|
||||||
|
this.props = properties;
|
||||||
|
|
||||||
|
ValidationUtils.checkArgument(properties.containsKey(HoodieTableConfig.BOOTSTRAP_BASE_PATH
|
||||||
|
.key()),
|
||||||
|
HoodieTableConfig.BOOTSTRAP_BASE_PATH.key() + " must be specified.");
|
||||||
|
this.bootstrapBasePath = properties.getString(HoodieTableConfig.BOOTSTRAP_BASE_PATH.key());
|
||||||
|
|
||||||
|
// Add more defaults if full bootstrap requested
|
||||||
|
this.props.putIfAbsent(DataSourceWriteOptions.PAYLOAD_CLASS_NAME().key(),
|
||||||
|
DataSourceWriteOptions.PAYLOAD_CLASS_NAME().defaultValue());
|
||||||
|
/**
|
||||||
|
* Schema provider that supplies the command for reading the input and writing out the target table.
|
||||||
|
*/
|
||||||
|
SchemaProvider schemaProvider = createSchemaProvider(cfg.schemaProviderClass, props, jssc);
|
||||||
|
HoodieWriteConfig.Builder builder =
|
||||||
|
HoodieWriteConfig.newBuilder().withPath(cfg.basePath)
|
||||||
|
.withCompactionConfig(HoodieCompactionConfig.newBuilder().withInlineCompaction(false).build())
|
||||||
|
.forTable(cfg.tableName)
|
||||||
|
.withIndexConfig(HoodieIndexConfig.newBuilder().withIndexType(HoodieIndex.IndexType.BLOOM).build())
|
||||||
|
.withAutoCommit(true)
|
||||||
|
.withProps(props);
|
||||||
|
|
||||||
|
if (null != schemaProvider && null != schemaProvider.getTargetSchema()) {
|
||||||
|
builder = builder.withSchema(schemaProvider.getTargetSchema().toString());
|
||||||
|
}
|
||||||
|
this.bootstrapConfig = builder.build();
|
||||||
|
LOG.info("Created bootstrap executor with configs : " + bootstrapConfig.getProps());
|
||||||
|
}
|
||||||
|
|
||||||
|
public static SchemaProvider createSchemaProvider(String schemaProviderClass, TypedProperties cfg,
|
||||||
|
JavaSparkContext jssc) throws IOException {
|
||||||
|
try {
|
||||||
|
return StringUtils.isNullOrEmpty(schemaProviderClass) ? null
|
||||||
|
: (SchemaProvider) ReflectionUtils.loadClass(schemaProviderClass, cfg, jssc);
|
||||||
|
} catch (Throwable e) {
|
||||||
|
throw new IOException("Could not load schema provider class " + schemaProviderClass, e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Executes Bootstrap.
|
||||||
|
*/
|
||||||
|
public void execute() throws IOException {
|
||||||
|
initializeTable();
|
||||||
|
|
||||||
|
try (SparkRDDWriteClient bootstrapClient = new SparkRDDWriteClient(new HoodieSparkEngineContext(jssc), bootstrapConfig)) {
|
||||||
|
HashMap<String, String> checkpointCommitMetadata = new HashMap<>();
|
||||||
|
checkpointCommitMetadata.put(CHECKPOINT_KEY, Config.checkpoint);
|
||||||
|
bootstrapClient.bootstrap(Option.of(checkpointCommitMetadata));
|
||||||
|
syncHive();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Sync to Hive.
|
||||||
|
*/
|
||||||
|
private void syncHive() {
|
||||||
|
if (cfg.enableHiveSync) {
|
||||||
|
TypedProperties metaProps = new TypedProperties();
|
||||||
|
metaProps.putAll(props);
|
||||||
|
metaProps.put(HoodieSyncConfig.META_SYNC_BASE_PATH.key(), cfg.basePath);
|
||||||
|
metaProps.put(HoodieSyncConfig.META_SYNC_BASE_FILE_FORMAT.key(), cfg.baseFileFormat);
|
||||||
|
if (props.getBoolean(HiveSyncConfig.HIVE_SYNC_BUCKET_SYNC.key(), HiveSyncConfig.HIVE_SYNC_BUCKET_SYNC.defaultValue())) {
|
||||||
|
metaProps.put(HiveSyncConfig.HIVE_SYNC_BUCKET_SYNC_SPEC.key(), HiveSyncConfig.getBucketSpec(props.getString(HoodieIndexConfig.BUCKET_INDEX_HASH_FIELD.key()),
|
||||||
|
props.getInteger(HoodieIndexConfig.BUCKET_INDEX_NUM_BUCKETS.key())));
|
||||||
|
}
|
||||||
|
|
||||||
|
new HiveSyncTool(metaProps, configuration, fs).syncHoodieTable();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private void initializeTable() throws IOException {
|
||||||
|
Path basePath = new Path(cfg.basePath);
|
||||||
|
if (fs.exists(basePath)) {
|
||||||
|
if (cfg.bootstrapOverwrite) {
|
||||||
|
LOG.warn("Target base path already exists, overwrite it");
|
||||||
|
fs.delete(basePath, true);
|
||||||
|
} else {
|
||||||
|
throw new HoodieException("target base path already exists at " + cfg.basePath
|
||||||
|
+ ". Cannot bootstrap data on top of an existing table");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
HoodieTableMetaClient.withPropertyBuilder()
|
||||||
|
.fromProperties(props)
|
||||||
|
.setTableType(cfg.tableType)
|
||||||
|
.setTableName(cfg.tableName)
|
||||||
|
.setArchiveLogFolder(ARCHIVELOG_FOLDER.defaultValue())
|
||||||
|
.setPayloadClassName(cfg.payloadClass)
|
||||||
|
.setBaseFileFormat(cfg.baseFileFormat)
|
||||||
|
.setBootstrapIndexClass(cfg.bootstrapIndexClass)
|
||||||
|
.setBootstrapBasePath(bootstrapBasePath)
|
||||||
|
.initTable(new Configuration(jssc.hadoopConfiguration()), cfg.basePath);
|
||||||
|
}
|
||||||
|
|
||||||
|
public static class Config {
|
||||||
|
private String tableName;
|
||||||
|
private String tableType;
|
||||||
|
|
||||||
|
private String basePath;
|
||||||
|
|
||||||
|
private String baseFileFormat;
|
||||||
|
private String bootstrapIndexClass;
|
||||||
|
private String schemaProviderClass;
|
||||||
|
private String payloadClass;
|
||||||
|
private Boolean enableHiveSync;
|
||||||
|
|
||||||
|
private Boolean bootstrapOverwrite;
|
||||||
|
|
||||||
|
public static String checkpoint = null;
|
||||||
|
|
||||||
|
public void setTableName(String tableName) {
|
||||||
|
this.tableName = tableName;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setTableType(String tableType) {
|
||||||
|
this.tableType = tableType;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setBasePath(String basePath) {
|
||||||
|
this.basePath = basePath;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setBaseFileFormat(String baseFileFormat) {
|
||||||
|
this.baseFileFormat = baseFileFormat;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setBootstrapIndexClass(String bootstrapIndexClass) {
|
||||||
|
this.bootstrapIndexClass = bootstrapIndexClass;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setSchemaProviderClass(String schemaProviderClass) {
|
||||||
|
this.schemaProviderClass = schemaProviderClass;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setPayloadClass(String payloadClass) {
|
||||||
|
this.payloadClass = payloadClass;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setEnableHiveSync(Boolean enableHiveSync) {
|
||||||
|
this.enableHiveSync = enableHiveSync;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setBootstrapOverwrite(Boolean bootstrapOverwrite) {
|
||||||
|
this.bootstrapOverwrite = bootstrapOverwrite;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -0,0 +1,58 @@
|
|||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one
|
||||||
|
* or more contributor license agreements. See the NOTICE file
|
||||||
|
* distributed with this work for additional information
|
||||||
|
* regarding copyright ownership. The ASF licenses this file
|
||||||
|
* to you under the Apache License, Version 2.0 (the
|
||||||
|
* "License"); you may not use this file except in compliance
|
||||||
|
* with the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.apache.hudi.cli;
|
||||||
|
|
||||||
|
import org.apache.avro.Schema;
|
||||||
|
import org.apache.hudi.ApiMaturityLevel;
|
||||||
|
import org.apache.hudi.PublicAPIClass;
|
||||||
|
import org.apache.hudi.PublicAPIMethod;
|
||||||
|
import org.apache.hudi.common.config.TypedProperties;
|
||||||
|
import org.apache.spark.api.java.JavaSparkContext;
|
||||||
|
|
||||||
|
import java.io.Serializable;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Class to provide schema for reading data and also writing into a Hoodie table,
|
||||||
|
* used by deltastreamer (runs over Spark).
|
||||||
|
*/
|
||||||
|
@PublicAPIClass(maturity = ApiMaturityLevel.STABLE)
|
||||||
|
public abstract class SchemaProvider implements Serializable {
|
||||||
|
|
||||||
|
protected TypedProperties config;
|
||||||
|
|
||||||
|
protected JavaSparkContext jssc;
|
||||||
|
|
||||||
|
public SchemaProvider(TypedProperties props) {
|
||||||
|
this(props, null);
|
||||||
|
}
|
||||||
|
|
||||||
|
protected SchemaProvider(TypedProperties props, JavaSparkContext jssc) {
|
||||||
|
this.config = props;
|
||||||
|
this.jssc = jssc;
|
||||||
|
}
|
||||||
|
|
||||||
|
@PublicAPIMethod(maturity = ApiMaturityLevel.STABLE)
|
||||||
|
public abstract Schema getSourceSchema();
|
||||||
|
|
||||||
|
@PublicAPIMethod(maturity = ApiMaturityLevel.STABLE)
|
||||||
|
public Schema getTargetSchema() {
|
||||||
|
// by default, use source schema as target for hoodie table as well
|
||||||
|
return getSourceSchema();
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -55,6 +55,9 @@ object HoodieProcedures {
|
|||||||
mapBuilder.put(StatsWriteAmplificationProcedure.NAME, StatsWriteAmplificationProcedure.builder)
|
mapBuilder.put(StatsWriteAmplificationProcedure.NAME, StatsWriteAmplificationProcedure.builder)
|
||||||
mapBuilder.put(StatsFileSizeProcedure.NAME, StatsFileSizeProcedure.builder)
|
mapBuilder.put(StatsFileSizeProcedure.NAME, StatsFileSizeProcedure.builder)
|
||||||
mapBuilder.put(HdfsParquetImportProcedure.NAME, HdfsParquetImportProcedure.builder)
|
mapBuilder.put(HdfsParquetImportProcedure.NAME, HdfsParquetImportProcedure.builder)
|
||||||
|
mapBuilder.put(RunBootstrapProcedure.NAME, RunBootstrapProcedure.builder)
|
||||||
|
mapBuilder.put(ShowBootstrapMappingProcedure.NAME, ShowBootstrapMappingProcedure.builder)
|
||||||
|
mapBuilder.put(ShowBootstrapPartitionsProcedure.NAME, ShowBootstrapPartitionsProcedure.builder)
|
||||||
mapBuilder.build
|
mapBuilder.build
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -0,0 +1,144 @@
|
|||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.apache.spark.sql.hudi.command.procedures
|
||||||
|
|
||||||
|
import org.apache.hadoop.fs.Path
|
||||||
|
import org.apache.hudi.DataSourceWriteOptions
|
||||||
|
import org.apache.hudi.cli.BootstrapExecutorUtils
|
||||||
|
import org.apache.hudi.cli.HDFSParquetImporterUtils.{buildProperties, readConfig}
|
||||||
|
import org.apache.hudi.common.config.TypedProperties
|
||||||
|
import org.apache.hudi.common.fs.FSUtils
|
||||||
|
import org.apache.hudi.common.util.StringUtils
|
||||||
|
import org.apache.hudi.config.HoodieBootstrapConfig
|
||||||
|
import org.apache.hudi.keygen.constant.KeyGeneratorType
|
||||||
|
import org.apache.spark.internal.Logging
|
||||||
|
import org.apache.spark.sql.Row
|
||||||
|
import org.apache.spark.sql.types.{DataTypes, Metadata, StructField, StructType}
|
||||||
|
|
||||||
|
import java.util
|
||||||
|
import java.util.Locale
|
||||||
|
import java.util.function.Supplier
|
||||||
|
|
||||||
|
class RunBootstrapProcedure extends BaseProcedure with ProcedureBuilder with Logging {
|
||||||
|
private val PARAMETERS = Array[ProcedureParameter](
|
||||||
|
ProcedureParameter.required(0, "table", DataTypes.StringType, None),
|
||||||
|
ProcedureParameter.required(1, "tableType", DataTypes.StringType, None),
|
||||||
|
ProcedureParameter.required(2, "bootstrapPath", DataTypes.StringType, None),
|
||||||
|
ProcedureParameter.required(3, "basePath", DataTypes.StringType, None),
|
||||||
|
ProcedureParameter.required(4, "rowKeyField", DataTypes.StringType, None),
|
||||||
|
ProcedureParameter.optional(5, "baseFileFormat", DataTypes.StringType, "PARQUET"),
|
||||||
|
ProcedureParameter.optional(6, "partitionPathField", DataTypes.StringType, ""),
|
||||||
|
ProcedureParameter.optional(7, "bootstrapIndexClass", DataTypes.StringType, "org.apache.hudi.common.bootstrap.index.HFileBootstrapIndex"),
|
||||||
|
ProcedureParameter.optional(8, "selectorClass", DataTypes.StringType, "org.apache.hudi.client.bootstrap.selector.MetadataOnlyBootstrapModeSelector"),
|
||||||
|
ProcedureParameter.optional(9, "keyGeneratorClass", DataTypes.StringType, "org.apache.hudi.keygen.SimpleKeyGenerator"),
|
||||||
|
ProcedureParameter.optional(10, "fullBootstrapInputProvider", DataTypes.StringType, "org.apache.hudi.bootstrap.SparkParquetBootstrapDataProvider"),
|
||||||
|
ProcedureParameter.optional(11, "schemaProviderClass", DataTypes.StringType, ""),
|
||||||
|
ProcedureParameter.optional(12, "payloadClass", DataTypes.StringType, "org.apache.hudi.common.model.OverwriteWithLatestAvroPayload"),
|
||||||
|
ProcedureParameter.optional(13, "parallelism", DataTypes.IntegerType, 1500),
|
||||||
|
ProcedureParameter.optional(14, "enableHiveSync", DataTypes.BooleanType, false),
|
||||||
|
ProcedureParameter.optional(15, "propsFilePath", DataTypes.StringType, ""),
|
||||||
|
ProcedureParameter.optional(16, "bootstrapOverwrite", DataTypes.BooleanType, false)
|
||||||
|
)
|
||||||
|
|
||||||
|
private val OUTPUT_TYPE = new StructType(Array[StructField](
|
||||||
|
StructField("status", DataTypes.IntegerType, nullable = true, Metadata.empty))
|
||||||
|
)
|
||||||
|
|
||||||
|
def parameters: Array[ProcedureParameter] = PARAMETERS
|
||||||
|
|
||||||
|
def outputType: StructType = OUTPUT_TYPE
|
||||||
|
|
||||||
|
override def call(args: ProcedureArgs): Seq[Row] = {
|
||||||
|
super.checkArgs(PARAMETERS, args)
|
||||||
|
|
||||||
|
val tableName = getArgValueOrDefault(args, PARAMETERS(0))
|
||||||
|
val tableType = getArgValueOrDefault(args, PARAMETERS(1)).get.asInstanceOf[String]
|
||||||
|
val bootstrapPath = getArgValueOrDefault(args, PARAMETERS(2)).get.asInstanceOf[String]
|
||||||
|
val basePath = getArgValueOrDefault(args, PARAMETERS(3)).get.asInstanceOf[String]
|
||||||
|
val rowKeyField = getArgValueOrDefault(args, PARAMETERS(4)).get.asInstanceOf[String]
|
||||||
|
val baseFileFormat = getArgValueOrDefault(args, PARAMETERS(5)).get.asInstanceOf[String]
|
||||||
|
val partitionPathField = getArgValueOrDefault(args, PARAMETERS(6)).get.asInstanceOf[String]
|
||||||
|
val bootstrapIndexClass = getArgValueOrDefault(args, PARAMETERS(7)).get.asInstanceOf[String]
|
||||||
|
val selectorClass = getArgValueOrDefault(args, PARAMETERS(8)).get.asInstanceOf[String]
|
||||||
|
val keyGeneratorClass = getArgValueOrDefault(args, PARAMETERS(9)).get.asInstanceOf[String]
|
||||||
|
val fullBootstrapInputProvider = getArgValueOrDefault(args, PARAMETERS(10)).get.asInstanceOf[String]
|
||||||
|
val schemaProviderClass = getArgValueOrDefault(args, PARAMETERS(11)).get.asInstanceOf[String]
|
||||||
|
val payloadClass = getArgValueOrDefault(args, PARAMETERS(12)).get.asInstanceOf[String]
|
||||||
|
val parallelism = getArgValueOrDefault(args, PARAMETERS(13)).get.asInstanceOf[Int]
|
||||||
|
val enableHiveSync = getArgValueOrDefault(args, PARAMETERS(14)).get.asInstanceOf[Boolean]
|
||||||
|
val propsFilePath = getArgValueOrDefault(args, PARAMETERS(15)).get.asInstanceOf[String]
|
||||||
|
val bootstrapOverwrite = getArgValueOrDefault(args, PARAMETERS(16)).get.asInstanceOf[Boolean]
|
||||||
|
|
||||||
|
val configs: util.List[String] = new util.ArrayList[String]
|
||||||
|
|
||||||
|
val properties: TypedProperties = if (propsFilePath == null || propsFilePath.isEmpty) buildProperties(configs)
|
||||||
|
else readConfig(jsc.hadoopConfiguration, new Path(propsFilePath), configs).getProps(true)
|
||||||
|
|
||||||
|
properties.setProperty(HoodieBootstrapConfig.BASE_PATH.key, bootstrapPath)
|
||||||
|
|
||||||
|
if (!StringUtils.isNullOrEmpty(keyGeneratorClass) && KeyGeneratorType.getNames.contains(keyGeneratorClass.toUpperCase(Locale.ROOT))) {
|
||||||
|
properties.setProperty(HoodieBootstrapConfig.KEYGEN_TYPE.key, keyGeneratorClass.toUpperCase(Locale.ROOT))
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
properties.setProperty(HoodieBootstrapConfig.KEYGEN_CLASS_NAME.key, keyGeneratorClass)
|
||||||
|
}
|
||||||
|
|
||||||
|
properties.setProperty(HoodieBootstrapConfig.FULL_BOOTSTRAP_INPUT_PROVIDER_CLASS_NAME.key, fullBootstrapInputProvider)
|
||||||
|
properties.setProperty(HoodieBootstrapConfig.PARALLELISM_VALUE.key, parallelism.toString)
|
||||||
|
properties.setProperty(HoodieBootstrapConfig.MODE_SELECTOR_CLASS_NAME.key, selectorClass)
|
||||||
|
properties.setProperty(DataSourceWriteOptions.RECORDKEY_FIELD.key, rowKeyField)
|
||||||
|
properties.setProperty(DataSourceWriteOptions.PARTITIONPATH_FIELD.key, partitionPathField)
|
||||||
|
|
||||||
|
val fs = FSUtils.getFs(basePath, jsc.hadoopConfiguration)
|
||||||
|
|
||||||
|
val cfg = new BootstrapExecutorUtils.Config()
|
||||||
|
cfg.setTableName(tableName.get.asInstanceOf[String])
|
||||||
|
cfg.setTableType(tableType)
|
||||||
|
cfg.setBasePath(basePath)
|
||||||
|
cfg.setBaseFileFormat(baseFileFormat)
|
||||||
|
cfg.setBootstrapIndexClass(bootstrapIndexClass)
|
||||||
|
cfg.setSchemaProviderClass(schemaProviderClass)
|
||||||
|
cfg.setPayloadClass(payloadClass)
|
||||||
|
cfg.setEnableHiveSync(enableHiveSync)
|
||||||
|
cfg.setBootstrapOverwrite(bootstrapOverwrite)
|
||||||
|
|
||||||
|
try {
|
||||||
|
new BootstrapExecutorUtils(cfg, jsc, fs, jsc.hadoopConfiguration, properties).execute()
|
||||||
|
} catch {
|
||||||
|
case e: Exception =>
|
||||||
|
logWarning(s"Run bootstrap failed due to", e)
|
||||||
|
Seq(Row(-1))
|
||||||
|
}
|
||||||
|
Seq(Row(0))
|
||||||
|
}
|
||||||
|
|
||||||
|
override def build = new RunBootstrapProcedure()
|
||||||
|
}
|
||||||
|
|
||||||
|
object RunBootstrapProcedure {
|
||||||
|
val NAME = "run_bootstrap"
|
||||||
|
|
||||||
|
def builder: Supplier[ProcedureBuilder] = new Supplier[ProcedureBuilder] {
|
||||||
|
override def get() = new RunBootstrapProcedure
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
@@ -0,0 +1,117 @@
|
|||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.apache.spark.sql.hudi.command.procedures
|
||||||
|
|
||||||
|
import com.google.common.collect.Lists
|
||||||
|
import org.apache.hudi.common.bootstrap.index.BootstrapIndex
|
||||||
|
import org.apache.hudi.common.model.{BootstrapFileMapping, HoodieFileGroupId}
|
||||||
|
import org.apache.hudi.common.table.HoodieTableMetaClient
|
||||||
|
import org.apache.hudi.exception.HoodieException
|
||||||
|
import org.apache.spark.sql.Row
|
||||||
|
import org.apache.spark.sql.types.{DataTypes, Metadata, StructField, StructType}
|
||||||
|
|
||||||
|
import java.util
|
||||||
|
import java.util.function.Supplier
|
||||||
|
import scala.collection.JavaConversions._
|
||||||
|
import scala.collection.JavaConverters._
|
||||||
|
|
||||||
|
class ShowBootstrapMappingProcedure extends BaseProcedure with ProcedureBuilder {
|
||||||
|
private val PARAMETERS = Array[ProcedureParameter](
|
||||||
|
ProcedureParameter.required(0, "table", DataTypes.StringType, None),
|
||||||
|
ProcedureParameter.optional(1, "partitionPath", DataTypes.StringType, ""),
|
||||||
|
ProcedureParameter.optional(2, "fileIds", DataTypes.StringType, ""),
|
||||||
|
ProcedureParameter.optional(3, "limit", DataTypes.IntegerType, 10),
|
||||||
|
ProcedureParameter.optional(4, "sortBy", DataTypes.StringType, "partition"),
|
||||||
|
ProcedureParameter.optional(5, "desc", DataTypes.BooleanType, false)
|
||||||
|
)
|
||||||
|
|
||||||
|
private val OUTPUT_TYPE = new StructType(Array[StructField](
|
||||||
|
StructField("partition", DataTypes.StringType, nullable = true, Metadata.empty),
|
||||||
|
StructField("fileid", DataTypes.StringType, nullable = true, Metadata.empty),
|
||||||
|
StructField("source_basepath", DataTypes.StringType, nullable = true, Metadata.empty),
|
||||||
|
StructField("source_partition", DataTypes.StringType, nullable = true, Metadata.empty),
|
||||||
|
StructField("source_file", DataTypes.StringType, nullable = true, Metadata.empty))
|
||||||
|
)
|
||||||
|
|
||||||
|
def parameters: Array[ProcedureParameter] = PARAMETERS
|
||||||
|
|
||||||
|
def outputType: StructType = OUTPUT_TYPE
|
||||||
|
|
||||||
|
override def call(args: ProcedureArgs): Seq[Row] = {
|
||||||
|
super.checkArgs(PARAMETERS, args)
|
||||||
|
|
||||||
|
val tableName = getArgValueOrDefault(args, PARAMETERS(0))
|
||||||
|
val partitionPath = getArgValueOrDefault(args, PARAMETERS(1)).get.asInstanceOf[String]
|
||||||
|
val fileIds = getArgValueOrDefault(args, PARAMETERS(2)).get.asInstanceOf[String]
|
||||||
|
val limit = getArgValueOrDefault(args, PARAMETERS(3)).get.asInstanceOf[Int]
|
||||||
|
val sortBy = getArgValueOrDefault(args, PARAMETERS(4)).get.asInstanceOf[String]
|
||||||
|
val desc = getArgValueOrDefault(args, PARAMETERS(5)).get.asInstanceOf[Boolean]
|
||||||
|
|
||||||
|
val basePath: String = getBasePath(tableName)
|
||||||
|
val metaClient = HoodieTableMetaClient.builder.setConf(jsc.hadoopConfiguration()).setBasePath(basePath).build
|
||||||
|
|
||||||
|
if (partitionPath.isEmpty && fileIds.nonEmpty) throw new IllegalStateException("PartitionPath is mandatory when passing fileIds.")
|
||||||
|
|
||||||
|
val indexReader = createBootstrapIndexReader(metaClient)
|
||||||
|
val indexedPartitions = indexReader.getIndexedPartitionPaths
|
||||||
|
|
||||||
|
if (partitionPath.nonEmpty && !indexedPartitions.contains(partitionPath)) new HoodieException(partitionPath + " is not an valid indexed partition")
|
||||||
|
|
||||||
|
val mappingList: util.ArrayList[BootstrapFileMapping] = new util.ArrayList[BootstrapFileMapping]
|
||||||
|
if (fileIds.nonEmpty) {
|
||||||
|
val fileGroupIds = fileIds.split(",").toList.map((fileId: String) => new HoodieFileGroupId(partitionPath, fileId)).asJava
|
||||||
|
mappingList.addAll(indexReader.getSourceFileMappingForFileIds(fileGroupIds).values)
|
||||||
|
} else if (partitionPath.nonEmpty) mappingList.addAll(indexReader.getSourceFileMappingForPartition(partitionPath))
|
||||||
|
else {
|
||||||
|
for (part <- indexedPartitions) {
|
||||||
|
mappingList.addAll(indexReader.getSourceFileMappingForPartition(part))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
val rows: java.util.List[Row] = mappingList
|
||||||
|
.map(mapping => Row(mapping.getPartitionPath, mapping.getFileId, mapping.getBootstrapBasePath,
|
||||||
|
mapping.getBootstrapPartitionPath, mapping.getBootstrapFileStatus.getPath.getUri)).toList
|
||||||
|
|
||||||
|
val df = spark.createDataFrame(rows, OUTPUT_TYPE)
|
||||||
|
|
||||||
|
if (desc) {
|
||||||
|
df.orderBy(df(sortBy).desc).limit(limit).collect()
|
||||||
|
} else {
|
||||||
|
df.orderBy(df(sortBy).asc).limit(limit).collect()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private def createBootstrapIndexReader(metaClient: HoodieTableMetaClient) = {
|
||||||
|
val index = BootstrapIndex.getBootstrapIndex(metaClient)
|
||||||
|
if (!index.useIndex) throw new HoodieException("This is not a bootstrapped Hudi table. Don't have any index info")
|
||||||
|
index.createReader
|
||||||
|
}
|
||||||
|
|
||||||
|
override def build: Procedure = new ShowBootstrapMappingProcedure()
|
||||||
|
}
|
||||||
|
|
||||||
|
object ShowBootstrapMappingProcedure {
|
||||||
|
val NAME = "show_bootstrap_mapping"
|
||||||
|
|
||||||
|
def builder: Supplier[ProcedureBuilder] = new Supplier[ProcedureBuilder] {
|
||||||
|
override def get() = new ShowBootstrapMappingProcedure
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
@@ -0,0 +1,75 @@
|
|||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.apache.spark.sql.hudi.command.procedures
|
||||||
|
|
||||||
|
import org.apache.hudi.common.bootstrap.index.BootstrapIndex
|
||||||
|
import org.apache.hudi.common.table.HoodieTableMetaClient
|
||||||
|
import org.apache.hudi.exception.HoodieException
|
||||||
|
import org.apache.spark.sql.Row
|
||||||
|
import org.apache.spark.sql.types.{DataTypes, Metadata, StructField, StructType}
|
||||||
|
|
||||||
|
import java.util.function.Supplier
|
||||||
|
|
||||||
|
class ShowBootstrapPartitionsProcedure extends BaseProcedure with ProcedureBuilder {
|
||||||
|
private val PARAMETERS = Array[ProcedureParameter](
|
||||||
|
ProcedureParameter.required(0, "table", DataTypes.StringType, None)
|
||||||
|
)
|
||||||
|
|
||||||
|
private val OUTPUT_TYPE = new StructType(Array[StructField](
|
||||||
|
StructField("indexed_partitions", DataTypes.StringType, nullable = true, Metadata.empty))
|
||||||
|
)
|
||||||
|
|
||||||
|
def parameters: Array[ProcedureParameter] = PARAMETERS
|
||||||
|
|
||||||
|
def outputType: StructType = OUTPUT_TYPE
|
||||||
|
|
||||||
|
override def call(args: ProcedureArgs): Seq[Row] = {
|
||||||
|
super.checkArgs(PARAMETERS, args)
|
||||||
|
|
||||||
|
val tableName = getArgValueOrDefault(args, PARAMETERS(0))
|
||||||
|
|
||||||
|
val basePath: String = getBasePath(tableName)
|
||||||
|
val metaClient = HoodieTableMetaClient.builder.setConf(jsc.hadoopConfiguration()).setBasePath(basePath).build
|
||||||
|
|
||||||
|
val indexReader = createBootstrapIndexReader(metaClient)
|
||||||
|
val indexedPartitions = indexReader.getIndexedPartitionPaths
|
||||||
|
|
||||||
|
indexedPartitions.stream().toArray.map(r => Row(r)).toList
|
||||||
|
}
|
||||||
|
|
||||||
|
private def createBootstrapIndexReader(metaClient: HoodieTableMetaClient) = {
|
||||||
|
val index = BootstrapIndex.getBootstrapIndex(metaClient)
|
||||||
|
if (!index.useIndex) throw new HoodieException("This is not a bootstrapped Hudi table. Don't have any index info")
|
||||||
|
index.createReader
|
||||||
|
}
|
||||||
|
|
||||||
|
override def build = new ShowBootstrapPartitionsProcedure()
|
||||||
|
}
|
||||||
|
|
||||||
|
object ShowBootstrapPartitionsProcedure {
|
||||||
|
val NAME = "show_bootstrap_partitions"
|
||||||
|
|
||||||
|
def builder: Supplier[ProcedureBuilder] = new Supplier[ProcedureBuilder] {
|
||||||
|
override def get() = new ShowBootstrapPartitionsProcedure
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
@@ -0,0 +1,89 @@
|
|||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.apache.spark.sql.hudi.procedure
|
||||||
|
|
||||||
|
import org.apache.hadoop.fs.Path
|
||||||
|
import org.apache.hudi.common.model.HoodieTableType
|
||||||
|
import org.apache.hudi.functional.TestBootstrap
|
||||||
|
import org.apache.spark.api.java.JavaSparkContext
|
||||||
|
import org.apache.spark.sql.hudi.HoodieSparkSqlTestBase
|
||||||
|
import org.apache.spark.sql.{Dataset, Row}
|
||||||
|
|
||||||
|
import java.time.Instant
|
||||||
|
import java.util
|
||||||
|
|
||||||
|
class TestBootstrapProcedure extends HoodieSparkSqlTestBase {
|
||||||
|
|
||||||
|
test("Test Call run_bootstrap Procedure") {
|
||||||
|
withTempDir { tmp =>
|
||||||
|
val NUM_OF_RECORDS = 100
|
||||||
|
val PARTITION_FIELD = "datestr"
|
||||||
|
val RECORD_KEY_FIELD = "_row_key"
|
||||||
|
|
||||||
|
val tableName = generateTableName
|
||||||
|
val basePath = s"${tmp.getCanonicalPath}"
|
||||||
|
|
||||||
|
val srcName: String = "source"
|
||||||
|
val sourcePath = basePath + Path.SEPARATOR + srcName
|
||||||
|
val tablePath = basePath + Path.SEPARATOR + tableName
|
||||||
|
val jsc = new JavaSparkContext(spark.sparkContext)
|
||||||
|
|
||||||
|
// generate test data
|
||||||
|
val partitions = util.Arrays.asList("2018", "2019", "2020")
|
||||||
|
val timestamp: Long = Instant.now.toEpochMilli
|
||||||
|
for (i <- 0 until partitions.size) {
|
||||||
|
val df: Dataset[Row] = TestBootstrap.generateTestRawTripDataset(timestamp, i * NUM_OF_RECORDS, i * NUM_OF_RECORDS + NUM_OF_RECORDS, null, jsc, spark.sqlContext)
|
||||||
|
df.write.parquet(sourcePath + Path.SEPARATOR + PARTITION_FIELD + "=" + partitions.get(i))
|
||||||
|
}
|
||||||
|
|
||||||
|
// run bootstrap
|
||||||
|
checkAnswer(
|
||||||
|
s"""call run_bootstrap(
|
||||||
|
|table => '$tableName',
|
||||||
|
|basePath => '$tablePath',
|
||||||
|
|tableType => '${HoodieTableType.COPY_ON_WRITE.name}',
|
||||||
|
|bootstrapPath => '$sourcePath',
|
||||||
|
|rowKeyField => '$RECORD_KEY_FIELD',
|
||||||
|
|partitionPathField => '$PARTITION_FIELD',
|
||||||
|
|bootstrapOverwrite => true)""".stripMargin) {
|
||||||
|
Seq(0)
|
||||||
|
}
|
||||||
|
|
||||||
|
// create table
|
||||||
|
spark.sql(
|
||||||
|
s"""
|
||||||
|
|create table $tableName using hudi
|
||||||
|
|location '$tablePath'
|
||||||
|
|tblproperties(primaryKey = '$RECORD_KEY_FIELD')
|
||||||
|
|""".stripMargin)
|
||||||
|
|
||||||
|
// show bootstrap's index partitions
|
||||||
|
var result = spark.sql(s"""call show_bootstrap_partitions(table => '$tableName')""".stripMargin).collect()
|
||||||
|
assertResult(3) {
|
||||||
|
result.length
|
||||||
|
}
|
||||||
|
|
||||||
|
// show bootstrap's index mapping
|
||||||
|
result = spark.sql(
|
||||||
|
s"""call show_bootstrap_mapping(table => '$tableName')""".stripMargin).collect()
|
||||||
|
assertResult(3) {
|
||||||
|
result.length
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
Reference in New Issue
Block a user