[HUDI-875] Abstract hudi-sync-common, and support hudi-hive-sync, hudi-dla-sync (#1810)
- Generalize the hive-sync module for syncing to multiple metastores - Added new options for datasource - Added new command line for delta streamer Co-authored-by: Vinoth Chandar <vinoth@apache.org>
This commit is contained in:
46
hudi-sync/hudi-hive-sync/src/assembly/src.xml
Normal file
46
hudi-sync/hudi-hive-sync/src/assembly/src.xml
Normal file
@@ -0,0 +1,46 @@
|
||||
<!--
|
||||
Licensed to the Apache Software Foundation (ASF) under one
|
||||
or more contributor license agreements. See the NOTICE file
|
||||
distributed with this work for additional information
|
||||
regarding copyright ownership. The ASF licenses this file
|
||||
to you under the Apache License, Version 2.0 (the
|
||||
"License"); you may not use this file except in compliance
|
||||
with the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
-->
|
||||
|
||||
<assembly xmlns="http://maven.apache.org/plugins/maven-assembly-plugin/assembly/1.1.3"
|
||||
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
|
||||
xsi:schemaLocation="http://maven.apache.org/plugins/maven-assembly-plugin/assembly/1.1.3 http://maven.apache.org/xsd/assembly-1.1.3.xsd">
|
||||
<id>jar-with-dependencies</id>
|
||||
<formats>
|
||||
<format>jar</format>
|
||||
</formats>
|
||||
|
||||
<includeBaseDirectory>false</includeBaseDirectory>
|
||||
<dependencySets>
|
||||
|
||||
<dependencySet>
|
||||
<outputDirectory>/</outputDirectory>
|
||||
<unpack>true</unpack>
|
||||
<scope>runtime</scope>
|
||||
<excludes>
|
||||
<exclude>junit:junit</exclude>
|
||||
<exclude>com.google.code.findbugs:*</exclude>
|
||||
<exclude>org.apache.hbase:*</exclude>
|
||||
</excludes>
|
||||
</dependencySet>
|
||||
|
||||
<dependencySet>
|
||||
<unpack>true</unpack>
|
||||
<scope>provided</scope>
|
||||
</dependencySet>
|
||||
</dependencySets>
|
||||
</assembly>
|
||||
@@ -0,0 +1,103 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.hive;
|
||||
|
||||
import com.beust.jcommander.Parameter;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* Configs needed to sync data into Hive.
|
||||
*/
|
||||
public class HiveSyncConfig implements Serializable {
|
||||
|
||||
@Parameter(names = {"--database"}, description = "name of the target database in Hive", required = true)
|
||||
public String databaseName;
|
||||
|
||||
@Parameter(names = {"--table"}, description = "name of the target table in Hive", required = true)
|
||||
public String tableName;
|
||||
|
||||
@Parameter(names = {"--base-file-format"}, description = "Format of the base files (PARQUET (or) HFILE)")
|
||||
public String baseFileFormat = "PARQUET";
|
||||
|
||||
@Parameter(names = {"--user"}, description = "Hive username", required = true)
|
||||
public String hiveUser;
|
||||
|
||||
@Parameter(names = {"--pass"}, description = "Hive password", required = true)
|
||||
public String hivePass;
|
||||
|
||||
@Parameter(names = {"--jdbc-url"}, description = "Hive jdbc connect url", required = true)
|
||||
public String jdbcUrl;
|
||||
|
||||
@Parameter(names = {"--base-path"}, description = "Basepath of hoodie table to sync", required = true)
|
||||
public String basePath;
|
||||
|
||||
@Parameter(names = "--partitioned-by", description = "Fields in the schema partitioned by")
|
||||
public List<String> partitionFields = new ArrayList<>();
|
||||
|
||||
@Parameter(names = "--partition-value-extractor", description = "Class which implements PartitionValueExtractor "
|
||||
+ "to extract the partition values from HDFS path")
|
||||
public String partitionValueExtractorClass = SlashEncodedDayPartitionValueExtractor.class.getName();
|
||||
|
||||
@Parameter(names = {"--assume-date-partitioning"}, description = "Assume standard yyyy/mm/dd partitioning, this"
|
||||
+ " exists to support backward compatibility. If you use hoodie 0.3.x, do not set this parameter")
|
||||
public Boolean assumeDatePartitioning = false;
|
||||
|
||||
@Parameter(names = {"--use-pre-apache-input-format"},
|
||||
description = "Use InputFormat under com.uber.hoodie package "
|
||||
+ "instead of org.apache.hudi package. Use this when you are in the process of migrating from "
|
||||
+ "com.uber.hoodie to org.apache.hudi. Stop using this after you migrated the table definition to "
|
||||
+ "org.apache.hudi input format.")
|
||||
public Boolean usePreApacheInputFormat = false;
|
||||
|
||||
@Parameter(names = {"--use-jdbc"}, description = "Hive jdbc connect url")
|
||||
public Boolean useJdbc = true;
|
||||
|
||||
@Parameter(names = {"--skip-ro-suffix"}, description = "Skip the `_ro` suffix for Read optimized table, when registering")
|
||||
public Boolean skipROSuffix = false;
|
||||
|
||||
@Parameter(names = {"--help", "-h"}, help = true)
|
||||
public Boolean help = false;
|
||||
|
||||
public static HiveSyncConfig copy(HiveSyncConfig cfg) {
|
||||
HiveSyncConfig newConfig = new HiveSyncConfig();
|
||||
newConfig.basePath = cfg.basePath;
|
||||
newConfig.assumeDatePartitioning = cfg.assumeDatePartitioning;
|
||||
newConfig.databaseName = cfg.databaseName;
|
||||
newConfig.hivePass = cfg.hivePass;
|
||||
newConfig.hiveUser = cfg.hiveUser;
|
||||
newConfig.partitionFields = cfg.partitionFields;
|
||||
newConfig.partitionValueExtractorClass = cfg.partitionValueExtractorClass;
|
||||
newConfig.jdbcUrl = cfg.jdbcUrl;
|
||||
newConfig.tableName = cfg.tableName;
|
||||
newConfig.usePreApacheInputFormat = cfg.usePreApacheInputFormat;
|
||||
return newConfig;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return "HiveSyncConfig{databaseName='" + databaseName + '\'' + ", tableName='" + tableName + '\''
|
||||
+ ", hiveUser='" + hiveUser + '\'' + ", hivePass='" + hivePass + '\'' + ", jdbcUrl='" + jdbcUrl + '\''
|
||||
+ ", basePath='" + basePath + '\'' + ", partitionFields=" + partitionFields + ", partitionValueExtractorClass='"
|
||||
+ partitionValueExtractorClass + '\'' + ", assumeDatePartitioning=" + assumeDatePartitioning
|
||||
+ ", usePreApacheInputFormat=" + usePreApacheInputFormat + ", useJdbc=" + useJdbc + ", help=" + help + '}';
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,228 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.hive;
|
||||
|
||||
import org.apache.hudi.common.fs.FSUtils;
|
||||
import org.apache.hudi.common.model.HoodieFileFormat;
|
||||
import org.apache.hudi.common.util.Option;
|
||||
import org.apache.hudi.exception.InvalidTableException;
|
||||
import org.apache.hudi.hadoop.utils.HoodieInputFormatUtils;
|
||||
import org.apache.hudi.sync.common.AbstractSyncHoodieClient.PartitionEvent;
|
||||
import org.apache.hudi.sync.common.AbstractSyncHoodieClient.PartitionEvent.PartitionEventType;
|
||||
import org.apache.hudi.hive.util.HiveSchemaUtil;
|
||||
|
||||
import com.beust.jcommander.JCommander;
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.hadoop.hive.conf.HiveConf;
|
||||
import org.apache.hadoop.hive.metastore.api.Partition;
|
||||
import org.apache.hudi.sync.common.AbstractSyncTool;
|
||||
import org.apache.log4j.LogManager;
|
||||
import org.apache.log4j.Logger;
|
||||
import org.apache.parquet.schema.MessageType;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
/**
|
||||
* Tool to sync a hoodie HDFS table with a hive metastore table. Either use it as a api
|
||||
* HiveSyncTool.syncHoodieTable(HiveSyncConfig) or as a command line java -cp hoodie-hive-sync.jar HiveSyncTool [args]
|
||||
* <p>
|
||||
* This utility will get the schema from the latest commit and will sync hive table schema Also this will sync the
|
||||
* partitions incrementally (all the partitions modified since the last commit)
|
||||
*/
|
||||
@SuppressWarnings("WeakerAccess")
|
||||
public class HiveSyncTool extends AbstractSyncTool {
|
||||
|
||||
private static final Logger LOG = LogManager.getLogger(HiveSyncTool.class);
|
||||
public static final String SUFFIX_SNAPSHOT_TABLE = "_rt";
|
||||
public static final String SUFFIX_READ_OPTIMIZED_TABLE = "_ro";
|
||||
|
||||
private final HiveSyncConfig cfg;
|
||||
private final HoodieHiveClient hoodieHiveClient;
|
||||
private final String snapshotTableName;
|
||||
private final Option<String> roTableTableName;
|
||||
|
||||
public HiveSyncTool(HiveSyncConfig cfg, HiveConf configuration, FileSystem fs) {
|
||||
super(configuration.getAllProperties(), fs);
|
||||
this.hoodieHiveClient = new HoodieHiveClient(cfg, configuration, fs);
|
||||
this.cfg = cfg;
|
||||
// Set partitionFields to empty, when the NonPartitionedExtractor is used
|
||||
if (NonPartitionedExtractor.class.getName().equals(cfg.partitionValueExtractorClass)) {
|
||||
LOG.warn("Set partitionFields to empty, since the NonPartitionedExtractor is used");
|
||||
cfg.partitionFields = new ArrayList<>();
|
||||
}
|
||||
switch (hoodieHiveClient.getTableType()) {
|
||||
case COPY_ON_WRITE:
|
||||
this.snapshotTableName = cfg.tableName;
|
||||
this.roTableTableName = Option.empty();
|
||||
break;
|
||||
case MERGE_ON_READ:
|
||||
this.snapshotTableName = cfg.tableName + SUFFIX_SNAPSHOT_TABLE;
|
||||
this.roTableTableName = cfg.skipROSuffix ? Option.of(cfg.tableName) :
|
||||
Option.of(cfg.tableName + SUFFIX_READ_OPTIMIZED_TABLE);
|
||||
break;
|
||||
default:
|
||||
LOG.error("Unknown table type " + hoodieHiveClient.getTableType());
|
||||
throw new InvalidTableException(hoodieHiveClient.getBasePath());
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void syncHoodieTable() {
|
||||
try {
|
||||
switch (hoodieHiveClient.getTableType()) {
|
||||
case COPY_ON_WRITE:
|
||||
syncHoodieTable(snapshotTableName, false);
|
||||
break;
|
||||
case MERGE_ON_READ:
|
||||
// sync a RO table for MOR
|
||||
syncHoodieTable(roTableTableName.get(), false);
|
||||
// sync a RT table for MOR
|
||||
syncHoodieTable(snapshotTableName, true);
|
||||
break;
|
||||
default:
|
||||
LOG.error("Unknown table type " + hoodieHiveClient.getTableType());
|
||||
throw new InvalidTableException(hoodieHiveClient.getBasePath());
|
||||
}
|
||||
} catch (RuntimeException re) {
|
||||
LOG.error("Got runtime exception when hive syncing", re);
|
||||
} finally {
|
||||
hoodieHiveClient.close();
|
||||
}
|
||||
}
|
||||
|
||||
private void syncHoodieTable(String tableName, boolean useRealtimeInputFormat) {
|
||||
LOG.info("Trying to sync hoodie table " + tableName + " with base path " + hoodieHiveClient.getBasePath()
|
||||
+ " of type " + hoodieHiveClient.getTableType());
|
||||
// Check if the necessary table exists
|
||||
boolean tableExists = hoodieHiveClient.doesTableExist(tableName);
|
||||
|
||||
// check if the database exists else create it
|
||||
try {
|
||||
hoodieHiveClient.updateHiveSQL("create database if not exists " + cfg.databaseName);
|
||||
} catch (Exception e) {
|
||||
// this is harmless since table creation will fail anyways, creation of DB is needed for in-memory testing
|
||||
LOG.warn("Unable to create database", e);
|
||||
}
|
||||
|
||||
// Get the parquet schema for this table looking at the latest commit
|
||||
MessageType schema = hoodieHiveClient.getDataSchema();
|
||||
// Sync schema if needed
|
||||
syncSchema(tableName, tableExists, useRealtimeInputFormat, schema);
|
||||
|
||||
LOG.info("Schema sync complete. Syncing partitions for " + tableName);
|
||||
// Get the last time we successfully synced partitions
|
||||
Option<String> lastCommitTimeSynced = Option.empty();
|
||||
if (tableExists) {
|
||||
lastCommitTimeSynced = hoodieHiveClient.getLastCommitTimeSynced(tableName);
|
||||
}
|
||||
LOG.info("Last commit time synced was found to be " + lastCommitTimeSynced.orElse("null"));
|
||||
List<String> writtenPartitionsSince = hoodieHiveClient.getPartitionsWrittenToSince(lastCommitTimeSynced);
|
||||
LOG.info("Storage partitions scan complete. Found " + writtenPartitionsSince.size());
|
||||
// Sync the partitions if needed
|
||||
syncPartitions(tableName, writtenPartitionsSince);
|
||||
|
||||
hoodieHiveClient.updateLastCommitTimeSynced(tableName);
|
||||
LOG.info("Sync complete for " + tableName);
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the latest schema from the last commit and check if its in sync with the hive table schema. If not, evolves the
|
||||
* table schema.
|
||||
*
|
||||
* @param tableExists - does table exist
|
||||
* @param schema - extracted schema
|
||||
*/
|
||||
private void syncSchema(String tableName, boolean tableExists, boolean useRealTimeInputFormat, MessageType schema) {
|
||||
// Check and sync schema
|
||||
if (!tableExists) {
|
||||
LOG.info("Hive table " + tableName + " is not found. Creating it");
|
||||
HoodieFileFormat baseFileFormat = HoodieFileFormat.valueOf(cfg.baseFileFormat.toUpperCase());
|
||||
String inputFormatClassName = HoodieInputFormatUtils.getInputFormatClassName(baseFileFormat, useRealTimeInputFormat,
|
||||
new Configuration());
|
||||
|
||||
if (baseFileFormat.equals(HoodieFileFormat.PARQUET) && cfg.usePreApacheInputFormat) {
|
||||
// Parquet input format had an InputFormat class visible under the old naming scheme.
|
||||
inputFormatClassName = useRealTimeInputFormat
|
||||
? com.uber.hoodie.hadoop.realtime.HoodieRealtimeInputFormat.class.getName()
|
||||
: com.uber.hoodie.hadoop.HoodieInputFormat.class.getName();
|
||||
}
|
||||
|
||||
String outputFormatClassName = HoodieInputFormatUtils.getOutputFormatClassName(baseFileFormat);
|
||||
String serDeFormatClassName = HoodieInputFormatUtils.getSerDeClassName(baseFileFormat);
|
||||
|
||||
// Custom serde will not work with ALTER TABLE REPLACE COLUMNS
|
||||
// https://github.com/apache/hive/blob/release-1.1.0/ql/src/java/org/apache/hadoop/hive
|
||||
// /ql/exec/DDLTask.java#L3488
|
||||
hoodieHiveClient.createTable(tableName, schema, inputFormatClassName, outputFormatClassName, serDeFormatClassName);
|
||||
} else {
|
||||
// Check if the table schema has evolved
|
||||
Map<String, String> tableSchema = hoodieHiveClient.getTableSchema(tableName);
|
||||
SchemaDifference schemaDiff = HiveSchemaUtil.getSchemaDifference(schema, tableSchema, cfg.partitionFields);
|
||||
if (!schemaDiff.isEmpty()) {
|
||||
LOG.info("Schema difference found for " + tableName);
|
||||
hoodieHiveClient.updateTableDefinition(tableName, schema);
|
||||
} else {
|
||||
LOG.info("No Schema difference for " + tableName);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Syncs the list of storage parititions passed in (checks if the partition is in hive, if not adds it or if the
|
||||
* partition path does not match, it updates the partition path).
|
||||
*/
|
||||
private void syncPartitions(String tableName, List<String> writtenPartitionsSince) {
|
||||
try {
|
||||
List<Partition> hivePartitions = hoodieHiveClient.scanTablePartitions(tableName);
|
||||
List<PartitionEvent> partitionEvents =
|
||||
hoodieHiveClient.getPartitionEvents(hivePartitions, writtenPartitionsSince);
|
||||
List<String> newPartitions = filterPartitions(partitionEvents, PartitionEventType.ADD);
|
||||
LOG.info("New Partitions " + newPartitions);
|
||||
hoodieHiveClient.addPartitionsToTable(tableName, newPartitions);
|
||||
List<String> updatePartitions = filterPartitions(partitionEvents, PartitionEventType.UPDATE);
|
||||
LOG.info("Changed Partitions " + updatePartitions);
|
||||
hoodieHiveClient.updatePartitionsToTable(tableName, updatePartitions);
|
||||
} catch (Exception e) {
|
||||
throw new HoodieHiveSyncException("Failed to sync partitions for table " + tableName, e);
|
||||
}
|
||||
}
|
||||
|
||||
private List<String> filterPartitions(List<PartitionEvent> events, PartitionEventType eventType) {
|
||||
return events.stream().filter(s -> s.eventType == eventType).map(s -> s.storagePartition)
|
||||
.collect(Collectors.toList());
|
||||
}
|
||||
|
||||
public static void main(String[] args) {
|
||||
// parse the params
|
||||
final HiveSyncConfig cfg = new HiveSyncConfig();
|
||||
JCommander cmd = new JCommander(cfg, null, args);
|
||||
if (cfg.help || args.length == 0) {
|
||||
cmd.usage();
|
||||
System.exit(1);
|
||||
}
|
||||
FileSystem fs = FSUtils.getFs(cfg.basePath, new Configuration());
|
||||
HiveConf hiveConf = new HiveConf();
|
||||
hiveConf.addResource(fs.getConf());
|
||||
new HiveSyncTool(cfg, hiveConf, fs).syncHoodieTable();
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,482 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.hive;
|
||||
|
||||
import org.apache.hudi.common.fs.FSUtils;
|
||||
import org.apache.hudi.common.fs.StorageSchemes;
|
||||
import org.apache.hudi.common.table.timeline.HoodieTimeline;
|
||||
import org.apache.hudi.common.util.Option;
|
||||
import org.apache.hudi.common.util.ValidationUtils;
|
||||
import org.apache.hudi.hive.util.HiveSchemaUtil;
|
||||
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.apache.hadoop.hive.conf.HiveConf;
|
||||
import org.apache.hadoop.hive.metastore.IMetaStoreClient;
|
||||
import org.apache.hadoop.hive.metastore.api.FieldSchema;
|
||||
import org.apache.hadoop.hive.metastore.api.MetaException;
|
||||
import org.apache.hadoop.hive.metastore.api.Partition;
|
||||
import org.apache.hadoop.hive.metastore.api.Table;
|
||||
import org.apache.hadoop.hive.ql.metadata.Hive;
|
||||
import org.apache.hadoop.hive.ql.metadata.HiveException;
|
||||
import org.apache.hadoop.hive.ql.processors.CommandProcessorResponse;
|
||||
import org.apache.hadoop.hive.ql.session.SessionState;
|
||||
import org.apache.hive.jdbc.HiveDriver;
|
||||
import org.apache.hudi.sync.common.AbstractSyncHoodieClient;
|
||||
import org.apache.log4j.LogManager;
|
||||
import org.apache.log4j.Logger;
|
||||
import org.apache.parquet.schema.MessageType;
|
||||
import org.apache.thrift.TException;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.sql.Connection;
|
||||
import java.sql.DatabaseMetaData;
|
||||
import java.sql.DriverManager;
|
||||
import java.sql.ResultSet;
|
||||
import java.sql.SQLException;
|
||||
import java.sql.Statement;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
public class HoodieHiveClient extends AbstractSyncHoodieClient {
|
||||
|
||||
private static final String HOODIE_LAST_COMMIT_TIME_SYNC = "last_commit_time_sync";
|
||||
// Make sure we have the hive JDBC driver in classpath
|
||||
private static String driverName = HiveDriver.class.getName();
|
||||
private static final String HIVE_ESCAPE_CHARACTER = HiveSchemaUtil.HIVE_ESCAPE_CHARACTER;
|
||||
|
||||
static {
|
||||
try {
|
||||
Class.forName(driverName);
|
||||
} catch (ClassNotFoundException e) {
|
||||
throw new IllegalStateException("Could not find " + driverName + " in classpath. ", e);
|
||||
}
|
||||
}
|
||||
|
||||
private static final Logger LOG = LogManager.getLogger(HoodieHiveClient.class);
|
||||
private final PartitionValueExtractor partitionValueExtractor;
|
||||
private IMetaStoreClient client;
|
||||
private HiveSyncConfig syncConfig;
|
||||
private FileSystem fs;
|
||||
private Connection connection;
|
||||
private HoodieTimeline activeTimeline;
|
||||
private HiveConf configuration;
|
||||
|
||||
public HoodieHiveClient(HiveSyncConfig cfg, HiveConf configuration, FileSystem fs) {
|
||||
super(cfg.basePath, cfg.assumeDatePartitioning, fs);
|
||||
this.syncConfig = cfg;
|
||||
this.fs = fs;
|
||||
|
||||
this.configuration = configuration;
|
||||
// Support both JDBC and metastore based implementations for backwards compatiblity. Future users should
|
||||
// disable jdbc and depend on metastore client for all hive registrations
|
||||
if (cfg.useJdbc) {
|
||||
LOG.info("Creating hive connection " + cfg.jdbcUrl);
|
||||
createHiveConnection();
|
||||
}
|
||||
try {
|
||||
this.client = Hive.get(configuration).getMSC();
|
||||
} catch (MetaException | HiveException e) {
|
||||
throw new HoodieHiveSyncException("Failed to create HiveMetaStoreClient", e);
|
||||
}
|
||||
|
||||
try {
|
||||
this.partitionValueExtractor =
|
||||
(PartitionValueExtractor) Class.forName(cfg.partitionValueExtractorClass).newInstance();
|
||||
} catch (Exception e) {
|
||||
throw new HoodieHiveSyncException(
|
||||
"Failed to initialize PartitionValueExtractor class " + cfg.partitionValueExtractorClass, e);
|
||||
}
|
||||
|
||||
activeTimeline = metaClient.getActiveTimeline().getCommitsTimeline().filterCompletedInstants();
|
||||
}
|
||||
|
||||
public HoodieTimeline getActiveTimeline() {
|
||||
return activeTimeline;
|
||||
}
|
||||
|
||||
/**
|
||||
* Add the (NEW) partitions to the table.
|
||||
*/
|
||||
@Override
|
||||
public void addPartitionsToTable(String tableName, List<String> partitionsToAdd) {
|
||||
if (partitionsToAdd.isEmpty()) {
|
||||
LOG.info("No partitions to add for " + tableName);
|
||||
return;
|
||||
}
|
||||
LOG.info("Adding partitions " + partitionsToAdd.size() + " to table " + tableName);
|
||||
String sql = constructAddPartitions(tableName, partitionsToAdd);
|
||||
updateHiveSQL(sql);
|
||||
}
|
||||
|
||||
/**
|
||||
* Partition path has changed - update the path for te following partitions.
|
||||
*/
|
||||
@Override
|
||||
public void updatePartitionsToTable(String tableName, List<String> changedPartitions) {
|
||||
if (changedPartitions.isEmpty()) {
|
||||
LOG.info("No partitions to change for " + tableName);
|
||||
return;
|
||||
}
|
||||
LOG.info("Changing partitions " + changedPartitions.size() + " on " + tableName);
|
||||
List<String> sqls = constructChangePartitions(tableName, changedPartitions);
|
||||
for (String sql : sqls) {
|
||||
updateHiveSQL(sql);
|
||||
}
|
||||
}
|
||||
|
||||
private String constructAddPartitions(String tableName, List<String> partitions) {
|
||||
StringBuilder alterSQL = new StringBuilder("ALTER TABLE ");
|
||||
alterSQL.append(HIVE_ESCAPE_CHARACTER).append(syncConfig.databaseName)
|
||||
.append(HIVE_ESCAPE_CHARACTER).append(".").append(HIVE_ESCAPE_CHARACTER)
|
||||
.append(tableName).append(HIVE_ESCAPE_CHARACTER).append(" ADD IF NOT EXISTS ");
|
||||
for (String partition : partitions) {
|
||||
String partitionClause = getPartitionClause(partition);
|
||||
String fullPartitionPath = FSUtils.getPartitionPath(syncConfig.basePath, partition).toString();
|
||||
alterSQL.append(" PARTITION (").append(partitionClause).append(") LOCATION '").append(fullPartitionPath)
|
||||
.append("' ");
|
||||
}
|
||||
return alterSQL.toString();
|
||||
}
|
||||
|
||||
/**
|
||||
* Generate Hive Partition from partition values.
|
||||
*
|
||||
* @param partition Partition path
|
||||
* @return
|
||||
*/
|
||||
private String getPartitionClause(String partition) {
|
||||
List<String> partitionValues = partitionValueExtractor.extractPartitionValuesInPath(partition);
|
||||
ValidationUtils.checkArgument(syncConfig.partitionFields.size() == partitionValues.size(),
|
||||
"Partition key parts " + syncConfig.partitionFields + " does not match with partition values " + partitionValues
|
||||
+ ". Check partition strategy. ");
|
||||
List<String> partBuilder = new ArrayList<>();
|
||||
for (int i = 0; i < syncConfig.partitionFields.size(); i++) {
|
||||
partBuilder.add("`" + syncConfig.partitionFields.get(i) + "`='" + partitionValues.get(i) + "'");
|
||||
}
|
||||
return String.join(",", partBuilder);
|
||||
}
|
||||
|
||||
private List<String> constructChangePartitions(String tableName, List<String> partitions) {
|
||||
List<String> changePartitions = new ArrayList<>();
|
||||
// Hive 2.x doesn't like db.table name for operations, hence we need to change to using the database first
|
||||
String useDatabase = "USE " + HIVE_ESCAPE_CHARACTER + syncConfig.databaseName + HIVE_ESCAPE_CHARACTER;
|
||||
changePartitions.add(useDatabase);
|
||||
String alterTable = "ALTER TABLE " + HIVE_ESCAPE_CHARACTER + tableName + HIVE_ESCAPE_CHARACTER;
|
||||
for (String partition : partitions) {
|
||||
String partitionClause = getPartitionClause(partition);
|
||||
Path partitionPath = FSUtils.getPartitionPath(syncConfig.basePath, partition);
|
||||
String partitionScheme = partitionPath.toUri().getScheme();
|
||||
String fullPartitionPath = StorageSchemes.HDFS.getScheme().equals(partitionScheme)
|
||||
? FSUtils.getDFSFullPartitionPath(fs, partitionPath) : partitionPath.toString();
|
||||
String changePartition =
|
||||
alterTable + " PARTITION (" + partitionClause + ") SET LOCATION '" + fullPartitionPath + "'";
|
||||
changePartitions.add(changePartition);
|
||||
}
|
||||
return changePartitions;
|
||||
}
|
||||
|
||||
/**
|
||||
* Iterate over the storage partitions and find if there are any new partitions that need to be added or updated.
|
||||
* Generate a list of PartitionEvent based on the changes required.
|
||||
*/
|
||||
List<PartitionEvent> getPartitionEvents(List<Partition> tablePartitions, List<String> partitionStoragePartitions) {
|
||||
Map<String, String> paths = new HashMap<>();
|
||||
for (Partition tablePartition : tablePartitions) {
|
||||
List<String> hivePartitionValues = tablePartition.getValues();
|
||||
Collections.sort(hivePartitionValues);
|
||||
String fullTablePartitionPath =
|
||||
Path.getPathWithoutSchemeAndAuthority(new Path(tablePartition.getSd().getLocation())).toUri().getPath();
|
||||
paths.put(String.join(", ", hivePartitionValues), fullTablePartitionPath);
|
||||
}
|
||||
|
||||
List<PartitionEvent> events = new ArrayList<>();
|
||||
for (String storagePartition : partitionStoragePartitions) {
|
||||
Path storagePartitionPath = FSUtils.getPartitionPath(syncConfig.basePath, storagePartition);
|
||||
String fullStoragePartitionPath = Path.getPathWithoutSchemeAndAuthority(storagePartitionPath).toUri().getPath();
|
||||
// Check if the partition values or if hdfs path is the same
|
||||
List<String> storagePartitionValues = partitionValueExtractor.extractPartitionValuesInPath(storagePartition);
|
||||
Collections.sort(storagePartitionValues);
|
||||
if (!storagePartitionValues.isEmpty()) {
|
||||
String storageValue = String.join(", ", storagePartitionValues);
|
||||
if (!paths.containsKey(storageValue)) {
|
||||
events.add(PartitionEvent.newPartitionAddEvent(storagePartition));
|
||||
} else if (!paths.get(storageValue).equals(fullStoragePartitionPath)) {
|
||||
events.add(PartitionEvent.newPartitionUpdateEvent(storagePartition));
|
||||
}
|
||||
}
|
||||
}
|
||||
return events;
|
||||
}
|
||||
|
||||
/**
|
||||
* Scan table partitions.
|
||||
*/
|
||||
public List<Partition> scanTablePartitions(String tableName) throws TException {
|
||||
return client.listPartitions(syncConfig.databaseName, tableName, (short) -1);
|
||||
}
|
||||
|
||||
void updateTableDefinition(String tableName, MessageType newSchema) {
|
||||
try {
|
||||
String newSchemaStr = HiveSchemaUtil.generateSchemaString(newSchema, syncConfig.partitionFields);
|
||||
// Cascade clause should not be present for non-partitioned tables
|
||||
String cascadeClause = syncConfig.partitionFields.size() > 0 ? " cascade" : "";
|
||||
StringBuilder sqlBuilder = new StringBuilder("ALTER TABLE ").append(HIVE_ESCAPE_CHARACTER)
|
||||
.append(syncConfig.databaseName).append(HIVE_ESCAPE_CHARACTER).append(".")
|
||||
.append(HIVE_ESCAPE_CHARACTER).append(tableName)
|
||||
.append(HIVE_ESCAPE_CHARACTER).append(" REPLACE COLUMNS(")
|
||||
.append(newSchemaStr).append(" )").append(cascadeClause);
|
||||
LOG.info("Updating table definition with " + sqlBuilder);
|
||||
updateHiveSQL(sqlBuilder.toString());
|
||||
} catch (IOException e) {
|
||||
throw new HoodieHiveSyncException("Failed to update table for " + tableName, e);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void createTable(String tableName, MessageType storageSchema, String inputFormatClass, String outputFormatClass, String serdeClass) {
|
||||
try {
|
||||
String createSQLQuery =
|
||||
HiveSchemaUtil.generateCreateDDL(tableName, storageSchema, syncConfig, inputFormatClass, outputFormatClass, serdeClass);
|
||||
LOG.info("Creating table with " + createSQLQuery);
|
||||
updateHiveSQL(createSQLQuery);
|
||||
} catch (IOException e) {
|
||||
throw new HoodieHiveSyncException("Failed to create table " + tableName, e);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the table schema.
|
||||
*/
|
||||
@Override
|
||||
public Map<String, String> getTableSchema(String tableName) {
|
||||
if (syncConfig.useJdbc) {
|
||||
if (!doesTableExist(tableName)) {
|
||||
throw new IllegalArgumentException(
|
||||
"Failed to get schema for table " + tableName + " does not exist");
|
||||
}
|
||||
Map<String, String> schema = new HashMap<>();
|
||||
ResultSet result = null;
|
||||
try {
|
||||
DatabaseMetaData databaseMetaData = connection.getMetaData();
|
||||
result = databaseMetaData.getColumns(null, syncConfig.databaseName, tableName, null);
|
||||
while (result.next()) {
|
||||
String columnName = result.getString(4);
|
||||
String columnType = result.getString(6);
|
||||
if ("DECIMAL".equals(columnType)) {
|
||||
int columnSize = result.getInt("COLUMN_SIZE");
|
||||
int decimalDigits = result.getInt("DECIMAL_DIGITS");
|
||||
columnType += String.format("(%s,%s)", columnSize, decimalDigits);
|
||||
}
|
||||
schema.put(columnName, columnType);
|
||||
}
|
||||
return schema;
|
||||
} catch (SQLException e) {
|
||||
throw new HoodieHiveSyncException("Failed to get table schema for " + tableName, e);
|
||||
} finally {
|
||||
closeQuietly(result, null);
|
||||
}
|
||||
} else {
|
||||
return getTableSchemaUsingMetastoreClient(tableName);
|
||||
}
|
||||
}
|
||||
|
||||
public Map<String, String> getTableSchemaUsingMetastoreClient(String tableName) {
|
||||
try {
|
||||
// HiveMetastoreClient returns partition keys separate from Columns, hence get both and merge to
|
||||
// get the Schema of the table.
|
||||
final long start = System.currentTimeMillis();
|
||||
Table table = this.client.getTable(syncConfig.databaseName, tableName);
|
||||
Map<String, String> partitionKeysMap =
|
||||
table.getPartitionKeys().stream().collect(Collectors.toMap(FieldSchema::getName, f -> f.getType().toUpperCase()));
|
||||
|
||||
Map<String, String> columnsMap =
|
||||
table.getSd().getCols().stream().collect(Collectors.toMap(FieldSchema::getName, f -> f.getType().toUpperCase()));
|
||||
|
||||
Map<String, String> schema = new HashMap<>();
|
||||
schema.putAll(columnsMap);
|
||||
schema.putAll(partitionKeysMap);
|
||||
final long end = System.currentTimeMillis();
|
||||
LOG.info(String.format("Time taken to getTableSchema: %s ms", (end - start)));
|
||||
return schema;
|
||||
} catch (Exception e) {
|
||||
throw new HoodieHiveSyncException("Failed to get table schema for : " + tableName, e);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @return true if the configured table exists
|
||||
*/
|
||||
@Override
|
||||
public boolean doesTableExist(String tableName) {
|
||||
try {
|
||||
return client.tableExists(syncConfig.databaseName, tableName);
|
||||
} catch (TException e) {
|
||||
throw new HoodieHiveSyncException("Failed to check if table exists " + tableName, e);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Execute a update in hive metastore with this SQL.
|
||||
*
|
||||
* @param s SQL to execute
|
||||
*/
|
||||
public void updateHiveSQL(String s) {
|
||||
if (syncConfig.useJdbc) {
|
||||
Statement stmt = null;
|
||||
try {
|
||||
stmt = connection.createStatement();
|
||||
LOG.info("Executing SQL " + s);
|
||||
stmt.execute(s);
|
||||
} catch (SQLException e) {
|
||||
throw new HoodieHiveSyncException("Failed in executing SQL " + s, e);
|
||||
} finally {
|
||||
closeQuietly(null, stmt);
|
||||
}
|
||||
} else {
|
||||
updateHiveSQLUsingHiveDriver(s);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Execute a update in hive using Hive Driver.
|
||||
*
|
||||
* @param sql SQL statement to execute
|
||||
*/
|
||||
public CommandProcessorResponse updateHiveSQLUsingHiveDriver(String sql) {
|
||||
List<CommandProcessorResponse> responses = updateHiveSQLs(Collections.singletonList(sql));
|
||||
return responses.get(responses.size() - 1);
|
||||
}
|
||||
|
||||
private List<CommandProcessorResponse> updateHiveSQLs(List<String> sqls) {
|
||||
SessionState ss = null;
|
||||
org.apache.hadoop.hive.ql.Driver hiveDriver = null;
|
||||
List<CommandProcessorResponse> responses = new ArrayList<>();
|
||||
try {
|
||||
final long startTime = System.currentTimeMillis();
|
||||
ss = SessionState.start(configuration);
|
||||
ss.setCurrentDatabase(syncConfig.databaseName);
|
||||
hiveDriver = new org.apache.hadoop.hive.ql.Driver(configuration);
|
||||
final long endTime = System.currentTimeMillis();
|
||||
LOG.info(String.format("Time taken to start SessionState and create Driver: %s ms", (endTime - startTime)));
|
||||
for (String sql : sqls) {
|
||||
final long start = System.currentTimeMillis();
|
||||
responses.add(hiveDriver.run(sql));
|
||||
final long end = System.currentTimeMillis();
|
||||
LOG.info(String.format("Time taken to execute [%s]: %s ms", sql, (end - start)));
|
||||
}
|
||||
} catch (Exception e) {
|
||||
throw new HoodieHiveSyncException("Failed in executing SQL", e);
|
||||
} finally {
|
||||
if (ss != null) {
|
||||
try {
|
||||
ss.close();
|
||||
} catch (IOException ie) {
|
||||
LOG.error("Error while closing SessionState", ie);
|
||||
}
|
||||
}
|
||||
if (hiveDriver != null) {
|
||||
try {
|
||||
hiveDriver.close();
|
||||
} catch (Exception e) {
|
||||
LOG.error("Error while closing hiveDriver", e);
|
||||
}
|
||||
}
|
||||
}
|
||||
return responses;
|
||||
}
|
||||
|
||||
private void createHiveConnection() {
|
||||
if (connection == null) {
|
||||
try {
|
||||
Class.forName(HiveDriver.class.getCanonicalName());
|
||||
} catch (ClassNotFoundException e) {
|
||||
LOG.error("Unable to load Hive driver class", e);
|
||||
return;
|
||||
}
|
||||
|
||||
try {
|
||||
this.connection = DriverManager.getConnection(syncConfig.jdbcUrl, syncConfig.hiveUser, syncConfig.hivePass);
|
||||
LOG.info("Successfully established Hive connection to " + syncConfig.jdbcUrl);
|
||||
} catch (SQLException e) {
|
||||
throw new HoodieHiveSyncException("Cannot create hive connection " + getHiveJdbcUrlWithDefaultDBName(), e);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private String getHiveJdbcUrlWithDefaultDBName() {
|
||||
String hiveJdbcUrl = syncConfig.jdbcUrl;
|
||||
String urlAppend = null;
|
||||
// If the hive url contains addition properties like ;transportMode=http;httpPath=hs2
|
||||
if (hiveJdbcUrl.contains(";")) {
|
||||
urlAppend = hiveJdbcUrl.substring(hiveJdbcUrl.indexOf(";"));
|
||||
hiveJdbcUrl = hiveJdbcUrl.substring(0, hiveJdbcUrl.indexOf(";"));
|
||||
}
|
||||
if (!hiveJdbcUrl.endsWith("/")) {
|
||||
hiveJdbcUrl = hiveJdbcUrl + "/";
|
||||
}
|
||||
return hiveJdbcUrl + (urlAppend == null ? "" : urlAppend);
|
||||
}
|
||||
|
||||
@Override
|
||||
public Option<String> getLastCommitTimeSynced(String tableName) {
|
||||
// Get the last commit time from the TBLproperties
|
||||
try {
|
||||
Table database = client.getTable(syncConfig.databaseName, tableName);
|
||||
return Option.ofNullable(database.getParameters().getOrDefault(HOODIE_LAST_COMMIT_TIME_SYNC, null));
|
||||
} catch (Exception e) {
|
||||
throw new HoodieHiveSyncException("Failed to get the last commit time synced from the database", e);
|
||||
}
|
||||
}
|
||||
|
||||
public void close() {
|
||||
try {
|
||||
if (connection != null) {
|
||||
connection.close();
|
||||
}
|
||||
if (client != null) {
|
||||
Hive.closeCurrent();
|
||||
client = null;
|
||||
}
|
||||
} catch (SQLException e) {
|
||||
LOG.error("Could not close connection ", e);
|
||||
}
|
||||
}
|
||||
|
||||
List<String> getAllTables(String db) throws Exception {
|
||||
return client.getAllTables(db);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void updateLastCommitTimeSynced(String tableName) {
|
||||
// Set the last commit time from the TBLproperties
|
||||
String lastCommitSynced = activeTimeline.lastInstant().get().getTimestamp();
|
||||
try {
|
||||
Table table = client.getTable(syncConfig.databaseName, tableName);
|
||||
table.putToParameters(HOODIE_LAST_COMMIT_TIME_SYNC, lastCommitSynced);
|
||||
client.alter_table(syncConfig.databaseName, tableName, table);
|
||||
} catch (Exception e) {
|
||||
throw new HoodieHiveSyncException("Failed to get update last commit time synced to " + lastCommitSynced, e);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,42 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.hive;
|
||||
|
||||
public class HoodieHiveSyncException extends RuntimeException {
|
||||
|
||||
public HoodieHiveSyncException() {
|
||||
super();
|
||||
}
|
||||
|
||||
public HoodieHiveSyncException(String message) {
|
||||
super(message);
|
||||
}
|
||||
|
||||
public HoodieHiveSyncException(String message, Throwable t) {
|
||||
super(message, t);
|
||||
}
|
||||
|
||||
public HoodieHiveSyncException(Throwable t) {
|
||||
super(t);
|
||||
}
|
||||
|
||||
protected static String format(String message, Object... args) {
|
||||
return String.format(String.valueOf(message), (Object[]) args);
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,44 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.hive;
|
||||
|
||||
import org.apache.hudi.common.util.ValidationUtils;
|
||||
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
/**
|
||||
* Partition Key extractor treating each value delimited by slash as separate key.
|
||||
*/
|
||||
public class MultiPartKeysValueExtractor implements PartitionValueExtractor {
|
||||
|
||||
@Override
|
||||
public List<String> extractPartitionValuesInPath(String partitionPath) {
|
||||
String[] splits = partitionPath.split("/");
|
||||
return Arrays.stream(splits).map(s -> {
|
||||
if (s.contains("=")) {
|
||||
String[] moreSplit = s.split("=");
|
||||
ValidationUtils.checkArgument(moreSplit.length == 2, "Partition Field (" + s + ") not in expected format");
|
||||
return moreSplit[1];
|
||||
}
|
||||
return s;
|
||||
}).collect(Collectors.toList());
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,33 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.hive;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* Extractor for Non-partitioned hive tables.
|
||||
*/
|
||||
public class NonPartitionedExtractor implements PartitionValueExtractor {
|
||||
|
||||
@Override
|
||||
public List<String> extractPartitionValuesInPath(String partitionPath) {
|
||||
return new ArrayList<>();
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,33 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.hive;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* HDFS Path contain hive partition values for the keys it is partitioned on. This mapping is not straight forward and
|
||||
* requires a pluggable implementation to extract the partition value from HDFS path.
|
||||
* <p>
|
||||
* e.g. Hive table partitioned by datestr=yyyy-mm-dd and hdfs path /app/hoodie/dataset1/YYYY=[yyyy]/MM=[mm]/DD=[dd]
|
||||
*/
|
||||
public interface PartitionValueExtractor extends Serializable {
|
||||
|
||||
List<String> extractPartitionValuesInPath(String partitionPath);
|
||||
}
|
||||
@@ -0,0 +1,118 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.hive;
|
||||
|
||||
import org.apache.parquet.schema.MessageType;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
import java.util.HashMap;
|
||||
import java.util.LinkedHashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.StringJoiner;
|
||||
|
||||
|
||||
/**
|
||||
* Represents the schema difference between the storage schema and hive table schema.
|
||||
*/
|
||||
public class SchemaDifference {
|
||||
|
||||
private final MessageType storageSchema;
|
||||
private final Map<String, String> tableSchema;
|
||||
private final List<String> deleteColumns;
|
||||
private final Map<String, String> updateColumnTypes;
|
||||
private final Map<String, String> addColumnTypes;
|
||||
|
||||
private SchemaDifference(MessageType storageSchema, Map<String, String> tableSchema, List<String> deleteColumns,
|
||||
Map<String, String> updateColumnTypes, Map<String, String> addColumnTypes) {
|
||||
this.storageSchema = storageSchema;
|
||||
this.tableSchema = tableSchema;
|
||||
this.deleteColumns = Collections.unmodifiableList(deleteColumns);
|
||||
this.updateColumnTypes = Collections.unmodifiableMap(updateColumnTypes);
|
||||
this.addColumnTypes = Collections.unmodifiableMap(addColumnTypes);
|
||||
}
|
||||
|
||||
public List<String> getDeleteColumns() {
|
||||
return deleteColumns;
|
||||
}
|
||||
|
||||
public Map<String, String> getUpdateColumnTypes() {
|
||||
return updateColumnTypes;
|
||||
}
|
||||
|
||||
public Map<String, String> getAddColumnTypes() {
|
||||
return addColumnTypes;
|
||||
}
|
||||
|
||||
public static Builder newBuilder(MessageType storageSchema, Map<String, String> tableSchema) {
|
||||
return new Builder(storageSchema, tableSchema);
|
||||
}
|
||||
|
||||
public boolean isEmpty() {
|
||||
return deleteColumns.isEmpty() && updateColumnTypes.isEmpty() && addColumnTypes.isEmpty();
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return new StringJoiner(", ", SchemaDifference.class.getSimpleName() + "[", "]")
|
||||
.add("storageSchema=" + storageSchema)
|
||||
.add("tableSchema=" + tableSchema)
|
||||
.add("deleteColumns=" + deleteColumns)
|
||||
.add("updateColumnTypes=" + updateColumnTypes)
|
||||
.add("addColumnTypes=" + addColumnTypes)
|
||||
.toString();
|
||||
}
|
||||
|
||||
public static class Builder {
|
||||
|
||||
private final MessageType storageSchema;
|
||||
private final Map<String, String> tableSchema;
|
||||
private List<String> deleteColumns;
|
||||
private Map<String, String> updateColumnTypes;
|
||||
private Map<String, String> addColumnTypes;
|
||||
|
||||
public Builder(MessageType storageSchema, Map<String, String> tableSchema) {
|
||||
this.storageSchema = storageSchema;
|
||||
this.tableSchema = tableSchema;
|
||||
deleteColumns = new ArrayList<>();
|
||||
updateColumnTypes = new HashMap<>();
|
||||
addColumnTypes = new LinkedHashMap<>();
|
||||
}
|
||||
|
||||
public Builder deleteTableColumn(String column) {
|
||||
deleteColumns.add(column);
|
||||
return this;
|
||||
}
|
||||
|
||||
public Builder updateTableColumn(String column, String storageColumnType) {
|
||||
updateColumnTypes.put(column, storageColumnType);
|
||||
return this;
|
||||
}
|
||||
|
||||
public Builder addTableColumn(String name, String type) {
|
||||
addColumnTypes.put(name, type);
|
||||
return this;
|
||||
}
|
||||
|
||||
public SchemaDifference build() {
|
||||
return new SchemaDifference(storageSchema, tableSchema, deleteColumns, updateColumnTypes, addColumnTypes);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,65 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.hive;
|
||||
|
||||
import org.joda.time.DateTime;
|
||||
import org.joda.time.format.DateTimeFormat;
|
||||
import org.joda.time.format.DateTimeFormatter;
|
||||
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* HDFS Path contain hive partition values for the keys it is partitioned on. This mapping is not straight forward and
|
||||
* requires a pluggable implementation to extract the partition value from HDFS path.
|
||||
* <p>
|
||||
* This implementation extracts datestr=yyyy-mm-dd from path of type /yyyy/mm/dd
|
||||
*/
|
||||
public class SlashEncodedDayPartitionValueExtractor implements PartitionValueExtractor {
|
||||
|
||||
private static final long serialVersionUID = 1L;
|
||||
private transient DateTimeFormatter dtfOut;
|
||||
|
||||
public SlashEncodedDayPartitionValueExtractor() {
|
||||
this.dtfOut = DateTimeFormat.forPattern("yyyy-MM-dd");
|
||||
}
|
||||
|
||||
private DateTimeFormatter getDtfOut() {
|
||||
if (dtfOut == null) {
|
||||
dtfOut = DateTimeFormat.forPattern("yyyy-MM-dd");
|
||||
}
|
||||
return dtfOut;
|
||||
}
|
||||
|
||||
@Override
|
||||
public List<String> extractPartitionValuesInPath(String partitionPath) {
|
||||
// partition path is expected to be in this format yyyy/mm/dd
|
||||
String[] splits = partitionPath.split("/");
|
||||
if (splits.length != 3) {
|
||||
throw new IllegalArgumentException("Partition path " + partitionPath + " is not in the form yyyy/mm/dd ");
|
||||
}
|
||||
// Get the partition part and remove the / as well at the end
|
||||
int year = Integer.parseInt(splits[0].contains("=") ? splits[0].split("=")[1] : splits[0]);
|
||||
int mm = Integer.parseInt(splits[1].contains("=") ? splits[1].split("=")[1] : splits[1]);
|
||||
int dd = Integer.parseInt(splits[2].contains("=") ? splits[2].split("=")[1] : splits[2]);
|
||||
DateTime dateTime = new DateTime(year, mm, dd, 0, 0);
|
||||
|
||||
return Collections.singletonList(getDtfOut().print(dateTime));
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,50 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.hive.util;
|
||||
|
||||
import java.util.HashMap;
|
||||
import java.util.Iterator;
|
||||
import java.util.Map;
|
||||
|
||||
public class ColumnNameXLator {
|
||||
|
||||
private static Map<String, String> xformMap = new HashMap<>();
|
||||
|
||||
public static String translateNestedColumn(String colName) {
|
||||
Map.Entry<String,String> entry;
|
||||
for (Iterator<Map.Entry<String, String>> ic = xformMap.entrySet().iterator(); ic.hasNext(); colName =
|
||||
colName.replaceAll(entry.getKey(), entry.getValue())) {
|
||||
entry = ic.next();
|
||||
}
|
||||
|
||||
return colName;
|
||||
}
|
||||
|
||||
public static String translateColumn(String colName) {
|
||||
return colName;
|
||||
}
|
||||
|
||||
public static String translate(String colName, boolean nestedColumn) {
|
||||
return !nestedColumn ? translateColumn(colName) : translateNestedColumn(colName);
|
||||
}
|
||||
|
||||
static {
|
||||
xformMap.put("\\$", "_dollar_");
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,419 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.hive.util;
|
||||
|
||||
import org.apache.hudi.hive.HiveSyncConfig;
|
||||
import org.apache.hudi.hive.HoodieHiveSyncException;
|
||||
import org.apache.hudi.hive.SchemaDifference;
|
||||
|
||||
import org.apache.log4j.LogManager;
|
||||
import org.apache.log4j.Logger;
|
||||
import org.apache.parquet.schema.DecimalMetadata;
|
||||
import org.apache.parquet.schema.GroupType;
|
||||
import org.apache.parquet.schema.MessageType;
|
||||
import org.apache.parquet.schema.OriginalType;
|
||||
import org.apache.parquet.schema.PrimitiveType;
|
||||
import org.apache.parquet.schema.Type;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashSet;
|
||||
import java.util.LinkedHashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
|
||||
/**
|
||||
* Schema Utilities.
|
||||
*/
|
||||
public class HiveSchemaUtil {
|
||||
|
||||
private static final Logger LOG = LogManager.getLogger(HiveSchemaUtil.class);
|
||||
public static final String HIVE_ESCAPE_CHARACTER = "`";
|
||||
|
||||
/**
|
||||
* Get the schema difference between the storage schema and hive table schema.
|
||||
*/
|
||||
public static SchemaDifference getSchemaDifference(MessageType storageSchema, Map<String, String> tableSchema,
|
||||
List<String> partitionKeys) {
|
||||
Map<String, String> newTableSchema;
|
||||
try {
|
||||
newTableSchema = convertParquetSchemaToHiveSchema(storageSchema);
|
||||
} catch (IOException e) {
|
||||
throw new HoodieHiveSyncException("Failed to convert parquet schema to hive schema", e);
|
||||
}
|
||||
LOG.info("Getting schema difference for " + tableSchema + "\r\n\r\n" + newTableSchema);
|
||||
SchemaDifference.Builder schemaDiffBuilder = SchemaDifference.newBuilder(storageSchema, tableSchema);
|
||||
Set<String> tableColumns = new HashSet<>();
|
||||
|
||||
for (Map.Entry<String, String> field : tableSchema.entrySet()) {
|
||||
String fieldName = field.getKey().toLowerCase();
|
||||
String tickSurroundedFieldName = tickSurround(fieldName);
|
||||
if (!isFieldExistsInSchema(newTableSchema, tickSurroundedFieldName) && !partitionKeys.contains(fieldName)) {
|
||||
schemaDiffBuilder.deleteTableColumn(fieldName);
|
||||
} else {
|
||||
// check type
|
||||
String tableColumnType = field.getValue();
|
||||
if (!isFieldExistsInSchema(newTableSchema, tickSurroundedFieldName)) {
|
||||
if (partitionKeys.contains(fieldName)) {
|
||||
// Partition key does not have to be part of the storage schema
|
||||
continue;
|
||||
}
|
||||
// We will log this and continue. Hive schema is a superset of all parquet schemas
|
||||
LOG.warn("Ignoring table column " + fieldName + " as its not present in the parquet schema");
|
||||
continue;
|
||||
}
|
||||
tableColumnType = tableColumnType.replaceAll("\\s+", "");
|
||||
|
||||
String expectedType = getExpectedType(newTableSchema, tickSurroundedFieldName);
|
||||
expectedType = expectedType.replaceAll("\\s+", "");
|
||||
expectedType = expectedType.replaceAll("`", "");
|
||||
|
||||
if (!tableColumnType.equalsIgnoreCase(expectedType)) {
|
||||
// check for incremental queries, the schema type change is allowed as per evolution
|
||||
// rules
|
||||
if (!isSchemaTypeUpdateAllowed(tableColumnType, expectedType)) {
|
||||
throw new HoodieHiveSyncException("Could not convert field Type from " + tableColumnType + " to "
|
||||
+ expectedType + " for field " + fieldName);
|
||||
}
|
||||
schemaDiffBuilder.updateTableColumn(fieldName, getExpectedType(newTableSchema, tickSurroundedFieldName));
|
||||
}
|
||||
}
|
||||
tableColumns.add(tickSurroundedFieldName);
|
||||
}
|
||||
|
||||
for (Map.Entry<String, String> entry : newTableSchema.entrySet()) {
|
||||
if (!tableColumns.contains(entry.getKey().toLowerCase())) {
|
||||
schemaDiffBuilder.addTableColumn(entry.getKey(), entry.getValue());
|
||||
}
|
||||
}
|
||||
LOG.info("Difference between schemas: " + schemaDiffBuilder.build().toString());
|
||||
|
||||
return schemaDiffBuilder.build();
|
||||
}
|
||||
|
||||
private static String getExpectedType(Map<String, String> newTableSchema, String fieldName) {
|
||||
for (Map.Entry<String, String> entry : newTableSchema.entrySet()) {
|
||||
if (entry.getKey().toLowerCase().equals(fieldName)) {
|
||||
return entry.getValue();
|
||||
}
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
private static boolean isFieldExistsInSchema(Map<String, String> newTableSchema, String fieldName) {
|
||||
for (String entry : newTableSchema.keySet()) {
|
||||
if (entry.toLowerCase().equals(fieldName)) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns equivalent Hive table schema read from a parquet file.
|
||||
*
|
||||
* @param messageType : Parquet Schema
|
||||
* @return : Hive Table schema read from parquet file MAP[String,String]
|
||||
*/
|
||||
public static Map<String, String> convertParquetSchemaToHiveSchema(MessageType messageType) throws IOException {
|
||||
Map<String, String> schema = new LinkedHashMap<>();
|
||||
List<Type> parquetFields = messageType.getFields();
|
||||
for (Type parquetType : parquetFields) {
|
||||
StringBuilder result = new StringBuilder();
|
||||
String key = parquetType.getName();
|
||||
if (parquetType.isRepetition(Type.Repetition.REPEATED)) {
|
||||
result.append(createHiveArray(parquetType, ""));
|
||||
} else {
|
||||
result.append(convertField(parquetType));
|
||||
}
|
||||
|
||||
schema.put(hiveCompatibleFieldName(key, false), result.toString());
|
||||
}
|
||||
return schema;
|
||||
}
|
||||
|
||||
/**
|
||||
* Convert one field data type of parquet schema into an equivalent Hive schema.
|
||||
*
|
||||
* @param parquetType : Single paruet field
|
||||
* @return : Equivalent sHive schema
|
||||
*/
|
||||
private static String convertField(final Type parquetType) {
|
||||
StringBuilder field = new StringBuilder();
|
||||
if (parquetType.isPrimitive()) {
|
||||
final PrimitiveType.PrimitiveTypeName parquetPrimitiveTypeName =
|
||||
parquetType.asPrimitiveType().getPrimitiveTypeName();
|
||||
final OriginalType originalType = parquetType.getOriginalType();
|
||||
if (originalType == OriginalType.DECIMAL) {
|
||||
final DecimalMetadata decimalMetadata = parquetType.asPrimitiveType().getDecimalMetadata();
|
||||
return field.append("DECIMAL(").append(decimalMetadata.getPrecision()).append(" , ")
|
||||
.append(decimalMetadata.getScale()).append(")").toString();
|
||||
} else if (originalType == OriginalType.DATE) {
|
||||
return field.append("DATE").toString();
|
||||
}
|
||||
// TODO - fix the method naming here
|
||||
return parquetPrimitiveTypeName.convert(new PrimitiveType.PrimitiveTypeNameConverter<String, RuntimeException>() {
|
||||
@Override
|
||||
public String convertBOOLEAN(PrimitiveType.PrimitiveTypeName primitiveTypeName) {
|
||||
return "boolean";
|
||||
}
|
||||
|
||||
@Override
|
||||
public String convertINT32(PrimitiveType.PrimitiveTypeName primitiveTypeName) {
|
||||
return "int";
|
||||
}
|
||||
|
||||
@Override
|
||||
public String convertINT64(PrimitiveType.PrimitiveTypeName primitiveTypeName) {
|
||||
return "bigint";
|
||||
}
|
||||
|
||||
@Override
|
||||
public String convertINT96(PrimitiveType.PrimitiveTypeName primitiveTypeName) {
|
||||
return "timestamp-millis";
|
||||
}
|
||||
|
||||
@Override
|
||||
public String convertFLOAT(PrimitiveType.PrimitiveTypeName primitiveTypeName) {
|
||||
return "float";
|
||||
}
|
||||
|
||||
@Override
|
||||
public String convertDOUBLE(PrimitiveType.PrimitiveTypeName primitiveTypeName) {
|
||||
return "double";
|
||||
}
|
||||
|
||||
@Override
|
||||
public String convertFIXED_LEN_BYTE_ARRAY(PrimitiveType.PrimitiveTypeName primitiveTypeName) {
|
||||
return "binary";
|
||||
}
|
||||
|
||||
@Override
|
||||
public String convertBINARY(PrimitiveType.PrimitiveTypeName primitiveTypeName) {
|
||||
if (originalType == OriginalType.UTF8 || originalType == OriginalType.ENUM) {
|
||||
return "string";
|
||||
} else {
|
||||
return "binary";
|
||||
}
|
||||
}
|
||||
});
|
||||
} else {
|
||||
GroupType parquetGroupType = parquetType.asGroupType();
|
||||
OriginalType originalType = parquetGroupType.getOriginalType();
|
||||
if (originalType != null) {
|
||||
switch (originalType) {
|
||||
case LIST:
|
||||
if (parquetGroupType.getFieldCount() != 1) {
|
||||
throw new UnsupportedOperationException("Invalid list type " + parquetGroupType);
|
||||
}
|
||||
Type elementType = parquetGroupType.getType(0);
|
||||
if (!elementType.isRepetition(Type.Repetition.REPEATED)) {
|
||||
throw new UnsupportedOperationException("Invalid list type " + parquetGroupType);
|
||||
}
|
||||
return createHiveArray(elementType, parquetGroupType.getName());
|
||||
case MAP:
|
||||
if (parquetGroupType.getFieldCount() != 1 || parquetGroupType.getType(0).isPrimitive()) {
|
||||
throw new UnsupportedOperationException("Invalid map type " + parquetGroupType);
|
||||
}
|
||||
GroupType mapKeyValType = parquetGroupType.getType(0).asGroupType();
|
||||
if (!mapKeyValType.isRepetition(Type.Repetition.REPEATED)
|
||||
|| !mapKeyValType.getOriginalType().equals(OriginalType.MAP_KEY_VALUE)
|
||||
|| mapKeyValType.getFieldCount() != 2) {
|
||||
throw new UnsupportedOperationException("Invalid map type " + parquetGroupType);
|
||||
}
|
||||
Type keyType = mapKeyValType.getType(0);
|
||||
if (!keyType.isPrimitive()
|
||||
|| !keyType.asPrimitiveType().getPrimitiveTypeName().equals(PrimitiveType.PrimitiveTypeName.BINARY)
|
||||
|| !keyType.getOriginalType().equals(OriginalType.UTF8)) {
|
||||
throw new UnsupportedOperationException("Map key type must be binary (UTF8): " + keyType);
|
||||
}
|
||||
Type valueType = mapKeyValType.getType(1);
|
||||
return createHiveMap(convertField(keyType), convertField(valueType));
|
||||
case ENUM:
|
||||
case UTF8:
|
||||
return "string";
|
||||
case MAP_KEY_VALUE:
|
||||
// MAP_KEY_VALUE was supposed to be used to annotate key and
|
||||
// value group levels in a
|
||||
// MAP. However, that is always implied by the structure of
|
||||
// MAP. Hence, PARQUET-113
|
||||
// dropped the requirement for having MAP_KEY_VALUE.
|
||||
default:
|
||||
throw new UnsupportedOperationException("Cannot convert Parquet type " + parquetType);
|
||||
}
|
||||
} else {
|
||||
// if no original type then it's a record
|
||||
return createHiveStruct(parquetGroupType.getFields());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Return a 'struct' Hive schema from a list of Parquet fields.
|
||||
*
|
||||
* @param parquetFields : list of parquet fields
|
||||
* @return : Equivalent 'struct' Hive schema
|
||||
*/
|
||||
private static String createHiveStruct(List<Type> parquetFields) {
|
||||
StringBuilder struct = new StringBuilder();
|
||||
struct.append("STRUCT< ");
|
||||
for (Type field : parquetFields) {
|
||||
// TODO: struct field name is only translated to support special char($)
|
||||
// We will need to extend it to other collection type
|
||||
struct.append(hiveCompatibleFieldName(field.getName(), true)).append(" : ");
|
||||
struct.append(convertField(field)).append(", ");
|
||||
}
|
||||
struct.delete(struct.length() - 2, struct.length()); // Remove the last
|
||||
// ", "
|
||||
struct.append(">");
|
||||
String finalStr = struct.toString();
|
||||
// Struct cannot have - in them. userstore_udr_entities has uuid in struct. This breaks the
|
||||
// schema.
|
||||
// HDrone sync should not fail because of this.
|
||||
finalStr = finalStr.replaceAll("-", "_");
|
||||
return finalStr;
|
||||
}
|
||||
|
||||
private static String hiveCompatibleFieldName(String fieldName, boolean isNested) {
|
||||
String result = fieldName;
|
||||
if (isNested) {
|
||||
result = ColumnNameXLator.translateNestedColumn(fieldName);
|
||||
}
|
||||
return tickSurround(result);
|
||||
}
|
||||
|
||||
private static String tickSurround(String result) {
|
||||
if (!result.startsWith("`")) {
|
||||
result = "`" + result;
|
||||
}
|
||||
if (!result.endsWith("`")) {
|
||||
result = result + "`";
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
private static String removeSurroundingTick(String result) {
|
||||
if (result.startsWith("`") && result.endsWith("`")) {
|
||||
result = result.substring(1, result.length() - 1);
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a 'Map' schema from Parquet map field.
|
||||
*/
|
||||
private static String createHiveMap(String keyType, String valueType) {
|
||||
return "MAP< " + keyType + ", " + valueType + ">";
|
||||
}
|
||||
|
||||
/**
|
||||
* Create an Array Hive schema from equivalent parquet list type.
|
||||
*/
|
||||
private static String createHiveArray(Type elementType, String elementName) {
|
||||
StringBuilder array = new StringBuilder();
|
||||
array.append("ARRAY< ");
|
||||
if (elementType.isPrimitive()) {
|
||||
array.append(convertField(elementType));
|
||||
} else {
|
||||
final GroupType groupType = elementType.asGroupType();
|
||||
final List<Type> groupFields = groupType.getFields();
|
||||
if (groupFields.size() > 1 || (groupFields.size() == 1
|
||||
&& (elementType.getName().equals("array") || elementType.getName().equals(elementName + "_tuple")))) {
|
||||
array.append(convertField(elementType));
|
||||
} else {
|
||||
array.append(convertField(groupType.getFields().get(0)));
|
||||
}
|
||||
}
|
||||
array.append(">");
|
||||
return array.toString();
|
||||
}
|
||||
|
||||
public static boolean isSchemaTypeUpdateAllowed(String prevType, String newType) {
|
||||
if (prevType == null || prevType.trim().isEmpty() || newType == null || newType.trim().isEmpty()) {
|
||||
return false;
|
||||
}
|
||||
prevType = prevType.toLowerCase();
|
||||
newType = newType.toLowerCase();
|
||||
if (prevType.equals(newType)) {
|
||||
return true;
|
||||
} else if (prevType.equalsIgnoreCase("int") && newType.equalsIgnoreCase("bigint")) {
|
||||
return true;
|
||||
} else if (prevType.equalsIgnoreCase("float") && newType.equalsIgnoreCase("double")) {
|
||||
return true;
|
||||
} else {
|
||||
return prevType.contains("struct") && newType.toLowerCase().contains("struct");
|
||||
}
|
||||
}
|
||||
|
||||
public static String generateSchemaString(MessageType storageSchema) throws IOException {
|
||||
return generateSchemaString(storageSchema, new ArrayList<>());
|
||||
}
|
||||
|
||||
public static String generateSchemaString(MessageType storageSchema, List<String> colsToSkip) throws IOException {
|
||||
Map<String, String> hiveSchema = convertParquetSchemaToHiveSchema(storageSchema);
|
||||
StringBuilder columns = new StringBuilder();
|
||||
for (Map.Entry<String, String> hiveSchemaEntry : hiveSchema.entrySet()) {
|
||||
if (!colsToSkip.contains(removeSurroundingTick(hiveSchemaEntry.getKey()))) {
|
||||
columns.append(hiveSchemaEntry.getKey()).append(" ");
|
||||
columns.append(hiveSchemaEntry.getValue()).append(", ");
|
||||
}
|
||||
}
|
||||
// Remove the last ", "
|
||||
columns.delete(columns.length() - 2, columns.length());
|
||||
return columns.toString();
|
||||
}
|
||||
|
||||
public static String generateCreateDDL(String tableName, MessageType storageSchema, HiveSyncConfig config, String inputFormatClass,
|
||||
String outputFormatClass, String serdeClass) throws IOException {
|
||||
Map<String, String> hiveSchema = convertParquetSchemaToHiveSchema(storageSchema);
|
||||
String columns = generateSchemaString(storageSchema, config.partitionFields);
|
||||
|
||||
List<String> partitionFields = new ArrayList<>();
|
||||
for (String partitionKey : config.partitionFields) {
|
||||
String partitionKeyWithTicks = tickSurround(partitionKey);
|
||||
partitionFields.add(new StringBuilder().append(partitionKeyWithTicks).append(" ")
|
||||
.append(getPartitionKeyType(hiveSchema, partitionKeyWithTicks)).toString());
|
||||
}
|
||||
|
||||
String partitionsStr = String.join(",", partitionFields);
|
||||
StringBuilder sb = new StringBuilder("CREATE EXTERNAL TABLE IF NOT EXISTS ");
|
||||
sb.append(HIVE_ESCAPE_CHARACTER).append(config.databaseName).append(HIVE_ESCAPE_CHARACTER)
|
||||
.append(".").append(HIVE_ESCAPE_CHARACTER).append(tableName).append(HIVE_ESCAPE_CHARACTER);
|
||||
sb.append("( ").append(columns).append(")");
|
||||
if (!config.partitionFields.isEmpty()) {
|
||||
sb.append(" PARTITIONED BY (").append(partitionsStr).append(")");
|
||||
}
|
||||
sb.append(" ROW FORMAT SERDE '").append(serdeClass).append("'");
|
||||
sb.append(" STORED AS INPUTFORMAT '").append(inputFormatClass).append("'");
|
||||
sb.append(" OUTPUTFORMAT '").append(outputFormatClass).append("' LOCATION '").append(config.basePath).append("'");
|
||||
return sb.toString();
|
||||
}
|
||||
|
||||
private static String getPartitionKeyType(Map<String, String> hiveSchema, String partitionKey) {
|
||||
if (hiveSchema.containsKey(partitionKey)) {
|
||||
return hiveSchema.get(partitionKey);
|
||||
}
|
||||
// Default the unknown partition fields to be String
|
||||
// TODO - all partition fields should be part of the schema. datestr is treated as special.
|
||||
// Dont do that
|
||||
return "String";
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,538 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.hive;
|
||||
|
||||
import org.apache.hudi.common.model.HoodieRecord;
|
||||
import org.apache.hudi.common.testutils.SchemaTestUtil;
|
||||
import org.apache.hudi.common.util.Option;
|
||||
import org.apache.hudi.sync.common.AbstractSyncHoodieClient.PartitionEvent;
|
||||
import org.apache.hudi.sync.common.AbstractSyncHoodieClient.PartitionEvent.PartitionEventType;
|
||||
import org.apache.hudi.hive.testutils.HiveTestUtil;
|
||||
import org.apache.hudi.hive.util.HiveSchemaUtil;
|
||||
|
||||
import org.apache.hadoop.hive.metastore.api.Partition;
|
||||
import org.apache.parquet.schema.MessageType;
|
||||
import org.apache.parquet.schema.OriginalType;
|
||||
import org.apache.parquet.schema.PrimitiveType;
|
||||
import org.apache.parquet.schema.Types;
|
||||
import org.joda.time.DateTime;
|
||||
import org.junit.jupiter.api.AfterAll;
|
||||
import org.junit.jupiter.api.AfterEach;
|
||||
import org.junit.jupiter.api.BeforeEach;
|
||||
import org.junit.jupiter.api.Test;
|
||||
import org.junit.jupiter.params.ParameterizedTest;
|
||||
import org.junit.jupiter.params.provider.MethodSource;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
import java.util.stream.Stream;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||
import static org.junit.jupiter.api.Assertions.assertFalse;
|
||||
import static org.junit.jupiter.api.Assertions.assertTrue;
|
||||
|
||||
public class TestHiveSyncTool {
|
||||
|
||||
private static Stream<Boolean> useJdbc() {
|
||||
return Stream.of(false, true);
|
||||
}
|
||||
|
||||
private static Iterable<Object[]> useJdbcAndSchemaFromCommitMetadata() {
|
||||
return Arrays.asList(new Object[][] { { true, true }, { true, false }, { false, true }, { false, false } });
|
||||
}
|
||||
|
||||
@BeforeEach
|
||||
public void setUp() throws IOException, InterruptedException {
|
||||
HiveTestUtil.setUp();
|
||||
}
|
||||
|
||||
@AfterEach
|
||||
public void teardown() throws IOException {
|
||||
HiveTestUtil.clear();
|
||||
}
|
||||
|
||||
@AfterAll
|
||||
public static void cleanUpClass() {
|
||||
HiveTestUtil.shutdown();
|
||||
}
|
||||
|
||||
/**
|
||||
* Testing converting array types to Hive field declaration strings.
|
||||
* <p>
|
||||
* Refer to the Parquet-113 spec: https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#lists
|
||||
*/
|
||||
@Test
|
||||
public void testSchemaConvertArray() throws IOException {
|
||||
// Testing the 3-level annotation structure
|
||||
MessageType schema = Types.buildMessage().optionalGroup().as(OriginalType.LIST).repeatedGroup()
|
||||
.optional(PrimitiveType.PrimitiveTypeName.INT32).named("element").named("list").named("int_list")
|
||||
.named("ArrayOfInts");
|
||||
|
||||
String schemaString = HiveSchemaUtil.generateSchemaString(schema);
|
||||
assertEquals("`int_list` ARRAY< int>", schemaString);
|
||||
|
||||
// A array of arrays
|
||||
schema = Types.buildMessage().optionalGroup().as(OriginalType.LIST).repeatedGroup().requiredGroup()
|
||||
.as(OriginalType.LIST).repeatedGroup().required(PrimitiveType.PrimitiveTypeName.INT32).named("element")
|
||||
.named("list").named("element").named("list").named("int_list_list").named("ArrayOfArrayOfInts");
|
||||
|
||||
schemaString = HiveSchemaUtil.generateSchemaString(schema);
|
||||
assertEquals("`int_list_list` ARRAY< ARRAY< int>>", schemaString);
|
||||
|
||||
// A list of integers
|
||||
schema = Types.buildMessage().optionalGroup().as(OriginalType.LIST).repeated(PrimitiveType.PrimitiveTypeName.INT32)
|
||||
.named("element").named("int_list").named("ArrayOfInts");
|
||||
|
||||
schemaString = HiveSchemaUtil.generateSchemaString(schema);
|
||||
assertEquals("`int_list` ARRAY< int>", schemaString);
|
||||
|
||||
// A list of structs with two fields
|
||||
schema = Types.buildMessage().optionalGroup().as(OriginalType.LIST).repeatedGroup()
|
||||
.required(PrimitiveType.PrimitiveTypeName.BINARY).named("str").required(PrimitiveType.PrimitiveTypeName.INT32)
|
||||
.named("num").named("element").named("tuple_list").named("ArrayOfTuples");
|
||||
|
||||
schemaString = HiveSchemaUtil.generateSchemaString(schema);
|
||||
assertEquals("`tuple_list` ARRAY< STRUCT< `str` : binary, `num` : int>>", schemaString);
|
||||
|
||||
// A list of structs with a single field
|
||||
// For this case, since the inner group name is "array", we treat the
|
||||
// element type as a one-element struct.
|
||||
schema = Types.buildMessage().optionalGroup().as(OriginalType.LIST).repeatedGroup()
|
||||
.required(PrimitiveType.PrimitiveTypeName.BINARY).named("str").named("array").named("one_tuple_list")
|
||||
.named("ArrayOfOneTuples");
|
||||
|
||||
schemaString = HiveSchemaUtil.generateSchemaString(schema);
|
||||
assertEquals("`one_tuple_list` ARRAY< STRUCT< `str` : binary>>", schemaString);
|
||||
|
||||
// A list of structs with a single field
|
||||
// For this case, since the inner group name ends with "_tuple", we also treat the
|
||||
// element type as a one-element struct.
|
||||
schema = Types.buildMessage().optionalGroup().as(OriginalType.LIST).repeatedGroup()
|
||||
.required(PrimitiveType.PrimitiveTypeName.BINARY).named("str").named("one_tuple_list_tuple")
|
||||
.named("one_tuple_list").named("ArrayOfOneTuples2");
|
||||
|
||||
schemaString = HiveSchemaUtil.generateSchemaString(schema);
|
||||
assertEquals("`one_tuple_list` ARRAY< STRUCT< `str` : binary>>", schemaString);
|
||||
|
||||
// A list of structs with a single field
|
||||
// Unlike the above two cases, for this the element type is the type of the
|
||||
// only field in the struct.
|
||||
schema = Types.buildMessage().optionalGroup().as(OriginalType.LIST).repeatedGroup()
|
||||
.required(PrimitiveType.PrimitiveTypeName.BINARY).named("str").named("one_tuple_list").named("one_tuple_list")
|
||||
.named("ArrayOfOneTuples3");
|
||||
|
||||
schemaString = HiveSchemaUtil.generateSchemaString(schema);
|
||||
assertEquals("`one_tuple_list` ARRAY< binary>", schemaString);
|
||||
|
||||
// A list of maps
|
||||
schema = Types.buildMessage().optionalGroup().as(OriginalType.LIST).repeatedGroup().as(OriginalType.MAP)
|
||||
.repeatedGroup().as(OriginalType.MAP_KEY_VALUE).required(PrimitiveType.PrimitiveTypeName.BINARY)
|
||||
.as(OriginalType.UTF8).named("string_key").required(PrimitiveType.PrimitiveTypeName.INT32).named("int_value")
|
||||
.named("key_value").named("array").named("map_list").named("ArrayOfMaps");
|
||||
|
||||
schemaString = HiveSchemaUtil.generateSchemaString(schema);
|
||||
assertEquals("`map_list` ARRAY< MAP< string, int>>", schemaString);
|
||||
}
|
||||
|
||||
@ParameterizedTest
|
||||
@MethodSource({"useJdbcAndSchemaFromCommitMetadata"})
|
||||
public void testBasicSync(boolean useJdbc, boolean useSchemaFromCommitMetadata) throws Exception {
|
||||
HiveTestUtil.hiveSyncConfig.useJdbc = useJdbc;
|
||||
String instantTime = "100";
|
||||
HiveTestUtil.createCOWTable(instantTime, 5, useSchemaFromCommitMetadata);
|
||||
HoodieHiveClient hiveClient =
|
||||
new HoodieHiveClient(HiveTestUtil.hiveSyncConfig, HiveTestUtil.getHiveConf(), HiveTestUtil.fileSystem);
|
||||
assertFalse(hiveClient.doesTableExist(HiveTestUtil.hiveSyncConfig.tableName),
|
||||
"Table " + HiveTestUtil.hiveSyncConfig.tableName + " should not exist initially");
|
||||
// Lets do the sync
|
||||
HiveSyncTool tool = new HiveSyncTool(HiveTestUtil.hiveSyncConfig, HiveTestUtil.getHiveConf(), HiveTestUtil.fileSystem);
|
||||
tool.syncHoodieTable();
|
||||
assertTrue(hiveClient.doesTableExist(HiveTestUtil.hiveSyncConfig.tableName),
|
||||
"Table " + HiveTestUtil.hiveSyncConfig.tableName + " should exist after sync completes");
|
||||
assertEquals(hiveClient.getTableSchema(HiveTestUtil.hiveSyncConfig.tableName).size(),
|
||||
hiveClient.getDataSchema().getColumns().size() + 1,
|
||||
"Hive Schema should match the table schema + partition field");
|
||||
assertEquals(5, hiveClient.scanTablePartitions(HiveTestUtil.hiveSyncConfig.tableName).size(),
|
||||
"Table partitions should match the number of partitions we wrote");
|
||||
assertEquals(instantTime, hiveClient.getLastCommitTimeSynced(HiveTestUtil.hiveSyncConfig.tableName).get(),
|
||||
"The last commit that was sycned should be updated in the TBLPROPERTIES");
|
||||
|
||||
// Adding of new partitions
|
||||
List<String> newPartition = Arrays.asList("2050/01/01");
|
||||
hiveClient.addPartitionsToTable(HiveTestUtil.hiveSyncConfig.tableName, Arrays.asList());
|
||||
assertEquals(5, hiveClient.scanTablePartitions(HiveTestUtil.hiveSyncConfig.tableName).size(),
|
||||
"No new partition should be added");
|
||||
hiveClient.addPartitionsToTable(HiveTestUtil.hiveSyncConfig.tableName, newPartition);
|
||||
assertEquals(6, hiveClient.scanTablePartitions(HiveTestUtil.hiveSyncConfig.tableName).size(),
|
||||
"New partition should be added");
|
||||
|
||||
// Update partitions
|
||||
hiveClient.updatePartitionsToTable(HiveTestUtil.hiveSyncConfig.tableName, Arrays.asList());
|
||||
assertEquals(6, hiveClient.scanTablePartitions(HiveTestUtil.hiveSyncConfig.tableName).size(),
|
||||
"Partition count should remain the same");
|
||||
hiveClient.updatePartitionsToTable(HiveTestUtil.hiveSyncConfig.tableName, newPartition);
|
||||
assertEquals(6, hiveClient.scanTablePartitions(HiveTestUtil.hiveSyncConfig.tableName).size(),
|
||||
"Partition count should remain the same");
|
||||
|
||||
// Alter partitions
|
||||
// Manually change a hive partition location to check if the sync will detect
|
||||
// it and generage a partition update event for it.
|
||||
hiveClient.updateHiveSQL("ALTER TABLE `" + HiveTestUtil.hiveSyncConfig.tableName
|
||||
+ "` PARTITION (`datestr`='2050-01-01') SET LOCATION '/some/new/location'");
|
||||
|
||||
hiveClient = new HoodieHiveClient(HiveTestUtil.hiveSyncConfig, HiveTestUtil.getHiveConf(), HiveTestUtil.fileSystem);
|
||||
List<Partition> hivePartitions = hiveClient.scanTablePartitions(HiveTestUtil.hiveSyncConfig.tableName);
|
||||
List<String> writtenPartitionsSince = hiveClient.getPartitionsWrittenToSince(Option.empty());
|
||||
writtenPartitionsSince.add(newPartition.get(0));
|
||||
List<PartitionEvent> partitionEvents = hiveClient.getPartitionEvents(hivePartitions, writtenPartitionsSince);
|
||||
assertEquals(1, partitionEvents.size(), "There should be only one paritition event");
|
||||
assertEquals(PartitionEventType.UPDATE, partitionEvents.iterator().next().eventType,
|
||||
"The one partition event must of type UPDATE");
|
||||
|
||||
tool = new HiveSyncTool(HiveTestUtil.hiveSyncConfig, HiveTestUtil.getHiveConf(), HiveTestUtil.fileSystem);
|
||||
tool.syncHoodieTable();
|
||||
// Sync should update the changed partition to correct path
|
||||
List<Partition> tablePartitions = hiveClient.scanTablePartitions(HiveTestUtil.hiveSyncConfig.tableName);
|
||||
assertEquals(6, tablePartitions.size(), "The one partition we wrote should be added to hive");
|
||||
assertEquals(instantTime, hiveClient.getLastCommitTimeSynced(HiveTestUtil.hiveSyncConfig.tableName).get(),
|
||||
"The last commit that was sycned should be 100");
|
||||
}
|
||||
|
||||
@ParameterizedTest
|
||||
@MethodSource("useJdbc")
|
||||
public void testSyncIncremental(boolean useJdbc) throws Exception {
|
||||
HiveTestUtil.hiveSyncConfig.useJdbc = useJdbc;
|
||||
String commitTime1 = "100";
|
||||
HiveTestUtil.createCOWTable(commitTime1, 5, true);
|
||||
HoodieHiveClient hiveClient =
|
||||
new HoodieHiveClient(HiveTestUtil.hiveSyncConfig, HiveTestUtil.getHiveConf(), HiveTestUtil.fileSystem);
|
||||
// Lets do the sync
|
||||
HiveSyncTool tool = new HiveSyncTool(HiveTestUtil.hiveSyncConfig, HiveTestUtil.getHiveConf(), HiveTestUtil.fileSystem);
|
||||
tool.syncHoodieTable();
|
||||
assertEquals(5, hiveClient.scanTablePartitions(HiveTestUtil.hiveSyncConfig.tableName).size(),
|
||||
"Table partitions should match the number of partitions we wrote");
|
||||
assertEquals(commitTime1, hiveClient.getLastCommitTimeSynced(HiveTestUtil.hiveSyncConfig.tableName).get(),
|
||||
"The last commit that was sycned should be updated in the TBLPROPERTIES");
|
||||
|
||||
// Now lets create more parititions and these are the only ones which needs to be synced
|
||||
DateTime dateTime = DateTime.now().plusDays(6);
|
||||
String commitTime2 = "101";
|
||||
HiveTestUtil.addCOWPartitions(1, true, true, dateTime, commitTime2);
|
||||
|
||||
// Lets do the sync
|
||||
hiveClient = new HoodieHiveClient(HiveTestUtil.hiveSyncConfig, HiveTestUtil.getHiveConf(), HiveTestUtil.fileSystem);
|
||||
List<String> writtenPartitionsSince = hiveClient.getPartitionsWrittenToSince(Option.of(commitTime1));
|
||||
assertEquals(1, writtenPartitionsSince.size(), "We should have one partition written after 100 commit");
|
||||
List<Partition> hivePartitions = hiveClient.scanTablePartitions(HiveTestUtil.hiveSyncConfig.tableName);
|
||||
List<PartitionEvent> partitionEvents = hiveClient.getPartitionEvents(hivePartitions, writtenPartitionsSince);
|
||||
assertEquals(1, partitionEvents.size(), "There should be only one paritition event");
|
||||
assertEquals(PartitionEventType.ADD, partitionEvents.iterator().next().eventType, "The one partition event must of type ADD");
|
||||
|
||||
tool = new HiveSyncTool(HiveTestUtil.hiveSyncConfig, HiveTestUtil.getHiveConf(), HiveTestUtil.fileSystem);
|
||||
tool.syncHoodieTable();
|
||||
// Sync should add the one partition
|
||||
assertEquals(6, hiveClient.scanTablePartitions(HiveTestUtil.hiveSyncConfig.tableName).size(),
|
||||
"The one partition we wrote should be added to hive");
|
||||
assertEquals(commitTime2, hiveClient.getLastCommitTimeSynced(HiveTestUtil.hiveSyncConfig.tableName).get(),
|
||||
"The last commit that was sycned should be 101");
|
||||
}
|
||||
|
||||
@ParameterizedTest
|
||||
@MethodSource("useJdbc")
|
||||
public void testSyncIncrementalWithSchemaEvolution(boolean useJdbc) throws Exception {
|
||||
HiveTestUtil.hiveSyncConfig.useJdbc = useJdbc;
|
||||
String commitTime1 = "100";
|
||||
HiveTestUtil.createCOWTable(commitTime1, 5, true);
|
||||
HoodieHiveClient hiveClient =
|
||||
new HoodieHiveClient(HiveTestUtil.hiveSyncConfig, HiveTestUtil.getHiveConf(), HiveTestUtil.fileSystem);
|
||||
// Lets do the sync
|
||||
HiveSyncTool tool = new HiveSyncTool(HiveTestUtil.hiveSyncConfig, HiveTestUtil.getHiveConf(), HiveTestUtil.fileSystem);
|
||||
tool.syncHoodieTable();
|
||||
|
||||
int fields = hiveClient.getTableSchema(HiveTestUtil.hiveSyncConfig.tableName).size();
|
||||
|
||||
// Now lets create more parititions and these are the only ones which needs to be synced
|
||||
DateTime dateTime = DateTime.now().plusDays(6);
|
||||
String commitTime2 = "101";
|
||||
HiveTestUtil.addCOWPartitions(1, false, true, dateTime, commitTime2);
|
||||
|
||||
// Lets do the sync
|
||||
tool = new HiveSyncTool(HiveTestUtil.hiveSyncConfig, HiveTestUtil.getHiveConf(), HiveTestUtil.fileSystem);
|
||||
tool.syncHoodieTable();
|
||||
|
||||
assertEquals(fields + 3, hiveClient.getTableSchema(HiveTestUtil.hiveSyncConfig.tableName).size(),
|
||||
"Hive Schema has evolved and should not be 3 more field");
|
||||
assertEquals("BIGINT", hiveClient.getTableSchema(HiveTestUtil.hiveSyncConfig.tableName).get("favorite_number"),
|
||||
"Hive Schema has evolved - Field favorite_number has evolved from int to long");
|
||||
assertTrue(hiveClient.getTableSchema(HiveTestUtil.hiveSyncConfig.tableName).containsKey("favorite_movie"),
|
||||
"Hive Schema has evolved - Field favorite_movie was added");
|
||||
|
||||
// Sync should add the one partition
|
||||
assertEquals(6, hiveClient.scanTablePartitions(HiveTestUtil.hiveSyncConfig.tableName).size(),
|
||||
"The one partition we wrote should be added to hive");
|
||||
assertEquals(commitTime2, hiveClient.getLastCommitTimeSynced(HiveTestUtil.hiveSyncConfig.tableName).get(),
|
||||
"The last commit that was sycned should be 101");
|
||||
}
|
||||
|
||||
@ParameterizedTest
|
||||
@MethodSource("useJdbcAndSchemaFromCommitMetadata")
|
||||
public void testSyncMergeOnRead(boolean useJdbc, boolean useSchemaFromCommitMetadata) throws Exception {
|
||||
HiveTestUtil.hiveSyncConfig.useJdbc = useJdbc;
|
||||
String instantTime = "100";
|
||||
String deltaCommitTime = "101";
|
||||
HiveTestUtil.createMORTable(instantTime, deltaCommitTime, 5, true,
|
||||
useSchemaFromCommitMetadata);
|
||||
|
||||
String roTableName = HiveTestUtil.hiveSyncConfig.tableName + HiveSyncTool.SUFFIX_READ_OPTIMIZED_TABLE;
|
||||
HoodieHiveClient hiveClient = new HoodieHiveClient(HiveTestUtil.hiveSyncConfig, HiveTestUtil.getHiveConf(), HiveTestUtil.fileSystem);
|
||||
assertFalse(hiveClient.doesTableExist(roTableName), "Table " + HiveTestUtil.hiveSyncConfig.tableName + " should not exist initially");
|
||||
// Lets do the sync
|
||||
HiveSyncTool tool = new HiveSyncTool(HiveTestUtil.hiveSyncConfig, HiveTestUtil.getHiveConf(), HiveTestUtil.fileSystem);
|
||||
tool.syncHoodieTable();
|
||||
|
||||
assertTrue(hiveClient.doesTableExist(roTableName), "Table " + roTableName + " should exist after sync completes");
|
||||
|
||||
if (useSchemaFromCommitMetadata) {
|
||||
assertEquals(hiveClient.getTableSchema(roTableName).size(),
|
||||
SchemaTestUtil.getSimpleSchema().getFields().size() + HiveTestUtil.hiveSyncConfig.partitionFields.size()
|
||||
+ HoodieRecord.HOODIE_META_COLUMNS.size(),
|
||||
"Hive Schema should match the table schema + partition field");
|
||||
} else {
|
||||
// The data generated and schema in the data file do not have metadata columns, so we need a separate check.
|
||||
assertEquals(hiveClient.getTableSchema(roTableName).size(),
|
||||
SchemaTestUtil.getSimpleSchema().getFields().size() + HiveTestUtil.hiveSyncConfig.partitionFields.size(),
|
||||
"Hive Schema should match the table schema + partition field");
|
||||
}
|
||||
|
||||
assertEquals(5, hiveClient.scanTablePartitions(roTableName).size(),
|
||||
"Table partitions should match the number of partitions we wrote");
|
||||
assertEquals(deltaCommitTime, hiveClient.getLastCommitTimeSynced(roTableName).get(),
|
||||
"The last commit that was sycned should be updated in the TBLPROPERTIES");
|
||||
|
||||
// Now lets create more partitions and these are the only ones which needs to be synced
|
||||
DateTime dateTime = DateTime.now().plusDays(6);
|
||||
String commitTime2 = "102";
|
||||
String deltaCommitTime2 = "103";
|
||||
|
||||
HiveTestUtil.addCOWPartitions(1, true, useSchemaFromCommitMetadata, dateTime, commitTime2);
|
||||
HiveTestUtil.addMORPartitions(1, true, false,
|
||||
useSchemaFromCommitMetadata, dateTime, commitTime2, deltaCommitTime2);
|
||||
// Lets do the sync
|
||||
tool = new HiveSyncTool(HiveTestUtil.hiveSyncConfig, HiveTestUtil.getHiveConf(), HiveTestUtil.fileSystem);
|
||||
tool.syncHoodieTable();
|
||||
hiveClient = new HoodieHiveClient(HiveTestUtil.hiveSyncConfig, HiveTestUtil.getHiveConf(), HiveTestUtil.fileSystem);
|
||||
|
||||
if (useSchemaFromCommitMetadata) {
|
||||
assertEquals(hiveClient.getTableSchema(roTableName).size(),
|
||||
SchemaTestUtil.getEvolvedSchema().getFields().size() + HiveTestUtil.hiveSyncConfig.partitionFields.size()
|
||||
+ HoodieRecord.HOODIE_META_COLUMNS.size(),
|
||||
"Hive Schema should match the evolved table schema + partition field");
|
||||
} else {
|
||||
// The data generated and schema in the data file do not have metadata columns, so we need a separate check.
|
||||
assertEquals(hiveClient.getTableSchema(roTableName).size(),
|
||||
SchemaTestUtil.getEvolvedSchema().getFields().size() + HiveTestUtil.hiveSyncConfig.partitionFields.size(),
|
||||
"Hive Schema should match the evolved table schema + partition field");
|
||||
}
|
||||
// Sync should add the one partition
|
||||
assertEquals(6, hiveClient.scanTablePartitions(roTableName).size(),
|
||||
"The 2 partitions we wrote should be added to hive");
|
||||
assertEquals(deltaCommitTime2, hiveClient.getLastCommitTimeSynced(roTableName).get(),
|
||||
"The last commit that was synced should be 103");
|
||||
}
|
||||
|
||||
@ParameterizedTest
|
||||
@MethodSource("useJdbcAndSchemaFromCommitMetadata")
|
||||
public void testSyncMergeOnReadRT(boolean useJdbc, boolean useSchemaFromCommitMetadata) throws Exception {
|
||||
HiveTestUtil.hiveSyncConfig.useJdbc = useJdbc;
|
||||
String instantTime = "100";
|
||||
String deltaCommitTime = "101";
|
||||
String snapshotTableName = HiveTestUtil.hiveSyncConfig.tableName + HiveSyncTool.SUFFIX_SNAPSHOT_TABLE;
|
||||
HiveTestUtil.createMORTable(instantTime, deltaCommitTime, 5, true, useSchemaFromCommitMetadata);
|
||||
HoodieHiveClient hiveClientRT =
|
||||
new HoodieHiveClient(HiveTestUtil.hiveSyncConfig, HiveTestUtil.getHiveConf(), HiveTestUtil.fileSystem);
|
||||
|
||||
assertFalse(hiveClientRT.doesTableExist(snapshotTableName),
|
||||
"Table " + HiveTestUtil.hiveSyncConfig.tableName + HiveSyncTool.SUFFIX_SNAPSHOT_TABLE
|
||||
+ " should not exist initially");
|
||||
|
||||
// Lets do the sync
|
||||
HiveSyncTool tool = new HiveSyncTool(HiveTestUtil.hiveSyncConfig, HiveTestUtil.getHiveConf(), HiveTestUtil.fileSystem);
|
||||
tool.syncHoodieTable();
|
||||
|
||||
assertTrue(hiveClientRT.doesTableExist(snapshotTableName),
|
||||
"Table " + HiveTestUtil.hiveSyncConfig.tableName + HiveSyncTool.SUFFIX_SNAPSHOT_TABLE
|
||||
+ " should exist after sync completes");
|
||||
|
||||
if (useSchemaFromCommitMetadata) {
|
||||
assertEquals(hiveClientRT.getTableSchema(snapshotTableName).size(),
|
||||
SchemaTestUtil.getSimpleSchema().getFields().size() + HiveTestUtil.hiveSyncConfig.partitionFields.size()
|
||||
+ HoodieRecord.HOODIE_META_COLUMNS.size(),
|
||||
"Hive Schema should match the table schema + partition field");
|
||||
} else {
|
||||
// The data generated and schema in the data file do not have metadata columns, so we need a separate check.
|
||||
assertEquals(hiveClientRT.getTableSchema(snapshotTableName).size(),
|
||||
SchemaTestUtil.getSimpleSchema().getFields().size() + HiveTestUtil.hiveSyncConfig.partitionFields.size(),
|
||||
"Hive Schema should match the table schema + partition field");
|
||||
}
|
||||
|
||||
assertEquals(5, hiveClientRT.scanTablePartitions(snapshotTableName).size(),
|
||||
"Table partitions should match the number of partitions we wrote");
|
||||
assertEquals(deltaCommitTime, hiveClientRT.getLastCommitTimeSynced(snapshotTableName).get(),
|
||||
"The last commit that was synced should be updated in the TBLPROPERTIES");
|
||||
|
||||
// Now lets create more parititions and these are the only ones which needs to be synced
|
||||
DateTime dateTime = DateTime.now().plusDays(6);
|
||||
String commitTime2 = "102";
|
||||
String deltaCommitTime2 = "103";
|
||||
|
||||
HiveTestUtil.addCOWPartitions(1, true, useSchemaFromCommitMetadata, dateTime, commitTime2);
|
||||
HiveTestUtil.addMORPartitions(1, true, false, useSchemaFromCommitMetadata, dateTime, commitTime2, deltaCommitTime2);
|
||||
// Lets do the sync
|
||||
tool = new HiveSyncTool(HiveTestUtil.hiveSyncConfig, HiveTestUtil.getHiveConf(), HiveTestUtil.fileSystem);
|
||||
tool.syncHoodieTable();
|
||||
hiveClientRT = new HoodieHiveClient(HiveTestUtil.hiveSyncConfig, HiveTestUtil.getHiveConf(), HiveTestUtil.fileSystem);
|
||||
|
||||
if (useSchemaFromCommitMetadata) {
|
||||
assertEquals(hiveClientRT.getTableSchema(snapshotTableName).size(),
|
||||
SchemaTestUtil.getEvolvedSchema().getFields().size() + HiveTestUtil.hiveSyncConfig.partitionFields.size()
|
||||
+ HoodieRecord.HOODIE_META_COLUMNS.size(),
|
||||
"Hive Schema should match the evolved table schema + partition field");
|
||||
} else {
|
||||
// The data generated and schema in the data file do not have metadata columns, so we need a separate check.
|
||||
assertEquals(hiveClientRT.getTableSchema(snapshotTableName).size(),
|
||||
SchemaTestUtil.getEvolvedSchema().getFields().size() + HiveTestUtil.hiveSyncConfig.partitionFields.size(),
|
||||
"Hive Schema should match the evolved table schema + partition field");
|
||||
}
|
||||
// Sync should add the one partition
|
||||
assertEquals(6, hiveClientRT.scanTablePartitions(snapshotTableName).size(),
|
||||
"The 2 partitions we wrote should be added to hive");
|
||||
assertEquals(deltaCommitTime2, hiveClientRT.getLastCommitTimeSynced(snapshotTableName).get(),
|
||||
"The last commit that was sycned should be 103");
|
||||
}
|
||||
|
||||
@ParameterizedTest
|
||||
@MethodSource("useJdbc")
|
||||
public void testMultiPartitionKeySync(boolean useJdbc) throws Exception {
|
||||
HiveTestUtil.hiveSyncConfig.useJdbc = useJdbc;
|
||||
String instantTime = "100";
|
||||
HiveTestUtil.createCOWTable(instantTime, 5, true);
|
||||
|
||||
HiveSyncConfig hiveSyncConfig = HiveSyncConfig.copy(HiveTestUtil.hiveSyncConfig);
|
||||
hiveSyncConfig.partitionValueExtractorClass = MultiPartKeysValueExtractor.class.getCanonicalName();
|
||||
hiveSyncConfig.tableName = "multi_part_key";
|
||||
hiveSyncConfig.partitionFields = Arrays.asList("year", "month", "day");
|
||||
HiveTestUtil.getCreatedTablesSet().add(hiveSyncConfig.databaseName + "." + hiveSyncConfig.tableName);
|
||||
|
||||
HoodieHiveClient hiveClient = new HoodieHiveClient(hiveSyncConfig, HiveTestUtil.getHiveConf(), HiveTestUtil.fileSystem);
|
||||
assertFalse(hiveClient.doesTableExist(hiveSyncConfig.tableName),
|
||||
"Table " + hiveSyncConfig.tableName + " should not exist initially");
|
||||
// Lets do the sync
|
||||
HiveSyncTool tool = new HiveSyncTool(hiveSyncConfig, HiveTestUtil.getHiveConf(), HiveTestUtil.fileSystem);
|
||||
tool.syncHoodieTable();
|
||||
assertTrue(hiveClient.doesTableExist(hiveSyncConfig.tableName),
|
||||
"Table " + hiveSyncConfig.tableName + " should exist after sync completes");
|
||||
assertEquals(hiveClient.getTableSchema(hiveSyncConfig.tableName).size(),
|
||||
hiveClient.getDataSchema().getColumns().size() + 3,
|
||||
"Hive Schema should match the table schema + partition fields");
|
||||
assertEquals(5, hiveClient.scanTablePartitions(hiveSyncConfig.tableName).size(),
|
||||
"Table partitions should match the number of partitions we wrote");
|
||||
assertEquals(instantTime, hiveClient.getLastCommitTimeSynced(hiveSyncConfig.tableName).get(),
|
||||
"The last commit that was sycned should be updated in the TBLPROPERTIES");
|
||||
}
|
||||
|
||||
@ParameterizedTest
|
||||
@MethodSource("useJdbc")
|
||||
public void testNonPartitionedSync(boolean useJdbc) throws Exception {
|
||||
HiveTestUtil.hiveSyncConfig.useJdbc = useJdbc;
|
||||
String instantTime = "100";
|
||||
HiveTestUtil.createCOWTable(instantTime, 5, true);
|
||||
|
||||
HiveSyncConfig hiveSyncConfig = HiveSyncConfig.copy(HiveTestUtil.hiveSyncConfig);
|
||||
// Set partition value extractor to NonPartitionedExtractor
|
||||
hiveSyncConfig.partitionValueExtractorClass = NonPartitionedExtractor.class.getCanonicalName();
|
||||
hiveSyncConfig.tableName = "non_partitioned";
|
||||
hiveSyncConfig.partitionFields = Arrays.asList("year", "month", "day");
|
||||
HiveTestUtil.getCreatedTablesSet().add(hiveSyncConfig.databaseName + "." + hiveSyncConfig.tableName);
|
||||
|
||||
HoodieHiveClient hiveClient = new HoodieHiveClient(hiveSyncConfig, HiveTestUtil.getHiveConf(), HiveTestUtil.fileSystem);
|
||||
assertFalse(hiveClient.doesTableExist(hiveSyncConfig.tableName),
|
||||
"Table " + hiveSyncConfig.tableName + " should not exist initially");
|
||||
// Lets do the sync
|
||||
HiveSyncTool tool = new HiveSyncTool(hiveSyncConfig, HiveTestUtil.getHiveConf(), HiveTestUtil.fileSystem);
|
||||
tool.syncHoodieTable();
|
||||
assertTrue(hiveClient.doesTableExist(hiveSyncConfig.tableName),
|
||||
"Table " + hiveSyncConfig.tableName + " should exist after sync completes");
|
||||
assertEquals(hiveClient.getTableSchema(hiveSyncConfig.tableName).size(),
|
||||
hiveClient.getDataSchema().getColumns().size(),
|
||||
"Hive Schema should match the table schema,ignoring the partition fields");
|
||||
assertEquals(0, hiveClient.scanTablePartitions(hiveSyncConfig.tableName).size(),
|
||||
"Table should not have partitions because of the NonPartitionedExtractor");
|
||||
}
|
||||
|
||||
@ParameterizedTest
|
||||
@MethodSource("useJdbc")
|
||||
public void testReadSchemaForMOR(boolean useJdbc) throws Exception {
|
||||
HiveTestUtil.hiveSyncConfig.useJdbc = useJdbc;
|
||||
String commitTime = "100";
|
||||
String snapshotTableName = HiveTestUtil.hiveSyncConfig.tableName + HiveSyncTool.SUFFIX_SNAPSHOT_TABLE;
|
||||
HiveTestUtil.createMORTable(commitTime, "", 5, false, true);
|
||||
HoodieHiveClient hiveClientRT =
|
||||
new HoodieHiveClient(HiveTestUtil.hiveSyncConfig, HiveTestUtil.getHiveConf(), HiveTestUtil.fileSystem);
|
||||
|
||||
assertFalse(hiveClientRT.doesTableExist(snapshotTableName), "Table " + HiveTestUtil.hiveSyncConfig.tableName + HiveSyncTool.SUFFIX_SNAPSHOT_TABLE
|
||||
+ " should not exist initially");
|
||||
|
||||
// Lets do the sync
|
||||
HiveSyncTool tool = new HiveSyncTool(HiveTestUtil.hiveSyncConfig, HiveTestUtil.getHiveConf(), HiveTestUtil.fileSystem);
|
||||
tool.syncHoodieTable();
|
||||
|
||||
assertTrue(hiveClientRT.doesTableExist(snapshotTableName), "Table " + HiveTestUtil.hiveSyncConfig.tableName + HiveSyncTool.SUFFIX_SNAPSHOT_TABLE
|
||||
+ " should exist after sync completes");
|
||||
|
||||
// Schema being read from compacted base files
|
||||
assertEquals(hiveClientRT.getTableSchema(snapshotTableName).size(),
|
||||
SchemaTestUtil.getSimpleSchema().getFields().size() + HiveTestUtil.hiveSyncConfig.partitionFields.size()
|
||||
+ HoodieRecord.HOODIE_META_COLUMNS.size(),
|
||||
"Hive Schema should match the table schema + partition field");
|
||||
assertEquals(5, hiveClientRT.scanTablePartitions(snapshotTableName).size(), "Table partitions should match the number of partitions we wrote");
|
||||
|
||||
// Now lets create more partitions and these are the only ones which needs to be synced
|
||||
DateTime dateTime = DateTime.now().plusDays(6);
|
||||
String commitTime2 = "102";
|
||||
String deltaCommitTime2 = "103";
|
||||
|
||||
HiveTestUtil.addMORPartitions(1, true, false, true, dateTime, commitTime2, deltaCommitTime2);
|
||||
// Lets do the sync
|
||||
tool = new HiveSyncTool(HiveTestUtil.hiveSyncConfig, HiveTestUtil.getHiveConf(), HiveTestUtil.fileSystem);
|
||||
tool.syncHoodieTable();
|
||||
hiveClientRT = new HoodieHiveClient(HiveTestUtil.hiveSyncConfig, HiveTestUtil.getHiveConf(), HiveTestUtil.fileSystem);
|
||||
|
||||
// Schema being read from the log files
|
||||
assertEquals(hiveClientRT.getTableSchema(snapshotTableName).size(),
|
||||
SchemaTestUtil.getEvolvedSchema().getFields().size() + HiveTestUtil.hiveSyncConfig.partitionFields.size()
|
||||
+ HoodieRecord.HOODIE_META_COLUMNS.size(),
|
||||
"Hive Schema should match the evolved table schema + partition field");
|
||||
// Sync should add the one partition
|
||||
assertEquals(6, hiveClientRT.scanTablePartitions(snapshotTableName).size(), "The 1 partition we wrote should be added to hive");
|
||||
assertEquals(deltaCommitTime2, hiveClientRT.getLastCommitTimeSynced(snapshotTableName).get(),
|
||||
"The last commit that was sycned should be 103");
|
||||
}
|
||||
|
||||
}
|
||||
@@ -0,0 +1,328 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.hive.testutils;
|
||||
|
||||
import org.apache.hudi.common.testutils.HoodieTestUtils;
|
||||
import org.apache.hudi.common.util.FileIOUtils;
|
||||
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.apache.hadoop.hive.conf.HiveConf;
|
||||
import org.apache.hadoop.hive.metastore.HiveMetaStore;
|
||||
import org.apache.hadoop.hive.metastore.HiveMetaStoreClient;
|
||||
import org.apache.hadoop.hive.metastore.IHMSHandler;
|
||||
import org.apache.hadoop.hive.metastore.RetryingHMSHandler;
|
||||
import org.apache.hadoop.hive.metastore.TSetIpAddressProcessor;
|
||||
import org.apache.hadoop.hive.metastore.TUGIBasedProcessor;
|
||||
import org.apache.hadoop.hive.metastore.api.MetaException;
|
||||
import org.apache.hadoop.hive.thrift.TUGIContainingTransport;
|
||||
import org.apache.hive.service.server.HiveServer2;
|
||||
import org.apache.log4j.LogManager;
|
||||
import org.apache.log4j.Logger;
|
||||
import org.apache.thrift.TProcessor;
|
||||
import org.apache.thrift.protocol.TBinaryProtocol;
|
||||
import org.apache.thrift.server.TServer;
|
||||
import org.apache.thrift.server.TThreadPoolServer;
|
||||
import org.apache.thrift.transport.TFramedTransport;
|
||||
import org.apache.thrift.transport.TServerSocket;
|
||||
import org.apache.thrift.transport.TServerTransport;
|
||||
import org.apache.thrift.transport.TSocket;
|
||||
import org.apache.thrift.transport.TTransport;
|
||||
import org.apache.thrift.transport.TTransportException;
|
||||
import org.apache.thrift.transport.TTransportFactory;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.net.InetSocketAddress;
|
||||
import java.net.SocketException;
|
||||
import java.nio.file.Files;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
import java.util.Objects;
|
||||
import java.util.concurrent.ExecutorService;
|
||||
import java.util.concurrent.Executors;
|
||||
|
||||
public class HiveTestService {
|
||||
|
||||
private static final Logger LOG = LogManager.getLogger(HiveTestService.class);
|
||||
|
||||
private static final int CONNECTION_TIMEOUT = 30000;
|
||||
|
||||
/**
|
||||
* Configuration settings.
|
||||
*/
|
||||
private Configuration hadoopConf;
|
||||
private String workDir;
|
||||
private String bindIP = "127.0.0.1";
|
||||
private int metastorePort = 9083;
|
||||
private int serverPort = 9999;
|
||||
private boolean clean = true;
|
||||
|
||||
private Map<String, String> sysProps = new HashMap<>();
|
||||
private ExecutorService executorService;
|
||||
private TServer tServer;
|
||||
private HiveServer2 hiveServer;
|
||||
|
||||
public HiveTestService(Configuration configuration) throws IOException {
|
||||
this.workDir = Files.createTempDirectory(System.currentTimeMillis() + "-").toFile().getAbsolutePath();
|
||||
}
|
||||
|
||||
public Configuration getHadoopConf() {
|
||||
return hadoopConf;
|
||||
}
|
||||
|
||||
public HiveServer2 start() throws IOException {
|
||||
Objects.requireNonNull(workDir, "The work dir must be set before starting cluster.");
|
||||
|
||||
if (hadoopConf == null) {
|
||||
hadoopConf = HoodieTestUtils.getDefaultHadoopConf();
|
||||
}
|
||||
|
||||
String localHiveLocation = getHiveLocation(workDir);
|
||||
if (clean) {
|
||||
LOG.info("Cleaning Hive cluster data at: " + localHiveLocation + " and starting fresh.");
|
||||
File file = new File(localHiveLocation);
|
||||
FileIOUtils.deleteDirectory(file);
|
||||
}
|
||||
|
||||
HiveConf serverConf = configureHive(hadoopConf, localHiveLocation);
|
||||
|
||||
executorService = Executors.newSingleThreadExecutor();
|
||||
tServer = startMetaStore(bindIP, metastorePort, serverConf);
|
||||
|
||||
serverConf.set("hive.in.test", "true");
|
||||
hiveServer = startHiveServer(serverConf);
|
||||
|
||||
String serverHostname;
|
||||
if (bindIP.equals("0.0.0.0")) {
|
||||
serverHostname = "localhost";
|
||||
} else {
|
||||
serverHostname = bindIP;
|
||||
}
|
||||
if (!waitForServerUp(serverConf, serverHostname, metastorePort, CONNECTION_TIMEOUT)) {
|
||||
throw new IOException("Waiting for startup of standalone server");
|
||||
}
|
||||
|
||||
LOG.info("Hive Minicluster service started.");
|
||||
return hiveServer;
|
||||
}
|
||||
|
||||
public void stop() {
|
||||
resetSystemProperties();
|
||||
if (tServer != null) {
|
||||
try {
|
||||
tServer.stop();
|
||||
} catch (Exception e) {
|
||||
LOG.error("Stop meta store failed", e);
|
||||
}
|
||||
}
|
||||
if (hiveServer != null) {
|
||||
try {
|
||||
hiveServer.stop();
|
||||
} catch (Exception e) {
|
||||
LOG.error("Stop hive server failed", e);
|
||||
}
|
||||
}
|
||||
if (executorService != null) {
|
||||
executorService.shutdownNow();
|
||||
}
|
||||
LOG.info("Hive Minicluster service shut down.");
|
||||
tServer = null;
|
||||
hiveServer = null;
|
||||
hadoopConf = null;
|
||||
}
|
||||
|
||||
private HiveConf configureHive(Configuration conf, String localHiveLocation) throws IOException {
|
||||
conf.set("hive.metastore.local", "false");
|
||||
conf.set(HiveConf.ConfVars.METASTOREURIS.varname, "thrift://" + bindIP + ":" + metastorePort);
|
||||
conf.set(HiveConf.ConfVars.HIVE_SERVER2_THRIFT_BIND_HOST.varname, bindIP);
|
||||
conf.setInt(HiveConf.ConfVars.HIVE_SERVER2_THRIFT_PORT.varname, serverPort);
|
||||
// The following line to turn of SASL has no effect since HiveAuthFactory calls
|
||||
// 'new HiveConf()'. This is fixed by https://issues.apache.org/jira/browse/HIVE-6657,
|
||||
// in Hive 0.14.
|
||||
// As a workaround, the property is set in hive-site.xml in this module.
|
||||
// conf.set(HiveConf.ConfVars.HIVE_SERVER2_AUTHENTICATION.varname, "NOSASL");
|
||||
File localHiveDir = new File(localHiveLocation);
|
||||
localHiveDir.mkdirs();
|
||||
File metastoreDbDir = new File(localHiveDir, "metastore_db");
|
||||
conf.set(HiveConf.ConfVars.METASTORECONNECTURLKEY.varname,
|
||||
"jdbc:derby:" + metastoreDbDir.getPath() + ";create=true");
|
||||
File derbyLogFile = new File(localHiveDir, "derby.log");
|
||||
derbyLogFile.createNewFile();
|
||||
setSystemProperty("derby.stream.error.file", derbyLogFile.getPath());
|
||||
setSystemProperty("derby.system.home", localHiveDir.getAbsolutePath());
|
||||
conf.set(HiveConf.ConfVars.METASTOREWAREHOUSE.varname,
|
||||
Files.createTempDirectory(System.currentTimeMillis() + "-").toFile().getAbsolutePath());
|
||||
conf.set("datanucleus.schema.autoCreateTables", "true");
|
||||
conf.set("hive.metastore.schema.verification", "false");
|
||||
setSystemProperty("derby.stream.error.file", derbyLogFile.getPath());
|
||||
|
||||
return new HiveConf(conf, this.getClass());
|
||||
}
|
||||
|
||||
private boolean waitForServerUp(HiveConf serverConf, String hostname, int port, int timeout) {
|
||||
long start = System.currentTimeMillis();
|
||||
while (true) {
|
||||
try {
|
||||
new HiveMetaStoreClient(serverConf);
|
||||
return true;
|
||||
} catch (MetaException e) {
|
||||
// ignore as this is expected
|
||||
LOG.info("server " + hostname + ":" + port + " not up " + e);
|
||||
}
|
||||
|
||||
if (System.currentTimeMillis() > start + timeout) {
|
||||
break;
|
||||
}
|
||||
try {
|
||||
Thread.sleep(250);
|
||||
} catch (InterruptedException e) {
|
||||
// ignore
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
private void setSystemProperty(String name, String value) {
|
||||
if (!sysProps.containsKey(name)) {
|
||||
String currentValue = System.getProperty(name);
|
||||
sysProps.put(name, currentValue);
|
||||
}
|
||||
if (value != null) {
|
||||
System.setProperty(name, value);
|
||||
} else {
|
||||
System.getProperties().remove(name);
|
||||
}
|
||||
}
|
||||
|
||||
private void resetSystemProperties() {
|
||||
for (Map.Entry<String, String> entry : sysProps.entrySet()) {
|
||||
if (entry.getValue() != null) {
|
||||
System.setProperty(entry.getKey(), entry.getValue());
|
||||
} else {
|
||||
System.getProperties().remove(entry.getKey());
|
||||
}
|
||||
}
|
||||
sysProps.clear();
|
||||
}
|
||||
|
||||
private static String getHiveLocation(String baseLocation) {
|
||||
return baseLocation + Path.SEPARATOR + "hive";
|
||||
}
|
||||
|
||||
private HiveServer2 startHiveServer(HiveConf serverConf) {
|
||||
HiveServer2 hiveServer = new HiveServer2();
|
||||
hiveServer.init(serverConf);
|
||||
hiveServer.start();
|
||||
return hiveServer;
|
||||
}
|
||||
|
||||
// XXX: From org.apache.hadoop.hive.metastore.HiveMetaStore,
|
||||
// with changes to support binding to a specified IP address (not only 0.0.0.0)
|
||||
private static final class ChainedTTransportFactory extends TTransportFactory {
|
||||
|
||||
private final TTransportFactory parentTransFactory;
|
||||
private final TTransportFactory childTransFactory;
|
||||
|
||||
private ChainedTTransportFactory(TTransportFactory parentTransFactory, TTransportFactory childTransFactory) {
|
||||
this.parentTransFactory = parentTransFactory;
|
||||
this.childTransFactory = childTransFactory;
|
||||
}
|
||||
|
||||
@Override
|
||||
public TTransport getTransport(TTransport trans) {
|
||||
return childTransFactory.getTransport(parentTransFactory.getTransport(trans));
|
||||
}
|
||||
}
|
||||
|
||||
private static final class TServerSocketKeepAlive extends TServerSocket {
|
||||
|
||||
public TServerSocketKeepAlive(int port) throws TTransportException {
|
||||
super(port, 0);
|
||||
}
|
||||
|
||||
public TServerSocketKeepAlive(InetSocketAddress address) throws TTransportException {
|
||||
super(address, 0);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected TSocket acceptImpl() throws TTransportException {
|
||||
TSocket ts = super.acceptImpl();
|
||||
try {
|
||||
ts.getSocket().setKeepAlive(true);
|
||||
} catch (SocketException e) {
|
||||
throw new TTransportException(e);
|
||||
}
|
||||
return ts;
|
||||
}
|
||||
}
|
||||
|
||||
public TServer startMetaStore(String forceBindIP, int port, HiveConf conf) throws IOException {
|
||||
try {
|
||||
// Server will create new threads up to max as necessary. After an idle
|
||||
// period, it will destory threads to keep the number of threads in the
|
||||
// pool to min.
|
||||
int minWorkerThreads = conf.getIntVar(HiveConf.ConfVars.METASTORESERVERMINTHREADS);
|
||||
int maxWorkerThreads = conf.getIntVar(HiveConf.ConfVars.METASTORESERVERMAXTHREADS);
|
||||
boolean tcpKeepAlive = conf.getBoolVar(HiveConf.ConfVars.METASTORE_TCP_KEEP_ALIVE);
|
||||
boolean useFramedTransport = conf.getBoolVar(HiveConf.ConfVars.METASTORE_USE_THRIFT_FRAMED_TRANSPORT);
|
||||
|
||||
// don't support SASL yet
|
||||
// boolean useSasl = conf.getBoolVar(HiveConf.ConfVars.METASTORE_USE_THRIFT_SASL);
|
||||
|
||||
TServerTransport serverTransport;
|
||||
if (forceBindIP != null) {
|
||||
InetSocketAddress address = new InetSocketAddress(forceBindIP, port);
|
||||
serverTransport = tcpKeepAlive ? new TServerSocketKeepAlive(address) : new TServerSocket(address);
|
||||
|
||||
} else {
|
||||
serverTransport = tcpKeepAlive ? new TServerSocketKeepAlive(port) : new TServerSocket(port);
|
||||
}
|
||||
|
||||
TProcessor processor;
|
||||
TTransportFactory transFactory;
|
||||
|
||||
HiveMetaStore.HMSHandler baseHandler = new HiveMetaStore.HMSHandler("new db based metaserver", conf, false);
|
||||
IHMSHandler handler = RetryingHMSHandler.getProxy(conf, baseHandler, true);
|
||||
|
||||
if (conf.getBoolVar(HiveConf.ConfVars.METASTORE_EXECUTE_SET_UGI)) {
|
||||
transFactory = useFramedTransport
|
||||
? new ChainedTTransportFactory(new TFramedTransport.Factory(), new TUGIContainingTransport.Factory())
|
||||
: new TUGIContainingTransport.Factory();
|
||||
|
||||
processor = new TUGIBasedProcessor<>(handler);
|
||||
LOG.info("Starting DB backed MetaStore Server with SetUGI enabled");
|
||||
} else {
|
||||
transFactory = useFramedTransport ? new TFramedTransport.Factory() : new TTransportFactory();
|
||||
processor = new TSetIpAddressProcessor<>(handler);
|
||||
LOG.info("Starting DB backed MetaStore Server");
|
||||
}
|
||||
|
||||
TThreadPoolServer.Args args = new TThreadPoolServer.Args(serverTransport).processor(processor)
|
||||
.transportFactory(transFactory).protocolFactory(new TBinaryProtocol.Factory())
|
||||
.minWorkerThreads(minWorkerThreads).maxWorkerThreads(maxWorkerThreads);
|
||||
|
||||
final TServer tServer = new TThreadPoolServer(args);
|
||||
executorService.submit(tServer::serve);
|
||||
return tServer;
|
||||
} catch (Throwable x) {
|
||||
throw new IOException(x);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,385 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.hive.testutils;
|
||||
|
||||
import org.apache.hudi.avro.HoodieAvroWriteSupport;
|
||||
import org.apache.hudi.common.bloom.BloomFilter;
|
||||
import org.apache.hudi.common.bloom.BloomFilterFactory;
|
||||
import org.apache.hudi.common.bloom.BloomFilterTypeCode;
|
||||
import org.apache.hudi.common.fs.FSUtils;
|
||||
import org.apache.hudi.common.model.HoodieAvroPayload;
|
||||
import org.apache.hudi.common.model.HoodieBaseFile;
|
||||
import org.apache.hudi.common.model.HoodieCommitMetadata;
|
||||
import org.apache.hudi.common.model.HoodieDeltaWriteStat;
|
||||
import org.apache.hudi.common.model.HoodieLogFile;
|
||||
import org.apache.hudi.common.model.HoodieTableType;
|
||||
import org.apache.hudi.common.model.HoodieWriteStat;
|
||||
import org.apache.hudi.common.table.HoodieTableMetaClient;
|
||||
import org.apache.hudi.common.table.log.HoodieLogFormat;
|
||||
import org.apache.hudi.common.table.log.HoodieLogFormat.Writer;
|
||||
import org.apache.hudi.common.table.log.block.HoodieAvroDataBlock;
|
||||
import org.apache.hudi.common.table.log.block.HoodieLogBlock;
|
||||
import org.apache.hudi.common.table.log.block.HoodieLogBlock.HeaderMetadataType;
|
||||
import org.apache.hudi.common.table.timeline.HoodieTimeline;
|
||||
import org.apache.hudi.common.testutils.SchemaTestUtil;
|
||||
import org.apache.hudi.common.testutils.minicluster.HdfsTestService;
|
||||
import org.apache.hudi.common.testutils.minicluster.ZookeeperTestService;
|
||||
import org.apache.hudi.common.util.FileIOUtils;
|
||||
import org.apache.hudi.hive.HiveSyncConfig;
|
||||
import org.apache.hudi.hive.HiveSyncTool;
|
||||
import org.apache.hudi.hive.HoodieHiveClient;
|
||||
|
||||
import org.apache.avro.Schema;
|
||||
import org.apache.avro.generic.IndexedRecord;
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.fs.FSDataOutputStream;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.apache.hadoop.hdfs.MiniDFSCluster;
|
||||
import org.apache.hadoop.hive.conf.HiveConf;
|
||||
import org.apache.hive.service.server.HiveServer2;
|
||||
import org.apache.parquet.avro.AvroSchemaConverter;
|
||||
import org.apache.parquet.hadoop.ParquetWriter;
|
||||
import org.apache.parquet.hadoop.metadata.CompressionCodecName;
|
||||
import org.apache.zookeeper.server.ZooKeeperServer;
|
||||
import org.joda.time.DateTime;
|
||||
import org.joda.time.format.DateTimeFormat;
|
||||
import org.joda.time.format.DateTimeFormatter;
|
||||
import org.junit.platform.commons.JUnitException;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.net.URISyntaxException;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
import java.util.HashMap;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Map.Entry;
|
||||
import java.util.Set;
|
||||
import java.util.UUID;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.fail;
|
||||
|
||||
@SuppressWarnings("SameParameterValue")
|
||||
public class HiveTestUtil {
|
||||
|
||||
private static MiniDFSCluster dfsCluster;
|
||||
private static ZooKeeperServer zkServer;
|
||||
private static HiveServer2 hiveServer;
|
||||
private static HiveTestService hiveTestService;
|
||||
private static Configuration configuration;
|
||||
public static HiveSyncConfig hiveSyncConfig;
|
||||
private static DateTimeFormatter dtfOut;
|
||||
public static FileSystem fileSystem;
|
||||
private static Set<String> createdTablesSet = new HashSet<>();
|
||||
|
||||
public static void setUp() throws IOException, InterruptedException {
|
||||
if (dfsCluster == null) {
|
||||
HdfsTestService service = new HdfsTestService();
|
||||
dfsCluster = service.start(true);
|
||||
configuration = service.getHadoopConf();
|
||||
}
|
||||
if (zkServer == null) {
|
||||
ZookeeperTestService zkService = new ZookeeperTestService(configuration);
|
||||
zkServer = zkService.start();
|
||||
}
|
||||
if (hiveServer == null) {
|
||||
hiveTestService = new HiveTestService(configuration);
|
||||
hiveServer = hiveTestService.start();
|
||||
}
|
||||
fileSystem = FileSystem.get(configuration);
|
||||
|
||||
hiveSyncConfig = new HiveSyncConfig();
|
||||
hiveSyncConfig.jdbcUrl = "jdbc:hive2://127.0.0.1:9999/";
|
||||
hiveSyncConfig.hiveUser = "";
|
||||
hiveSyncConfig.hivePass = "";
|
||||
hiveSyncConfig.databaseName = "testdb";
|
||||
hiveSyncConfig.tableName = "test1";
|
||||
hiveSyncConfig.basePath = "/tmp/hdfs/TestHiveSyncTool/";
|
||||
hiveSyncConfig.assumeDatePartitioning = true;
|
||||
hiveSyncConfig.usePreApacheInputFormat = false;
|
||||
hiveSyncConfig.partitionFields = Collections.singletonList("datestr");
|
||||
|
||||
dtfOut = DateTimeFormat.forPattern("yyyy/MM/dd");
|
||||
|
||||
clear();
|
||||
}
|
||||
|
||||
public static void clear() throws IOException {
|
||||
fileSystem.delete(new Path(hiveSyncConfig.basePath), true);
|
||||
HoodieTableMetaClient.initTableType(configuration, hiveSyncConfig.basePath, HoodieTableType.COPY_ON_WRITE,
|
||||
hiveSyncConfig.tableName, HoodieAvroPayload.class.getName());
|
||||
|
||||
HoodieHiveClient client = new HoodieHiveClient(hiveSyncConfig, hiveServer.getHiveConf(), fileSystem);
|
||||
for (String tableName : createdTablesSet) {
|
||||
client.updateHiveSQL("drop table if exists " + tableName);
|
||||
}
|
||||
createdTablesSet.clear();
|
||||
client.updateHiveSQL("drop database if exists " + hiveSyncConfig.databaseName);
|
||||
client.updateHiveSQL("create database " + hiveSyncConfig.databaseName);
|
||||
}
|
||||
|
||||
public static HiveConf getHiveConf() {
|
||||
return hiveServer.getHiveConf();
|
||||
}
|
||||
|
||||
public static void shutdown() {
|
||||
if (hiveServer != null) {
|
||||
hiveServer.stop();
|
||||
}
|
||||
if (hiveTestService != null) {
|
||||
hiveTestService.stop();
|
||||
}
|
||||
if (dfsCluster != null) {
|
||||
dfsCluster.shutdown();
|
||||
}
|
||||
if (zkServer != null) {
|
||||
zkServer.shutdown();
|
||||
}
|
||||
}
|
||||
|
||||
public static void createCOWTable(String instantTime, int numberOfPartitions, boolean useSchemaFromCommitMetadata)
|
||||
throws IOException, URISyntaxException {
|
||||
Path path = new Path(hiveSyncConfig.basePath);
|
||||
FileIOUtils.deleteDirectory(new File(hiveSyncConfig.basePath));
|
||||
HoodieTableMetaClient.initTableType(configuration, hiveSyncConfig.basePath, HoodieTableType.COPY_ON_WRITE,
|
||||
hiveSyncConfig.tableName, HoodieAvroPayload.class.getName());
|
||||
boolean result = fileSystem.mkdirs(path);
|
||||
checkResult(result);
|
||||
DateTime dateTime = DateTime.now();
|
||||
HoodieCommitMetadata commitMetadata = createPartitions(numberOfPartitions, true,
|
||||
useSchemaFromCommitMetadata, dateTime, instantTime);
|
||||
createdTablesSet.add(hiveSyncConfig.databaseName + "." + hiveSyncConfig.tableName);
|
||||
createCommitFile(commitMetadata, instantTime);
|
||||
}
|
||||
|
||||
public static void createMORTable(String commitTime, String deltaCommitTime, int numberOfPartitions,
|
||||
boolean createDeltaCommit, boolean useSchemaFromCommitMetadata)
|
||||
throws IOException, URISyntaxException, InterruptedException {
|
||||
Path path = new Path(hiveSyncConfig.basePath);
|
||||
FileIOUtils.deleteDirectory(new File(hiveSyncConfig.basePath));
|
||||
HoodieTableMetaClient.initTableType(configuration, hiveSyncConfig.basePath, HoodieTableType.MERGE_ON_READ,
|
||||
hiveSyncConfig.tableName, HoodieAvroPayload.class.getName());
|
||||
|
||||
boolean result = fileSystem.mkdirs(path);
|
||||
checkResult(result);
|
||||
DateTime dateTime = DateTime.now();
|
||||
HoodieCommitMetadata commitMetadata = createPartitions(numberOfPartitions, true,
|
||||
useSchemaFromCommitMetadata, dateTime, commitTime);
|
||||
createdTablesSet.add(hiveSyncConfig.databaseName + "." + hiveSyncConfig.tableName);
|
||||
createdTablesSet
|
||||
.add(hiveSyncConfig.databaseName + "." + hiveSyncConfig.tableName + HiveSyncTool.SUFFIX_SNAPSHOT_TABLE);
|
||||
HoodieCommitMetadata compactionMetadata = new HoodieCommitMetadata();
|
||||
commitMetadata.getPartitionToWriteStats()
|
||||
.forEach((key, value) -> value.forEach(l -> compactionMetadata.addWriteStat(key, l)));
|
||||
addSchemaToCommitMetadata(compactionMetadata, commitMetadata.getMetadata(HoodieCommitMetadata.SCHEMA_KEY),
|
||||
useSchemaFromCommitMetadata);
|
||||
createCompactionCommitFile(compactionMetadata, commitTime);
|
||||
if (createDeltaCommit) {
|
||||
// Write a delta commit
|
||||
HoodieCommitMetadata deltaMetadata = createLogFiles(commitMetadata.getPartitionToWriteStats(), true,
|
||||
useSchemaFromCommitMetadata);
|
||||
createDeltaCommitFile(deltaMetadata, deltaCommitTime);
|
||||
}
|
||||
}
|
||||
|
||||
public static void addCOWPartitions(int numberOfPartitions, boolean isParquetSchemaSimple,
|
||||
boolean useSchemaFromCommitMetadata, DateTime startFrom, String instantTime) throws IOException, URISyntaxException {
|
||||
HoodieCommitMetadata commitMetadata =
|
||||
createPartitions(numberOfPartitions, isParquetSchemaSimple, useSchemaFromCommitMetadata, startFrom, instantTime);
|
||||
createdTablesSet.add(hiveSyncConfig.databaseName + "." + hiveSyncConfig.tableName);
|
||||
createCommitFile(commitMetadata, instantTime);
|
||||
}
|
||||
|
||||
public static void addMORPartitions(int numberOfPartitions, boolean isParquetSchemaSimple, boolean isLogSchemaSimple,
|
||||
boolean useSchemaFromCommitMetadata, DateTime startFrom, String instantTime, String deltaCommitTime)
|
||||
throws IOException, URISyntaxException, InterruptedException {
|
||||
HoodieCommitMetadata commitMetadata = createPartitions(numberOfPartitions, isParquetSchemaSimple,
|
||||
useSchemaFromCommitMetadata, startFrom, instantTime);
|
||||
createdTablesSet.add(hiveSyncConfig.databaseName + "." + hiveSyncConfig.tableName + HiveSyncTool.SUFFIX_READ_OPTIMIZED_TABLE);
|
||||
createdTablesSet.add(hiveSyncConfig.databaseName + "." + hiveSyncConfig.tableName + HiveSyncTool.SUFFIX_SNAPSHOT_TABLE);
|
||||
HoodieCommitMetadata compactionMetadata = new HoodieCommitMetadata();
|
||||
commitMetadata.getPartitionToWriteStats()
|
||||
.forEach((key, value) -> value.forEach(l -> compactionMetadata.addWriteStat(key, l)));
|
||||
addSchemaToCommitMetadata(compactionMetadata, commitMetadata.getMetadata(HoodieCommitMetadata.SCHEMA_KEY),
|
||||
useSchemaFromCommitMetadata);
|
||||
createCompactionCommitFile(compactionMetadata, instantTime);
|
||||
HoodieCommitMetadata deltaMetadata = createLogFiles(commitMetadata.getPartitionToWriteStats(), isLogSchemaSimple,
|
||||
useSchemaFromCommitMetadata);
|
||||
createDeltaCommitFile(deltaMetadata, deltaCommitTime);
|
||||
}
|
||||
|
||||
private static HoodieCommitMetadata createLogFiles(Map<String, List<HoodieWriteStat>> partitionWriteStats,
|
||||
boolean isLogSchemaSimple, boolean useSchemaFromCommitMetadata)
|
||||
throws InterruptedException, IOException, URISyntaxException {
|
||||
HoodieCommitMetadata commitMetadata = new HoodieCommitMetadata();
|
||||
for (Entry<String, List<HoodieWriteStat>> wEntry : partitionWriteStats.entrySet()) {
|
||||
String partitionPath = wEntry.getKey();
|
||||
for (HoodieWriteStat wStat : wEntry.getValue()) {
|
||||
Path path = new Path(wStat.getPath());
|
||||
HoodieBaseFile dataFile = new HoodieBaseFile(fileSystem.getFileStatus(path));
|
||||
HoodieLogFile logFile = generateLogData(path, isLogSchemaSimple);
|
||||
HoodieDeltaWriteStat writeStat = new HoodieDeltaWriteStat();
|
||||
writeStat.setFileId(dataFile.getFileId());
|
||||
writeStat.setPath(logFile.getPath().toString());
|
||||
commitMetadata.addWriteStat(partitionPath, writeStat);
|
||||
}
|
||||
}
|
||||
addSchemaToCommitMetadata(commitMetadata, isLogSchemaSimple, useSchemaFromCommitMetadata);
|
||||
return commitMetadata;
|
||||
}
|
||||
|
||||
private static HoodieCommitMetadata createPartitions(int numberOfPartitions, boolean isParquetSchemaSimple,
|
||||
boolean useSchemaFromCommitMetadata, DateTime startFrom, String instantTime) throws IOException, URISyntaxException {
|
||||
startFrom = startFrom.withTimeAtStartOfDay();
|
||||
|
||||
HoodieCommitMetadata commitMetadata = new HoodieCommitMetadata();
|
||||
for (int i = 0; i < numberOfPartitions; i++) {
|
||||
String partitionPath = dtfOut.print(startFrom);
|
||||
Path partPath = new Path(hiveSyncConfig.basePath + "/" + partitionPath);
|
||||
fileSystem.makeQualified(partPath);
|
||||
fileSystem.mkdirs(partPath);
|
||||
List<HoodieWriteStat> writeStats = createTestData(partPath, isParquetSchemaSimple, instantTime);
|
||||
startFrom = startFrom.minusDays(1);
|
||||
writeStats.forEach(s -> commitMetadata.addWriteStat(partitionPath, s));
|
||||
}
|
||||
addSchemaToCommitMetadata(commitMetadata, isParquetSchemaSimple, useSchemaFromCommitMetadata);
|
||||
return commitMetadata;
|
||||
}
|
||||
|
||||
private static List<HoodieWriteStat> createTestData(Path partPath, boolean isParquetSchemaSimple, String instantTime)
|
||||
throws IOException, URISyntaxException {
|
||||
List<HoodieWriteStat> writeStats = new ArrayList<>();
|
||||
for (int i = 0; i < 5; i++) {
|
||||
// Create 5 files
|
||||
String fileId = UUID.randomUUID().toString();
|
||||
Path filePath = new Path(partPath.toString() + "/" + FSUtils.makeDataFileName(instantTime, "1-0-1", fileId));
|
||||
generateParquetData(filePath, isParquetSchemaSimple);
|
||||
HoodieWriteStat writeStat = new HoodieWriteStat();
|
||||
writeStat.setFileId(fileId);
|
||||
writeStat.setPath(filePath.toString());
|
||||
writeStats.add(writeStat);
|
||||
}
|
||||
return writeStats;
|
||||
}
|
||||
|
||||
@SuppressWarnings({"unchecked", "deprecation"})
|
||||
private static void generateParquetData(Path filePath, boolean isParquetSchemaSimple)
|
||||
throws IOException, URISyntaxException {
|
||||
Schema schema = getTestDataSchema(isParquetSchemaSimple);
|
||||
org.apache.parquet.schema.MessageType parquetSchema = new AvroSchemaConverter().convert(schema);
|
||||
BloomFilter filter = BloomFilterFactory.createBloomFilter(1000, 0.0001, -1,
|
||||
BloomFilterTypeCode.SIMPLE.name());
|
||||
HoodieAvroWriteSupport writeSupport = new HoodieAvroWriteSupport(parquetSchema, schema, filter);
|
||||
ParquetWriter writer = new ParquetWriter(filePath, writeSupport, CompressionCodecName.GZIP, 120 * 1024 * 1024,
|
||||
ParquetWriter.DEFAULT_PAGE_SIZE, ParquetWriter.DEFAULT_PAGE_SIZE, ParquetWriter.DEFAULT_IS_DICTIONARY_ENABLED,
|
||||
ParquetWriter.DEFAULT_IS_VALIDATING_ENABLED, ParquetWriter.DEFAULT_WRITER_VERSION, fileSystem.getConf());
|
||||
|
||||
List<IndexedRecord> testRecords = (isParquetSchemaSimple ? SchemaTestUtil.generateTestRecords(0, 100)
|
||||
: SchemaTestUtil.generateEvolvedTestRecords(100, 100));
|
||||
testRecords.forEach(s -> {
|
||||
try {
|
||||
writer.write(s);
|
||||
} catch (IOException e) {
|
||||
fail("IOException while writing test records as parquet" + e.toString());
|
||||
}
|
||||
});
|
||||
writer.close();
|
||||
}
|
||||
|
||||
private static HoodieLogFile generateLogData(Path parquetFilePath, boolean isLogSchemaSimple)
|
||||
throws IOException, InterruptedException, URISyntaxException {
|
||||
Schema schema = getTestDataSchema(isLogSchemaSimple);
|
||||
HoodieBaseFile dataFile = new HoodieBaseFile(fileSystem.getFileStatus(parquetFilePath));
|
||||
// Write a log file for this parquet file
|
||||
Writer logWriter = HoodieLogFormat.newWriterBuilder().onParentPath(parquetFilePath.getParent())
|
||||
.withFileExtension(HoodieLogFile.DELTA_EXTENSION).withFileId(dataFile.getFileId())
|
||||
.overBaseCommit(dataFile.getCommitTime()).withFs(fileSystem).build();
|
||||
List<IndexedRecord> records = (isLogSchemaSimple ? SchemaTestUtil.generateTestRecords(0, 100)
|
||||
: SchemaTestUtil.generateEvolvedTestRecords(100, 100));
|
||||
Map<HeaderMetadataType, String> header = new HashMap<>(2);
|
||||
header.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, dataFile.getCommitTime());
|
||||
header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, schema.toString());
|
||||
HoodieAvroDataBlock dataBlock = new HoodieAvroDataBlock(records, header);
|
||||
logWriter.appendBlock(dataBlock);
|
||||
logWriter.close();
|
||||
return logWriter.getLogFile();
|
||||
}
|
||||
|
||||
private static Schema getTestDataSchema(boolean isSimpleSchema) throws IOException {
|
||||
return isSimpleSchema ? SchemaTestUtil.getSimpleSchema() : SchemaTestUtil.getEvolvedSchema();
|
||||
}
|
||||
|
||||
private static void addSchemaToCommitMetadata(HoodieCommitMetadata commitMetadata, boolean isSimpleSchema,
|
||||
boolean useSchemaFromCommitMetadata) throws IOException {
|
||||
if (useSchemaFromCommitMetadata) {
|
||||
Schema dataSchema = getTestDataSchema(isSimpleSchema);
|
||||
commitMetadata.addMetadata(HoodieCommitMetadata.SCHEMA_KEY, dataSchema.toString());
|
||||
}
|
||||
}
|
||||
|
||||
private static void addSchemaToCommitMetadata(HoodieCommitMetadata commitMetadata, String schema,
|
||||
boolean useSchemaFromCommitMetadata) {
|
||||
if (useSchemaFromCommitMetadata) {
|
||||
commitMetadata.addMetadata(HoodieCommitMetadata.SCHEMA_KEY, schema);
|
||||
}
|
||||
}
|
||||
|
||||
private static void checkResult(boolean result) {
|
||||
if (!result) {
|
||||
throw new JUnitException("Could not initialize");
|
||||
}
|
||||
}
|
||||
|
||||
private static void createCommitFile(HoodieCommitMetadata commitMetadata, String instantTime) throws IOException {
|
||||
byte[] bytes = commitMetadata.toJsonString().getBytes(StandardCharsets.UTF_8);
|
||||
Path fullPath = new Path(hiveSyncConfig.basePath + "/" + HoodieTableMetaClient.METAFOLDER_NAME + "/"
|
||||
+ HoodieTimeline.makeCommitFileName(instantTime));
|
||||
FSDataOutputStream fsout = fileSystem.create(fullPath, true);
|
||||
fsout.write(bytes);
|
||||
fsout.close();
|
||||
}
|
||||
|
||||
private static void createCompactionCommitFile(HoodieCommitMetadata commitMetadata, String instantTime)
|
||||
throws IOException {
|
||||
byte[] bytes = commitMetadata.toJsonString().getBytes(StandardCharsets.UTF_8);
|
||||
Path fullPath = new Path(hiveSyncConfig.basePath + "/" + HoodieTableMetaClient.METAFOLDER_NAME + "/"
|
||||
+ HoodieTimeline.makeCommitFileName(instantTime));
|
||||
FSDataOutputStream fsout = fileSystem.create(fullPath, true);
|
||||
fsout.write(bytes);
|
||||
fsout.close();
|
||||
}
|
||||
|
||||
private static void createDeltaCommitFile(HoodieCommitMetadata deltaCommitMetadata, String deltaCommitTime)
|
||||
throws IOException {
|
||||
byte[] bytes = deltaCommitMetadata.toJsonString().getBytes(StandardCharsets.UTF_8);
|
||||
Path fullPath = new Path(hiveSyncConfig.basePath + "/" + HoodieTableMetaClient.METAFOLDER_NAME + "/"
|
||||
+ HoodieTimeline.makeDeltaFileName(deltaCommitTime));
|
||||
FSDataOutputStream fsout = fileSystem.create(fullPath, true);
|
||||
fsout.write(bytes);
|
||||
fsout.close();
|
||||
}
|
||||
|
||||
public static Set<String> getCreatedTablesSet() {
|
||||
return createdTablesSet;
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,29 @@
|
||||
###
|
||||
# Licensed to the Apache Software Foundation (ASF) under one
|
||||
# or more contributor license agreements. See the NOTICE file
|
||||
# distributed with this work for additional information
|
||||
# regarding copyright ownership. The ASF licenses this file
|
||||
# to you under the Apache License, Version 2.0 (the
|
||||
# "License"); you may not use this file except in compliance
|
||||
# with the License. You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
###
|
||||
log4j.rootLogger=WARN, CONSOLE
|
||||
log4j.logger.org.apache.hudi=DEBUG
|
||||
|
||||
# CONSOLE is set to be a ConsoleAppender.
|
||||
log4j.appender.CONSOLE=org.apache.log4j.ConsoleAppender
|
||||
# CONSOLE uses PatternLayout.
|
||||
log4j.appender.CONSOLE.layout=org.apache.log4j.PatternLayout
|
||||
log4j.appender.CONSOLE.layout.ConversionPattern=[%-5p] %d %c %x - %m%n
|
||||
log4j.appender.CONSOLE.filter.a=org.apache.log4j.varia.LevelRangeFilter
|
||||
log4j.appender.CONSOLE.filter.a.AcceptOnMatch=true
|
||||
log4j.appender.CONSOLE.filter.a.LevelMin=WARN
|
||||
log4j.appender.CONSOLE.filter.a.LevelMax=FATAL
|
||||
@@ -0,0 +1,30 @@
|
||||
###
|
||||
# Licensed to the Apache Software Foundation (ASF) under one
|
||||
# or more contributor license agreements. See the NOTICE file
|
||||
# distributed with this work for additional information
|
||||
# regarding copyright ownership. The ASF licenses this file
|
||||
# to you under the Apache License, Version 2.0 (the
|
||||
# "License"); you may not use this file except in compliance
|
||||
# with the License. You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
###
|
||||
log4j.rootLogger=WARN, CONSOLE
|
||||
log4j.logger.org.apache=INFO
|
||||
log4j.logger.org.apache.hudi=DEBUG
|
||||
|
||||
# A1 is set to be a ConsoleAppender.
|
||||
log4j.appender.CONSOLE=org.apache.log4j.ConsoleAppender
|
||||
# A1 uses PatternLayout.
|
||||
log4j.appender.CONSOLE.layout=org.apache.log4j.PatternLayout
|
||||
log4j.appender.CONSOLE.layout.ConversionPattern=%-4r [%t] %-5p %c %x - %m%n
|
||||
log4j.appender.CONSOLE.filter.a=org.apache.log4j.varia.LevelRangeFilter
|
||||
log4j.appender.CONSOLE.filter.a.AcceptOnMatch=true
|
||||
log4j.appender.CONSOLE.filter.a.LevelMin=WARN
|
||||
log4j.appender.CONSOLE.filter.a.LevelMax=FATAL
|
||||
Reference in New Issue
Block a user