/* * Copyright (c) 2017 Uber Technologies, Inc. (hoodie-dev-group@uber.com) * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * * */ package com.uber.hoodie.hive; import com.beust.jcommander.JCommander; import com.uber.hoodie.common.util.FSUtils; import com.uber.hoodie.exception.InvalidDatasetException; import com.uber.hoodie.hadoop.HoodieInputFormat; import com.uber.hoodie.hadoop.realtime.HoodieRealtimeInputFormat; import com.uber.hoodie.hive.HoodieHiveClient.PartitionEvent; import com.uber.hoodie.hive.HoodieHiveClient.PartitionEvent.PartitionEventType; import com.uber.hoodie.hive.util.SchemaUtil; import java.util.List; import java.util.Map; import java.util.Optional; import java.util.stream.Collectors; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.hive.conf.HiveConf; import org.apache.hadoop.hive.metastore.api.Partition; import org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat; import org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import parquet.schema.MessageType; /** * Tool to sync a hoodie HDFS dataset with a hive metastore table. Either use it as a api * HiveSyncTool.syncHoodieTable(HiveSyncConfig) or as a command line java -cp hoodie-hive.jar * HiveSyncTool [args] *
* This utility will get the schema from the latest commit and will sync hive table schema Also this
* will sync the partitions incrementally (all the partitions modified since the last commit)
*/
@SuppressWarnings("WeakerAccess")
public class HiveSyncTool {
private static final Logger LOG = LoggerFactory.getLogger(HiveSyncTool.class);
private final HoodieHiveClient hoodieHiveClient;
public static final String SUFFIX_REALTIME_TABLE = "_rt";
private final HiveSyncConfig cfg;
public HiveSyncTool(HiveSyncConfig cfg, HiveConf configuration, FileSystem fs) {
this.hoodieHiveClient = new HoodieHiveClient(cfg, configuration, fs);
this.cfg = cfg;
}
public void syncHoodieTable() {
switch (hoodieHiveClient.getTableType()) {
case COPY_ON_WRITE:
syncHoodieTable(false);
break;
case MERGE_ON_READ:
//sync a RO table for MOR
syncHoodieTable(false);
String originalTableName = cfg.tableName;
//TODO : Make realtime table registration optional using a config param
cfg.tableName = cfg.tableName + SUFFIX_REALTIME_TABLE;
//sync a RT table for MOR
syncHoodieTable(true);
cfg.tableName = originalTableName;
break;
default:
LOG.error("Unknown table type " + hoodieHiveClient.getTableType());
throw new InvalidDatasetException(hoodieHiveClient.getBasePath());
}
hoodieHiveClient.close();
}
private void syncHoodieTable(boolean isRealTime) {
LOG.info("Trying to sync hoodie table " + cfg.tableName + " with base path "
+ hoodieHiveClient.getBasePath() + " of type " + hoodieHiveClient.getTableType());
// Check if the necessary table exists
boolean tableExists = hoodieHiveClient.doesTableExist();
// Get the parquet schema for this dataset looking at the latest commit
MessageType schema = hoodieHiveClient.getDataSchema();
// Sync schema if needed
syncSchema(tableExists, isRealTime, schema);
LOG.info("Schema sync complete. Syncing partitions for " + cfg.tableName);
// Get the last time we successfully synced partitions
Optional