The code-style rules follow google style with some changes: 1. Increase line length from 100 to 120 2. Disable JavaDoc related checkstyles as this needs more manual work. Both source and test code are checked for code-style
192 lines
7.9 KiB
Java
192 lines
7.9 KiB
Java
/*
|
|
* Copyright (c) 2017 Uber Technologies, Inc. (hoodie-dev-group@uber.com)
|
|
*
|
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
|
* you may not use this file except in compliance with the License.
|
|
* You may obtain a copy of the License at
|
|
*
|
|
* http://www.apache.org/licenses/LICENSE-2.0
|
|
*
|
|
* Unless required by applicable law or agreed to in writing, software
|
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
* See the License for the specific language governing permissions and
|
|
* limitations under the License.
|
|
*
|
|
*
|
|
*/
|
|
|
|
package com.uber.hoodie.hive;
|
|
|
|
import com.beust.jcommander.JCommander;
|
|
import com.uber.hoodie.common.util.FSUtils;
|
|
import com.uber.hoodie.exception.InvalidDatasetException;
|
|
import com.uber.hoodie.hadoop.HoodieInputFormat;
|
|
import com.uber.hoodie.hadoop.realtime.HoodieRealtimeInputFormat;
|
|
import com.uber.hoodie.hive.HoodieHiveClient.PartitionEvent;
|
|
import com.uber.hoodie.hive.HoodieHiveClient.PartitionEvent.PartitionEventType;
|
|
import com.uber.hoodie.hive.util.SchemaUtil;
|
|
import java.util.List;
|
|
import java.util.Map;
|
|
import java.util.Optional;
|
|
import java.util.stream.Collectors;
|
|
import org.apache.hadoop.conf.Configuration;
|
|
import org.apache.hadoop.fs.FileSystem;
|
|
import org.apache.hadoop.hive.conf.HiveConf;
|
|
import org.apache.hadoop.hive.metastore.api.Partition;
|
|
import org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat;
|
|
import org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe;
|
|
import org.slf4j.Logger;
|
|
import org.slf4j.LoggerFactory;
|
|
import parquet.schema.MessageType;
|
|
|
|
|
|
/**
|
|
* Tool to sync a hoodie HDFS dataset with a hive metastore table. Either use it as a api
|
|
* HiveSyncTool.syncHoodieTable(HiveSyncConfig) or as a command line java -cp hoodie-hive.jar
|
|
* HiveSyncTool [args]
|
|
* <p>
|
|
* This utility will get the schema from the latest commit and will sync hive table schema Also this
|
|
* will sync the partitions incrementally (all the partitions modified since the last commit)
|
|
*/
|
|
@SuppressWarnings("WeakerAccess")
|
|
public class HiveSyncTool {
|
|
|
|
private static final Logger LOG = LoggerFactory.getLogger(HiveSyncTool.class);
|
|
private final HoodieHiveClient hoodieHiveClient;
|
|
public static final String SUFFIX_REALTIME_TABLE = "_rt";
|
|
private final HiveSyncConfig cfg;
|
|
|
|
public HiveSyncTool(HiveSyncConfig cfg, HiveConf configuration, FileSystem fs) {
|
|
this.hoodieHiveClient = new HoodieHiveClient(cfg, configuration, fs);
|
|
this.cfg = cfg;
|
|
}
|
|
|
|
public void syncHoodieTable() {
|
|
switch (hoodieHiveClient.getTableType()) {
|
|
case COPY_ON_WRITE:
|
|
syncHoodieTable(false);
|
|
break;
|
|
case MERGE_ON_READ:
|
|
//sync a RO table for MOR
|
|
syncHoodieTable(false);
|
|
String originalTableName = cfg.tableName;
|
|
//TODO : Make realtime table registration optional using a config param
|
|
cfg.tableName = cfg.tableName + SUFFIX_REALTIME_TABLE;
|
|
//sync a RT table for MOR
|
|
syncHoodieTable(true);
|
|
cfg.tableName = originalTableName;
|
|
break;
|
|
default:
|
|
LOG.error("Unknown table type " + hoodieHiveClient.getTableType());
|
|
throw new InvalidDatasetException(hoodieHiveClient.getBasePath());
|
|
}
|
|
hoodieHiveClient.close();
|
|
}
|
|
|
|
private void syncHoodieTable(boolean isRealTime) {
|
|
LOG.info("Trying to sync hoodie table " + cfg.tableName + " with base path "
|
|
+ hoodieHiveClient.getBasePath() + " of type " + hoodieHiveClient.getTableType());
|
|
|
|
// Check if the necessary table exists
|
|
boolean tableExists = hoodieHiveClient.doesTableExist();
|
|
// Get the parquet schema for this dataset looking at the latest commit
|
|
MessageType schema = hoodieHiveClient.getDataSchema();
|
|
// Sync schema if needed
|
|
syncSchema(tableExists, isRealTime, schema);
|
|
|
|
LOG.info("Schema sync complete. Syncing partitions for " + cfg.tableName);
|
|
// Get the last time we successfully synced partitions
|
|
Optional<String> lastCommitTimeSynced = Optional.empty();
|
|
if (tableExists) {
|
|
lastCommitTimeSynced = hoodieHiveClient.getLastCommitTimeSynced();
|
|
}
|
|
LOG.info("Last commit time synced was found to be " + lastCommitTimeSynced.orElse("null"));
|
|
List<String> writtenPartitionsSince = hoodieHiveClient
|
|
.getPartitionsWrittenToSince(lastCommitTimeSynced);
|
|
LOG.info("Storage partitions scan complete. Found " + writtenPartitionsSince.size());
|
|
// Sync the partitions if needed
|
|
syncPartitions(writtenPartitionsSince);
|
|
|
|
hoodieHiveClient.updateLastCommitTimeSynced();
|
|
LOG.info("Sync complete for " + cfg.tableName);
|
|
}
|
|
|
|
/**
|
|
* Get the latest schema from the last commit and check if its in sync with the hive table schema.
|
|
* If not, evolves the table schema.
|
|
*
|
|
* @param tableExists - does table exist
|
|
* @param schema - extracted schema
|
|
*/
|
|
private void syncSchema(boolean tableExists, boolean isRealTime, MessageType schema) {
|
|
// Check and sync schema
|
|
if (!tableExists) {
|
|
LOG.info("Table " + cfg.tableName + " is not found. Creating it");
|
|
if (!isRealTime) {
|
|
// TODO - RO Table for MOR only after major compaction (UnboundedCompaction is default
|
|
// for now)
|
|
hoodieHiveClient.createTable(schema, HoodieInputFormat.class.getName(),
|
|
MapredParquetOutputFormat.class.getName(), ParquetHiveSerDe.class.getName());
|
|
} else {
|
|
// Custom serde will not work with ALTER TABLE REPLACE COLUMNS
|
|
// https://github.com/apache/hive/blob/release-1.1.0/ql/src/java/org/apache/hadoop/hive
|
|
// /ql/exec/DDLTask.java#L3488
|
|
hoodieHiveClient.createTable(schema, HoodieRealtimeInputFormat.class.getName(),
|
|
MapredParquetOutputFormat.class.getName(), ParquetHiveSerDe.class.getName());
|
|
}
|
|
} else {
|
|
// Check if the dataset schema has evolved
|
|
Map<String, String> tableSchema = hoodieHiveClient.getTableSchema();
|
|
SchemaDifference schemaDiff = SchemaUtil.getSchemaDifference(schema, tableSchema,
|
|
cfg.partitionFields);
|
|
if (!schemaDiff.isEmpty()) {
|
|
LOG.info("Schema difference found for " + cfg.tableName);
|
|
hoodieHiveClient.updateTableDefinition(schema);
|
|
} else {
|
|
LOG.info("No Schema difference for " + cfg.tableName);
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
/**
|
|
* Syncs the list of storage parititions passed in (checks if the partition is in hive, if not
|
|
* adds it or if the partition path does not match, it updates the partition path)
|
|
*/
|
|
private void syncPartitions(List<String> writtenPartitionsSince) {
|
|
try {
|
|
List<Partition> hivePartitions = hoodieHiveClient.scanTablePartitions();
|
|
List<PartitionEvent> partitionEvents = hoodieHiveClient.getPartitionEvents(hivePartitions,
|
|
writtenPartitionsSince);
|
|
List<String> newPartitions = filterPartitions(partitionEvents, PartitionEventType.ADD);
|
|
LOG.info("New Partitions " + newPartitions);
|
|
hoodieHiveClient.addPartitionsToTable(newPartitions);
|
|
List<String> updatePartitions = filterPartitions(partitionEvents, PartitionEventType.UPDATE);
|
|
LOG.info("Changed Partitions " + updatePartitions);
|
|
hoodieHiveClient.updatePartitionsToTable(updatePartitions);
|
|
} catch (Exception e) {
|
|
throw new HoodieHiveSyncException("Failed to sync partitions for table " + cfg.tableName, e);
|
|
}
|
|
}
|
|
|
|
private List<String> filterPartitions(List<PartitionEvent> events, PartitionEventType eventType) {
|
|
return events.stream().filter(s -> s.eventType == eventType).map(s -> s.storagePartition)
|
|
.collect(Collectors.toList());
|
|
}
|
|
|
|
public static void main(String[] args) throws Exception {
|
|
// parse the params
|
|
final HiveSyncConfig cfg = new HiveSyncConfig();
|
|
JCommander cmd = new JCommander(cfg, args);
|
|
if (cfg.help || args.length == 0) {
|
|
cmd.usage();
|
|
System.exit(1);
|
|
}
|
|
FileSystem fs = FSUtils.getFs(cfg.basePath, new Configuration());
|
|
HiveConf hiveConf = new HiveConf();
|
|
hiveConf.addResource(fs.getConf());
|
|
new HiveSyncTool(cfg, hiveConf, fs).syncHoodieTable();
|
|
}
|
|
}
|