1
0

[HUDI-296] Explore use of spotless to auto fix formatting errors (#945)

- Add spotless format fixing to project
- One time reformatting for conformity
- Build fails for formatting changes and mvn spotless:apply autofixes them
This commit is contained in:
leesf
2019-10-10 20:19:40 +08:00
committed by vinoth chandar
parent 834c591955
commit b19bed442d
381 changed files with 7350 additions and 9064 deletions

View File

@@ -28,8 +28,7 @@ import java.util.List;
*/
public class HiveSyncConfig implements Serializable {
@Parameter(names = {
"--database"}, description = "name of the target database in Hive", required = true)
@Parameter(names = {"--database"}, description = "name of the target database in Hive", required = true)
public String databaseName;
@Parameter(names = {"--table"}, description = "name of the target table in Hive", required = true)
@@ -44,33 +43,25 @@ public class HiveSyncConfig implements Serializable {
@Parameter(names = {"--jdbc-url"}, description = "Hive jdbc connect url", required = true)
public String jdbcUrl;
@Parameter(names = {
"--base-path"}, description = "Basepath of hoodie dataset to sync", required = true)
@Parameter(names = {"--base-path"}, description = "Basepath of hoodie dataset to sync", required = true)
public String basePath;
@Parameter(names = "--partitioned-by", description = "Fields in the schema partitioned by")
public List<String> partitionFields = new ArrayList<>();
@Parameter(names = "--partition-value-extractor", description = "Class which implements "
+ "PartitionValueExtractor "
+ "to extract the partition "
+ "values from HDFS path")
public String partitionValueExtractorClass = SlashEncodedDayPartitionValueExtractor.class
.getName();
@Parameter(names = "--partition-value-extractor", description = "Class which implements " + "PartitionValueExtractor "
+ "to extract the partition " + "values from HDFS path")
public String partitionValueExtractorClass = SlashEncodedDayPartitionValueExtractor.class.getName();
@Parameter(names = {
"--assume-date-partitioning"}, description = "Assume standard yyyy/mm/dd partitioning, this"
+ " exists to support "
+ "backward compatibility. If"
+ " you use hoodie 0.3.x, do "
+ "not set this parameter")
@Parameter(names = {"--assume-date-partitioning"}, description = "Assume standard yyyy/mm/dd partitioning, this"
+ " exists to support " + "backward compatibility. If" + " you use hoodie 0.3.x, do " + "not set this parameter")
public Boolean assumeDatePartitioning = false;
@Parameter(names = {
"--use-pre-apache-input-format"}, description = "Use InputFormat under com.uber.hoodie package "
+ "instead of org.apache.hudi package. Use this when you are in the process of migrating from "
+ "com.uber.hoodie to org.apache.hudi. Stop using this after you migrated the table definition to "
+ "org.apache.hudi input format.")
@Parameter(names = {"--use-pre-apache-input-format"},
description = "Use InputFormat under com.uber.hoodie package "
+ "instead of org.apache.hudi package. Use this when you are in the process of migrating from "
+ "com.uber.hoodie to org.apache.hudi. Stop using this after you migrated the table definition to "
+ "org.apache.hudi input format.")
public Boolean usePreApacheInputFormat = false;
@Parameter(names = {"--use-jdbc"}, description = "Hive jdbc connect url")
@@ -96,19 +87,10 @@ public class HiveSyncConfig implements Serializable {
@Override
public String toString() {
return "HiveSyncConfig{"
+ "databaseName='" + databaseName + '\''
+ ", tableName='" + tableName + '\''
+ ", hiveUser='" + hiveUser + '\''
+ ", hivePass='" + hivePass + '\''
+ ", jdbcUrl='" + jdbcUrl + '\''
+ ", basePath='" + basePath + '\''
+ ", partitionFields=" + partitionFields
+ ", partitionValueExtractorClass='" + partitionValueExtractorClass + '\''
+ ", assumeDatePartitioning=" + assumeDatePartitioning
+ ", usePreApacheInputFormat=" + usePreApacheInputFormat
+ ", useJdbc=" + useJdbc
+ ", help=" + help
+ '}';
return "HiveSyncConfig{" + "databaseName='" + databaseName + '\'' + ", tableName='" + tableName + '\''
+ ", hiveUser='" + hiveUser + '\'' + ", hivePass='" + hivePass + '\'' + ", jdbcUrl='" + jdbcUrl + '\''
+ ", basePath='" + basePath + '\'' + ", partitionFields=" + partitionFields + ", partitionValueExtractorClass='"
+ partitionValueExtractorClass + '\'' + ", assumeDatePartitioning=" + assumeDatePartitioning
+ ", usePreApacheInputFormat=" + usePreApacheInputFormat + ", useJdbc=" + useJdbc + ", help=" + help + '}';
}
}

View File

@@ -43,11 +43,10 @@ import org.apache.parquet.schema.MessageType;
/**
* Tool to sync a hoodie HDFS dataset with a hive metastore table. Either use it as a api
* HiveSyncTool.syncHoodieTable(HiveSyncConfig) or as a command line java -cp hoodie-hive.jar
* HiveSyncTool [args]
* HiveSyncTool.syncHoodieTable(HiveSyncConfig) or as a command line java -cp hoodie-hive.jar HiveSyncTool [args]
* <p>
* This utility will get the schema from the latest commit and will sync hive table schema Also this
* will sync the partitions incrementally (all the partitions modified since the last commit)
* This utility will get the schema from the latest commit and will sync hive table schema Also this will sync the
* partitions incrementally (all the partitions modified since the last commit)
*/
@SuppressWarnings("WeakerAccess")
public class HiveSyncTool {
@@ -68,12 +67,12 @@ public class HiveSyncTool {
syncHoodieTable(false);
break;
case MERGE_ON_READ:
//sync a RO table for MOR
// sync a RO table for MOR
syncHoodieTable(false);
String originalTableName = cfg.tableName;
//TODO : Make realtime table registration optional using a config param
// TODO : Make realtime table registration optional using a config param
cfg.tableName = cfg.tableName + SUFFIX_REALTIME_TABLE;
//sync a RT table for MOR
// sync a RT table for MOR
syncHoodieTable(true);
cfg.tableName = originalTableName;
break;
@@ -85,8 +84,8 @@ public class HiveSyncTool {
}
private void syncHoodieTable(boolean isRealTime) throws ClassNotFoundException {
LOG.info("Trying to sync hoodie table " + cfg.tableName + " with base path "
+ hoodieHiveClient.getBasePath() + " of type " + hoodieHiveClient.getTableType());
LOG.info("Trying to sync hoodie table " + cfg.tableName + " with base path " + hoodieHiveClient.getBasePath()
+ " of type " + hoodieHiveClient.getTableType());
// Check if the necessary table exists
boolean tableExists = hoodieHiveClient.doesTableExist();
@@ -102,8 +101,7 @@ public class HiveSyncTool {
lastCommitTimeSynced = hoodieHiveClient.getLastCommitTimeSynced();
}
LOG.info("Last commit time synced was found to be " + lastCommitTimeSynced.orElse("null"));
List<String> writtenPartitionsSince = hoodieHiveClient
.getPartitionsWrittenToSince(lastCommitTimeSynced);
List<String> writtenPartitionsSince = hoodieHiveClient.getPartitionsWrittenToSince(lastCommitTimeSynced);
LOG.info("Storage partitions scan complete. Found " + writtenPartitionsSince.size());
// Sync the partitions if needed
syncPartitions(writtenPartitionsSince);
@@ -113,8 +111,8 @@ public class HiveSyncTool {
}
/**
* Get the latest schema from the last commit and check if its in sync with the hive table schema.
* If not, evolves the table schema.
* Get the latest schema from the last commit and check if its in sync with the hive table schema. If not, evolves the
* table schema.
*
* @param tableExists - does table exist
* @param schema - extracted schema
@@ -129,8 +127,8 @@ public class HiveSyncTool {
String inputFormatClassName =
cfg.usePreApacheInputFormat ? com.uber.hoodie.hadoop.HoodieInputFormat.class.getName()
: HoodieParquetInputFormat.class.getName();
hoodieHiveClient.createTable(schema, inputFormatClassName,
MapredParquetOutputFormat.class.getName(), ParquetHiveSerDe.class.getName());
hoodieHiveClient.createTable(schema, inputFormatClassName, MapredParquetOutputFormat.class.getName(),
ParquetHiveSerDe.class.getName());
} else {
// Custom serde will not work with ALTER TABLE REPLACE COLUMNS
// https://github.com/apache/hive/blob/release-1.1.0/ql/src/java/org/apache/hadoop/hive
@@ -138,14 +136,13 @@ public class HiveSyncTool {
String inputFormatClassName =
cfg.usePreApacheInputFormat ? com.uber.hoodie.hadoop.realtime.HoodieRealtimeInputFormat.class.getName()
: HoodieParquetRealtimeInputFormat.class.getName();
hoodieHiveClient.createTable(schema, inputFormatClassName,
MapredParquetOutputFormat.class.getName(), ParquetHiveSerDe.class.getName());
hoodieHiveClient.createTable(schema, inputFormatClassName, MapredParquetOutputFormat.class.getName(),
ParquetHiveSerDe.class.getName());
}
} else {
// Check if the dataset schema has evolved
Map<String, String> tableSchema = hoodieHiveClient.getTableSchema();
SchemaDifference schemaDiff = SchemaUtil.getSchemaDifference(schema, tableSchema,
cfg.partitionFields);
SchemaDifference schemaDiff = SchemaUtil.getSchemaDifference(schema, tableSchema, cfg.partitionFields);
if (!schemaDiff.isEmpty()) {
LOG.info("Schema difference found for " + cfg.tableName);
hoodieHiveClient.updateTableDefinition(schema);
@@ -157,14 +154,14 @@ public class HiveSyncTool {
/**
* Syncs the list of storage parititions passed in (checks if the partition is in hive, if not
* adds it or if the partition path does not match, it updates the partition path)
* Syncs the list of storage parititions passed in (checks if the partition is in hive, if not adds it or if the
* partition path does not match, it updates the partition path)
*/
private void syncPartitions(List<String> writtenPartitionsSince) {
try {
List<Partition> hivePartitions = hoodieHiveClient.scanTablePartitions();
List<PartitionEvent> partitionEvents = hoodieHiveClient.getPartitionEvents(hivePartitions,
writtenPartitionsSince);
List<PartitionEvent> partitionEvents =
hoodieHiveClient.getPartitionEvents(hivePartitions, writtenPartitionsSince);
List<String> newPartitions = filterPartitions(partitionEvents, PartitionEventType.ADD);
LOG.info("New Partitions " + newPartitions);
hoodieHiveClient.addPartitionsToTable(newPartitions);

View File

@@ -112,12 +112,11 @@ public class HoodieHiveClient {
}
try {
this.partitionValueExtractor = (PartitionValueExtractor) Class.forName(
cfg.partitionValueExtractorClass).newInstance();
this.partitionValueExtractor =
(PartitionValueExtractor) Class.forName(cfg.partitionValueExtractorClass).newInstance();
} catch (Exception e) {
throw new HoodieHiveSyncException(
"Failed to initialize PartitionValueExtractor class " + cfg.partitionValueExtractorClass,
e);
"Failed to initialize PartitionValueExtractor class " + cfg.partitionValueExtractorClass, e);
}
activeTimeline = metaClient.getActiveTimeline().getCommitsTimeline().filterCompletedInstants();
@@ -157,28 +156,26 @@ public class HoodieHiveClient {
private String constructAddPartitions(List<String> partitions) {
StringBuilder alterSQL = new StringBuilder("ALTER TABLE ");
alterSQL.append(syncConfig.databaseName).append(".").append(syncConfig.tableName)
.append(" ADD IF NOT EXISTS ");
alterSQL.append(syncConfig.databaseName).append(".").append(syncConfig.tableName).append(" ADD IF NOT EXISTS ");
for (String partition : partitions) {
String partitionClause = getPartitionClause(partition);
String fullPartitionPath = FSUtils.getPartitionPath(syncConfig.basePath, partition).toString();
alterSQL.append(" PARTITION (").append(partitionClause).append(") LOCATION '")
.append(fullPartitionPath).append("' ");
alterSQL.append(" PARTITION (").append(partitionClause).append(") LOCATION '").append(fullPartitionPath)
.append("' ");
}
return alterSQL.toString();
}
/**
* Generate Hive Partition from partition values
*
* @param partition Partition path
* @return
*/
private String getPartitionClause(String partition) {
List<String> partitionValues = partitionValueExtractor
.extractPartitionValuesInPath(partition);
List<String> partitionValues = partitionValueExtractor.extractPartitionValuesInPath(partition);
Preconditions.checkArgument(syncConfig.partitionFields.size() == partitionValues.size(),
"Partition key parts " + syncConfig.partitionFields
+ " does not match with partition values " + partitionValues
"Partition key parts " + syncConfig.partitionFields + " does not match with partition values " + partitionValues
+ ". Check partition strategy. ");
List<String> partBuilder = new ArrayList<>();
for (int i = 0; i < syncConfig.partitionFields.size(); i++) {
@@ -204,17 +201,16 @@ public class HoodieHiveClient {
}
/**
* Iterate over the storage partitions and find if there are any new partitions that need to be
* added or updated. Generate a list of PartitionEvent based on the changes required.
* Iterate over the storage partitions and find if there are any new partitions that need to be added or updated.
* Generate a list of PartitionEvent based on the changes required.
*/
List<PartitionEvent> getPartitionEvents(List<Partition> tablePartitions,
List<String> partitionStoragePartitions) {
List<PartitionEvent> getPartitionEvents(List<Partition> tablePartitions, List<String> partitionStoragePartitions) {
Map<String, String> paths = Maps.newHashMap();
for (Partition tablePartition : tablePartitions) {
List<String> hivePartitionValues = tablePartition.getValues();
Collections.sort(hivePartitionValues);
String fullTablePartitionPath = Path.getPathWithoutSchemeAndAuthority(
new Path(tablePartition.getSd().getLocation())).toUri().getPath();
String fullTablePartitionPath =
Path.getPathWithoutSchemeAndAuthority(new Path(tablePartition.getSd().getLocation())).toUri().getPath();
paths.put(String.join(", ", hivePartitionValues), fullTablePartitionPath);
}
@@ -222,8 +218,7 @@ public class HoodieHiveClient {
for (String storagePartition : partitionStoragePartitions) {
String fullStoragePartitionPath = FSUtils.getPartitionPath(syncConfig.basePath, storagePartition).toString();
// Check if the partition values or if hdfs path is the same
List<String> storagePartitionValues = partitionValueExtractor
.extractPartitionValuesInPath(storagePartition);
List<String> storagePartitionValues = partitionValueExtractor.extractPartitionValuesInPath(storagePartition);
Collections.sort(storagePartitionValues);
if (!storagePartitionValues.isEmpty()) {
String storageValue = String.join(", ", storagePartitionValues);
@@ -250,11 +245,9 @@ public class HoodieHiveClient {
String newSchemaStr = SchemaUtil.generateSchemaString(newSchema, syncConfig.partitionFields);
// Cascade clause should not be present for non-partitioned tables
String cascadeClause = syncConfig.partitionFields.size() > 0 ? " cascade" : "";
StringBuilder sqlBuilder = new StringBuilder("ALTER TABLE ").append("`")
.append(syncConfig.databaseName).append(".")
.append(syncConfig.tableName).append("`")
.append(" REPLACE COLUMNS(").append(newSchemaStr).append(" )")
.append(cascadeClause);
StringBuilder sqlBuilder = new StringBuilder("ALTER TABLE ").append("`").append(syncConfig.databaseName)
.append(".").append(syncConfig.tableName).append("`").append(" REPLACE COLUMNS(").append(newSchemaStr)
.append(" )").append(cascadeClause);
LOG.info("Updating table definition with " + sqlBuilder);
updateHiveSQL(sqlBuilder.toString());
} catch (IOException e) {
@@ -262,12 +255,10 @@ public class HoodieHiveClient {
}
}
void createTable(MessageType storageSchema, String inputFormatClass, String outputFormatClass,
String serdeClass) {
void createTable(MessageType storageSchema, String inputFormatClass, String outputFormatClass, String serdeClass) {
try {
String createSQLQuery = SchemaUtil
.generateCreateDDL(storageSchema, syncConfig, inputFormatClass,
outputFormatClass, serdeClass);
String createSQLQuery =
SchemaUtil.generateCreateDDL(storageSchema, syncConfig, inputFormatClass, outputFormatClass, serdeClass);
LOG.info("Creating table with " + createSQLQuery);
updateHiveSQL(createSQLQuery);
} catch (IOException e) {
@@ -288,8 +279,7 @@ public class HoodieHiveClient {
ResultSet result = null;
try {
DatabaseMetaData databaseMetaData = connection.getMetaData();
result = databaseMetaData
.getColumns(null, syncConfig.databaseName, syncConfig.tableName, null);
result = databaseMetaData.getColumns(null, syncConfig.databaseName, syncConfig.tableName, null);
while (result.next()) {
String columnName = result.getString(4);
String columnType = result.getString(6);
@@ -302,8 +292,7 @@ public class HoodieHiveClient {
}
return schema;
} catch (SQLException e) {
throw new HoodieHiveSyncException("Failed to get table schema for " + syncConfig.tableName,
e);
throw new HoodieHiveSyncException("Failed to get table schema for " + syncConfig.tableName, e);
} finally {
closeQuietly(result, null);
}
@@ -318,11 +307,11 @@ public class HoodieHiveClient {
// get the Schema of the table.
final long start = System.currentTimeMillis();
Table table = this.client.getTable(syncConfig.databaseName, syncConfig.tableName);
Map<String, String> partitionKeysMap = table.getPartitionKeys().stream()
.collect(Collectors.toMap(f -> f.getName(), f -> f.getType().toUpperCase()));
Map<String, String> partitionKeysMap =
table.getPartitionKeys().stream().collect(Collectors.toMap(f -> f.getName(), f -> f.getType().toUpperCase()));
Map<String, String> columnsMap = table.getSd().getCols().stream()
.collect(Collectors.toMap(f -> f.getName(), f -> f.getType().toUpperCase()));
Map<String, String> columnsMap =
table.getSd().getCols().stream().collect(Collectors.toMap(f -> f.getName(), f -> f.getType().toUpperCase()));
Map<String, String> schema = new HashMap<>();
schema.putAll(columnsMap);
@@ -336,9 +325,8 @@ public class HoodieHiveClient {
}
/**
* Gets the schema for a hoodie dataset. Depending on the type of table, read from any file
* written in the latest commit. We will assume that the schema has not changed within a single
* atomic write.
* Gets the schema for a hoodie dataset. Depending on the type of table, read from any file written in the latest
* commit. We will assume that the schema has not changed within a single atomic write.
*
* @return Parquet schema for this dataset
*/
@@ -349,57 +337,49 @@ public class HoodieHiveClient {
case COPY_ON_WRITE:
// If this is COW, get the last commit and read the schema from a file written in the
// last commit
HoodieInstant lastCommit = activeTimeline.lastInstant().orElseThrow(
() -> new InvalidDatasetException(syncConfig.basePath));
HoodieCommitMetadata commitMetadata = HoodieCommitMetadata.fromBytes(
activeTimeline.getInstantDetails(lastCommit).get(), HoodieCommitMetadata.class);
String filePath = commitMetadata.getFileIdAndFullPaths(metaClient.getBasePath()).values()
.stream().findAny().orElseThrow(() -> new IllegalArgumentException(
"Could not find any data file written for commit " + lastCommit
+ ", could not get schema for dataset " + metaClient.getBasePath()
+ ", Metadata :" + commitMetadata));
HoodieInstant lastCommit =
activeTimeline.lastInstant().orElseThrow(() -> new InvalidDatasetException(syncConfig.basePath));
HoodieCommitMetadata commitMetadata = HoodieCommitMetadata
.fromBytes(activeTimeline.getInstantDetails(lastCommit).get(), HoodieCommitMetadata.class);
String filePath = commitMetadata.getFileIdAndFullPaths(metaClient.getBasePath()).values().stream().findAny()
.orElseThrow(() -> new IllegalArgumentException("Could not find any data file written for commit "
+ lastCommit + ", could not get schema for dataset " + metaClient.getBasePath() + ", Metadata :"
+ commitMetadata));
return readSchemaFromDataFile(new Path(filePath));
case MERGE_ON_READ:
// If this is MOR, depending on whether the latest commit is a delta commit or
// compaction commit
// Get a datafile written and get the schema from that file
Option<HoodieInstant> lastCompactionCommit = metaClient.getActiveTimeline()
.getCommitTimeline()
.filterCompletedInstants()
.lastInstant();
Option<HoodieInstant> lastCompactionCommit =
metaClient.getActiveTimeline().getCommitTimeline().filterCompletedInstants().lastInstant();
LOG.info("Found the last compaction commit as " + lastCompactionCommit);
Option<HoodieInstant> lastDeltaCommit;
if (lastCompactionCommit.isPresent()) {
lastDeltaCommit = metaClient.getActiveTimeline().getDeltaCommitTimeline()
.filterCompletedInstants()
.findInstantsAfter(lastCompactionCommit.get().getTimestamp(),
Integer.MAX_VALUE).lastInstant();
lastDeltaCommit = metaClient.getActiveTimeline().getDeltaCommitTimeline().filterCompletedInstants()
.findInstantsAfter(lastCompactionCommit.get().getTimestamp(), Integer.MAX_VALUE).lastInstant();
} else {
lastDeltaCommit = metaClient.getActiveTimeline().getDeltaCommitTimeline()
.filterCompletedInstants().lastInstant();
lastDeltaCommit =
metaClient.getActiveTimeline().getDeltaCommitTimeline().filterCompletedInstants().lastInstant();
}
LOG.info("Found the last delta commit " + lastDeltaCommit);
if (lastDeltaCommit.isPresent()) {
HoodieInstant lastDeltaInstant = lastDeltaCommit.get();
// read from the log file wrote
commitMetadata = HoodieCommitMetadata.fromBytes(
activeTimeline.getInstantDetails(lastDeltaInstant).get(), HoodieCommitMetadata.class);
commitMetadata = HoodieCommitMetadata.fromBytes(activeTimeline.getInstantDetails(lastDeltaInstant).get(),
HoodieCommitMetadata.class);
Pair<String, HoodieFileFormat> filePathWithFormat =
commitMetadata.getFileIdAndFullPaths(metaClient.getBasePath()).values()
.stream().filter(s -> s.contains(HoodieLogFile.DELTA_EXTENSION))
.findAny().map(f -> Pair.of(f, HoodieFileFormat.HOODIE_LOG))
.orElseGet(() -> {
commitMetadata.getFileIdAndFullPaths(metaClient.getBasePath()).values().stream()
.filter(s -> s.contains(HoodieLogFile.DELTA_EXTENSION)).findAny()
.map(f -> Pair.of(f, HoodieFileFormat.HOODIE_LOG)).orElseGet(() -> {
// No Log files in Delta-Commit. Check if there are any parquet files
return commitMetadata.getFileIdAndFullPaths(metaClient.getBasePath()).values().stream()
.filter(s -> s.contains((metaClient.getTableConfig().getROFileFormat().getFileExtension())))
.findAny()
.map(f -> Pair.of(f, HoodieFileFormat.PARQUET)).orElseThrow(() -> {
return new IllegalArgumentException(
"Could not find any data file written for commit " + lastDeltaInstant
+ ", could not get schema for dataset " + metaClient.getBasePath()
+ ", CommitMetadata :" + commitMetadata);
.findAny().map(f -> Pair.of(f, HoodieFileFormat.PARQUET)).orElseThrow(() -> {
return new IllegalArgumentException("Could not find any data file written for commit "
+ lastDeltaInstant + ", could not get schema for dataset " + metaClient.getBasePath()
+ ", CommitMetadata :" + commitMetadata);
});
});
switch (filePathWithFormat.getRight()) {
@@ -419,8 +399,7 @@ public class HoodieHiveClient {
throw new InvalidDatasetException(syncConfig.basePath);
}
} catch (IOException e) {
throw new HoodieHiveSyncException("Failed to get dataset schema for " + syncConfig.tableName,
e);
throw new HoodieHiveSyncException("Failed to get dataset schema for " + syncConfig.tableName, e);
}
}
@@ -428,20 +407,16 @@ public class HoodieHiveClient {
* Read schema from a data file from the last compaction commit done.
*/
@SuppressWarnings("OptionalUsedAsFieldOrParameterType")
private MessageType readSchemaFromLastCompaction(Option<HoodieInstant> lastCompactionCommitOpt)
throws IOException {
HoodieInstant lastCompactionCommit = lastCompactionCommitOpt.orElseThrow(
() -> new HoodieHiveSyncException(
"Could not read schema from last compaction, no compaction commits found on path "
+ syncConfig.basePath));
private MessageType readSchemaFromLastCompaction(Option<HoodieInstant> lastCompactionCommitOpt) throws IOException {
HoodieInstant lastCompactionCommit = lastCompactionCommitOpt.orElseThrow(() -> new HoodieHiveSyncException(
"Could not read schema from last compaction, no compaction commits found on path " + syncConfig.basePath));
// Read from the compacted file wrote
HoodieCommitMetadata compactionMetadata = HoodieCommitMetadata.fromBytes(
activeTimeline.getInstantDetails(lastCompactionCommit).get(), HoodieCommitMetadata.class);
String filePath = compactionMetadata.getFileIdAndFullPaths(metaClient.getBasePath()).values()
.stream().findAny().orElseThrow(() -> new IllegalArgumentException(
"Could not find any data file written for compaction " + lastCompactionCommit
+ ", could not get schema for dataset " + metaClient.getBasePath()));
HoodieCommitMetadata compactionMetadata = HoodieCommitMetadata
.fromBytes(activeTimeline.getInstantDetails(lastCompactionCommit).get(), HoodieCommitMetadata.class);
String filePath = compactionMetadata.getFileIdAndFullPaths(metaClient.getBasePath()).values().stream().findAny()
.orElseThrow(() -> new IllegalArgumentException("Could not find any data file written for compaction "
+ lastCompactionCommit + ", could not get schema for dataset " + metaClient.getBasePath()));
return readSchemaFromDataFile(new Path(filePath));
}
@@ -449,8 +424,8 @@ public class HoodieHiveClient {
* Read the schema from the log file on path
*/
@SuppressWarnings("OptionalUsedAsFieldOrParameterType")
private MessageType readSchemaFromLogFile(Option<HoodieInstant> lastCompactionCommitOpt,
Path path) throws IOException {
private MessageType readSchemaFromLogFile(Option<HoodieInstant> lastCompactionCommitOpt, Path path)
throws IOException {
MessageType messageType = SchemaUtil.readSchemaFromLogFile(fs, path);
// Fall back to read the schema from last compaction
if (messageType == null) {
@@ -469,8 +444,8 @@ public class HoodieHiveClient {
throw new IllegalArgumentException(
"Failed to read schema from data file " + parquetFilePath + ". File does not exist.");
}
ParquetMetadata fileFooter = ParquetFileReader.readFooter(fs.getConf(), parquetFilePath,
ParquetMetadataConverter.NO_FILTER);
ParquetMetadata fileFooter =
ParquetFileReader.readFooter(fs.getConf(), parquetFilePath, ParquetMetadataConverter.NO_FILTER);
return fileFooter.getFileMetaData().getSchema();
}
@@ -481,8 +456,7 @@ public class HoodieHiveClient {
try {
return client.tableExists(syncConfig.databaseName, syncConfig.tableName);
} catch (TException e) {
throw new HoodieHiveSyncException("Failed to check if table exists " + syncConfig.tableName,
e);
throw new HoodieHiveSyncException("Failed to check if table exists " + syncConfig.tableName, e);
}
}
@@ -623,11 +597,9 @@ public class HoodieHiveClient {
// Get the last commit time from the TBLproperties
try {
Table database = client.getTable(syncConfig.databaseName, syncConfig.tableName);
return Option.ofNullable(
database.getParameters().getOrDefault(HOODIE_LAST_COMMIT_TIME_SYNC, null));
return Option.ofNullable(database.getParameters().getOrDefault(HOODIE_LAST_COMMIT_TIME_SYNC, null));
} catch (Exception e) {
throw new HoodieHiveSyncException(
"Failed to get the last commit time synced from the database", e);
throw new HoodieHiveSyncException("Failed to get the last commit time synced from the database", e);
}
}
@@ -650,26 +622,21 @@ public class HoodieHiveClient {
if (!lastCommitTimeSynced.isPresent()) {
LOG.info("Last commit time synced is not known, listing all partitions in " + syncConfig.basePath + ",FS :" + fs);
try {
return FSUtils.getAllPartitionPaths(fs, syncConfig.basePath,
syncConfig.assumeDatePartitioning);
return FSUtils.getAllPartitionPaths(fs, syncConfig.basePath, syncConfig.assumeDatePartitioning);
} catch (IOException e) {
throw new HoodieIOException("Failed to list all partitions in " + syncConfig.basePath, e);
}
} else {
LOG.info("Last commit time synced is " + lastCommitTimeSynced.get()
+ ", Getting commits since then");
LOG.info("Last commit time synced is " + lastCommitTimeSynced.get() + ", Getting commits since then");
HoodieTimeline timelineToSync = activeTimeline.findInstantsAfter(lastCommitTimeSynced.get(),
Integer.MAX_VALUE);
HoodieTimeline timelineToSync = activeTimeline.findInstantsAfter(lastCommitTimeSynced.get(), Integer.MAX_VALUE);
return timelineToSync.getInstants().map(s -> {
try {
return HoodieCommitMetadata.fromBytes(activeTimeline.getInstantDetails(s).get(), HoodieCommitMetadata.class);
} catch (IOException e) {
throw new HoodieIOException(
"Failed to get partitions written since " + lastCommitTimeSynced, e);
throw new HoodieIOException("Failed to get partitions written since " + lastCommitTimeSynced, e);
}
}).flatMap(s -> s.getPartitionToWriteStats().keySet().stream()).distinct()
.collect(Collectors.toList());
}).flatMap(s -> s.getPartitionToWriteStats().keySet().stream()).distinct().collect(Collectors.toList());
}
}
@@ -685,8 +652,7 @@ public class HoodieHiveClient {
table.putToParameters(HOODIE_LAST_COMMIT_TIME_SYNC, lastCommitSynced);
client.alter_table(syncConfig.databaseName, syncConfig.tableName, table);
} catch (Exception e) {
throw new HoodieHiveSyncException(
"Failed to get update last commit time synced to " + lastCommitSynced, e);
throw new HoodieHiveSyncException("Failed to get update last commit time synced to " + lastCommitSynced, e);
}
}
@@ -697,8 +663,7 @@ public class HoodieHiveClient {
static class PartitionEvent {
public enum PartitionEventType {
ADD,
UPDATE
ADD, UPDATE
}
PartitionEventType eventType;
@@ -717,4 +682,4 @@ public class HoodieHiveClient {
return new PartitionEvent(PartitionEventType.UPDATE, storagePartition);
}
}
}
}

View File

@@ -34,8 +34,7 @@ public class MultiPartKeysValueExtractor implements PartitionValueExtractor {
return Arrays.stream(splits).map(s -> {
if (s.contains("=")) {
String[] moreSplit = s.split("=");
Preconditions.checkArgument(moreSplit.length == 2,
"Partition Field (" + s + ") not in expected format");
Preconditions.checkArgument(moreSplit.length == 2, "Partition Field (" + s + ") not in expected format");
return moreSplit[1];
}
return s;

View File

@@ -24,7 +24,7 @@ import java.util.List;
/**
* Extractor for Non-partitioned hive tables
*/
public class NonPartitionedExtractor implements PartitionValueExtractor {
public class NonPartitionedExtractor implements PartitionValueExtractor {
@Override
public List<String> extractPartitionValuesInPath(String partitionPath) {

View File

@@ -22,12 +22,10 @@ import java.io.Serializable;
import java.util.List;
/**
* HDFS Path contain hive partition values for the keys it is partitioned on. This mapping is not
* straight forward and requires a pluggable implementation to extract the partition value from HDFS
* path.
* HDFS Path contain hive partition values for the keys it is partitioned on. This mapping is not straight forward and
* requires a pluggable implementation to extract the partition value from HDFS path.
* <p>
* e.g. Hive table partitioned by datestr=yyyy-mm-dd and hdfs path
* /app/hoodie/dataset1/YYYY=[yyyy]/MM=[mm]/DD=[dd]
* e.g. Hive table partitioned by datestr=yyyy-mm-dd and hdfs path /app/hoodie/dataset1/YYYY=[yyyy]/MM=[mm]/DD=[dd]
*/
public interface PartitionValueExtractor extends Serializable {

View File

@@ -38,9 +38,8 @@ public class SchemaDifference {
private final Map<String, String> updateColumnTypes;
private final Map<String, String> addColumnTypes;
private SchemaDifference(MessageType storageSchema, Map<String, String> tableSchema,
List<String> deleteColumns, Map<String, String> updateColumnTypes,
Map<String, String> addColumnTypes) {
private SchemaDifference(MessageType storageSchema, Map<String, String> tableSchema, List<String> deleteColumns,
Map<String, String> updateColumnTypes, Map<String, String> addColumnTypes) {
this.storageSchema = storageSchema;
this.tableSchema = tableSchema;
this.deleteColumns = ImmutableList.copyOf(deleteColumns);
@@ -62,9 +61,8 @@ public class SchemaDifference {
@Override
public String toString() {
return Objects.toStringHelper(this).add("deleteColumns", deleteColumns)
.add("updateColumnTypes", updateColumnTypes).add("addColumnTypes", addColumnTypes)
.toString();
return Objects.toStringHelper(this).add("deleteColumns", deleteColumns).add("updateColumnTypes", updateColumnTypes)
.add("addColumnTypes", addColumnTypes).toString();
}
public static Builder newBuilder(MessageType storageSchema, Map<String, String> tableSchema) {
@@ -107,8 +105,7 @@ public class SchemaDifference {
}
public SchemaDifference build() {
return new SchemaDifference(storageSchema, tableSchema, deleteColumns, updateColumnTypes,
addColumnTypes);
return new SchemaDifference(storageSchema, tableSchema, deleteColumns, updateColumnTypes, addColumnTypes);
}
}
}

View File

@@ -25,9 +25,8 @@ import org.joda.time.format.DateTimeFormat;
import org.joda.time.format.DateTimeFormatter;
/**
* HDFS Path contain hive partition values for the keys it is partitioned on. This mapping is not
* straight forward and requires a pluggable implementation to extract the partition value from HDFS
* path.
* HDFS Path contain hive partition values for the keys it is partitioned on. This mapping is not straight forward and
* requires a pluggable implementation to extract the partition value from HDFS path.
* <p>
* This implementation extracts datestr=yyyy-mm-dd from path of type /yyyy/mm/dd
*/
@@ -51,8 +50,7 @@ public class SlashEncodedDayPartitionValueExtractor implements PartitionValueExt
// partition path is expected to be in this format yyyy/mm/dd
String[] splits = partitionPath.split("/");
if (splits.length != 3) {
throw new IllegalArgumentException(
"Partition path " + partitionPath + " is not in the form yyyy/mm/dd ");
throw new IllegalArgumentException("Partition path " + partitionPath + " is not in the form yyyy/mm/dd ");
}
// Get the partition part and remove the / as well at the end
int year = Integer.parseInt(splits[0]);

View File

@@ -28,8 +28,8 @@ public class ColumnNameXLator {
public static String translateNestedColumn(String colName) {
Map.Entry entry;
for (Iterator ic = xformMap.entrySet().iterator(); ic.hasNext();
colName = colName.replaceAll((String) entry.getKey(), (String) entry.getValue())) {
for (Iterator ic = xformMap.entrySet().iterator(); ic.hasNext(); colName =
colName.replaceAll((String) entry.getKey(), (String) entry.getValue())) {
entry = (Map.Entry) ic.next();
}

View File

@@ -56,8 +56,8 @@ public class SchemaUtil {
/**
* Get the schema difference between the storage schema and hive table schema
*/
public static SchemaDifference getSchemaDifference(MessageType storageSchema,
Map<String, String> tableSchema, List<String> partitionKeys) {
public static SchemaDifference getSchemaDifference(MessageType storageSchema, Map<String, String> tableSchema,
List<String> partitionKeys) {
Map<String, String> newTableSchema;
try {
newTableSchema = convertParquetSchemaToHiveSchema(storageSchema);
@@ -65,16 +65,13 @@ public class SchemaUtil {
throw new HoodieHiveSyncException("Failed to convert parquet schema to hive schema", e);
}
LOG.info("Getting schema difference for " + tableSchema + "\r\n\r\n" + newTableSchema);
SchemaDifference.Builder schemaDiffBuilder = SchemaDifference
.newBuilder(storageSchema, tableSchema);
SchemaDifference.Builder schemaDiffBuilder = SchemaDifference.newBuilder(storageSchema, tableSchema);
Set<String> tableColumns = Sets.newHashSet();
for (Map.Entry<String, String> field : tableSchema.entrySet()) {
String fieldName = field.getKey().toLowerCase();
String tickSurroundedFieldName = tickSurround(fieldName);
if (!isFieldExistsInSchema(newTableSchema, tickSurroundedFieldName) && !partitionKeys
.contains(
fieldName)) {
if (!isFieldExistsInSchema(newTableSchema, tickSurroundedFieldName) && !partitionKeys.contains(fieldName)) {
schemaDiffBuilder.deleteTableColumn(fieldName);
} else {
// check type
@@ -85,8 +82,7 @@ public class SchemaUtil {
continue;
}
// We will log this and continue. Hive schema is a superset of all parquet schemas
LOG.warn(
"Ignoring table column " + fieldName + " as its not present in the parquet schema");
LOG.warn("Ignoring table column " + fieldName + " as its not present in the parquet schema");
continue;
}
tableColumnType = tableColumnType.replaceAll("\\s+", "");
@@ -99,12 +95,10 @@ public class SchemaUtil {
// check for incremental datasets, the schema type change is allowed as per evolution
// rules
if (!isSchemaTypeUpdateAllowed(tableColumnType, expectedType)) {
throw new HoodieHiveSyncException(
"Could not convert field Type from " + tableColumnType + " to " + expectedType
+ " for field " + fieldName);
throw new HoodieHiveSyncException("Could not convert field Type from " + tableColumnType + " to "
+ expectedType + " for field " + fieldName);
}
schemaDiffBuilder.updateTableColumn(fieldName,
getExpectedType(newTableSchema, tickSurroundedFieldName));
schemaDiffBuilder.updateTableColumn(fieldName, getExpectedType(newTableSchema, tickSurroundedFieldName));
}
}
tableColumns.add(tickSurroundedFieldName);
@@ -129,8 +123,7 @@ public class SchemaUtil {
return null;
}
private static boolean isFieldExistsInSchema(Map<String, String> newTableSchema,
String fieldName) {
private static boolean isFieldExistsInSchema(Map<String, String> newTableSchema, String fieldName) {
for (String entry : newTableSchema.keySet()) {
if (entry.toLowerCase().equals(fieldName)) {
return true;
@@ -146,8 +139,7 @@ public class SchemaUtil {
* @param messageType : Parquet Schema
* @return : Hive Table schema read from parquet file MAP[String,String]
*/
public static Map<String, String> convertParquetSchemaToHiveSchema(MessageType messageType)
throws IOException {
public static Map<String, String> convertParquetSchemaToHiveSchema(MessageType messageType) throws IOException {
Map<String, String> schema = Maps.newLinkedHashMap();
List<Type> parquetFields = messageType.getFields();
for (Type parquetType : parquetFields) {
@@ -173,8 +165,8 @@ public class SchemaUtil {
private static String convertField(final Type parquetType) {
StringBuilder field = new StringBuilder();
if (parquetType.isPrimitive()) {
final PrimitiveType.PrimitiveTypeName parquetPrimitiveTypeName = parquetType.asPrimitiveType()
.getPrimitiveTypeName();
final PrimitiveType.PrimitiveTypeName parquetPrimitiveTypeName =
parquetType.asPrimitiveType().getPrimitiveTypeName();
final OriginalType originalType = parquetType.getOriginalType();
if (originalType == OriginalType.DECIMAL) {
final DecimalMetadata decimalMetadata = parquetType.asPrimitiveType().getDecimalMetadata();
@@ -182,53 +174,51 @@ public class SchemaUtil {
.append(decimalMetadata.getScale()).append(")").toString();
}
// TODO - fix the method naming here
return parquetPrimitiveTypeName
.convert(new PrimitiveType.PrimitiveTypeNameConverter<String, RuntimeException>() {
@Override
public String convertBOOLEAN(PrimitiveType.PrimitiveTypeName primitiveTypeName) {
return "boolean";
}
return parquetPrimitiveTypeName.convert(new PrimitiveType.PrimitiveTypeNameConverter<String, RuntimeException>() {
@Override
public String convertBOOLEAN(PrimitiveType.PrimitiveTypeName primitiveTypeName) {
return "boolean";
}
@Override
public String convertINT32(PrimitiveType.PrimitiveTypeName primitiveTypeName) {
return "int";
}
@Override
public String convertINT32(PrimitiveType.PrimitiveTypeName primitiveTypeName) {
return "int";
}
@Override
public String convertINT64(PrimitiveType.PrimitiveTypeName primitiveTypeName) {
return "bigint";
}
@Override
public String convertINT64(PrimitiveType.PrimitiveTypeName primitiveTypeName) {
return "bigint";
}
@Override
public String convertINT96(PrimitiveType.PrimitiveTypeName primitiveTypeName) {
return "timestamp-millis";
}
@Override
public String convertINT96(PrimitiveType.PrimitiveTypeName primitiveTypeName) {
return "timestamp-millis";
}
@Override
public String convertFLOAT(PrimitiveType.PrimitiveTypeName primitiveTypeName) {
return "float";
}
@Override
public String convertFLOAT(PrimitiveType.PrimitiveTypeName primitiveTypeName) {
return "float";
}
@Override
public String convertDOUBLE(PrimitiveType.PrimitiveTypeName primitiveTypeName) {
return "double";
}
@Override
public String convertDOUBLE(PrimitiveType.PrimitiveTypeName primitiveTypeName) {
return "double";
}
@Override
public String convertFIXED_LEN_BYTE_ARRAY(
PrimitiveType.PrimitiveTypeName primitiveTypeName) {
return "binary";
}
@Override
public String convertFIXED_LEN_BYTE_ARRAY(PrimitiveType.PrimitiveTypeName primitiveTypeName) {
return "binary";
}
@Override
public String convertBINARY(PrimitiveType.PrimitiveTypeName primitiveTypeName) {
if (originalType == OriginalType.UTF8 || originalType == OriginalType.ENUM) {
return "string";
} else {
return "binary";
}
}
});
@Override
public String convertBINARY(PrimitiveType.PrimitiveTypeName primitiveTypeName) {
if (originalType == OriginalType.UTF8 || originalType == OriginalType.ENUM) {
return "string";
} else {
return "binary";
}
}
});
} else {
GroupType parquetGroupType = parquetType.asGroupType();
OriginalType originalType = parquetGroupType.getOriginalType();
@@ -244,8 +234,7 @@ public class SchemaUtil {
}
return createHiveArray(elementType, parquetGroupType.getName());
case MAP:
if (parquetGroupType.getFieldCount() != 1 || parquetGroupType.getType(0)
.isPrimitive()) {
if (parquetGroupType.getFieldCount() != 1 || parquetGroupType.getType(0).isPrimitive()) {
throw new UnsupportedOperationException("Invalid map type " + parquetGroupType);
}
GroupType mapKeyValType = parquetGroupType.getType(0).asGroupType();
@@ -255,11 +244,10 @@ public class SchemaUtil {
throw new UnsupportedOperationException("Invalid map type " + parquetGroupType);
}
Type keyType = mapKeyValType.getType(0);
if (!keyType.isPrimitive() || !keyType.asPrimitiveType().getPrimitiveTypeName()
.equals(PrimitiveType.PrimitiveTypeName.BINARY)
if (!keyType.isPrimitive()
|| !keyType.asPrimitiveType().getPrimitiveTypeName().equals(PrimitiveType.PrimitiveTypeName.BINARY)
|| !keyType.getOriginalType().equals(OriginalType.UTF8)) {
throw new UnsupportedOperationException(
"Map key type must be binary (UTF8): " + keyType);
throw new UnsupportedOperationException("Map key type must be binary (UTF8): " + keyType);
}
Type valueType = mapKeyValType.getType(1);
return createHiveMap(convertField(keyType), convertField(valueType));
@@ -292,8 +280,8 @@ public class SchemaUtil {
StringBuilder struct = new StringBuilder();
struct.append("STRUCT< ");
for (Type field : parquetFields) {
//TODO: struct field name is only translated to support special char($)
//We will need to extend it to other collection type
// TODO: struct field name is only translated to support special char($)
// We will need to extend it to other collection type
struct.append(hiveCompatibleFieldName(field.getName(), true)).append(" : ");
struct.append(convertField(field)).append(", ");
}
@@ -353,9 +341,8 @@ public class SchemaUtil {
} else {
final GroupType groupType = elementType.asGroupType();
final List<Type> groupFields = groupType.getFields();
if (groupFields.size() > 1 || (groupFields.size() == 1 && (
elementType.getName().equals("array") || elementType.getName()
.equals(elementName + "_tuple")))) {
if (groupFields.size() > 1 || (groupFields.size() == 1
&& (elementType.getName().equals("array") || elementType.getName().equals(elementName + "_tuple")))) {
array.append(convertField(elementType));
} else {
array.append(convertField(groupType.getFields().get(0)));
@@ -366,8 +353,7 @@ public class SchemaUtil {
}
public static boolean isSchemaTypeUpdateAllowed(String prevType, String newType) {
if (prevType == null || prevType.trim().isEmpty() || newType == null || newType.trim()
.isEmpty()) {
if (prevType == null || prevType.trim().isEmpty() || newType == null || newType.trim().isEmpty()) {
return false;
}
prevType = prevType.toLowerCase();
@@ -402,8 +388,8 @@ public class SchemaUtil {
return columns.toString();
}
public static String generateCreateDDL(MessageType storageSchema, HiveSyncConfig config,
String inputFormatClass, String outputFormatClass, String serdeClass) throws IOException {
public static String generateCreateDDL(MessageType storageSchema, HiveSyncConfig config, String inputFormatClass,
String outputFormatClass, String serdeClass) throws IOException {
Map<String, String> hiveSchema = convertParquetSchemaToHiveSchema(storageSchema);
String columns = generateSchemaString(storageSchema, config.partitionFields);
@@ -423,8 +409,8 @@ public class SchemaUtil {
}
sb = sb.append(" ROW FORMAT SERDE '").append(serdeClass).append("'");
sb = sb.append(" STORED AS INPUTFORMAT '").append(inputFormatClass).append("'");
sb = sb.append(" OUTPUTFORMAT '").append(outputFormatClass).append("' LOCATION '")
.append(config.basePath).append("'");
sb = sb.append(" OUTPUTFORMAT '").append(outputFormatClass).append("' LOCATION '").append(config.basePath)
.append("'");
return sb.toString();
}
@@ -440,6 +426,7 @@ public class SchemaUtil {
/**
* Read the schema from the log file on path
*
* @return
*/
@SuppressWarnings("OptionalUsedAsFieldOrParameterType")

View File

@@ -57,7 +57,7 @@ public class HiveSyncToolTest {
@Parameterized.Parameters(name = "UseJdbc")
public static Collection<Boolean[]> data() {
return Arrays.asList(new Boolean[][]{{false}, {true}});
return Arrays.asList(new Boolean[][] {{false}, {true}});
}
@Before
@@ -71,45 +71,38 @@ public class HiveSyncToolTest {
}
/**
* Testing converting array types to Hive field declaration strings, according to the Parquet-113
* spec: https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#lists
* Testing converting array types to Hive field declaration strings, according to the Parquet-113 spec:
* https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#lists
*/
@Test
public void testSchemaConvertArray() throws IOException {
// Testing the 3-level annotation structure
MessageType schema = Types.buildMessage().optionalGroup()
.as(OriginalType.LIST).repeatedGroup()
.optional(PrimitiveType.PrimitiveTypeName.INT32).named("element")
.named("list").named("int_list").named("ArrayOfInts");
MessageType schema = Types.buildMessage().optionalGroup().as(OriginalType.LIST).repeatedGroup()
.optional(PrimitiveType.PrimitiveTypeName.INT32).named("element").named("list").named("int_list")
.named("ArrayOfInts");
String schemaString = SchemaUtil.generateSchemaString(schema);
assertEquals("`int_list` ARRAY< int>", schemaString);
// A array of arrays
schema = Types.buildMessage().optionalGroup()
.as(OriginalType.LIST).repeatedGroup().requiredGroup()
.as(OriginalType.LIST).repeatedGroup()
.required(PrimitiveType.PrimitiveTypeName.INT32).named("element").named("list")
.named("element").named("list").named("int_list_list").named("ArrayOfArrayOfInts");
schema = Types.buildMessage().optionalGroup().as(OriginalType.LIST).repeatedGroup().requiredGroup()
.as(OriginalType.LIST).repeatedGroup().required(PrimitiveType.PrimitiveTypeName.INT32).named("element")
.named("list").named("element").named("list").named("int_list_list").named("ArrayOfArrayOfInts");
schemaString = SchemaUtil.generateSchemaString(schema);
assertEquals("`int_list_list` ARRAY< ARRAY< int>>", schemaString);
// A list of integers
schema = Types.buildMessage().optionalGroup()
.as(OriginalType.LIST)
.repeated(PrimitiveType.PrimitiveTypeName.INT32).named("element").named("int_list")
.named("ArrayOfInts");
schema = Types.buildMessage().optionalGroup().as(OriginalType.LIST).repeated(PrimitiveType.PrimitiveTypeName.INT32)
.named("element").named("int_list").named("ArrayOfInts");
schemaString = SchemaUtil.generateSchemaString(schema);
assertEquals("`int_list` ARRAY< int>", schemaString);
// A list of structs with two fields
schema = Types.buildMessage().optionalGroup()
.as(OriginalType.LIST).repeatedGroup()
.required(PrimitiveType.PrimitiveTypeName.BINARY).named("str")
.required(PrimitiveType.PrimitiveTypeName.INT32).named("num").named("element")
.named("tuple_list").named("ArrayOfTuples");
schema = Types.buildMessage().optionalGroup().as(OriginalType.LIST).repeatedGroup()
.required(PrimitiveType.PrimitiveTypeName.BINARY).named("str").required(PrimitiveType.PrimitiveTypeName.INT32)
.named("num").named("element").named("tuple_list").named("ArrayOfTuples");
schemaString = SchemaUtil.generateSchemaString(schema);
assertEquals("`tuple_list` ARRAY< STRUCT< `str` : binary, `num` : int>>", schemaString);
@@ -117,10 +110,9 @@ public class HiveSyncToolTest {
// A list of structs with a single field
// For this case, since the inner group name is "array", we treat the
// element type as a one-element struct.
schema = Types.buildMessage().optionalGroup()
.as(OriginalType.LIST).repeatedGroup()
.required(PrimitiveType.PrimitiveTypeName.BINARY).named("str").named("array")
.named("one_tuple_list").named("ArrayOfOneTuples");
schema = Types.buildMessage().optionalGroup().as(OriginalType.LIST).repeatedGroup()
.required(PrimitiveType.PrimitiveTypeName.BINARY).named("str").named("array").named("one_tuple_list")
.named("ArrayOfOneTuples");
schemaString = SchemaUtil.generateSchemaString(schema);
assertEquals("`one_tuple_list` ARRAY< STRUCT< `str` : binary>>", schemaString);
@@ -128,10 +120,9 @@ public class HiveSyncToolTest {
// A list of structs with a single field
// For this case, since the inner group name ends with "_tuple", we also treat the
// element type as a one-element struct.
schema = Types.buildMessage().optionalGroup()
.as(OriginalType.LIST).repeatedGroup()
.required(PrimitiveType.PrimitiveTypeName.BINARY).named("str")
.named("one_tuple_list_tuple").named("one_tuple_list").named("ArrayOfOneTuples2");
schema = Types.buildMessage().optionalGroup().as(OriginalType.LIST).repeatedGroup()
.required(PrimitiveType.PrimitiveTypeName.BINARY).named("str").named("one_tuple_list_tuple")
.named("one_tuple_list").named("ArrayOfOneTuples2");
schemaString = SchemaUtil.generateSchemaString(schema);
assertEquals("`one_tuple_list` ARRAY< STRUCT< `str` : binary>>", schemaString);
@@ -139,22 +130,18 @@ public class HiveSyncToolTest {
// A list of structs with a single field
// Unlike the above two cases, for this the element type is the type of the
// only field in the struct.
schema = Types.buildMessage().optionalGroup()
.as(OriginalType.LIST).repeatedGroup()
.required(PrimitiveType.PrimitiveTypeName.BINARY).named("str")
.named("one_tuple_list").named("one_tuple_list").named("ArrayOfOneTuples3");
schema = Types.buildMessage().optionalGroup().as(OriginalType.LIST).repeatedGroup()
.required(PrimitiveType.PrimitiveTypeName.BINARY).named("str").named("one_tuple_list").named("one_tuple_list")
.named("ArrayOfOneTuples3");
schemaString = SchemaUtil.generateSchemaString(schema);
assertEquals("`one_tuple_list` ARRAY< binary>", schemaString);
// A list of maps
schema = Types.buildMessage().optionalGroup()
.as(OriginalType.LIST).repeatedGroup().as(OriginalType.MAP)
.repeatedGroup().as(OriginalType.MAP_KEY_VALUE)
.required(PrimitiveType.PrimitiveTypeName.BINARY).as(OriginalType.UTF8)
.named("string_key").required(PrimitiveType.PrimitiveTypeName.INT32)
.named("int_value").named("key_value").named("array").named("map_list")
.named("ArrayOfMaps");
schema = Types.buildMessage().optionalGroup().as(OriginalType.LIST).repeatedGroup().as(OriginalType.MAP)
.repeatedGroup().as(OriginalType.MAP_KEY_VALUE).required(PrimitiveType.PrimitiveTypeName.BINARY)
.as(OriginalType.UTF8).named("string_key").required(PrimitiveType.PrimitiveTypeName.INT32).named("int_value")
.named("key_value").named("array").named("map_list").named("ArrayOfMaps");
schemaString = SchemaUtil.generateSchemaString(schema);
assertEquals("`map_list` ARRAY< MAP< string, int>>", schemaString);
@@ -166,22 +153,21 @@ public class HiveSyncToolTest {
TestUtil.hiveSyncConfig.useJdbc = this.useJdbc;
String commitTime = "100";
TestUtil.createCOWDataset(commitTime, 5);
HoodieHiveClient hiveClient = new HoodieHiveClient(TestUtil.hiveSyncConfig,
TestUtil.getHiveConf(), TestUtil.fileSystem);
HoodieHiveClient hiveClient =
new HoodieHiveClient(TestUtil.hiveSyncConfig, TestUtil.getHiveConf(), TestUtil.fileSystem);
assertFalse("Table " + TestUtil.hiveSyncConfig.tableName + " should not exist initially",
hiveClient.doesTableExist());
// Lets do the sync
HiveSyncTool tool = new HiveSyncTool(TestUtil.hiveSyncConfig, TestUtil.getHiveConf(),
TestUtil.fileSystem);
HiveSyncTool tool = new HiveSyncTool(TestUtil.hiveSyncConfig, TestUtil.getHiveConf(), TestUtil.fileSystem);
tool.syncHoodieTable();
assertTrue("Table " + TestUtil.hiveSyncConfig.tableName + " should exist after sync completes",
hiveClient.doesTableExist());
assertEquals("Hive Schema should match the dataset schema + partition field",
hiveClient.getTableSchema().size(), hiveClient.getDataSchema().getColumns().size() + 1);
assertEquals("Hive Schema should match the dataset schema + partition field", hiveClient.getTableSchema().size(),
hiveClient.getDataSchema().getColumns().size() + 1);
assertEquals("Table partitions should match the number of partitions we wrote", 5,
hiveClient.scanTablePartitions().size());
assertEquals("The last commit that was sycned should be updated in the TBLPROPERTIES",
commitTime, hiveClient.getLastCommitTimeSynced().get());
assertEquals("The last commit that was sycned should be updated in the TBLPROPERTIES", commitTime,
hiveClient.getLastCommitTimeSynced().get());
}
@Test
@@ -189,16 +175,15 @@ public class HiveSyncToolTest {
TestUtil.hiveSyncConfig.useJdbc = this.useJdbc;
String commitTime1 = "100";
TestUtil.createCOWDataset(commitTime1, 5);
HoodieHiveClient hiveClient = new HoodieHiveClient(TestUtil.hiveSyncConfig,
TestUtil.getHiveConf(), TestUtil.fileSystem);
HoodieHiveClient hiveClient =
new HoodieHiveClient(TestUtil.hiveSyncConfig, TestUtil.getHiveConf(), TestUtil.fileSystem);
// Lets do the sync
HiveSyncTool tool = new HiveSyncTool(TestUtil.hiveSyncConfig, TestUtil.getHiveConf(),
TestUtil.fileSystem);
HiveSyncTool tool = new HiveSyncTool(TestUtil.hiveSyncConfig, TestUtil.getHiveConf(), TestUtil.fileSystem);
tool.syncHoodieTable();
assertEquals("Table partitions should match the number of partitions we wrote", 5,
hiveClient.scanTablePartitions().size());
assertEquals("The last commit that was sycned should be updated in the TBLPROPERTIES",
commitTime1, hiveClient.getLastCommitTimeSynced().get());
assertEquals("The last commit that was sycned should be updated in the TBLPROPERTIES", commitTime1,
hiveClient.getLastCommitTimeSynced().get());
// Now lets create more parititions and these are the only ones which needs to be synced
DateTime dateTime = DateTime.now().plusDays(6);
@@ -206,15 +191,11 @@ public class HiveSyncToolTest {
TestUtil.addCOWPartitions(1, true, dateTime, commitTime2);
// Lets do the sync
hiveClient = new HoodieHiveClient(TestUtil.hiveSyncConfig, TestUtil.getHiveConf(),
TestUtil.fileSystem);
List<String> writtenPartitionsSince = hiveClient.getPartitionsWrittenToSince(
Option.of(commitTime1));
assertEquals("We should have one partition written after 100 commit", 1,
writtenPartitionsSince.size());
hiveClient = new HoodieHiveClient(TestUtil.hiveSyncConfig, TestUtil.getHiveConf(), TestUtil.fileSystem);
List<String> writtenPartitionsSince = hiveClient.getPartitionsWrittenToSince(Option.of(commitTime1));
assertEquals("We should have one partition written after 100 commit", 1, writtenPartitionsSince.size());
List<Partition> hivePartitions = hiveClient.scanTablePartitions();
List<PartitionEvent> partitionEvents = hiveClient.getPartitionEvents(hivePartitions,
writtenPartitionsSince);
List<PartitionEvent> partitionEvents = hiveClient.getPartitionEvents(hivePartitions, writtenPartitionsSince);
assertEquals("There should be only one paritition event", 1, partitionEvents.size());
assertEquals("The one partition event must of type ADD", PartitionEventType.ADD,
partitionEvents.iterator().next().eventType);
@@ -222,8 +203,7 @@ public class HiveSyncToolTest {
tool = new HiveSyncTool(TestUtil.hiveSyncConfig, TestUtil.getHiveConf(), TestUtil.fileSystem);
tool.syncHoodieTable();
// Sync should add the one partition
assertEquals("The one partition we wrote should be added to hive", 6,
hiveClient.scanTablePartitions().size());
assertEquals("The one partition we wrote should be added to hive", 6, hiveClient.scanTablePartitions().size());
assertEquals("The last commit that was sycned should be 101", commitTime2,
hiveClient.getLastCommitTimeSynced().get());
}
@@ -233,11 +213,10 @@ public class HiveSyncToolTest {
TestUtil.hiveSyncConfig.useJdbc = this.useJdbc;
String commitTime1 = "100";
TestUtil.createCOWDataset(commitTime1, 5);
HoodieHiveClient hiveClient = new HoodieHiveClient(TestUtil.hiveSyncConfig,
TestUtil.getHiveConf(), TestUtil.fileSystem);
HoodieHiveClient hiveClient =
new HoodieHiveClient(TestUtil.hiveSyncConfig, TestUtil.getHiveConf(), TestUtil.fileSystem);
// Lets do the sync
HiveSyncTool tool = new HiveSyncTool(TestUtil.hiveSyncConfig, TestUtil.getHiveConf(),
TestUtil.fileSystem);
HiveSyncTool tool = new HiveSyncTool(TestUtil.hiveSyncConfig, TestUtil.getHiveConf(), TestUtil.fileSystem);
tool.syncHoodieTable();
int fields = hiveClient.getTableSchema().size();
@@ -253,14 +232,13 @@ public class HiveSyncToolTest {
assertEquals("Hive Schema has evolved and should not be 3 more field", fields + 3,
hiveClient.getTableSchema().size());
assertEquals("Hive Schema has evolved - Field favorite_number has evolved from int to long",
"BIGINT", hiveClient.getTableSchema().get("favorite_number"));
assertEquals("Hive Schema has evolved - Field favorite_number has evolved from int to long", "BIGINT",
hiveClient.getTableSchema().get("favorite_number"));
assertTrue("Hive Schema has evolved - Field favorite_movie was added",
hiveClient.getTableSchema().containsKey("favorite_movie"));
// Sync should add the one partition
assertEquals("The one partition we wrote should be added to hive", 6,
hiveClient.scanTablePartitions().size());
assertEquals("The one partition we wrote should be added to hive", 6, hiveClient.scanTablePartitions().size());
assertEquals("The last commit that was sycned should be 101", commitTime2,
hiveClient.getLastCommitTimeSynced().get());
}
@@ -271,24 +249,22 @@ public class HiveSyncToolTest {
String commitTime = "100";
String deltaCommitTime = "101";
TestUtil.createMORDataset(commitTime, deltaCommitTime, 5);
HoodieHiveClient hiveClient = new HoodieHiveClient(TestUtil.hiveSyncConfig,
TestUtil.getHiveConf(), TestUtil.fileSystem);
HoodieHiveClient hiveClient =
new HoodieHiveClient(TestUtil.hiveSyncConfig, TestUtil.getHiveConf(), TestUtil.fileSystem);
assertFalse("Table " + TestUtil.hiveSyncConfig.tableName + " should not exist initially",
hiveClient.doesTableExist());
// Lets do the sync
HiveSyncTool tool = new HiveSyncTool(TestUtil.hiveSyncConfig, TestUtil.getHiveConf(),
TestUtil.fileSystem);
HiveSyncTool tool = new HiveSyncTool(TestUtil.hiveSyncConfig, TestUtil.getHiveConf(), TestUtil.fileSystem);
tool.syncHoodieTable();
assertTrue("Table " + TestUtil.hiveSyncConfig.tableName + " should exist after sync completes",
hiveClient.doesTableExist());
assertEquals("Hive Schema should match the dataset schema + partition field",
hiveClient.getTableSchema().size(),
assertEquals("Hive Schema should match the dataset schema + partition field", hiveClient.getTableSchema().size(),
SchemaTestUtil.getSimpleSchema().getFields().size() + 1);
assertEquals("Table partitions should match the number of partitions we wrote", 5,
hiveClient.scanTablePartitions().size());
assertEquals("The last commit that was sycned should be updated in the TBLPROPERTIES",
deltaCommitTime, hiveClient.getLastCommitTimeSynced().get());
assertEquals("The last commit that was sycned should be updated in the TBLPROPERTIES", deltaCommitTime,
hiveClient.getLastCommitTimeSynced().get());
// Now lets create more parititions and these are the only ones which needs to be synced
DateTime dateTime = DateTime.now().plusDays(6);
@@ -300,50 +276,43 @@ public class HiveSyncToolTest {
// Lets do the sync
tool = new HiveSyncTool(TestUtil.hiveSyncConfig, TestUtil.getHiveConf(), TestUtil.fileSystem);
tool.syncHoodieTable();
hiveClient = new HoodieHiveClient(TestUtil.hiveSyncConfig, TestUtil.getHiveConf(),
TestUtil.fileSystem);
hiveClient = new HoodieHiveClient(TestUtil.hiveSyncConfig, TestUtil.getHiveConf(), TestUtil.fileSystem);
assertEquals("Hive Schema should match the evolved dataset schema + partition field",
hiveClient.getTableSchema().size(),
SchemaTestUtil.getEvolvedSchema().getFields().size() + 1);
hiveClient.getTableSchema().size(), SchemaTestUtil.getEvolvedSchema().getFields().size() + 1);
// Sync should add the one partition
assertEquals("The 2 partitions we wrote should be added to hive", 6,
hiveClient.scanTablePartitions().size());
assertEquals("The 2 partitions we wrote should be added to hive", 6, hiveClient.scanTablePartitions().size());
assertEquals("The last commit that was sycned should be 103", deltaCommitTime2,
hiveClient.getLastCommitTimeSynced().get());
}
@Test
public void testSyncMergeOnReadRT()
throws Exception {
public void testSyncMergeOnReadRT() throws Exception {
TestUtil.hiveSyncConfig.useJdbc = this.useJdbc;
String commitTime = "100";
String deltaCommitTime = "101";
String roTablename = TestUtil.hiveSyncConfig.tableName;
TestUtil.hiveSyncConfig.tableName =
TestUtil.hiveSyncConfig.tableName + HiveSyncTool.SUFFIX_REALTIME_TABLE;
TestUtil.hiveSyncConfig.tableName = TestUtil.hiveSyncConfig.tableName + HiveSyncTool.SUFFIX_REALTIME_TABLE;
TestUtil.createMORDataset(commitTime, deltaCommitTime, 5);
HoodieHiveClient hiveClientRT = new HoodieHiveClient(TestUtil.hiveSyncConfig,
TestUtil.getHiveConf(), TestUtil.fileSystem);
HoodieHiveClient hiveClientRT =
new HoodieHiveClient(TestUtil.hiveSyncConfig, TestUtil.getHiveConf(), TestUtil.fileSystem);
assertFalse("Table " + TestUtil.hiveSyncConfig.tableName + HiveSyncTool.SUFFIX_REALTIME_TABLE
+ " should not exist initially", hiveClientRT.doesTableExist());
// Lets do the sync
HiveSyncTool tool = new HiveSyncTool(TestUtil.hiveSyncConfig, TestUtil.getHiveConf(),
TestUtil.fileSystem);
HiveSyncTool tool = new HiveSyncTool(TestUtil.hiveSyncConfig, TestUtil.getHiveConf(), TestUtil.fileSystem);
tool.syncHoodieTable();
assertTrue("Table " + TestUtil.hiveSyncConfig.tableName + HiveSyncTool.SUFFIX_REALTIME_TABLE
+ " should exist after sync completes", hiveClientRT.doesTableExist());
assertEquals("Hive Schema should match the dataset schema + partition field",
hiveClientRT.getTableSchema().size(),
assertEquals("Hive Schema should match the dataset schema + partition field", hiveClientRT.getTableSchema().size(),
SchemaTestUtil.getSimpleSchema().getFields().size() + 1);
assertEquals("Table partitions should match the number of partitions we wrote", 5,
hiveClientRT.scanTablePartitions().size());
assertEquals("The last commit that was sycned should be updated in the TBLPROPERTIES",
deltaCommitTime, hiveClientRT.getLastCommitTimeSynced().get());
assertEquals("The last commit that was sycned should be updated in the TBLPROPERTIES", deltaCommitTime,
hiveClientRT.getLastCommitTimeSynced().get());
// Now lets create more parititions and these are the only ones which needs to be synced
DateTime dateTime = DateTime.now().plusDays(6);
@@ -355,23 +324,19 @@ public class HiveSyncToolTest {
// Lets do the sync
tool = new HiveSyncTool(TestUtil.hiveSyncConfig, TestUtil.getHiveConf(), TestUtil.fileSystem);
tool.syncHoodieTable();
hiveClientRT = new HoodieHiveClient(TestUtil.hiveSyncConfig, TestUtil.getHiveConf(),
TestUtil.fileSystem);
hiveClientRT = new HoodieHiveClient(TestUtil.hiveSyncConfig, TestUtil.getHiveConf(), TestUtil.fileSystem);
assertEquals("Hive Schema should match the evolved dataset schema + partition field",
hiveClientRT.getTableSchema().size(),
SchemaTestUtil.getEvolvedSchema().getFields().size() + 1);
hiveClientRT.getTableSchema().size(), SchemaTestUtil.getEvolvedSchema().getFields().size() + 1);
// Sync should add the one partition
assertEquals("The 2 partitions we wrote should be added to hive", 6,
hiveClientRT.scanTablePartitions().size());
assertEquals("The 2 partitions we wrote should be added to hive", 6, hiveClientRT.scanTablePartitions().size());
assertEquals("The last commit that was sycned should be 103", deltaCommitTime2,
hiveClientRT.getLastCommitTimeSynced().get());
TestUtil.hiveSyncConfig.tableName = roTablename;
}
@Test
public void testMultiPartitionKeySync()
throws Exception {
public void testMultiPartitionKeySync() throws Exception {
TestUtil.hiveSyncConfig.useJdbc = this.useJdbc;
String commitTime = "100";
TestUtil.createCOWDataset(commitTime, 5);
@@ -382,20 +347,17 @@ public class HiveSyncToolTest {
hiveSyncConfig.partitionFields = Lists.newArrayList("year", "month", "day");
TestUtil.getCreatedTablesSet().add(hiveSyncConfig.databaseName + "." + hiveSyncConfig.tableName);
HoodieHiveClient hiveClient = new HoodieHiveClient(hiveSyncConfig,
TestUtil.getHiveConf(), TestUtil.fileSystem);
assertFalse("Table " + hiveSyncConfig.tableName + " should not exist initially",
hiveClient.doesTableExist());
HoodieHiveClient hiveClient = new HoodieHiveClient(hiveSyncConfig, TestUtil.getHiveConf(), TestUtil.fileSystem);
assertFalse("Table " + hiveSyncConfig.tableName + " should not exist initially", hiveClient.doesTableExist());
// Lets do the sync
HiveSyncTool tool = new HiveSyncTool(hiveSyncConfig, TestUtil.getHiveConf(), TestUtil.fileSystem);
tool.syncHoodieTable();
assertTrue("Table " + hiveSyncConfig.tableName + " should exist after sync completes",
hiveClient.doesTableExist());
assertEquals("Hive Schema should match the dataset schema + partition fields",
hiveClient.getTableSchema().size(), hiveClient.getDataSchema().getColumns().size() + 3);
assertTrue("Table " + hiveSyncConfig.tableName + " should exist after sync completes", hiveClient.doesTableExist());
assertEquals("Hive Schema should match the dataset schema + partition fields", hiveClient.getTableSchema().size(),
hiveClient.getDataSchema().getColumns().size() + 3);
assertEquals("Table partitions should match the number of partitions we wrote", 5,
hiveClient.scanTablePartitions().size());
assertEquals("The last commit that was sycned should be updated in the TBLPROPERTIES",
commitTime, hiveClient.getLastCommitTimeSynced().get());
assertEquals("The last commit that was sycned should be updated in the TBLPROPERTIES", commitTime,
hiveClient.getLastCommitTimeSynced().get());
}
}

View File

@@ -119,12 +119,10 @@ public class TestUtil {
static void clear() throws IOException {
fileSystem.delete(new Path(hiveSyncConfig.basePath), true);
HoodieTableMetaClient
.initTableType(configuration, hiveSyncConfig.basePath, HoodieTableType.COPY_ON_WRITE,
hiveSyncConfig.tableName, HoodieAvroPayload.class.getName());
HoodieTableMetaClient.initTableType(configuration, hiveSyncConfig.basePath, HoodieTableType.COPY_ON_WRITE,
hiveSyncConfig.tableName, HoodieAvroPayload.class.getName());
HoodieHiveClient client = new HoodieHiveClient(hiveSyncConfig, hiveServer.getHiveConf(),
fileSystem);
HoodieHiveClient client = new HoodieHiveClient(hiveSyncConfig, hiveServer.getHiveConf(), fileSystem);
for (String tableName : createdTablesSet) {
client.updateHiveSQL("drop table if exists " + tableName);
}
@@ -154,14 +152,12 @@ public class TestUtil {
throws IOException, InitializationError, URISyntaxException, InterruptedException {
Path path = new Path(hiveSyncConfig.basePath);
FileIOUtils.deleteDirectory(new File(hiveSyncConfig.basePath));
HoodieTableMetaClient
.initTableType(configuration, hiveSyncConfig.basePath, HoodieTableType.COPY_ON_WRITE,
hiveSyncConfig.tableName, HoodieAvroPayload.class.getName());
HoodieTableMetaClient.initTableType(configuration, hiveSyncConfig.basePath, HoodieTableType.COPY_ON_WRITE,
hiveSyncConfig.tableName, HoodieAvroPayload.class.getName());
boolean result = fileSystem.mkdirs(path);
checkResult(result);
DateTime dateTime = DateTime.now();
HoodieCommitMetadata commitMetadata = createPartitions(numberOfPartitions, true, dateTime,
commitTime);
HoodieCommitMetadata commitMetadata = createPartitions(numberOfPartitions, true, dateTime, commitTime);
createdTablesSet.add(hiveSyncConfig.databaseName + "." + hiveSyncConfig.tableName);
createCommitFile(commitMetadata, commitTime);
}
@@ -170,57 +166,51 @@ public class TestUtil {
throws IOException, InitializationError, URISyntaxException, InterruptedException {
Path path = new Path(hiveSyncConfig.basePath);
FileIOUtils.deleteDirectory(new File(hiveSyncConfig.basePath));
HoodieTableMetaClient
.initTableType(configuration, hiveSyncConfig.basePath, HoodieTableType.MERGE_ON_READ,
hiveSyncConfig.tableName, HoodieAvroPayload.class.getName());
HoodieTableMetaClient.initTableType(configuration, hiveSyncConfig.basePath, HoodieTableType.MERGE_ON_READ,
hiveSyncConfig.tableName, HoodieAvroPayload.class.getName());
boolean result = fileSystem.mkdirs(path);
checkResult(result);
DateTime dateTime = DateTime.now();
HoodieCommitMetadata commitMetadata = createPartitions(numberOfPartitions, true, dateTime,
commitTime);
HoodieCommitMetadata commitMetadata = createPartitions(numberOfPartitions, true, dateTime, commitTime);
createdTablesSet.add(hiveSyncConfig.databaseName + "." + hiveSyncConfig.tableName);
createdTablesSet.add(hiveSyncConfig.databaseName + "." + hiveSyncConfig.tableName
+ HiveSyncTool.SUFFIX_REALTIME_TABLE);
createdTablesSet
.add(hiveSyncConfig.databaseName + "." + hiveSyncConfig.tableName + HiveSyncTool.SUFFIX_REALTIME_TABLE);
HoodieCommitMetadata compactionMetadata = new HoodieCommitMetadata();
commitMetadata.getPartitionToWriteStats().forEach(
(key, value) -> value.stream().forEach(l -> compactionMetadata.addWriteStat(key, l)));
commitMetadata.getPartitionToWriteStats()
.forEach((key, value) -> value.stream().forEach(l -> compactionMetadata.addWriteStat(key, l)));
createCompactionCommitFile(compactionMetadata, commitTime);
// Write a delta commit
HoodieCommitMetadata deltaMetadata = createLogFiles(commitMetadata.getPartitionToWriteStats(),
true);
HoodieCommitMetadata deltaMetadata = createLogFiles(commitMetadata.getPartitionToWriteStats(), true);
createDeltaCommitFile(deltaMetadata, deltaCommitTime);
}
static void addCOWPartitions(int numberOfPartitions, boolean isParquetSchemaSimple,
DateTime startFrom, String commitTime)
throws IOException, URISyntaxException, InterruptedException {
HoodieCommitMetadata commitMetadata = createPartitions(numberOfPartitions,
isParquetSchemaSimple, startFrom, commitTime);
static void addCOWPartitions(int numberOfPartitions, boolean isParquetSchemaSimple, DateTime startFrom,
String commitTime) throws IOException, URISyntaxException, InterruptedException {
HoodieCommitMetadata commitMetadata =
createPartitions(numberOfPartitions, isParquetSchemaSimple, startFrom, commitTime);
createdTablesSet.add(hiveSyncConfig.databaseName + "." + hiveSyncConfig.tableName);
createCommitFile(commitMetadata, commitTime);
}
static void addMORPartitions(int numberOfPartitions, boolean isParquetSchemaSimple,
boolean isLogSchemaSimple, DateTime startFrom, String commitTime, String deltaCommitTime)
static void addMORPartitions(int numberOfPartitions, boolean isParquetSchemaSimple, boolean isLogSchemaSimple,
DateTime startFrom, String commitTime, String deltaCommitTime)
throws IOException, URISyntaxException, InterruptedException {
HoodieCommitMetadata commitMetadata = createPartitions(numberOfPartitions,
isParquetSchemaSimple, startFrom, commitTime);
HoodieCommitMetadata commitMetadata =
createPartitions(numberOfPartitions, isParquetSchemaSimple, startFrom, commitTime);
createdTablesSet.add(hiveSyncConfig.databaseName + "." + hiveSyncConfig.tableName);
createdTablesSet.add(hiveSyncConfig.databaseName + "." + hiveSyncConfig.tableName
+ HiveSyncTool.SUFFIX_REALTIME_TABLE);
createdTablesSet
.add(hiveSyncConfig.databaseName + "." + hiveSyncConfig.tableName + HiveSyncTool.SUFFIX_REALTIME_TABLE);
HoodieCommitMetadata compactionMetadata = new HoodieCommitMetadata();
commitMetadata.getPartitionToWriteStats().forEach(
(key, value) -> value.stream().forEach(l -> compactionMetadata.addWriteStat(key, l)));
commitMetadata.getPartitionToWriteStats()
.forEach((key, value) -> value.stream().forEach(l -> compactionMetadata.addWriteStat(key, l)));
createCompactionCommitFile(compactionMetadata, commitTime);
HoodieCommitMetadata deltaMetadata = createLogFiles(commitMetadata.getPartitionToWriteStats(),
isLogSchemaSimple);
HoodieCommitMetadata deltaMetadata = createLogFiles(commitMetadata.getPartitionToWriteStats(), isLogSchemaSimple);
createDeltaCommitFile(deltaMetadata, deltaCommitTime);
}
private static HoodieCommitMetadata createLogFiles(
Map<String, List<HoodieWriteStat>> partitionWriteStats, boolean isLogSchemaSimple)
throws InterruptedException, IOException, URISyntaxException {
private static HoodieCommitMetadata createLogFiles(Map<String, List<HoodieWriteStat>> partitionWriteStats,
boolean isLogSchemaSimple) throws InterruptedException, IOException, URISyntaxException {
HoodieCommitMetadata commitMetadata = new HoodieCommitMetadata();
for (Entry<String, List<HoodieWriteStat>> wEntry : partitionWriteStats.entrySet()) {
String partitionPath = wEntry.getKey();
@@ -237,9 +227,8 @@ public class TestUtil {
return commitMetadata;
}
private static HoodieCommitMetadata createPartitions(int numberOfPartitions,
boolean isParquetSchemaSimple, DateTime startFrom, String commitTime)
throws IOException, URISyntaxException, InterruptedException {
private static HoodieCommitMetadata createPartitions(int numberOfPartitions, boolean isParquetSchemaSimple,
DateTime startFrom, String commitTime) throws IOException, URISyntaxException, InterruptedException {
startFrom = startFrom.withTimeAtStartOfDay();
HoodieCommitMetadata commitMetadata = new HoodieCommitMetadata();
@@ -248,22 +237,20 @@ public class TestUtil {
Path partPath = new Path(hiveSyncConfig.basePath + "/" + partitionPath);
fileSystem.makeQualified(partPath);
fileSystem.mkdirs(partPath);
List<HoodieWriteStat> writeStats = createTestData(partPath, isParquetSchemaSimple,
commitTime);
List<HoodieWriteStat> writeStats = createTestData(partPath, isParquetSchemaSimple, commitTime);
startFrom = startFrom.minusDays(1);
writeStats.forEach(s -> commitMetadata.addWriteStat(partitionPath, s));
}
return commitMetadata;
}
private static List<HoodieWriteStat> createTestData(Path partPath, boolean isParquetSchemaSimple,
String commitTime) throws IOException, URISyntaxException, InterruptedException {
private static List<HoodieWriteStat> createTestData(Path partPath, boolean isParquetSchemaSimple, String commitTime)
throws IOException, URISyntaxException, InterruptedException {
List<HoodieWriteStat> writeStats = Lists.newArrayList();
for (int i = 0; i < 5; i++) {
// Create 5 files
String fileId = UUID.randomUUID().toString();
Path filePath = new Path(partPath.toString() + "/" + FSUtils.makeDataFileName(commitTime,
"1-0-1", fileId));
Path filePath = new Path(partPath.toString() + "/" + FSUtils.makeDataFileName(commitTime, "1-0-1", fileId));
generateParquetData(filePath, isParquetSchemaSimple);
HoodieWriteStat writeStat = new HoodieWriteStat();
writeStat.setFileId(fileId);
@@ -276,18 +263,15 @@ public class TestUtil {
@SuppressWarnings({"unchecked", "deprecation"})
private static void generateParquetData(Path filePath, boolean isParquetSchemaSimple)
throws IOException, URISyntaxException, InterruptedException {
Schema schema = (isParquetSchemaSimple ? SchemaTestUtil.getSimpleSchema()
: SchemaTestUtil.getEvolvedSchema());
Schema schema = (isParquetSchemaSimple ? SchemaTestUtil.getSimpleSchema() : SchemaTestUtil.getEvolvedSchema());
org.apache.parquet.schema.MessageType parquetSchema = new AvroSchemaConverter().convert(schema);
BloomFilter filter = new BloomFilter(1000, 0.0001);
HoodieAvroWriteSupport writeSupport = new HoodieAvroWriteSupport(parquetSchema, schema, filter);
ParquetWriter writer = new ParquetWriter(filePath, writeSupport, CompressionCodecName.GZIP,
120 * 1024 * 1024, ParquetWriter.DEFAULT_PAGE_SIZE, ParquetWriter.DEFAULT_PAGE_SIZE,
ParquetWriter.DEFAULT_IS_DICTIONARY_ENABLED, ParquetWriter.DEFAULT_IS_VALIDATING_ENABLED,
ParquetWriter.DEFAULT_WRITER_VERSION, fileSystem.getConf());
ParquetWriter writer = new ParquetWriter(filePath, writeSupport, CompressionCodecName.GZIP, 120 * 1024 * 1024,
ParquetWriter.DEFAULT_PAGE_SIZE, ParquetWriter.DEFAULT_PAGE_SIZE, ParquetWriter.DEFAULT_IS_DICTIONARY_ENABLED,
ParquetWriter.DEFAULT_IS_VALIDATING_ENABLED, ParquetWriter.DEFAULT_WRITER_VERSION, fileSystem.getConf());
List<IndexedRecord> testRecords = (isParquetSchemaSimple ? SchemaTestUtil
.generateTestRecords(0, 100)
List<IndexedRecord> testRecords = (isParquetSchemaSimple ? SchemaTestUtil.generateTestRecords(0, 100)
: SchemaTestUtil.generateEvolvedTestRecords(100, 100));
testRecords.forEach(s -> {
try {
@@ -301,13 +285,11 @@ public class TestUtil {
private static HoodieLogFile generateLogData(Path parquetFilePath, boolean isLogSchemaSimple)
throws IOException, InterruptedException, URISyntaxException {
Schema schema = (isLogSchemaSimple ? SchemaTestUtil.getSimpleSchema()
: SchemaTestUtil.getEvolvedSchema());
Schema schema = (isLogSchemaSimple ? SchemaTestUtil.getSimpleSchema() : SchemaTestUtil.getEvolvedSchema());
HoodieDataFile dataFile = new HoodieDataFile(fileSystem.getFileStatus(parquetFilePath));
// Write a log file for this parquet file
Writer logWriter = HoodieLogFormat.newWriterBuilder().onParentPath(parquetFilePath.getParent())
.withFileExtension(HoodieLogFile.DELTA_EXTENSION)
.withFileId(dataFile.getFileId())
.withFileExtension(HoodieLogFile.DELTA_EXTENSION).withFileId(dataFile.getFileId())
.overBaseCommit(dataFile.getCommitTime()).withFs(fileSystem).build();
List<IndexedRecord> records = (isLogSchemaSimple ? SchemaTestUtil.generateTestRecords(0, 100)
: SchemaTestUtil.generateEvolvedTestRecords(100, 100));
@@ -326,37 +308,30 @@ public class TestUtil {
}
}
private static void createCommitFile(HoodieCommitMetadata commitMetadata, String commitTime)
private static void createCommitFile(HoodieCommitMetadata commitMetadata, String commitTime) throws IOException {
byte[] bytes = commitMetadata.toJsonString().getBytes(StandardCharsets.UTF_8);
Path fullPath = new Path(hiveSyncConfig.basePath + "/" + HoodieTableMetaClient.METAFOLDER_NAME + "/"
+ HoodieTimeline.makeCommitFileName(commitTime));
FSDataOutputStream fsout = fileSystem.create(fullPath, true);
fsout.write(bytes);
fsout.close();
}
private static void createCompactionCommitFile(HoodieCommitMetadata commitMetadata, String commitTime)
throws IOException {
byte[] bytes = commitMetadata.toJsonString().getBytes(StandardCharsets.UTF_8);
Path fullPath = new Path(
hiveSyncConfig.basePath + "/" + HoodieTableMetaClient.METAFOLDER_NAME + "/" + HoodieTimeline
.makeCommitFileName(
commitTime));
Path fullPath = new Path(hiveSyncConfig.basePath + "/" + HoodieTableMetaClient.METAFOLDER_NAME + "/"
+ HoodieTimeline.makeCommitFileName(commitTime));
FSDataOutputStream fsout = fileSystem.create(fullPath, true);
fsout.write(bytes);
fsout.close();
}
private static void createCompactionCommitFile(HoodieCommitMetadata commitMetadata,
String commitTime) throws IOException {
byte[] bytes = commitMetadata.toJsonString().getBytes(StandardCharsets.UTF_8);
Path fullPath = new Path(
hiveSyncConfig.basePath + "/" + HoodieTableMetaClient.METAFOLDER_NAME + "/" + HoodieTimeline
.makeCommitFileName(
commitTime));
FSDataOutputStream fsout = fileSystem.create(fullPath, true);
fsout.write(bytes);
fsout.close();
}
private static void createDeltaCommitFile(HoodieCommitMetadata deltaCommitMetadata,
String deltaCommitTime) throws IOException {
private static void createDeltaCommitFile(HoodieCommitMetadata deltaCommitMetadata, String deltaCommitTime)
throws IOException {
byte[] bytes = deltaCommitMetadata.toJsonString().getBytes(StandardCharsets.UTF_8);
Path fullPath = new Path(
hiveSyncConfig.basePath + "/" + HoodieTableMetaClient.METAFOLDER_NAME + "/" + HoodieTimeline
.makeDeltaFileName(
deltaCommitTime));
Path fullPath = new Path(hiveSyncConfig.basePath + "/" + HoodieTableMetaClient.METAFOLDER_NAME + "/"
+ HoodieTimeline.makeDeltaFileName(deltaCommitTime));
FSDataOutputStream fsout = fileSystem.create(fullPath, true);
fsout.write(bytes);
fsout.close();

View File

@@ -142,7 +142,7 @@ public class HiveTestService {
// 'new HiveConf()'. This is fixed by https://issues.apache.org/jira/browse/HIVE-6657,
// in Hive 0.14.
// As a workaround, the property is set in hive-site.xml in this module.
//conf.set(HiveConf.ConfVars.HIVE_SERVER2_AUTHENTICATION.varname, "NOSASL");
// conf.set(HiveConf.ConfVars.HIVE_SERVER2_AUTHENTICATION.varname, "NOSASL");
File localHiveDir = new File(localHiveLocation);
localHiveDir.mkdirs();
File metastoreDbDir = new File(localHiveDir, "metastore_db");
@@ -225,8 +225,7 @@ public class HiveTestService {
private final TTransportFactory parentTransFactory;
private final TTransportFactory childTransFactory;
private ChainedTTransportFactory(TTransportFactory parentTransFactory,
TTransportFactory childTransFactory) {
private ChainedTTransportFactory(TTransportFactory parentTransFactory, TTransportFactory childTransFactory) {
this.parentTransFactory = parentTransFactory;
this.childTransFactory = childTransFactory;
}
@@ -268,17 +267,15 @@ public class HiveTestService {
int minWorkerThreads = conf.getIntVar(HiveConf.ConfVars.METASTORESERVERMINTHREADS);
int maxWorkerThreads = conf.getIntVar(HiveConf.ConfVars.METASTORESERVERMAXTHREADS);
boolean tcpKeepAlive = conf.getBoolVar(HiveConf.ConfVars.METASTORE_TCP_KEEP_ALIVE);
boolean useFramedTransport = conf.getBoolVar(
HiveConf.ConfVars.METASTORE_USE_THRIFT_FRAMED_TRANSPORT);
boolean useFramedTransport = conf.getBoolVar(HiveConf.ConfVars.METASTORE_USE_THRIFT_FRAMED_TRANSPORT);
// don't support SASL yet
//boolean useSasl = conf.getBoolVar(HiveConf.ConfVars.METASTORE_USE_THRIFT_SASL);
// boolean useSasl = conf.getBoolVar(HiveConf.ConfVars.METASTORE_USE_THRIFT_SASL);
TServerTransport serverTransport;
if (forceBindIP != null) {
InetSocketAddress address = new InetSocketAddress(forceBindIP, port);
serverTransport =
tcpKeepAlive ? new TServerSocketKeepAlive(address) : new TServerSocket(address);
serverTransport = tcpKeepAlive ? new TServerSocketKeepAlive(address) : new TServerSocket(address);
} else {
serverTransport = tcpKeepAlive ? new TServerSocketKeepAlive(port) : new TServerSocket(port);
@@ -287,29 +284,24 @@ public class HiveTestService {
TProcessor processor;
TTransportFactory transFactory;
IHMSHandler handler = (IHMSHandler) HiveMetaStore
.newRetryingHMSHandler("new db based metaserver",
conf, true);
IHMSHandler handler = (IHMSHandler) HiveMetaStore.newRetryingHMSHandler("new db based metaserver", conf, true);
if (conf.getBoolVar(HiveConf.ConfVars.METASTORE_EXECUTE_SET_UGI)) {
transFactory =
useFramedTransport ? new ChainedTTransportFactory(new TFramedTransport.Factory(),
new TUGIContainingTransport.Factory()) : new TUGIContainingTransport.Factory();
transFactory = useFramedTransport
? new ChainedTTransportFactory(new TFramedTransport.Factory(), new TUGIContainingTransport.Factory())
: new TUGIContainingTransport.Factory();
processor = new TUGIBasedProcessor<IHMSHandler>(handler);
LOG.info("Starting DB backed MetaStore Server with SetUGI enabled");
} else {
transFactory =
useFramedTransport ? new TFramedTransport.Factory() : new TTransportFactory();
transFactory = useFramedTransport ? new TFramedTransport.Factory() : new TTransportFactory();
processor = new TSetIpAddressProcessor<IHMSHandler>(handler);
LOG.info("Starting DB backed MetaStore Server");
}
TThreadPoolServer.Args args = new TThreadPoolServer.Args(serverTransport).processor(processor)
.transportFactory(transFactory)
.protocolFactory(new TBinaryProtocol.Factory())
.minWorkerThreads(minWorkerThreads)
.maxWorkerThreads(maxWorkerThreads);
.transportFactory(transFactory).protocolFactory(new TBinaryProtocol.Factory())
.minWorkerThreads(minWorkerThreads).maxWorkerThreads(maxWorkerThreads);
final TServer tServer = new TThreadPoolServer(args);
executorService.submit(new Runnable() {