[HUDI-3383] Sync column comments while syncing a hive table (#4960)
Desc: Add a hive sync config(hoodie.datasource.hive_sync.sync_comment). This config defaults to false. While syncing data source to hudi, add column comments to source avro schema, and the sync_comment is true, syncing column comments to the hive table.
This commit is contained in:
@@ -26,7 +26,7 @@ import org.apache.spark.rdd.RDD
|
||||
import org.apache.spark.sql.avro.SchemaConverters
|
||||
import org.apache.spark.sql.catalyst.InternalRow
|
||||
import org.apache.spark.sql.catalyst.encoders.RowEncoder
|
||||
import org.apache.spark.sql.types.{DataType, StructType}
|
||||
import org.apache.spark.sql.types.{ArrayType, DataType, MapType, StructType}
|
||||
import org.apache.spark.sql.{Dataset, Row, SparkSession}
|
||||
|
||||
import scala.collection.JavaConversions._
|
||||
@@ -144,7 +144,7 @@ object AvroConversionUtils {
|
||||
def convertStructTypeToAvroSchema(structType: DataType,
|
||||
structName: String,
|
||||
recordNamespace: String): Schema = {
|
||||
getAvroSchemaWithDefaults(SchemaConverters.toAvroType(structType, nullable = false, structName, recordNamespace))
|
||||
getAvroSchemaWithDefaults(SchemaConverters.toAvroType(structType, nullable = false, structName, recordNamespace), structType)
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -154,13 +154,20 @@ object AvroConversionUtils {
|
||||
* @param schema input avro schema
|
||||
* @return Avro schema with null default set to nullable fields
|
||||
*/
|
||||
def getAvroSchemaWithDefaults(schema: Schema): Schema = {
|
||||
def getAvroSchemaWithDefaults(schema: Schema, dataType: DataType): Schema = {
|
||||
|
||||
schema.getType match {
|
||||
case Schema.Type.RECORD => {
|
||||
|
||||
val structType = dataType.asInstanceOf[StructType]
|
||||
val structFields = structType.fields
|
||||
val modifiedFields = schema.getFields.map(field => {
|
||||
val newSchema = getAvroSchemaWithDefaults(field.schema())
|
||||
val i: Int = structType.fieldIndex(field.name())
|
||||
val comment: String = if (structFields(i).metadata.contains("comment")) {
|
||||
structFields(i).metadata.getString("comment")
|
||||
} else {
|
||||
field.doc()
|
||||
}
|
||||
val newSchema = getAvroSchemaWithDefaults(field.schema(), structFields(i).dataType)
|
||||
field.schema().getType match {
|
||||
case Schema.Type.UNION => {
|
||||
val innerFields = newSchema.getTypes
|
||||
@@ -168,27 +175,27 @@ object AvroConversionUtils {
|
||||
if(containsNullSchema) {
|
||||
// Need to re shuffle the fields in list because to set null as default, null schema must be head in union schema
|
||||
val restructuredNewSchema = Schema.createUnion(List(Schema.create(Schema.Type.NULL)) ++ innerFields.filter(innerSchema => !(innerSchema.getType == Schema.Type.NULL)))
|
||||
new Schema.Field(field.name(), restructuredNewSchema, field.doc(), JsonProperties.NULL_VALUE)
|
||||
new Schema.Field(field.name(), restructuredNewSchema, comment, JsonProperties.NULL_VALUE)
|
||||
} else {
|
||||
new Schema.Field(field.name(), newSchema, field.doc(), field.defaultVal())
|
||||
new Schema.Field(field.name(), newSchema, comment, field.defaultVal())
|
||||
}
|
||||
}
|
||||
case _ => new Schema.Field(field.name(), newSchema, field.doc(), field.defaultVal())
|
||||
case _ => new Schema.Field(field.name(), newSchema, comment, field.defaultVal())
|
||||
}
|
||||
}).toList
|
||||
Schema.createRecord(schema.getName, schema.getDoc, schema.getNamespace, schema.isError, modifiedFields)
|
||||
}
|
||||
|
||||
case Schema.Type.UNION => {
|
||||
Schema.createUnion(schema.getTypes.map(innerSchema => getAvroSchemaWithDefaults(innerSchema)))
|
||||
Schema.createUnion(schema.getTypes.map(innerSchema => getAvroSchemaWithDefaults(innerSchema, dataType)))
|
||||
}
|
||||
|
||||
case Schema.Type.MAP => {
|
||||
Schema.createMap(getAvroSchemaWithDefaults(schema.getValueType))
|
||||
Schema.createMap(getAvroSchemaWithDefaults(schema.getValueType, dataType.asInstanceOf[MapType].valueType))
|
||||
}
|
||||
|
||||
case Schema.Type.ARRAY => {
|
||||
Schema.createArray(getAvroSchemaWithDefaults(schema.getElementType))
|
||||
Schema.createArray(getAvroSchemaWithDefaults(schema.getElementType, dataType.asInstanceOf[ArrayType].elementType))
|
||||
}
|
||||
|
||||
case _ => schema
|
||||
|
||||
27
hudi-common/src/test/resources/simple-test-doced.avsc
Normal file
27
hudi-common/src/test/resources/simple-test-doced.avsc
Normal file
@@ -0,0 +1,27 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
{
|
||||
"namespace": "example.avro",
|
||||
"type": "record",
|
||||
"name": "User",
|
||||
"fields": [
|
||||
{"name": "name", "type": "string","doc":"name_comment"},
|
||||
{"name": "favorite_number", "type": "int","doc":"favorite_number_comment"},
|
||||
{"name": "favorite_color", "type": "string"}
|
||||
]
|
||||
}
|
||||
@@ -316,6 +316,8 @@ public class DataSourceUtils {
|
||||
if (props.containsKey(HiveExternalCatalog.CREATED_SPARK_VERSION())) {
|
||||
hiveSyncConfig.sparkVersion = props.getString(HiveExternalCatalog.CREATED_SPARK_VERSION());
|
||||
}
|
||||
hiveSyncConfig.syncComment = Boolean.valueOf(props.getString(DataSourceWriteOptions.HIVE_SYNC_COMMENT().key(),
|
||||
DataSourceWriteOptions.HIVE_SYNC_COMMENT().defaultValue()));
|
||||
return hiveSyncConfig;
|
||||
}
|
||||
|
||||
|
||||
@@ -555,6 +555,11 @@ object DataSourceWriteOptions {
|
||||
.withDocumentation("Whether sync hive metastore bucket specification when using bucket index." +
|
||||
"The specification is 'CLUSTERED BY (trace_id) SORTED BY (trace_id ASC) INTO 65536 BUCKETS'")
|
||||
|
||||
val HIVE_SYNC_COMMENT: ConfigProperty[String] = ConfigProperty
|
||||
.key("hoodie.datasource.hive_sync.sync_comment")
|
||||
.defaultValue("false")
|
||||
.withDocumentation("Whether to sync the table column comments while syncing the table.")
|
||||
|
||||
// Async Compaction - Enabled by default for MOR
|
||||
val ASYNC_COMPACT_ENABLE: ConfigProperty[String] = ConfigProperty
|
||||
.key("hoodie.datasource.compaction.async.enable")
|
||||
|
||||
@@ -601,6 +601,7 @@ object HoodieSparkSqlWriter {
|
||||
hiveSyncConfig.serdeProperties = hoodieConfig.getString(HIVE_TABLE_SERDE_PROPERTIES)
|
||||
hiveSyncConfig.tableProperties = hoodieConfig.getString(HIVE_TABLE_PROPERTIES)
|
||||
hiveSyncConfig.sparkVersion = SPARK_VERSION
|
||||
hiveSyncConfig.syncComment = hoodieConfig.getStringOrDefault(HIVE_SYNC_COMMENT).toBoolean
|
||||
hiveSyncConfig
|
||||
}
|
||||
|
||||
|
||||
@@ -161,4 +161,220 @@ class TestAvroConversionUtils extends FunSuite with Matchers {
|
||||
|
||||
assert(avroSchema.equals(expectedAvroSchema))
|
||||
}
|
||||
|
||||
test("test convertStructTypeToAvroSchema with Nested StructField comment") {
|
||||
val mapType = DataTypes.createMapType(StringType, new StructType().add("mapKey", "string", false, "mapKeyComment").add("mapVal", "integer", true))
|
||||
val arrayType = ArrayType(new StructType().add("arrayKey", "string", false).add("arrayVal", "integer", true, "arrayValComment"))
|
||||
val innerStruct = new StructType().add("innerKey","string",false, "innerKeyComment").add("value", "long", true, "valueComment")
|
||||
|
||||
val struct = new StructType().add("key", "string", false).add("version", "string", true, "versionComment")
|
||||
.add("data1",innerStruct,false).add("data2",innerStruct,true)
|
||||
.add("nullableMap", mapType, true).add("map",mapType,false)
|
||||
.add("nullableArray", arrayType, true).add("array",arrayType,false)
|
||||
|
||||
val avroSchema = AvroConversionUtils.convertStructTypeToAvroSchema(struct, "SchemaName", "SchemaNS")
|
||||
|
||||
val expectedSchemaStr = s"""
|
||||
{
|
||||
"type": "record",
|
||||
"name": "SchemaName",
|
||||
"namespace": "SchemaNS",
|
||||
"fields": [
|
||||
{
|
||||
"name": "key",
|
||||
"type": "string"
|
||||
},
|
||||
{
|
||||
"name": "version",
|
||||
"type": [
|
||||
"null",
|
||||
"string"
|
||||
],
|
||||
"doc": "versionComment",
|
||||
"default": null
|
||||
},
|
||||
{
|
||||
"name": "data1",
|
||||
"type": {
|
||||
"type": "record",
|
||||
"name": "data1",
|
||||
"namespace": "SchemaNS.SchemaName",
|
||||
"fields": [
|
||||
{
|
||||
"name": "innerKey",
|
||||
"type": "string",
|
||||
"doc": "innerKeyComment"
|
||||
},
|
||||
{
|
||||
"name": "value",
|
||||
"type": [
|
||||
"null",
|
||||
"long"
|
||||
],
|
||||
"doc": "valueComment",
|
||||
"default": null
|
||||
}
|
||||
]
|
||||
}
|
||||
},
|
||||
{
|
||||
"name": "data2",
|
||||
"type": [
|
||||
"null",
|
||||
{
|
||||
"type": "record",
|
||||
"name": "data2",
|
||||
"namespace": "SchemaNS.SchemaName",
|
||||
"fields": [
|
||||
{
|
||||
"name": "innerKey",
|
||||
"type": "string",
|
||||
"doc": "innerKeyComment"
|
||||
},
|
||||
{
|
||||
"name": "value",
|
||||
"type": [
|
||||
"null",
|
||||
"long"
|
||||
],
|
||||
"doc": "valueComment",
|
||||
"default": null
|
||||
}
|
||||
]
|
||||
}
|
||||
],
|
||||
"default": null
|
||||
},
|
||||
{
|
||||
"name": "nullableMap",
|
||||
"type": [
|
||||
"null",
|
||||
{
|
||||
"type": "map",
|
||||
"values": [
|
||||
{
|
||||
"type": "record",
|
||||
"name": "nullableMap",
|
||||
"namespace": "SchemaNS.SchemaName",
|
||||
"fields": [
|
||||
{
|
||||
"name": "mapKey",
|
||||
"type": "string",
|
||||
"doc": "mapKeyComment"
|
||||
},
|
||||
{
|
||||
"name": "mapVal",
|
||||
"type": [
|
||||
"null",
|
||||
"int"
|
||||
],
|
||||
"default": null
|
||||
}
|
||||
]
|
||||
},
|
||||
"null"
|
||||
]
|
||||
}
|
||||
],
|
||||
"default": null
|
||||
},
|
||||
{
|
||||
"name": "map",
|
||||
"type": {
|
||||
"type": "map",
|
||||
"values": [
|
||||
{
|
||||
"type": "record",
|
||||
"name": "map",
|
||||
"namespace": "SchemaNS.SchemaName",
|
||||
"fields": [
|
||||
{
|
||||
"name": "mapKey",
|
||||
"type": "string",
|
||||
"doc": "mapKeyComment"
|
||||
},
|
||||
{
|
||||
"name": "mapVal",
|
||||
"type": [
|
||||
"null",
|
||||
"int"
|
||||
],
|
||||
"default": null
|
||||
}
|
||||
]
|
||||
},
|
||||
"null"
|
||||
]
|
||||
}
|
||||
},
|
||||
{
|
||||
"name": "nullableArray",
|
||||
"type": [
|
||||
"null",
|
||||
{
|
||||
"type": "array",
|
||||
"items": [
|
||||
{
|
||||
"type": "record",
|
||||
"name": "nullableArray",
|
||||
"namespace": "SchemaNS.SchemaName",
|
||||
"fields": [
|
||||
{
|
||||
"name": "arrayKey",
|
||||
"type": "string"
|
||||
},
|
||||
{
|
||||
"name": "arrayVal",
|
||||
"type": [
|
||||
"null",
|
||||
"int"
|
||||
],
|
||||
"doc": "arrayValComment",
|
||||
"default": null
|
||||
}
|
||||
]
|
||||
},
|
||||
"null"
|
||||
]
|
||||
}
|
||||
],
|
||||
"default": null
|
||||
},
|
||||
{
|
||||
"name": "array",
|
||||
"type": {
|
||||
"type": "array",
|
||||
"items": [
|
||||
{
|
||||
"type": "record",
|
||||
"name": "array",
|
||||
"namespace": "SchemaNS.SchemaName",
|
||||
"fields": [
|
||||
{
|
||||
"name": "arrayKey",
|
||||
"type": "string"
|
||||
},
|
||||
{
|
||||
"name": "arrayVal",
|
||||
"type": [
|
||||
"null",
|
||||
"int"
|
||||
],
|
||||
"doc": "arrayValComment",
|
||||
"default": null
|
||||
}
|
||||
]
|
||||
},
|
||||
"null"
|
||||
]
|
||||
}
|
||||
}
|
||||
]
|
||||
}}
|
||||
"""
|
||||
|
||||
val expectedAvroSchema = new Schema.Parser().parse(expectedSchemaStr)
|
||||
|
||||
assert(avroSchema.equals(expectedAvroSchema))
|
||||
}
|
||||
}
|
||||
|
||||
@@ -132,6 +132,9 @@ public class HiveSyncConfig implements Serializable {
|
||||
@Parameter(names = {"--spark-version"}, description = "The spark version", required = false)
|
||||
public String sparkVersion;
|
||||
|
||||
@Parameter(names = {"--sync-comment"}, description = "synchronize table comments to hive")
|
||||
public boolean syncComment = false;
|
||||
|
||||
// enhance the similar function in child class
|
||||
public static HiveSyncConfig copy(HiveSyncConfig cfg) {
|
||||
HiveSyncConfig newConfig = new HiveSyncConfig();
|
||||
@@ -159,6 +162,7 @@ public class HiveSyncConfig implements Serializable {
|
||||
newConfig.withOperationField = cfg.withOperationField;
|
||||
newConfig.isConditionalSync = cfg.isConditionalSync;
|
||||
newConfig.sparkVersion = cfg.sparkVersion;
|
||||
newConfig.syncComment = cfg.syncComment;
|
||||
return newConfig;
|
||||
}
|
||||
|
||||
@@ -193,6 +197,7 @@ public class HiveSyncConfig implements Serializable {
|
||||
+ ", sparkSchemaLengthThreshold=" + sparkSchemaLengthThreshold
|
||||
+ ", withOperationField=" + withOperationField
|
||||
+ ", isConditionalSync=" + isConditionalSync
|
||||
+ ", syncComment=" + syncComment
|
||||
+ '}';
|
||||
}
|
||||
|
||||
|
||||
@@ -19,9 +19,11 @@
|
||||
package org.apache.hudi.hive;
|
||||
|
||||
import com.beust.jcommander.JCommander;
|
||||
import org.apache.avro.Schema;
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.hadoop.hive.conf.HiveConf;
|
||||
import org.apache.hadoop.hive.metastore.api.FieldSchema;
|
||||
import org.apache.hadoop.hive.metastore.api.Partition;
|
||||
import org.apache.hudi.common.fs.FSUtils;
|
||||
import org.apache.hudi.common.model.HoodieFileFormat;
|
||||
@@ -37,6 +39,7 @@ import org.apache.hudi.hive.util.Parquet2SparkSchemaUtils;
|
||||
import org.apache.hudi.sync.common.AbstractSyncHoodieClient.PartitionEvent;
|
||||
import org.apache.hudi.sync.common.AbstractSyncHoodieClient.PartitionEvent.PartitionEventType;
|
||||
import org.apache.hudi.sync.common.AbstractSyncTool;
|
||||
|
||||
import org.apache.log4j.LogManager;
|
||||
import org.apache.log4j.Logger;
|
||||
import org.apache.parquet.schema.GroupType;
|
||||
@@ -261,6 +264,19 @@ public class HiveSyncTool extends AbstractSyncTool {
|
||||
LOG.info("No Schema difference for " + tableName);
|
||||
}
|
||||
}
|
||||
|
||||
if (cfg.syncComment) {
|
||||
Schema avroSchemaWithoutMetadataFields = hoodieHiveClient.getAvroSchemaWithoutMetadataFields();
|
||||
Map<String, String> newComments = avroSchemaWithoutMetadataFields.getFields()
|
||||
.stream().collect(Collectors.toMap(Schema.Field::name, field -> StringUtils.isNullOrEmpty(field.doc()) ? "" : field.doc()));
|
||||
boolean allEmpty = newComments.values().stream().allMatch(StringUtils::isNullOrEmpty);
|
||||
if (!allEmpty) {
|
||||
List<FieldSchema> hiveSchema = hoodieHiveClient.getTableCommentUsingMetastoreClient(tableName);
|
||||
hoodieHiveClient.updateTableComments(tableName, hiveSchema, avroSchemaWithoutMetadataFields.getFields());
|
||||
} else {
|
||||
LOG.info(String.format("No comment %s need to add", tableName));
|
||||
}
|
||||
}
|
||||
return schemaChanged;
|
||||
}
|
||||
|
||||
|
||||
@@ -19,21 +19,27 @@
|
||||
package org.apache.hudi.hive;
|
||||
|
||||
import org.apache.hudi.common.fs.FSUtils;
|
||||
import org.apache.hudi.common.table.TableSchemaResolver;
|
||||
import org.apache.hudi.common.table.timeline.HoodieInstant;
|
||||
import org.apache.hudi.common.table.timeline.HoodieTimeline;
|
||||
import org.apache.hudi.common.util.Option;
|
||||
import org.apache.hudi.common.util.StringUtils;
|
||||
import org.apache.hudi.hive.util.HiveSchemaUtil;
|
||||
import org.apache.hudi.sync.common.AbstractSyncHoodieClient;
|
||||
import org.apache.hudi.common.util.collection.ImmutablePair;
|
||||
import org.apache.hudi.hive.ddl.DDLExecutor;
|
||||
import org.apache.hudi.hive.ddl.HMSDDLExecutor;
|
||||
import org.apache.hudi.hive.ddl.HiveQueryDDLExecutor;
|
||||
import org.apache.hudi.hive.ddl.HiveSyncMode;
|
||||
import org.apache.hudi.hive.ddl.JDBCExecutor;
|
||||
import org.apache.hudi.hive.util.HiveSchemaUtil;
|
||||
import org.apache.hudi.sync.common.AbstractSyncHoodieClient;
|
||||
import org.apache.hudi.sync.common.HoodieSyncException;
|
||||
|
||||
import org.apache.avro.Schema;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.apache.hadoop.hive.conf.HiveConf;
|
||||
import org.apache.hadoop.hive.metastore.IMetaStoreClient;
|
||||
import org.apache.hadoop.hive.metastore.api.FieldSchema;
|
||||
import org.apache.hadoop.hive.metastore.api.NoSuchObjectException;
|
||||
import org.apache.hadoop.hive.metastore.api.Partition;
|
||||
import org.apache.hadoop.hive.metastore.api.Table;
|
||||
@@ -46,7 +52,9 @@ import org.apache.thrift.TException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Locale;
|
||||
import java.util.Map;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import static org.apache.hudi.hadoop.utils.HoodieHiveUtils.GLOBALLY_CONSISTENT_READ_TIMESTAMP;
|
||||
|
||||
@@ -343,4 +351,43 @@ public class HoodieHiveClient extends AbstractSyncHoodieClient {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public Schema getAvroSchemaWithoutMetadataFields() {
|
||||
try {
|
||||
return new TableSchemaResolver(metaClient).getTableAvroSchemaWithoutMetadataFields();
|
||||
} catch (Exception e) {
|
||||
throw new HoodieSyncException("Failed to read avro schema", e);
|
||||
}
|
||||
}
|
||||
|
||||
public List<FieldSchema> getTableCommentUsingMetastoreClient(String tableName) {
|
||||
try {
|
||||
return client.getSchema(syncConfig.databaseName, tableName);
|
||||
} catch (Exception e) {
|
||||
throw new HoodieHiveSyncException("Failed to get table comments for : " + tableName, e);
|
||||
}
|
||||
}
|
||||
|
||||
public void updateTableComments(String tableName, List<FieldSchema> oldSchema, List<Schema.Field> newSchema) {
|
||||
Map<String,String> newComments = newSchema.stream().collect(Collectors.toMap(field -> field.name().toLowerCase(Locale.ROOT), field -> StringUtils.isNullOrEmpty(field.doc()) ? "" : field.doc()));
|
||||
updateTableComments(tableName,oldSchema,newComments);
|
||||
}
|
||||
|
||||
public void updateTableComments(String tableName, List<FieldSchema> oldSchema, Map<String,String> newComments) {
|
||||
Map<String,String> oldComments = oldSchema.stream().collect(Collectors.toMap(fieldSchema -> fieldSchema.getName().toLowerCase(Locale.ROOT),
|
||||
fieldSchema -> StringUtils.isNullOrEmpty(fieldSchema.getComment()) ? "" : fieldSchema.getComment()));
|
||||
Map<String,String> types = oldSchema.stream().collect(Collectors.toMap(FieldSchema::getName, FieldSchema::getType));
|
||||
Map<String, ImmutablePair<String,String>> alterComments = new HashMap<>();
|
||||
oldComments.forEach((name,comment) -> {
|
||||
String newComment = newComments.getOrDefault(name,"");
|
||||
if (!newComment.equals(comment)) {
|
||||
alterComments.put(name,new ImmutablePair<>(types.get(name),newComment));
|
||||
}
|
||||
});
|
||||
if (alterComments.size() > 0) {
|
||||
ddlExecutor.updateTableComments(tableName, alterComments);
|
||||
} else {
|
||||
LOG.info(String.format("No comment difference of %s ",tableName));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -18,6 +18,8 @@
|
||||
|
||||
package org.apache.hudi.hive.ddl;
|
||||
|
||||
import org.apache.hudi.common.util.collection.ImmutablePair;
|
||||
|
||||
import org.apache.parquet.schema.MessageType;
|
||||
|
||||
import java.util.List;
|
||||
@@ -89,5 +91,13 @@ public interface DDLExecutor {
|
||||
*/
|
||||
public void dropPartitionsToTable(String tableName, List<String> partitionsToDrop);
|
||||
|
||||
/**
|
||||
* update table comments
|
||||
*
|
||||
* @param tableName
|
||||
* @param newSchema
|
||||
*/
|
||||
public void updateTableComments(String tableName, Map<String, ImmutablePair<String,String>> newSchema);
|
||||
|
||||
public void close();
|
||||
}
|
||||
|
||||
@@ -18,9 +18,9 @@
|
||||
|
||||
package org.apache.hudi.hive.ddl;
|
||||
|
||||
import org.apache.hadoop.security.UserGroupInformation;
|
||||
import org.apache.hudi.common.fs.FSUtils;
|
||||
import org.apache.hudi.common.fs.StorageSchemes;
|
||||
import org.apache.hudi.common.util.collection.ImmutablePair;
|
||||
import org.apache.hudi.hive.HiveSyncConfig;
|
||||
import org.apache.hudi.hive.HoodieHiveSyncException;
|
||||
import org.apache.hudi.hive.PartitionValueExtractor;
|
||||
@@ -43,6 +43,7 @@ import org.apache.hadoop.hive.metastore.api.StorageDescriptor;
|
||||
import org.apache.hadoop.hive.metastore.api.Table;
|
||||
import org.apache.hadoop.hive.ql.metadata.Hive;
|
||||
import org.apache.hadoop.hive.ql.metadata.HiveException;
|
||||
import org.apache.hadoop.security.UserGroupInformation;
|
||||
import org.apache.log4j.LogManager;
|
||||
import org.apache.log4j.Logger;
|
||||
import org.apache.parquet.schema.MessageType;
|
||||
@@ -247,6 +248,27 @@ public class HMSDDLExecutor implements DDLExecutor {
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void updateTableComments(String tableName, Map<String, ImmutablePair<String,String>> alterSchema) {
|
||||
try {
|
||||
Table table = client.getTable(syncConfig.databaseName, tableName);
|
||||
StorageDescriptor sd = new StorageDescriptor(table.getSd());
|
||||
for (FieldSchema fieldSchema : sd.getCols()) {
|
||||
if (alterSchema.containsKey(fieldSchema.getName())) {
|
||||
String comment = alterSchema.get(fieldSchema.getName()).getRight();
|
||||
fieldSchema.setComment(comment);
|
||||
}
|
||||
}
|
||||
table.setSd(sd);
|
||||
EnvironmentContext environmentContext = new EnvironmentContext();
|
||||
client.alter_table_with_environmentContext(syncConfig.databaseName, tableName, table, environmentContext);
|
||||
sd.clear();
|
||||
} catch (Exception e) {
|
||||
LOG.error("Failed to update table comments for " + tableName, e);
|
||||
throw new HoodieHiveSyncException("Failed to update table comments for " + tableName, e);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void close() {
|
||||
if (client != null) {
|
||||
|
||||
@@ -22,6 +22,7 @@ import org.apache.hudi.common.fs.FSUtils;
|
||||
import org.apache.hudi.common.fs.StorageSchemes;
|
||||
import org.apache.hudi.common.util.PartitionPathEncodeUtils;
|
||||
import org.apache.hudi.common.util.ValidationUtils;
|
||||
import org.apache.hudi.common.util.collection.ImmutablePair;
|
||||
import org.apache.hudi.hive.HiveSyncConfig;
|
||||
import org.apache.hudi.hive.HoodieHiveSyncException;
|
||||
import org.apache.hudi.hive.PartitionValueExtractor;
|
||||
@@ -128,6 +129,24 @@ public abstract class QueryBasedDDLExecutor implements DDLExecutor {
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void updateTableComments(String tableName, Map<String, ImmutablePair<String,String>> newSchema) {
|
||||
for (Map.Entry<String, ImmutablePair<String,String>> field : newSchema.entrySet()) {
|
||||
String name = field.getKey();
|
||||
StringBuilder sql = new StringBuilder();
|
||||
String type = field.getValue().getLeft();
|
||||
String comment = field.getValue().getRight();
|
||||
comment = comment.replace("'","");
|
||||
sql.append("ALTER TABLE ").append(HIVE_ESCAPE_CHARACTER)
|
||||
.append(config.databaseName).append(HIVE_ESCAPE_CHARACTER).append(".")
|
||||
.append(HIVE_ESCAPE_CHARACTER).append(tableName)
|
||||
.append(HIVE_ESCAPE_CHARACTER)
|
||||
.append(" CHANGE COLUMN `").append(name).append("` `").append(name)
|
||||
.append("` ").append(type).append(" comment '").append(comment).append("' ");
|
||||
runSQL(sql.toString());
|
||||
}
|
||||
}
|
||||
|
||||
private List<String> constructAddPartitions(String tableName, List<String> partitions) {
|
||||
if (config.batchSyncNum <= 0) {
|
||||
throw new HoodieHiveSyncException("batch-sync-num for sync hive table must be greater than 0, pls check your parameter");
|
||||
|
||||
@@ -25,6 +25,8 @@ import org.apache.hudi.common.table.HoodieTableMetaClient;
|
||||
import org.apache.hudi.common.testutils.NetworkTestUtils;
|
||||
import org.apache.hudi.common.testutils.SchemaTestUtil;
|
||||
import org.apache.hudi.common.util.Option;
|
||||
import org.apache.hudi.common.util.StringUtils;
|
||||
import org.apache.hudi.common.util.collection.ImmutablePair;
|
||||
import org.apache.hudi.hive.testutils.HiveTestUtil;
|
||||
import org.apache.hudi.hive.util.ConfigUtils;
|
||||
import org.apache.hudi.sync.common.AbstractSyncHoodieClient.PartitionEvent;
|
||||
@@ -33,6 +35,7 @@ import org.apache.hudi.sync.common.AbstractSyncHoodieClient.PartitionEvent.Parti
|
||||
import org.apache.avro.Schema;
|
||||
import org.apache.avro.Schema.Field;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.apache.hadoop.hive.metastore.api.FieldSchema;
|
||||
import org.apache.hadoop.hive.metastore.api.MetaException;
|
||||
import org.apache.hadoop.hive.metastore.api.Partition;
|
||||
import org.apache.hadoop.hive.ql.Driver;
|
||||
@@ -52,7 +55,9 @@ import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Locale;
|
||||
import java.util.Map;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import static org.apache.hudi.hive.testutils.HiveTestUtil.ddlExecutor;
|
||||
import static org.apache.hudi.hive.testutils.HiveTestUtil.fileSystem;
|
||||
@@ -524,6 +529,77 @@ public class TestHiveSyncTool {
|
||||
"The last commit that was synced should be 101");
|
||||
}
|
||||
|
||||
@ParameterizedTest
|
||||
@MethodSource("syncMode")
|
||||
public void testUpdateTableComments(String syncMode) throws Exception {
|
||||
hiveSyncConfig.syncMode = syncMode;
|
||||
String commitTime = "100";
|
||||
HiveTestUtil.createCOWTableWithSchema(commitTime, "/simple-test.avsc");
|
||||
HiveSyncTool tool = new HiveSyncTool(hiveSyncConfig, HiveTestUtil.getHiveConf(), fileSystem);
|
||||
tool.syncHoodieTable();
|
||||
HoodieHiveClient hiveClient =
|
||||
new HoodieHiveClient(hiveSyncConfig, HiveTestUtil.getHiveConf(), fileSystem);
|
||||
|
||||
Map<String, ImmutablePair<String,String>> alterCommentSchema = new HashMap<>();
|
||||
//generate commented schema field
|
||||
Schema schema = SchemaTestUtil.getSchemaFromResource(HiveTestUtil.class, "/simple-test.avsc");
|
||||
Schema commentedSchema = SchemaTestUtil.getSchemaFromResource(HiveTestUtil.class, "/simple-test-doced.avsc");
|
||||
Map<String, String> fieldsNameAndDoc = commentedSchema.getFields().stream().collect(Collectors.toMap(field -> field.name().toLowerCase(Locale.ROOT),
|
||||
field -> StringUtils.isNullOrEmpty(field.doc()) ? "" : field.doc()));
|
||||
for (Field field : schema.getFields()) {
|
||||
String name = field.name().toLowerCase(Locale.ROOT);
|
||||
String comment = fieldsNameAndDoc.get(name);
|
||||
if (fieldsNameAndDoc.containsKey(name) && !comment.equals(field.doc())) {
|
||||
alterCommentSchema.put(name, new ImmutablePair<>(field.schema().getType().name(),comment));
|
||||
}
|
||||
}
|
||||
|
||||
ddlExecutor.updateTableComments(hiveSyncConfig.tableName,alterCommentSchema);
|
||||
|
||||
List<FieldSchema> fieldSchemas = hiveClient.getTableCommentUsingMetastoreClient(hiveSyncConfig.tableName);
|
||||
int commentCnt = 0;
|
||||
for (FieldSchema fieldSchema : fieldSchemas) {
|
||||
if (!StringUtils.isNullOrEmpty(fieldSchema.getComment())) {
|
||||
commentCnt++;
|
||||
}
|
||||
}
|
||||
assertEquals(2, commentCnt, "hive schema field comment numbers should match the avro schema field doc numbers");
|
||||
}
|
||||
|
||||
@ParameterizedTest
|
||||
@MethodSource("syncMode")
|
||||
public void testSyncWithCommentedSchema(String syncMode) throws Exception {
|
||||
hiveSyncConfig.syncMode = syncMode;
|
||||
hiveSyncConfig.syncComment = false;
|
||||
String commitTime = "100";
|
||||
HiveTestUtil.createCOWTableWithSchema(commitTime, "/simple-test-doced.avsc");
|
||||
|
||||
HiveSyncTool tool = new HiveSyncTool(hiveSyncConfig, HiveTestUtil.getHiveConf(), fileSystem);
|
||||
tool.syncHoodieTable();
|
||||
HoodieHiveClient hiveClient =
|
||||
new HoodieHiveClient(hiveSyncConfig, HiveTestUtil.getHiveConf(), fileSystem);
|
||||
List<FieldSchema> fieldSchemas = hiveClient.getTableCommentUsingMetastoreClient(hiveSyncConfig.tableName);
|
||||
int commentCnt = 0;
|
||||
for (FieldSchema fieldSchema : fieldSchemas) {
|
||||
if (!StringUtils.isNullOrEmpty(fieldSchema.getComment())) {
|
||||
commentCnt++;
|
||||
}
|
||||
}
|
||||
assertEquals(0, commentCnt, "hive schema field comment numbers should match the avro schema field doc numbers");
|
||||
|
||||
hiveSyncConfig.syncComment = true;
|
||||
tool = new HiveSyncTool(hiveSyncConfig, HiveTestUtil.getHiveConf(), fileSystem);
|
||||
tool.syncHoodieTable();
|
||||
fieldSchemas = hiveClient.getTableCommentUsingMetastoreClient(hiveSyncConfig.tableName);
|
||||
commentCnt = 0;
|
||||
for (FieldSchema fieldSchema : fieldSchemas) {
|
||||
if (!StringUtils.isNullOrEmpty(fieldSchema.getComment())) {
|
||||
commentCnt++;
|
||||
}
|
||||
}
|
||||
assertEquals(2, commentCnt, "hive schema field comment numbers should match the avro schema field doc numbers");
|
||||
}
|
||||
|
||||
@ParameterizedTest
|
||||
@MethodSource("syncModeAndSchemaFromCommitMetadata")
|
||||
public void testSyncMergeOnRead(boolean useSchemaFromCommitMetadata, String syncMode) throws Exception {
|
||||
|
||||
Reference in New Issue
Block a user