1
0

[HUDI-3383] Sync column comments while syncing a hive table (#4960)

Desc: Add a hive sync config(hoodie.datasource.hive_sync.sync_comment). This config defaults to false.
While syncing data source to hudi, add column comments to source avro schema, and the sync_comment is true, syncing column comments to the hive table.
This commit is contained in:
MrSleeping123
2022-03-10 09:44:39 +08:00
committed by GitHub
parent 548000b0d6
commit 8859b48b2a
13 changed files with 467 additions and 14 deletions

View File

@@ -25,6 +25,8 @@ import org.apache.hudi.common.table.HoodieTableMetaClient;
import org.apache.hudi.common.testutils.NetworkTestUtils;
import org.apache.hudi.common.testutils.SchemaTestUtil;
import org.apache.hudi.common.util.Option;
import org.apache.hudi.common.util.StringUtils;
import org.apache.hudi.common.util.collection.ImmutablePair;
import org.apache.hudi.hive.testutils.HiveTestUtil;
import org.apache.hudi.hive.util.ConfigUtils;
import org.apache.hudi.sync.common.AbstractSyncHoodieClient.PartitionEvent;
@@ -33,6 +35,7 @@ import org.apache.hudi.sync.common.AbstractSyncHoodieClient.PartitionEvent.Parti
import org.apache.avro.Schema;
import org.apache.avro.Schema.Field;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hive.metastore.api.FieldSchema;
import org.apache.hadoop.hive.metastore.api.MetaException;
import org.apache.hadoop.hive.metastore.api.Partition;
import org.apache.hadoop.hive.ql.Driver;
@@ -52,7 +55,9 @@ import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.stream.Collectors;
import static org.apache.hudi.hive.testutils.HiveTestUtil.ddlExecutor;
import static org.apache.hudi.hive.testutils.HiveTestUtil.fileSystem;
@@ -524,6 +529,77 @@ public class TestHiveSyncTool {
"The last commit that was synced should be 101");
}
@ParameterizedTest
@MethodSource("syncMode")
public void testUpdateTableComments(String syncMode) throws Exception {
hiveSyncConfig.syncMode = syncMode;
String commitTime = "100";
HiveTestUtil.createCOWTableWithSchema(commitTime, "/simple-test.avsc");
HiveSyncTool tool = new HiveSyncTool(hiveSyncConfig, HiveTestUtil.getHiveConf(), fileSystem);
tool.syncHoodieTable();
HoodieHiveClient hiveClient =
new HoodieHiveClient(hiveSyncConfig, HiveTestUtil.getHiveConf(), fileSystem);
Map<String, ImmutablePair<String,String>> alterCommentSchema = new HashMap<>();
//generate commented schema field
Schema schema = SchemaTestUtil.getSchemaFromResource(HiveTestUtil.class, "/simple-test.avsc");
Schema commentedSchema = SchemaTestUtil.getSchemaFromResource(HiveTestUtil.class, "/simple-test-doced.avsc");
Map<String, String> fieldsNameAndDoc = commentedSchema.getFields().stream().collect(Collectors.toMap(field -> field.name().toLowerCase(Locale.ROOT),
field -> StringUtils.isNullOrEmpty(field.doc()) ? "" : field.doc()));
for (Field field : schema.getFields()) {
String name = field.name().toLowerCase(Locale.ROOT);
String comment = fieldsNameAndDoc.get(name);
if (fieldsNameAndDoc.containsKey(name) && !comment.equals(field.doc())) {
alterCommentSchema.put(name, new ImmutablePair<>(field.schema().getType().name(),comment));
}
}
ddlExecutor.updateTableComments(hiveSyncConfig.tableName,alterCommentSchema);
List<FieldSchema> fieldSchemas = hiveClient.getTableCommentUsingMetastoreClient(hiveSyncConfig.tableName);
int commentCnt = 0;
for (FieldSchema fieldSchema : fieldSchemas) {
if (!StringUtils.isNullOrEmpty(fieldSchema.getComment())) {
commentCnt++;
}
}
assertEquals(2, commentCnt, "hive schema field comment numbers should match the avro schema field doc numbers");
}
@ParameterizedTest
@MethodSource("syncMode")
public void testSyncWithCommentedSchema(String syncMode) throws Exception {
hiveSyncConfig.syncMode = syncMode;
hiveSyncConfig.syncComment = false;
String commitTime = "100";
HiveTestUtil.createCOWTableWithSchema(commitTime, "/simple-test-doced.avsc");
HiveSyncTool tool = new HiveSyncTool(hiveSyncConfig, HiveTestUtil.getHiveConf(), fileSystem);
tool.syncHoodieTable();
HoodieHiveClient hiveClient =
new HoodieHiveClient(hiveSyncConfig, HiveTestUtil.getHiveConf(), fileSystem);
List<FieldSchema> fieldSchemas = hiveClient.getTableCommentUsingMetastoreClient(hiveSyncConfig.tableName);
int commentCnt = 0;
for (FieldSchema fieldSchema : fieldSchemas) {
if (!StringUtils.isNullOrEmpty(fieldSchema.getComment())) {
commentCnt++;
}
}
assertEquals(0, commentCnt, "hive schema field comment numbers should match the avro schema field doc numbers");
hiveSyncConfig.syncComment = true;
tool = new HiveSyncTool(hiveSyncConfig, HiveTestUtil.getHiveConf(), fileSystem);
tool.syncHoodieTable();
fieldSchemas = hiveClient.getTableCommentUsingMetastoreClient(hiveSyncConfig.tableName);
commentCnt = 0;
for (FieldSchema fieldSchema : fieldSchemas) {
if (!StringUtils.isNullOrEmpty(fieldSchema.getComment())) {
commentCnt++;
}
}
assertEquals(2, commentCnt, "hive schema field comment numbers should match the avro schema field doc numbers");
}
@ParameterizedTest
@MethodSource("syncModeAndSchemaFromCommitMetadata")
public void testSyncMergeOnRead(boolean useSchemaFromCommitMetadata, String syncMode) throws Exception {