1
0

[HUDI-624]: Split some of the code from PR for HUDI-479 (#1344)

This commit is contained in:
Suneel Marthi
2020-02-21 01:22:21 -05:00
committed by GitHub
parent 185ff646ad
commit 078d4825d9
31 changed files with 130 additions and 141 deletions

View File

@@ -18,15 +18,14 @@
package org.apache.hudi.hive;
import com.google.common.base.Objects;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.ImmutableMap;
import org.apache.parquet.schema.MessageType;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.StringJoiner;
/**
* Represents the schema difference between the storage schema and hive table schema.
@@ -43,9 +42,9 @@ public class SchemaDifference {
Map<String, String> updateColumnTypes, Map<String, String> addColumnTypes) {
this.storageSchema = storageSchema;
this.tableSchema = tableSchema;
this.deleteColumns = ImmutableList.copyOf(deleteColumns);
this.updateColumnTypes = ImmutableMap.copyOf(updateColumnTypes);
this.addColumnTypes = ImmutableMap.copyOf(addColumnTypes);
this.deleteColumns = Collections.unmodifiableList(deleteColumns);
this.updateColumnTypes = Collections.unmodifiableMap(updateColumnTypes);
this.addColumnTypes = Collections.unmodifiableMap(addColumnTypes);
}
public List<String> getDeleteColumns() {
@@ -60,12 +59,6 @@ public class SchemaDifference {
return addColumnTypes;
}
@Override
public String toString() {
return Objects.toStringHelper(this).add("deleteColumns", deleteColumns).add("updateColumnTypes", updateColumnTypes)
.add("addColumnTypes", addColumnTypes).toString();
}
public static Builder newBuilder(MessageType storageSchema, Map<String, String> tableSchema) {
return new Builder(storageSchema, tableSchema);
}
@@ -74,6 +67,17 @@ public class SchemaDifference {
return deleteColumns.isEmpty() && updateColumnTypes.isEmpty() && addColumnTypes.isEmpty();
}
@Override
public String toString() {
return new StringJoiner(", ", SchemaDifference.class.getSimpleName() + "[", "]")
.add("storageSchema=" + storageSchema)
.add("tableSchema=" + tableSchema)
.add("deleteColumns=" + deleteColumns)
.add("updateColumnTypes=" + updateColumnTypes)
.add("addColumnTypes=" + addColumnTypes)
.toString();
}
public static class Builder {
private final MessageType storageSchema;

View File

@@ -18,8 +18,6 @@
package org.apache.hudi.hive.util;
import com.google.common.collect.Maps;
import com.google.common.collect.Sets;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hudi.common.model.HoodieLogFile;
@@ -42,6 +40,8 @@ import org.apache.parquet.schema.Type;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;
@@ -67,7 +67,7 @@ public class SchemaUtil {
}
LOG.info("Getting schema difference for " + tableSchema + "\r\n\r\n" + newTableSchema);
SchemaDifference.Builder schemaDiffBuilder = SchemaDifference.newBuilder(storageSchema, tableSchema);
Set<String> tableColumns = Sets.newHashSet();
Set<String> tableColumns = new HashSet<>();
for (Map.Entry<String, String> field : tableSchema.entrySet()) {
String fieldName = field.getKey().toLowerCase();
@@ -140,7 +140,7 @@ public class SchemaUtil {
* @return : Hive Table schema read from parquet file MAP[String,String]
*/
public static Map<String, String> convertParquetSchemaToHiveSchema(MessageType messageType) throws IOException {
Map<String, String> schema = Maps.newLinkedHashMap();
Map<String, String> schema = new LinkedHashMap<>();
List<Type> parquetFields = messageType.getFields();
for (Type parquetType : parquetFields) {
StringBuilder result = new StringBuilder();

View File

@@ -24,7 +24,6 @@ import org.apache.hudi.hive.HoodieHiveClient.PartitionEvent;
import org.apache.hudi.hive.HoodieHiveClient.PartitionEvent.PartitionEventType;
import org.apache.hudi.hive.util.SchemaUtil;
import com.google.common.collect.Lists;
import org.apache.hadoop.hive.metastore.api.Partition;
import org.apache.parquet.schema.MessageType;
import org.apache.parquet.schema.OriginalType;
@@ -345,7 +344,7 @@ public class TestHiveSyncTool {
HiveSyncConfig hiveSyncConfig = HiveSyncConfig.copy(TestUtil.hiveSyncConfig);
hiveSyncConfig.partitionValueExtractorClass = MultiPartKeysValueExtractor.class.getCanonicalName();
hiveSyncConfig.tableName = "multi_part_key";
hiveSyncConfig.partitionFields = Lists.newArrayList("year", "month", "day");
hiveSyncConfig.partitionFields = Arrays.asList("year", "month", "day");
TestUtil.getCreatedTablesSet().add(hiveSyncConfig.databaseName + "." + hiveSyncConfig.tableName);
HoodieHiveClient hiveClient = new HoodieHiveClient(hiveSyncConfig, TestUtil.getHiveConf(), TestUtil.fileSystem);

View File

@@ -43,9 +43,6 @@ import org.apache.hudi.common.util.FileIOUtils;
import org.apache.hudi.common.util.SchemaTestUtil;
import org.apache.hudi.hive.util.HiveTestService;
import com.google.common.collect.Lists;
import com.google.common.collect.Maps;
import com.google.common.collect.Sets;
import org.apache.avro.Schema;
import org.apache.avro.generic.IndexedRecord;
import org.apache.hadoop.conf.Configuration;
@@ -68,6 +65,10 @@ import java.io.File;
import java.io.IOException;
import java.net.URISyntaxException;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
@@ -86,7 +87,7 @@ public class TestUtil {
static HiveSyncConfig hiveSyncConfig;
private static DateTimeFormatter dtfOut;
static FileSystem fileSystem;
private static Set<String> createdTablesSet = Sets.newHashSet();
private static Set<String> createdTablesSet = new HashSet<>();
public static void setUp() throws IOException, InterruptedException {
if (dfsCluster == null) {
@@ -114,7 +115,7 @@ public class TestUtil {
hiveSyncConfig.basePath = "/tmp/hdfs/TestHiveSyncTool/";
hiveSyncConfig.assumeDatePartitioning = true;
hiveSyncConfig.usePreApacheInputFormat = false;
hiveSyncConfig.partitionFields = Lists.newArrayList("datestr");
hiveSyncConfig.partitionFields = Collections.singletonList("datestr");
dtfOut = DateTimeFormat.forPattern("yyyy/MM/dd");
@@ -249,7 +250,7 @@ public class TestUtil {
private static List<HoodieWriteStat> createTestData(Path partPath, boolean isParquetSchemaSimple, String commitTime)
throws IOException, URISyntaxException {
List<HoodieWriteStat> writeStats = Lists.newArrayList();
List<HoodieWriteStat> writeStats = new ArrayList<>();
for (int i = 0; i < 5; i++) {
// Create 5 files
String fileId = UUID.randomUUID().toString();
@@ -297,7 +298,7 @@ public class TestUtil {
.overBaseCommit(dataFile.getCommitTime()).withFs(fileSystem).build();
List<IndexedRecord> records = (isLogSchemaSimple ? SchemaTestUtil.generateTestRecords(0, 100)
: SchemaTestUtil.generateEvolvedTestRecords(100, 100));
Map<HeaderMetadataType, String> header = Maps.newHashMap();
Map<HeaderMetadataType, String> header = new HashMap<>(2);
header.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, dataFile.getCommitTime());
header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, schema.toString());
HoodieAvroDataBlock dataBlock = new HoodieAvroDataBlock(records, header);