[HUDI-159]: Pom cleanup and removal of com.twitter.parquet
- Redo all classes based on org.parquet only - remove unuused dependencies like parquet-hadoop, common-configuration2 - timeline-service does not build a fat jar anymore - Fix utilities and hadoop-mr bundles based on above
This commit is contained in:
committed by
Balaji Varadarajan
parent
6edf0b9def
commit
cd090871a1
@@ -36,9 +36,9 @@ import org.apache.hudi.hadoop.realtime.HoodieRealtimeInputFormat;
|
||||
import org.apache.hudi.hive.HoodieHiveClient.PartitionEvent;
|
||||
import org.apache.hudi.hive.HoodieHiveClient.PartitionEvent.PartitionEventType;
|
||||
import org.apache.hudi.hive.util.SchemaUtil;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
import parquet.schema.MessageType;
|
||||
import org.apache.log4j.LogManager;
|
||||
import org.apache.log4j.Logger;
|
||||
import org.apache.parquet.schema.MessageType;
|
||||
|
||||
|
||||
/**
|
||||
@@ -52,7 +52,7 @@ import parquet.schema.MessageType;
|
||||
@SuppressWarnings("WeakerAccess")
|
||||
public class HiveSyncTool {
|
||||
|
||||
private static final Logger LOG = LoggerFactory.getLogger(HiveSyncTool.class);
|
||||
private static Logger LOG = LogManager.getLogger(HiveSyncTool.class);
|
||||
private final HoodieHiveClient hoodieHiveClient;
|
||||
public static final String SUFFIX_REALTIME_TABLE = "_rt";
|
||||
private final HiveSyncConfig cfg;
|
||||
|
||||
@@ -57,13 +57,13 @@ import org.apache.hudi.common.util.collection.Pair;
|
||||
import org.apache.hudi.exception.HoodieIOException;
|
||||
import org.apache.hudi.exception.InvalidDatasetException;
|
||||
import org.apache.hudi.hive.util.SchemaUtil;
|
||||
import org.apache.parquet.format.converter.ParquetMetadataConverter;
|
||||
import org.apache.parquet.hadoop.ParquetFileReader;
|
||||
import org.apache.parquet.hadoop.metadata.ParquetMetadata;
|
||||
import org.apache.parquet.schema.MessageType;
|
||||
import org.apache.thrift.TException;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
import parquet.format.converter.ParquetMetadataConverter;
|
||||
import parquet.hadoop.ParquetFileReader;
|
||||
import parquet.hadoop.metadata.ParquetMetadata;
|
||||
import parquet.schema.MessageType;
|
||||
|
||||
@SuppressWarnings("ConstantConditions")
|
||||
public class HoodieHiveClient {
|
||||
|
||||
@@ -25,7 +25,7 @@ import com.google.common.collect.Lists;
|
||||
import com.google.common.collect.Maps;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import parquet.schema.MessageType;
|
||||
import org.apache.parquet.schema.MessageType;
|
||||
|
||||
/**
|
||||
* Represents the schema difference between the storage schema and hive table schema
|
||||
|
||||
@@ -36,14 +36,15 @@ import org.apache.hudi.common.table.log.block.HoodieLogBlock;
|
||||
import org.apache.hudi.hive.HiveSyncConfig;
|
||||
import org.apache.hudi.hive.HoodieHiveSyncException;
|
||||
import org.apache.hudi.hive.SchemaDifference;
|
||||
import org.apache.parquet.avro.AvroSchemaConverter;
|
||||
import org.apache.parquet.schema.DecimalMetadata;
|
||||
import org.apache.parquet.schema.GroupType;
|
||||
import org.apache.parquet.schema.MessageType;
|
||||
import org.apache.parquet.schema.OriginalType;
|
||||
import org.apache.parquet.schema.PrimitiveType;
|
||||
import org.apache.parquet.schema.Type;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
import parquet.schema.DecimalMetadata;
|
||||
import parquet.schema.GroupType;
|
||||
import parquet.schema.MessageType;
|
||||
import parquet.schema.OriginalType;
|
||||
import parquet.schema.PrimitiveType;
|
||||
import parquet.schema.Type;
|
||||
|
||||
/**
|
||||
* Schema Utilities
|
||||
@@ -439,6 +440,7 @@ public class SchemaUtil {
|
||||
|
||||
/**
|
||||
* Read the schema from the log file on path
|
||||
* @return
|
||||
*/
|
||||
@SuppressWarnings("OptionalUsedAsFieldOrParameterType")
|
||||
public static MessageType readSchemaFromLogFile(FileSystem fs, Path path) throws IOException {
|
||||
@@ -452,7 +454,7 @@ public class SchemaUtil {
|
||||
}
|
||||
reader.close();
|
||||
if (lastBlock != null) {
|
||||
return new parquet.avro.AvroSchemaConverter().convert(lastBlock.getSchema());
|
||||
return new AvroSchemaConverter().convert(lastBlock.getSchema());
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
@@ -32,12 +32,13 @@ import org.apache.hudi.common.util.SchemaTestUtil;
|
||||
import org.apache.hudi.hive.HoodieHiveClient.PartitionEvent;
|
||||
import org.apache.hudi.hive.HoodieHiveClient.PartitionEvent.PartitionEventType;
|
||||
import org.apache.hudi.hive.util.SchemaUtil;
|
||||
import org.apache.parquet.schema.MessageType;
|
||||
import org.apache.parquet.schema.OriginalType;
|
||||
import org.apache.parquet.schema.PrimitiveType;
|
||||
import org.apache.parquet.schema.Types;
|
||||
import org.joda.time.DateTime;
|
||||
import org.junit.Before;
|
||||
import org.junit.Test;
|
||||
import parquet.schema.MessageType;
|
||||
import parquet.schema.OriginalType;
|
||||
import parquet.schema.PrimitiveType;
|
||||
|
||||
@SuppressWarnings("ConstantConditions")
|
||||
public class HiveSyncToolTest {
|
||||
@@ -59,8 +60,8 @@ public class HiveSyncToolTest {
|
||||
@Test
|
||||
public void testSchemaConvertArray() throws IOException {
|
||||
// Testing the 3-level annotation structure
|
||||
MessageType schema = parquet.schema.Types.buildMessage().optionalGroup()
|
||||
.as(parquet.schema.OriginalType.LIST).repeatedGroup()
|
||||
MessageType schema = Types.buildMessage().optionalGroup()
|
||||
.as(OriginalType.LIST).repeatedGroup()
|
||||
.optional(PrimitiveType.PrimitiveTypeName.INT32).named("element")
|
||||
.named("list").named("int_list").named("ArrayOfInts");
|
||||
|
||||
@@ -68,8 +69,8 @@ public class HiveSyncToolTest {
|
||||
assertEquals("`int_list` ARRAY< int>", schemaString);
|
||||
|
||||
// A array of arrays
|
||||
schema = parquet.schema.Types.buildMessage().optionalGroup()
|
||||
.as(parquet.schema.OriginalType.LIST).repeatedGroup().requiredGroup()
|
||||
schema = Types.buildMessage().optionalGroup()
|
||||
.as(OriginalType.LIST).repeatedGroup().requiredGroup()
|
||||
.as(OriginalType.LIST).repeatedGroup()
|
||||
.required(PrimitiveType.PrimitiveTypeName.INT32).named("element").named("list")
|
||||
.named("element").named("list").named("int_list_list").named("ArrayOfArrayOfInts");
|
||||
@@ -78,8 +79,8 @@ public class HiveSyncToolTest {
|
||||
assertEquals("`int_list_list` ARRAY< ARRAY< int>>", schemaString);
|
||||
|
||||
// A list of integers
|
||||
schema = parquet.schema.Types.buildMessage().optionalGroup()
|
||||
.as(parquet.schema.OriginalType.LIST)
|
||||
schema = Types.buildMessage().optionalGroup()
|
||||
.as(OriginalType.LIST)
|
||||
.repeated(PrimitiveType.PrimitiveTypeName.INT32).named("element").named("int_list")
|
||||
.named("ArrayOfInts");
|
||||
|
||||
@@ -87,8 +88,8 @@ public class HiveSyncToolTest {
|
||||
assertEquals("`int_list` ARRAY< int>", schemaString);
|
||||
|
||||
// A list of structs with two fields
|
||||
schema = parquet.schema.Types.buildMessage().optionalGroup()
|
||||
.as(parquet.schema.OriginalType.LIST).repeatedGroup()
|
||||
schema = Types.buildMessage().optionalGroup()
|
||||
.as(OriginalType.LIST).repeatedGroup()
|
||||
.required(PrimitiveType.PrimitiveTypeName.BINARY).named("str")
|
||||
.required(PrimitiveType.PrimitiveTypeName.INT32).named("num").named("element")
|
||||
.named("tuple_list").named("ArrayOfTuples");
|
||||
@@ -99,8 +100,8 @@ public class HiveSyncToolTest {
|
||||
// A list of structs with a single field
|
||||
// For this case, since the inner group name is "array", we treat the
|
||||
// element type as a one-element struct.
|
||||
schema = parquet.schema.Types.buildMessage().optionalGroup()
|
||||
.as(parquet.schema.OriginalType.LIST).repeatedGroup()
|
||||
schema = Types.buildMessage().optionalGroup()
|
||||
.as(OriginalType.LIST).repeatedGroup()
|
||||
.required(PrimitiveType.PrimitiveTypeName.BINARY).named("str").named("array")
|
||||
.named("one_tuple_list").named("ArrayOfOneTuples");
|
||||
|
||||
@@ -110,8 +111,8 @@ public class HiveSyncToolTest {
|
||||
// A list of structs with a single field
|
||||
// For this case, since the inner group name ends with "_tuple", we also treat the
|
||||
// element type as a one-element struct.
|
||||
schema = parquet.schema.Types.buildMessage().optionalGroup()
|
||||
.as(parquet.schema.OriginalType.LIST).repeatedGroup()
|
||||
schema = Types.buildMessage().optionalGroup()
|
||||
.as(OriginalType.LIST).repeatedGroup()
|
||||
.required(PrimitiveType.PrimitiveTypeName.BINARY).named("str")
|
||||
.named("one_tuple_list_tuple").named("one_tuple_list").named("ArrayOfOneTuples2");
|
||||
|
||||
@@ -121,8 +122,8 @@ public class HiveSyncToolTest {
|
||||
// A list of structs with a single field
|
||||
// Unlike the above two cases, for this the element type is the type of the
|
||||
// only field in the struct.
|
||||
schema = parquet.schema.Types.buildMessage().optionalGroup()
|
||||
.as(parquet.schema.OriginalType.LIST).repeatedGroup()
|
||||
schema = Types.buildMessage().optionalGroup()
|
||||
.as(OriginalType.LIST).repeatedGroup()
|
||||
.required(PrimitiveType.PrimitiveTypeName.BINARY).named("str")
|
||||
.named("one_tuple_list").named("one_tuple_list").named("ArrayOfOneTuples3");
|
||||
|
||||
@@ -130,8 +131,8 @@ public class HiveSyncToolTest {
|
||||
assertEquals("`one_tuple_list` ARRAY< binary>", schemaString);
|
||||
|
||||
// A list of maps
|
||||
schema = parquet.schema.Types.buildMessage().optionalGroup()
|
||||
.as(parquet.schema.OriginalType.LIST).repeatedGroup().as(OriginalType.MAP)
|
||||
schema = Types.buildMessage().optionalGroup()
|
||||
.as(OriginalType.LIST).repeatedGroup().as(OriginalType.MAP)
|
||||
.repeatedGroup().as(OriginalType.MAP_KEY_VALUE)
|
||||
.required(PrimitiveType.PrimitiveTypeName.BINARY).as(OriginalType.UTF8)
|
||||
.named("string_key").required(PrimitiveType.PrimitiveTypeName.INT32)
|
||||
|
||||
@@ -41,6 +41,8 @@ import org.apache.hadoop.hive.metastore.api.MetaException;
|
||||
import org.apache.hadoop.hive.thrift.TUGIContainingTransport;
|
||||
import org.apache.hive.service.server.HiveServer2;
|
||||
import org.apache.hudi.common.model.HoodieTestUtils;
|
||||
import org.apache.log4j.LogManager;
|
||||
import org.apache.log4j.Logger;
|
||||
import org.apache.thrift.TProcessor;
|
||||
import org.apache.thrift.protocol.TBinaryProtocol;
|
||||
import org.apache.thrift.server.TServer;
|
||||
@@ -52,12 +54,10 @@ import org.apache.thrift.transport.TSocket;
|
||||
import org.apache.thrift.transport.TTransport;
|
||||
import org.apache.thrift.transport.TTransportException;
|
||||
import org.apache.thrift.transport.TTransportFactory;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
public class HiveTestService {
|
||||
|
||||
private static final Logger LOG = LoggerFactory.getLogger(HiveTestService.class);
|
||||
private static Logger LOG = LogManager.getLogger(HiveTestService.class);
|
||||
|
||||
private static final int CONNECTION_TIMEOUT = 30000;
|
||||
|
||||
|
||||
Reference in New Issue
Block a user