From cd090871a18210ac311c02009b27bd7cfcf2bb44 Mon Sep 17 00:00:00 2001 From: vinoth chandar Date: Sun, 25 Aug 2019 05:34:51 -0700 Subject: [PATCH] [HUDI-159]: Pom cleanup and removal of com.twitter.parquet - Redo all classes based on org.parquet only - remove unuused dependencies like parquet-hadoop, common-configuration2 - timeline-service does not build a fat jar anymore - Fix utilities and hadoop-mr bundles based on above --- hudi-cli/pom.xml | 18 +- .../cli/commands/HoodieLogFileCommand.java | 4 +- hudi-client/pom.xml | 17 +- .../apache/hudi/io/HoodieAppendHandle.java | 2 +- .../TestHoodieCompactionStrategy.java | 4 +- hudi-common/pom.xml | 17 +- .../common/util/DefaultSizeEstimator.java | 2 - .../util/HoodieRecordSizeEstimator.java | 1 - .../common/util/ObjectSizeCalculator.java | 478 ++++++++++++++++++ .../apache/hudi/common/util/ParquetUtils.java | 3 +- .../util/collection/ExternalSpillableMap.java | 2 +- hudi-hadoop-mr/pom.xml | 22 - .../AbstractRealtimeRecordReader.java | 6 +- hudi-hive/pom.xml | 37 +- .../org/apache/hudi/hive/HiveSyncTool.java | 8 +- .../apache/hudi/hive/HoodieHiveClient.java | 8 +- .../apache/hudi/hive/SchemaDifference.java | 2 +- .../org/apache/hudi/hive/util/SchemaUtil.java | 16 +- .../apache/hudi/hive/HiveSyncToolTest.java | 39 +- .../hudi/hive/util/HiveTestService.java | 6 +- hudi-spark/pom.xml | 16 +- hudi-timeline-service/pom.xml | 59 --- hudi-utilities/pom.xml | 64 +-- packaging/hudi-hadoop-mr-bundle/pom.xml | 24 +- packaging/hudi-hive-bundle/pom.xml | 6 - packaging/hudi-presto-bundle/pom.xml | 6 - packaging/hudi-spark-bundle/pom.xml | 4 - packaging/hudi-utilities-bundle/pom.xml | 19 +- pom.xml | 36 +- 29 files changed, 600 insertions(+), 326 deletions(-) create mode 100644 hudi-common/src/main/java/org/apache/hudi/common/util/ObjectSizeCalculator.java diff --git a/hudi-cli/pom.xml b/hudi-cli/pom.xml index 3b66ff2f2..562adbbc9 100644 --- a/hudi-cli/pom.xml +++ b/hudi-cli/pom.xml @@ -166,6 +166,11 @@ log4j + + org.apache.parquet + parquet-avro + + org.apache.spark @@ -188,12 +193,6 @@ ${spring.shell.version} - - de.vandermeer - asciitable - 0.2.5 - - com.jakewharton.fliptables fliptables @@ -214,12 +213,5 @@ org.apache.hadoop hadoop-hdfs - - - junit - junit-dep - ${junit-dep.version} - test - diff --git a/hudi-cli/src/main/java/org/apache/hudi/cli/commands/HoodieLogFileCommand.java b/hudi-cli/src/main/java/org/apache/hudi/cli/commands/HoodieLogFileCommand.java index 9e82cd745..cc822353e 100644 --- a/hudi-cli/src/main/java/org/apache/hudi/cli/commands/HoodieLogFileCommand.java +++ b/hudi-cli/src/main/java/org/apache/hudi/cli/commands/HoodieLogFileCommand.java @@ -18,8 +18,8 @@ package org.apache.hudi.cli.commands; -import com.beust.jcommander.internal.Maps; import com.fasterxml.jackson.databind.ObjectMapper; +import com.google.common.collect.Maps; import java.io.IOException; import java.util.ArrayList; import java.util.Arrays; @@ -50,12 +50,12 @@ import org.apache.hudi.common.util.Option; import org.apache.hudi.config.HoodieCompactionConfig; import org.apache.hudi.config.HoodieMemoryConfig; import org.apache.hudi.hive.util.SchemaUtil; +import org.apache.parquet.avro.AvroSchemaConverter; import org.springframework.shell.core.CommandMarker; import org.springframework.shell.core.annotation.CliAvailabilityIndicator; import org.springframework.shell.core.annotation.CliCommand; import org.springframework.shell.core.annotation.CliOption; import org.springframework.stereotype.Component; -import parquet.avro.AvroSchemaConverter; import scala.Tuple2; import scala.Tuple3; diff --git a/hudi-client/pom.xml b/hudi-client/pom.xml index 5e1512d8d..c6ed6ffb5 100644 --- a/hudi-client/pom.xml +++ b/hudi-client/pom.xml @@ -91,10 +91,6 @@ org.apache.parquet parquet-avro - - org.apache.parquet - parquet-hadoop - @@ -110,6 +106,12 @@ io.dropwizard.metrics metrics-graphite + + + com.rabbitmq + * + + io.dropwizard.metrics @@ -125,12 +127,7 @@ com.beust jcommander 1.48 - - - - org.htrace - htrace-core - 3.0.4 + test diff --git a/hudi-client/src/main/java/org/apache/hudi/io/HoodieAppendHandle.java b/hudi-client/src/main/java/org/apache/hudi/io/HoodieAppendHandle.java index 6dbcb8b19..34178e618 100644 --- a/hudi-client/src/main/java/org/apache/hudi/io/HoodieAppendHandle.java +++ b/hudi-client/src/main/java/org/apache/hudi/io/HoodieAppendHandle.java @@ -18,7 +18,7 @@ package org.apache.hudi.io; -import com.beust.jcommander.internal.Maps; +import com.google.common.collect.Maps; import java.io.IOException; import java.util.ArrayList; import java.util.Iterator; diff --git a/hudi-client/src/test/java/org/apache/hudi/io/strategy/TestHoodieCompactionStrategy.java b/hudi-client/src/test/java/org/apache/hudi/io/strategy/TestHoodieCompactionStrategy.java index c3e123c94..f075edffe 100644 --- a/hudi-client/src/test/java/org/apache/hudi/io/strategy/TestHoodieCompactionStrategy.java +++ b/hudi-client/src/test/java/org/apache/hudi/io/strategy/TestHoodieCompactionStrategy.java @@ -21,8 +21,8 @@ package org.apache.hudi.io.strategy; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertTrue; -import com.beust.jcommander.internal.Lists; import com.google.common.collect.ImmutableMap; +import com.google.common.collect.Lists; import com.google.common.collect.Maps; import java.text.SimpleDateFormat; import java.util.ArrayList; @@ -248,7 +248,7 @@ public class TestHoodieCompactionStrategy { private List createCompactionOperations(HoodieWriteConfig config, Map> sizesMap, Map keyToPartitionMap) { - List operations = Lists.newArrayList(sizesMap.size()); + List operations = new ArrayList<>(sizesMap.size()); sizesMap.forEach((k, v) -> { HoodieDataFile df = TestHoodieDataFile.newDataFile(k); diff --git a/hudi-common/pom.xml b/hudi-common/pom.xml index f418ab64d..db3ffe98d 100644 --- a/hudi-common/pom.xml +++ b/hudi-common/pom.xml @@ -111,13 +111,6 @@ parquet-avro - - - com.twitter.common - objectsize - 0.0.12 - - commons-codec @@ -154,7 +147,12 @@ org.apache.hadoop hadoop-common tests + test + + org.apache.hadoop + hadoop-hdfs + org.apache.hadoop hadoop-hdfs @@ -173,11 +171,6 @@ test - - com.esotericsoftware - kryo - test - com.esotericsoftware kryo-shaded diff --git a/hudi-common/src/main/java/org/apache/hudi/common/util/DefaultSizeEstimator.java b/hudi-common/src/main/java/org/apache/hudi/common/util/DefaultSizeEstimator.java index 4c8320416..883c4cfa5 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/util/DefaultSizeEstimator.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/util/DefaultSizeEstimator.java @@ -18,8 +18,6 @@ package org.apache.hudi.common.util; -import com.twitter.common.objectsize.ObjectSizeCalculator; - /** * Default implementation of size-estimator that uses Twitter's ObjectSizeCalculator * @param diff --git a/hudi-common/src/main/java/org/apache/hudi/common/util/HoodieRecordSizeEstimator.java b/hudi-common/src/main/java/org/apache/hudi/common/util/HoodieRecordSizeEstimator.java index 3a29b9aa6..0cdd243db 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/util/HoodieRecordSizeEstimator.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/util/HoodieRecordSizeEstimator.java @@ -18,7 +18,6 @@ package org.apache.hudi.common.util; -import com.twitter.common.objectsize.ObjectSizeCalculator; import org.apache.avro.Schema; import org.apache.hudi.common.model.HoodieRecord; import org.apache.hudi.common.model.HoodieRecordPayload; diff --git a/hudi-common/src/main/java/org/apache/hudi/common/util/ObjectSizeCalculator.java b/hudi-common/src/main/java/org/apache/hudi/common/util/ObjectSizeCalculator.java new file mode 100644 index 000000000..8fdf73703 --- /dev/null +++ b/hudi-common/src/main/java/org/apache/hudi/common/util/ObjectSizeCalculator.java @@ -0,0 +1,478 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +// COPIED FROM https://github.com/twitter/commons/blob/master/src/java/com/twitter/common/objectsize/ +// ObjectSizeCalculator.java +// ================================================================================================= +// Copyright 2011 Twitter, Inc. +// ------------------------------------------------------------------------------------------------- +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this work except in compliance with the License. +// You may obtain a copy of the License in the LICENSE file, or at: +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// ================================================================================================= + +package org.apache.hudi.common.util; + +import com.google.common.annotations.VisibleForTesting; +import com.google.common.base.Preconditions; +import com.google.common.cache.CacheBuilder; +import com.google.common.cache.CacheLoader; +import com.google.common.cache.LoadingCache; +import com.google.common.collect.Sets; +import java.lang.management.ManagementFactory; +import java.lang.management.MemoryPoolMXBean; +import java.lang.reflect.Array; +import java.lang.reflect.Field; +import java.lang.reflect.Modifier; +import java.util.ArrayDeque; +import java.util.Arrays; +import java.util.Deque; +import java.util.LinkedList; +import java.util.List; +import java.util.Set; + +/** + * Contains utility methods for calculating the memory usage of objects. It + * only works on the HotSpot JVM, and infers the actual memory layout (32 bit + * vs. 64 bit word size, compressed object pointers vs. uncompressed) from + * best available indicators. It can reliably detect a 32 bit vs. 64 bit JVM. + * It can only make an educated guess at whether compressed OOPs are used, + * though; specifically, it knows what the JVM's default choice of OOP + * compression would be based on HotSpot version and maximum heap sizes, but if + * the choice is explicitly overridden with the -XX:{+|-}UseCompressedOops command line + * switch, it can not detect + * this fact and will report incorrect sizes, as it will presume the default JVM + * behavior. + * + * @author Attila Szegedi + */ +public class ObjectSizeCalculator { + + /** + * Describes constant memory overheads for various constructs in a JVM implementation. + */ + public interface MemoryLayoutSpecification { + + /** + * Returns the fixed overhead of an array of any type or length in this JVM. + * + * @return the fixed overhead of an array. + */ + int getArrayHeaderSize(); + + /** + * Returns the fixed overhead of for any {@link Object} subclass in this JVM. + * + * @return the fixed overhead of any object. + */ + int getObjectHeaderSize(); + + /** + * Returns the quantum field size for a field owned by an object in this JVM. + * + * @return the quantum field size for an object. + */ + int getObjectPadding(); + + /** + * Returns the fixed size of an object reference in this JVM. + * + * @return the size of all object references. + */ + int getReferenceSize(); + + /** + * Returns the quantum field size for a field owned by one of an object's ancestor superclasses + * in this JVM. + * + * @return the quantum field size for a superclass field. + */ + int getSuperclassFieldPadding(); + } + + private static class CurrentLayout { + + private static final MemoryLayoutSpecification SPEC = + getEffectiveMemoryLayoutSpecification(); + } + + /** + * Given an object, returns the total allocated size, in bytes, of the object + * and all other objects reachable from it. Attempts to to detect the current JVM memory layout, + * but may fail with {@link UnsupportedOperationException}; + * + * @param obj the object; can be null. Passing in a {@link java.lang.Class} object doesn't do + * anything special, it measures the size of all objects + * reachable through it (which will include its class loader, and by + * extension, all other Class objects loaded by + * the same loader, and all the parent class loaders). It doesn't provide the + * size of the static fields in the JVM class that the Class object + * represents. + * @return the total allocated size of the object and all other objects it + * retains. + * @throws UnsupportedOperationException if the current vm memory layout cannot be detected. + */ + public static long getObjectSize(Object obj) throws UnsupportedOperationException { + return obj == null ? 0 : new ObjectSizeCalculator(CurrentLayout.SPEC).calculateObjectSize(obj); + } + + // Fixed object header size for arrays. + private final int arrayHeaderSize; + // Fixed object header size for non-array objects. + private final int objectHeaderSize; + // Padding for the object size - if the object size is not an exact multiple + // of this, it is padded to the next multiple. + private final int objectPadding; + // Size of reference (pointer) fields. + private final int referenceSize; + // Padding for the fields of superclass before fields of subclasses are + // added. + private final int superclassFieldPadding; + + private final LoadingCache, ClassSizeInfo> classSizeInfos = + CacheBuilder.newBuilder().build(new CacheLoader, ClassSizeInfo>() { + public ClassSizeInfo load(Class clazz) { + return new ClassSizeInfo(clazz); + } + }); + + + private final Set alreadyVisited = Sets.newIdentityHashSet(); + private final Deque pending = new ArrayDeque(16 * 1024); + private long size; + + /** + * Creates an object size calculator that can calculate object sizes for a given + * {@code memoryLayoutSpecification}. + * + * @param memoryLayoutSpecification a description of the JVM memory layout. + */ + public ObjectSizeCalculator(MemoryLayoutSpecification memoryLayoutSpecification) { + Preconditions.checkNotNull(memoryLayoutSpecification); + arrayHeaderSize = memoryLayoutSpecification.getArrayHeaderSize(); + objectHeaderSize = memoryLayoutSpecification.getObjectHeaderSize(); + objectPadding = memoryLayoutSpecification.getObjectPadding(); + referenceSize = memoryLayoutSpecification.getReferenceSize(); + superclassFieldPadding = memoryLayoutSpecification.getSuperclassFieldPadding(); + } + + /** + * Given an object, returns the total allocated size, in bytes, of the object + * and all other objects reachable from it. + * + * @param obj the object; can be null. Passing in a {@link java.lang.Class} object doesn't do + * anything special, it measures the size of all objects + * reachable through it (which will include its class loader, and by + * extension, all other Class objects loaded by + * the same loader, and all the parent class loaders). It doesn't provide the + * size of the static fields in the JVM class that the Class object + * represents. + * @return the total allocated size of the object and all other objects it + * retains. + */ + public synchronized long calculateObjectSize(Object obj) { + // Breadth-first traversal instead of naive depth-first with recursive + // implementation, so we don't blow the stack traversing long linked lists. + try { + for (; ; ) { + visit(obj); + if (pending.isEmpty()) { + return size; + } + obj = pending.removeFirst(); + } + } finally { + alreadyVisited.clear(); + pending.clear(); + size = 0; + } + } + + private void visit(Object obj) { + if (alreadyVisited.contains(obj)) { + return; + } + final Class clazz = obj.getClass(); + if (clazz == ArrayElementsVisitor.class) { + ((ArrayElementsVisitor) obj).visit(this); + } else { + alreadyVisited.add(obj); + if (clazz.isArray()) { + visitArray(obj); + } else { + classSizeInfos.getUnchecked(clazz).visit(obj, this); + } + } + } + + private void visitArray(Object array) { + final Class componentType = array.getClass().getComponentType(); + final int length = Array.getLength(array); + if (componentType.isPrimitive()) { + increaseByArraySize(length, getPrimitiveFieldSize(componentType)); + } else { + increaseByArraySize(length, referenceSize); + // If we didn't use an ArrayElementsVisitor, we would be enqueueing every + // element of the array here instead. For large arrays, it would + // tremendously enlarge the queue. In essence, we're compressing it into + // a small command object instead. This is different than immediately + // visiting the elements, as their visiting is scheduled for the end of + // the current queue. + switch (length) { + case 0: { + break; + } + case 1: { + enqueue(Array.get(array, 0)); + break; + } + default: { + enqueue(new ArrayElementsVisitor((Object[]) array)); + } + } + } + } + + private void increaseByArraySize(int length, long elementSize) { + increaseSize(roundTo(arrayHeaderSize + length * elementSize, objectPadding)); + } + + private static class ArrayElementsVisitor { + + private final Object[] array; + + ArrayElementsVisitor(Object[] array) { + this.array = array; + } + + public void visit(ObjectSizeCalculator calc) { + for (Object elem : array) { + if (elem != null) { + calc.visit(elem); + } + } + } + } + + void enqueue(Object obj) { + if (obj != null) { + pending.addLast(obj); + } + } + + void increaseSize(long objectSize) { + size += objectSize; + } + + @VisibleForTesting + static long roundTo(long x, int multiple) { + return ((x + multiple - 1) / multiple) * multiple; + } + + private class ClassSizeInfo { + + // Padded fields + header size + private final long objectSize; + // Only the fields size - used to calculate the subclasses' memory + // footprint. + private final long fieldsSize; + private final Field[] referenceFields; + + public ClassSizeInfo(Class clazz) { + long fieldsSize = 0; + final List referenceFields = new LinkedList(); + for (Field f : clazz.getDeclaredFields()) { + if (Modifier.isStatic(f.getModifiers())) { + continue; + } + final Class type = f.getType(); + if (type.isPrimitive()) { + fieldsSize += getPrimitiveFieldSize(type); + } else { + f.setAccessible(true); + referenceFields.add(f); + fieldsSize += referenceSize; + } + } + final Class superClass = clazz.getSuperclass(); + if (superClass != null) { + final ClassSizeInfo superClassInfo = classSizeInfos.getUnchecked(superClass); + fieldsSize += roundTo(superClassInfo.fieldsSize, superclassFieldPadding); + referenceFields.addAll(Arrays.asList(superClassInfo.referenceFields)); + } + this.fieldsSize = fieldsSize; + this.objectSize = roundTo(objectHeaderSize + fieldsSize, objectPadding); + this.referenceFields = referenceFields.toArray( + new Field[referenceFields.size()]); + } + + void visit(Object obj, ObjectSizeCalculator calc) { + calc.increaseSize(objectSize); + enqueueReferencedObjects(obj, calc); + } + + public void enqueueReferencedObjects(Object obj, ObjectSizeCalculator calc) { + for (Field f : referenceFields) { + try { + calc.enqueue(f.get(obj)); + } catch (IllegalAccessException e) { + final AssertionError ae = new AssertionError( + "Unexpected denial of access to " + f); + ae.initCause(e); + throw ae; + } + } + } + } + + private static long getPrimitiveFieldSize(Class type) { + if (type == boolean.class || type == byte.class) { + return 1; + } + if (type == char.class || type == short.class) { + return 2; + } + if (type == int.class || type == float.class) { + return 4; + } + if (type == long.class || type == double.class) { + return 8; + } + throw new AssertionError("Encountered unexpected primitive type " + + type.getName()); + } + + @VisibleForTesting + static MemoryLayoutSpecification getEffectiveMemoryLayoutSpecification() { + final String vmName = System.getProperty("java.vm.name"); + if (vmName == null || !(vmName.startsWith("Java HotSpot(TM) ") + || vmName.startsWith("OpenJDK") || vmName.startsWith("TwitterJDK"))) { + throw new UnsupportedOperationException( + "ObjectSizeCalculator only supported on HotSpot VM"); + } + + final String dataModel = System.getProperty("sun.arch.data.model"); + if ("32".equals(dataModel)) { + // Running with 32-bit data model + return new MemoryLayoutSpecification() { + @Override + public int getArrayHeaderSize() { + return 12; + } + + @Override + public int getObjectHeaderSize() { + return 8; + } + + @Override + public int getObjectPadding() { + return 8; + } + + @Override + public int getReferenceSize() { + return 4; + } + + @Override + public int getSuperclassFieldPadding() { + return 4; + } + }; + } else if (!"64".equals(dataModel)) { + throw new UnsupportedOperationException("Unrecognized value '" + + dataModel + "' of sun.arch.data.model system property"); + } + + final String strVmVersion = System.getProperty("java.vm.version"); + final int vmVersion = Integer.parseInt(strVmVersion.substring(0, + strVmVersion.indexOf('.'))); + if (vmVersion >= 17) { + long maxMemory = 0; + for (MemoryPoolMXBean mp : ManagementFactory.getMemoryPoolMXBeans()) { + maxMemory += mp.getUsage().getMax(); + } + if (maxMemory < 30L * 1024 * 1024 * 1024) { + // HotSpot 17.0 and above use compressed OOPs below 30GB of RAM total + // for all memory pools (yes, including code cache). + return new MemoryLayoutSpecification() { + @Override + public int getArrayHeaderSize() { + return 16; + } + + @Override + public int getObjectHeaderSize() { + return 12; + } + + @Override + public int getObjectPadding() { + return 8; + } + + @Override + public int getReferenceSize() { + return 4; + } + + @Override + public int getSuperclassFieldPadding() { + return 4; + } + }; + } + } + + // In other cases, it's a 64-bit uncompressed OOPs object model + return new MemoryLayoutSpecification() { + @Override + public int getArrayHeaderSize() { + return 24; + } + + @Override + public int getObjectHeaderSize() { + return 16; + } + + @Override + public int getObjectPadding() { + return 8; + } + + @Override + public int getReferenceSize() { + return 8; + } + + @Override + public int getSuperclassFieldPadding() { + return 8; + } + }; + } +} diff --git a/hudi-common/src/main/java/org/apache/hudi/common/util/ParquetUtils.java b/hudi-common/src/main/java/org/apache/hudi/common/util/ParquetUtils.java index ecb80b63d..e4cb6606f 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/util/ParquetUtils.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/util/ParquetUtils.java @@ -27,7 +27,6 @@ import java.util.Set; import java.util.function.Function; import org.apache.avro.Schema; import org.apache.avro.generic.GenericRecord; -import org.apache.commons.collections.CollectionUtils; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.hudi.avro.HoodieAvroWriteSupport; @@ -71,7 +70,7 @@ public class ParquetUtils { */ public static Set filterParquetRowKeys(Configuration configuration, Path filePath, Set filter) { Option filterFunction = Option.empty(); - if (CollectionUtils.isNotEmpty(filter)) { + if (filter != null && !filter.isEmpty()) { filterFunction = Option.of(new RecordKeysFilterFunction(filter)); } Configuration conf = new Configuration(configuration); diff --git a/hudi-common/src/main/java/org/apache/hudi/common/util/collection/ExternalSpillableMap.java b/hudi-common/src/main/java/org/apache/hudi/common/util/collection/ExternalSpillableMap.java index 2b8463780..7e6ecff1d 100644 --- a/hudi-common/src/main/java/org/apache/hudi/common/util/collection/ExternalSpillableMap.java +++ b/hudi-common/src/main/java/org/apache/hudi/common/util/collection/ExternalSpillableMap.java @@ -18,7 +18,6 @@ package org.apache.hudi.common.util.collection; -import com.twitter.common.objectsize.ObjectSizeCalculator; import java.io.IOException; import java.io.Serializable; import java.util.ArrayList; @@ -30,6 +29,7 @@ import java.util.List; import java.util.Map; import java.util.Set; import java.util.stream.Stream; +import org.apache.hudi.common.util.ObjectSizeCalculator; import org.apache.hudi.common.util.SizeEstimator; import org.apache.hudi.exception.HoodieIOException; import org.apache.log4j.LogManager; diff --git a/hudi-hadoop-mr/pom.xml b/hudi-hadoop-mr/pom.xml index 66f2fe65a..25574e7af 100644 --- a/hudi-hadoop-mr/pom.xml +++ b/hudi-hadoop-mr/pom.xml @@ -49,28 +49,12 @@ parquet-avro - - - com.twitter - parquet-avro - - - com.twitter - parquet-hadoop-bundle - - commons-logging commons-logging - - com.twitter.common - objectsize - 0.0.12 - - org.apache.hadoop @@ -113,12 +97,6 @@ test - - com.esotericsoftware - kryo - test - - junit junit diff --git a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/realtime/AbstractRealtimeRecordReader.java b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/realtime/AbstractRealtimeRecordReader.java index d6cd4e1a5..a82b0bc3d 100644 --- a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/realtime/AbstractRealtimeRecordReader.java +++ b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/realtime/AbstractRealtimeRecordReader.java @@ -53,9 +53,9 @@ import org.apache.hudi.common.util.LogReaderUtils; import org.apache.hudi.common.util.collection.Pair; import org.apache.hudi.exception.HoodieException; import org.apache.hudi.exception.HoodieIOException; -import parquet.avro.AvroSchemaConverter; -import parquet.hadoop.ParquetFileReader; -import parquet.schema.MessageType; +import org.apache.parquet.avro.AvroSchemaConverter; +import org.apache.parquet.hadoop.ParquetFileReader; +import org.apache.parquet.schema.MessageType; /** * Record Reader implementation to merge fresh avro data with base parquet data, to support real diff --git a/hudi-hive/pom.xml b/hudi-hive/pom.xml index 758860d34..8fdcf5869 100644 --- a/hudi-hive/pom.xml +++ b/hudi-hive/pom.xml @@ -46,17 +46,12 @@ - org.slf4j - slf4j-api - - - org.slf4j - slf4j-log4j12 + log4j + log4j - - com.twitter + org.apache.parquet parquet-avro @@ -65,12 +60,6 @@ guava - - org.apache.thrift - libthrift - ${thrift.version} - - joda-time joda-time @@ -95,16 +84,6 @@ jcommander - - - org.apache.httpcomponents - httpcore - - - org.apache.httpcomponents - httpclient - - org.apache.hadoop @@ -175,15 +154,15 @@ - org.mockito - mockito-all + org.apache.thrift + libthrift + ${thrift.version} test - com.esotericsoftware.kryo - kryo - 2.21 + org.mockito + mockito-all test diff --git a/hudi-hive/src/main/java/org/apache/hudi/hive/HiveSyncTool.java b/hudi-hive/src/main/java/org/apache/hudi/hive/HiveSyncTool.java index a70106419..6e8e82a23 100644 --- a/hudi-hive/src/main/java/org/apache/hudi/hive/HiveSyncTool.java +++ b/hudi-hive/src/main/java/org/apache/hudi/hive/HiveSyncTool.java @@ -36,9 +36,9 @@ import org.apache.hudi.hadoop.realtime.HoodieRealtimeInputFormat; import org.apache.hudi.hive.HoodieHiveClient.PartitionEvent; import org.apache.hudi.hive.HoodieHiveClient.PartitionEvent.PartitionEventType; import org.apache.hudi.hive.util.SchemaUtil; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; -import parquet.schema.MessageType; +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; +import org.apache.parquet.schema.MessageType; /** @@ -52,7 +52,7 @@ import parquet.schema.MessageType; @SuppressWarnings("WeakerAccess") public class HiveSyncTool { - private static final Logger LOG = LoggerFactory.getLogger(HiveSyncTool.class); + private static Logger LOG = LogManager.getLogger(HiveSyncTool.class); private final HoodieHiveClient hoodieHiveClient; public static final String SUFFIX_REALTIME_TABLE = "_rt"; private final HiveSyncConfig cfg; diff --git a/hudi-hive/src/main/java/org/apache/hudi/hive/HoodieHiveClient.java b/hudi-hive/src/main/java/org/apache/hudi/hive/HoodieHiveClient.java index 90dded031..4a478016e 100644 --- a/hudi-hive/src/main/java/org/apache/hudi/hive/HoodieHiveClient.java +++ b/hudi-hive/src/main/java/org/apache/hudi/hive/HoodieHiveClient.java @@ -57,13 +57,13 @@ import org.apache.hudi.common.util.collection.Pair; import org.apache.hudi.exception.HoodieIOException; import org.apache.hudi.exception.InvalidDatasetException; import org.apache.hudi.hive.util.SchemaUtil; +import org.apache.parquet.format.converter.ParquetMetadataConverter; +import org.apache.parquet.hadoop.ParquetFileReader; +import org.apache.parquet.hadoop.metadata.ParquetMetadata; +import org.apache.parquet.schema.MessageType; import org.apache.thrift.TException; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import parquet.format.converter.ParquetMetadataConverter; -import parquet.hadoop.ParquetFileReader; -import parquet.hadoop.metadata.ParquetMetadata; -import parquet.schema.MessageType; @SuppressWarnings("ConstantConditions") public class HoodieHiveClient { diff --git a/hudi-hive/src/main/java/org/apache/hudi/hive/SchemaDifference.java b/hudi-hive/src/main/java/org/apache/hudi/hive/SchemaDifference.java index 8a7cee95d..c1f2291b3 100644 --- a/hudi-hive/src/main/java/org/apache/hudi/hive/SchemaDifference.java +++ b/hudi-hive/src/main/java/org/apache/hudi/hive/SchemaDifference.java @@ -25,7 +25,7 @@ import com.google.common.collect.Lists; import com.google.common.collect.Maps; import java.util.List; import java.util.Map; -import parquet.schema.MessageType; +import org.apache.parquet.schema.MessageType; /** * Represents the schema difference between the storage schema and hive table schema diff --git a/hudi-hive/src/main/java/org/apache/hudi/hive/util/SchemaUtil.java b/hudi-hive/src/main/java/org/apache/hudi/hive/util/SchemaUtil.java index 69ea8d7f5..9e2784ee3 100644 --- a/hudi-hive/src/main/java/org/apache/hudi/hive/util/SchemaUtil.java +++ b/hudi-hive/src/main/java/org/apache/hudi/hive/util/SchemaUtil.java @@ -36,14 +36,15 @@ import org.apache.hudi.common.table.log.block.HoodieLogBlock; import org.apache.hudi.hive.HiveSyncConfig; import org.apache.hudi.hive.HoodieHiveSyncException; import org.apache.hudi.hive.SchemaDifference; +import org.apache.parquet.avro.AvroSchemaConverter; +import org.apache.parquet.schema.DecimalMetadata; +import org.apache.parquet.schema.GroupType; +import org.apache.parquet.schema.MessageType; +import org.apache.parquet.schema.OriginalType; +import org.apache.parquet.schema.PrimitiveType; +import org.apache.parquet.schema.Type; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import parquet.schema.DecimalMetadata; -import parquet.schema.GroupType; -import parquet.schema.MessageType; -import parquet.schema.OriginalType; -import parquet.schema.PrimitiveType; -import parquet.schema.Type; /** * Schema Utilities @@ -439,6 +440,7 @@ public class SchemaUtil { /** * Read the schema from the log file on path + * @return */ @SuppressWarnings("OptionalUsedAsFieldOrParameterType") public static MessageType readSchemaFromLogFile(FileSystem fs, Path path) throws IOException { @@ -452,7 +454,7 @@ public class SchemaUtil { } reader.close(); if (lastBlock != null) { - return new parquet.avro.AvroSchemaConverter().convert(lastBlock.getSchema()); + return new AvroSchemaConverter().convert(lastBlock.getSchema()); } return null; } diff --git a/hudi-hive/src/test/java/org/apache/hudi/hive/HiveSyncToolTest.java b/hudi-hive/src/test/java/org/apache/hudi/hive/HiveSyncToolTest.java index fe6e196be..2947d8bb7 100644 --- a/hudi-hive/src/test/java/org/apache/hudi/hive/HiveSyncToolTest.java +++ b/hudi-hive/src/test/java/org/apache/hudi/hive/HiveSyncToolTest.java @@ -32,12 +32,13 @@ import org.apache.hudi.common.util.SchemaTestUtil; import org.apache.hudi.hive.HoodieHiveClient.PartitionEvent; import org.apache.hudi.hive.HoodieHiveClient.PartitionEvent.PartitionEventType; import org.apache.hudi.hive.util.SchemaUtil; +import org.apache.parquet.schema.MessageType; +import org.apache.parquet.schema.OriginalType; +import org.apache.parquet.schema.PrimitiveType; +import org.apache.parquet.schema.Types; import org.joda.time.DateTime; import org.junit.Before; import org.junit.Test; -import parquet.schema.MessageType; -import parquet.schema.OriginalType; -import parquet.schema.PrimitiveType; @SuppressWarnings("ConstantConditions") public class HiveSyncToolTest { @@ -59,8 +60,8 @@ public class HiveSyncToolTest { @Test public void testSchemaConvertArray() throws IOException { // Testing the 3-level annotation structure - MessageType schema = parquet.schema.Types.buildMessage().optionalGroup() - .as(parquet.schema.OriginalType.LIST).repeatedGroup() + MessageType schema = Types.buildMessage().optionalGroup() + .as(OriginalType.LIST).repeatedGroup() .optional(PrimitiveType.PrimitiveTypeName.INT32).named("element") .named("list").named("int_list").named("ArrayOfInts"); @@ -68,8 +69,8 @@ public class HiveSyncToolTest { assertEquals("`int_list` ARRAY< int>", schemaString); // A array of arrays - schema = parquet.schema.Types.buildMessage().optionalGroup() - .as(parquet.schema.OriginalType.LIST).repeatedGroup().requiredGroup() + schema = Types.buildMessage().optionalGroup() + .as(OriginalType.LIST).repeatedGroup().requiredGroup() .as(OriginalType.LIST).repeatedGroup() .required(PrimitiveType.PrimitiveTypeName.INT32).named("element").named("list") .named("element").named("list").named("int_list_list").named("ArrayOfArrayOfInts"); @@ -78,8 +79,8 @@ public class HiveSyncToolTest { assertEquals("`int_list_list` ARRAY< ARRAY< int>>", schemaString); // A list of integers - schema = parquet.schema.Types.buildMessage().optionalGroup() - .as(parquet.schema.OriginalType.LIST) + schema = Types.buildMessage().optionalGroup() + .as(OriginalType.LIST) .repeated(PrimitiveType.PrimitiveTypeName.INT32).named("element").named("int_list") .named("ArrayOfInts"); @@ -87,8 +88,8 @@ public class HiveSyncToolTest { assertEquals("`int_list` ARRAY< int>", schemaString); // A list of structs with two fields - schema = parquet.schema.Types.buildMessage().optionalGroup() - .as(parquet.schema.OriginalType.LIST).repeatedGroup() + schema = Types.buildMessage().optionalGroup() + .as(OriginalType.LIST).repeatedGroup() .required(PrimitiveType.PrimitiveTypeName.BINARY).named("str") .required(PrimitiveType.PrimitiveTypeName.INT32).named("num").named("element") .named("tuple_list").named("ArrayOfTuples"); @@ -99,8 +100,8 @@ public class HiveSyncToolTest { // A list of structs with a single field // For this case, since the inner group name is "array", we treat the // element type as a one-element struct. - schema = parquet.schema.Types.buildMessage().optionalGroup() - .as(parquet.schema.OriginalType.LIST).repeatedGroup() + schema = Types.buildMessage().optionalGroup() + .as(OriginalType.LIST).repeatedGroup() .required(PrimitiveType.PrimitiveTypeName.BINARY).named("str").named("array") .named("one_tuple_list").named("ArrayOfOneTuples"); @@ -110,8 +111,8 @@ public class HiveSyncToolTest { // A list of structs with a single field // For this case, since the inner group name ends with "_tuple", we also treat the // element type as a one-element struct. - schema = parquet.schema.Types.buildMessage().optionalGroup() - .as(parquet.schema.OriginalType.LIST).repeatedGroup() + schema = Types.buildMessage().optionalGroup() + .as(OriginalType.LIST).repeatedGroup() .required(PrimitiveType.PrimitiveTypeName.BINARY).named("str") .named("one_tuple_list_tuple").named("one_tuple_list").named("ArrayOfOneTuples2"); @@ -121,8 +122,8 @@ public class HiveSyncToolTest { // A list of structs with a single field // Unlike the above two cases, for this the element type is the type of the // only field in the struct. - schema = parquet.schema.Types.buildMessage().optionalGroup() - .as(parquet.schema.OriginalType.LIST).repeatedGroup() + schema = Types.buildMessage().optionalGroup() + .as(OriginalType.LIST).repeatedGroup() .required(PrimitiveType.PrimitiveTypeName.BINARY).named("str") .named("one_tuple_list").named("one_tuple_list").named("ArrayOfOneTuples3"); @@ -130,8 +131,8 @@ public class HiveSyncToolTest { assertEquals("`one_tuple_list` ARRAY< binary>", schemaString); // A list of maps - schema = parquet.schema.Types.buildMessage().optionalGroup() - .as(parquet.schema.OriginalType.LIST).repeatedGroup().as(OriginalType.MAP) + schema = Types.buildMessage().optionalGroup() + .as(OriginalType.LIST).repeatedGroup().as(OriginalType.MAP) .repeatedGroup().as(OriginalType.MAP_KEY_VALUE) .required(PrimitiveType.PrimitiveTypeName.BINARY).as(OriginalType.UTF8) .named("string_key").required(PrimitiveType.PrimitiveTypeName.INT32) diff --git a/hudi-hive/src/test/java/org/apache/hudi/hive/util/HiveTestService.java b/hudi-hive/src/test/java/org/apache/hudi/hive/util/HiveTestService.java index 74e8d79d0..487200e49 100644 --- a/hudi-hive/src/test/java/org/apache/hudi/hive/util/HiveTestService.java +++ b/hudi-hive/src/test/java/org/apache/hudi/hive/util/HiveTestService.java @@ -41,6 +41,8 @@ import org.apache.hadoop.hive.metastore.api.MetaException; import org.apache.hadoop.hive.thrift.TUGIContainingTransport; import org.apache.hive.service.server.HiveServer2; import org.apache.hudi.common.model.HoodieTestUtils; +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; import org.apache.thrift.TProcessor; import org.apache.thrift.protocol.TBinaryProtocol; import org.apache.thrift.server.TServer; @@ -52,12 +54,10 @@ import org.apache.thrift.transport.TSocket; import org.apache.thrift.transport.TTransport; import org.apache.thrift.transport.TTransportException; import org.apache.thrift.transport.TTransportFactory; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; public class HiveTestService { - private static final Logger LOG = LoggerFactory.getLogger(HiveTestService.class); + private static Logger LOG = LogManager.getLogger(HiveTestService.class); private static final int CONNECTION_TIMEOUT = 30000; diff --git a/hudi-spark/pom.xml b/hudi-spark/pom.xml index 60686f65a..b39508087 100644 --- a/hudi-spark/pom.xml +++ b/hudi-spark/pom.xml @@ -203,6 +203,12 @@ avro + + + org.apache.parquet + parquet-avro + + org.apache.spark @@ -218,12 +224,7 @@ com.databricks spark-avro_2.11 4.0.0 - - - - - org.apache.commons - commons-configuration2 + provided @@ -293,8 +294,7 @@ junit - junit-dep - ${junit-dep.version} + junit test diff --git a/hudi-timeline-service/pom.xml b/hudi-timeline-service/pom.xml index 353a40f6f..af5e42161 100644 --- a/hudi-timeline-service/pom.xml +++ b/hudi-timeline-service/pom.xml @@ -55,59 +55,6 @@ org.apache.rat apache-rat-plugin - - org.apache.maven.plugins - maven-dependency-plugin - - - copy-dependencies - prepare-package - - copy-dependencies - - - ${project.build.directory}/lib - true - true - true - - - - - - org.apache.maven.plugins - maven-shade-plugin - 2.4 - - true - - - *:* - - META-INF/*.SF - META-INF/*.DSA - META-INF/*.RSA - - - - - - - package - - shade - - - - - - org.apache.hudi.timeline.service.TimelineService - - - - - - @@ -244,11 +191,5 @@ test - - com.esotericsoftware - kryo - test - - diff --git a/hudi-utilities/pom.xml b/hudi-utilities/pom.xml index 612afdb63..ce8327687 100644 --- a/hudi-utilities/pom.xml +++ b/hudi-utilities/pom.xml @@ -76,18 +76,20 @@ + org.eclipse.jetty.aggregate jetty-all - 7.6.0.v20120127 + test org.eclipse.jetty jetty-server - 7.6.0.v20120127 + ${jetty.version} + test @@ -129,10 +131,6 @@ log4j log4j - - org.slf4j - slf4j-api - @@ -151,10 +149,6 @@ org.apache.parquet parquet-avro - - org.apache.parquet - parquet-hadoop - @@ -167,6 +161,7 @@ + org.apache.spark spark-sql_2.11 @@ -177,6 +172,13 @@ + + + com.databricks + spark-avro_2.11 + provided + + org.apache.spark spark-streaming_2.11 @@ -194,24 +196,6 @@ metrics-core - - io.javalin - javalin - 2.4.0 - - - org.eclipse.jetty - * - - - - - - com.yammer.metrics - metrics-core - 2.2.0 - - org.antlr @@ -252,24 +236,6 @@ 3.0.0 - - - commons-codec - commons-codec - - - commons-dbcp - commons-dbcp - - - commons-lang - commons-lang - - - commons-pool - commons-pool - - org.apache.httpcomponents @@ -295,11 +261,13 @@ org.apache.hadoop hadoop-hdfs tests + test org.apache.hadoop hadoop-common tests + test org.mortbay.jetty @@ -330,6 +298,10 @@ javax.servlet servlet-api + + org.eclipse.jetty.orbit + javax.servlet + diff --git a/packaging/hudi-hadoop-mr-bundle/pom.xml b/packaging/hudi-hadoop-mr-bundle/pom.xml index 1b68296b7..81b5e3d5a 100644 --- a/packaging/hudi-hadoop-mr-bundle/pom.xml +++ b/packaging/hudi-hadoop-mr-bundle/pom.xml @@ -25,6 +25,7 @@ 4.0.0 hudi-hadoop-mr-bundle + jar @@ -58,16 +59,6 @@ parquet-avro - - - com.twitter - parquet-avro - - - com.twitter - parquet-hadoop-bundle - - commons-logging @@ -82,12 +73,6 @@ commons-codec - - com.twitter.common - objectsize - 0.0.12 - - org.apache.hadoop @@ -239,15 +224,16 @@ org.apache.hudi:hudi-common org.apache.hudi:hudi-hadoop-mr - com.twitter:parquet-avro - com.twitter:parquet-hadoop-bundle - com.twitter.common:objectsize commons-logging:commons-logging commons-io:commons-io + commons-lang:commons-lang + commons-pool:commons-pool + commons-codec:commons-codec com.esotericsoftware:kryo-shaded org.objenesis:objenesis com.esotericsoftware:minlog commons-codec:commons-codec + org.apache.parquet:parquet-avro diff --git a/packaging/hudi-hive-bundle/pom.xml b/packaging/hudi-hive-bundle/pom.xml index 29594f4fe..72f00dba7 100644 --- a/packaging/hudi-hive-bundle/pom.xml +++ b/packaging/hudi-hive-bundle/pom.xml @@ -62,12 +62,6 @@ slf4j-log4j12 - - - com.twitter - parquet-avro - - org.apache.thrift diff --git a/packaging/hudi-presto-bundle/pom.xml b/packaging/hudi-presto-bundle/pom.xml index 54b966d32..fbaba749c 100644 --- a/packaging/hudi-presto-bundle/pom.xml +++ b/packaging/hudi-presto-bundle/pom.xml @@ -50,12 +50,6 @@ slf4j-log4j12 - - - com.twitter - parquet-avro - - org.apache.thrift diff --git a/packaging/hudi-spark-bundle/pom.xml b/packaging/hudi-spark-bundle/pom.xml index f46b94b17..bcd5beaff 100644 --- a/packaging/hudi-spark-bundle/pom.xml +++ b/packaging/hudi-spark-bundle/pom.xml @@ -296,10 +296,6 @@ commons-dbcp commons-dbcp - - org.apache.commons - commons-configuration2 - diff --git a/packaging/hudi-utilities-bundle/pom.xml b/packaging/hudi-utilities-bundle/pom.xml index 530b12fba..e002c11ab 100644 --- a/packaging/hudi-utilities-bundle/pom.xml +++ b/packaging/hudi-utilities-bundle/pom.xml @@ -75,11 +75,22 @@ org.apache.hudi:hudi-hadoop-mr org.apache.hudi:hudi-timeline-service com.beust:jcommander + + io.javalin:javalin + org.jetbrains.kotlin:* + org.eclipse.jetty:* + org.eclipse.jetty.websocket:* + org.rocksdb:rocksdbjni + org.apache.httpcomponents:httpclient + org.apache.httpcomponents:fluent-hc + org.antlr:stringtemplate + commons-io:commons-io + commons-logging:commons-logging + org.apache.parquet:parquet-avro + com.twitter:bijection-avro_2.11 com.twitter:bijection-core_2.11 org.apache.parquet:parquet-avro - com.twitter:parquet-avro - com.twitter.common:objectsize io.confluent:kafka-avro-serializer io.confluent:common-config io.confluent:common-utils @@ -263,10 +274,6 @@ org.apache.parquet parquet-avro - - org.apache.parquet - parquet-hadoop - diff --git a/pom.xml b/pom.xml index 45b10b600..097cb87bf 100644 --- a/pom.xml +++ b/pom.xml @@ -147,6 +147,7 @@ 3.0.1 file://${project.basedir}/src/test/resources/log4j-surefire.properties 0.12.0 + 7.6.0.v20120127 1.2.3 1.9.13 ${project.basedir} @@ -524,7 +525,7 @@ org.eclipse.jetty.aggregate jetty-all test - 7.6.0.v20120127 + ${jetty.version} @@ -545,34 +546,6 @@ parquet-avro ${parquet.version} - - org.apache.parquet - parquet-hadoop - ${parquet.version} - - - org.apache.parquet - parquet-hive-bundle - ${parquet.version} - - - - - - com.twitter - parquet-hadoop-bundle - 1.6.0 - - - com.twitter - parquet-hive-bundle - 1.6.0 - - - com.twitter - parquet-avro - 1.6.0 - @@ -675,11 +648,6 @@ commons-pool 1.4 - - org.apache.commons - commons-configuration2 - 2.1.1 -