From 8d0426826472f5b861b6bd7c43d8935917431467 Mon Sep 17 00:00:00 2001 From: Udit Mehrotra Date: Wed, 12 Aug 2020 17:51:31 -0700 Subject: [PATCH] [HUDI-1174] Changes for bootstrapped tables to work with presto (#1944) The purpose of this pull request is to implement changes required on Hudi side to get Bootstrapped tables integrated with Presto. The testing was done against presto 0.232 and following changes were identified to make it work: Annotation UseRecordReaderFromInputFormat is required on HoodieParquetInputFormat as well, because the reading for bootstrapped tables needs to happen through record reader to be able to perform the merge. On presto side, this annotation is already handled. We need to internally maintain VIRTUAL_COLUMN_NAMES because presto's internal hive version hive-apache-1.2.2 has VirutalColumn as a class, versus the one we depend on in hudi which is an enum. Dependency changes in hudi-presto-bundle to avoid runtime exceptions. --- .../hudi/hadoop/HoodieParquetInputFormat.java | 6 +- .../hudi/hadoop/utils/HoodieHiveUtils.java | 5 ++ .../apache/hudi/hadoop/TestAnnotation.java | 18 +++--- packaging/hudi-presto-bundle/pom.xml | 58 +++++++++++++++++++ pom.xml | 2 + 5 files changed, 79 insertions(+), 10 deletions(-) diff --git a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HoodieParquetInputFormat.java b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HoodieParquetInputFormat.java index 74a557e7e..8b89949e8 100644 --- a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HoodieParquetInputFormat.java +++ b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HoodieParquetInputFormat.java @@ -37,7 +37,6 @@ import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat; import org.apache.hadoop.hive.ql.io.sarg.ConvertAstToSearchArg; -import org.apache.hadoop.hive.ql.metadata.VirtualColumn; import org.apache.hadoop.hive.ql.plan.TableScanDesc; import org.apache.hadoop.io.ArrayWritable; import org.apache.hadoop.io.NullWritable; @@ -63,6 +62,7 @@ import java.util.stream.IntStream; * that does not correspond to a hoodie table then they are passed in as is (as what FileInputFormat.listStatus() * would do). The JobConf could have paths from multipe Hoodie/Non-Hoodie tables */ +@UseRecordReaderFromInputFormat @UseFileSplitsFromInputFormat public class HoodieParquetInputFormat extends MapredParquetInputFormat implements Configurable { @@ -179,7 +179,7 @@ public class HoodieParquetInputFormat extends MapredParquetInputFormat implement // clearOutExistingPredicate(job); // } if (split instanceof BootstrapBaseFileSplit) { - BootstrapBaseFileSplit eSplit = (BootstrapBaseFileSplit)split; + BootstrapBaseFileSplit eSplit = (BootstrapBaseFileSplit) split; String[] rawColNames = HoodieColumnProjectionUtils.getReadColumnNames(job); List rawColIds = HoodieColumnProjectionUtils.getReadColumnIDs(job); List> projectedColsWithIndex = @@ -191,7 +191,7 @@ public class HoodieParquetInputFormat extends MapredParquetInputFormat implement .collect(Collectors.toList()); List> externalColsProjected = projectedColsWithIndex.stream() .filter(idxWithName -> !HoodieRecord.HOODIE_META_COLUMNS.contains(idxWithName.getValue()) - && !VirtualColumn.VIRTUAL_COLUMN_NAMES.contains(idxWithName.getValue())) + && !HoodieHiveUtils.VIRTUAL_COLUMN_NAMES.contains(idxWithName.getValue())) .collect(Collectors.toList()); // This always matches hive table description diff --git a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/utils/HoodieHiveUtils.java b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/utils/HoodieHiveUtils.java index 02fb9d06d..8dbe08075 100644 --- a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/utils/HoodieHiveUtils.java +++ b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/utils/HoodieHiveUtils.java @@ -20,6 +20,7 @@ package org.apache.hudi.hadoop.utils; import org.apache.hadoop.fs.Path; import org.apache.hadoop.mapreduce.JobContext; +import org.apache.hudi.common.util.CollectionUtils; import org.apache.log4j.LogManager; import org.apache.log4j.Logger; @@ -27,6 +28,7 @@ import java.util.ArrayList; import java.util.List; import java.util.Map; import java.util.Objects; +import java.util.Set; import java.util.regex.Matcher; import java.util.regex.Pattern; import java.util.stream.Collectors; @@ -38,6 +40,9 @@ public class HoodieHiveUtils { public static final String HOODIE_CONSUME_MODE_PATTERN = "hoodie.%s.consume.mode"; public static final String HOODIE_START_COMMIT_PATTERN = "hoodie.%s.consume.start.timestamp"; public static final String HOODIE_MAX_COMMIT_PATTERN = "hoodie.%s.consume.max.commits"; + public static final Set VIRTUAL_COLUMN_NAMES = CollectionUtils.createImmutableSet( + "INPUT__FILE__NAME", "BLOCK__OFFSET__INSIDE__FILE", "ROW__OFFSET__INSIDE__BLOCK", "RAW__DATA__SIZE", + "ROW__ID", "GROUPING__ID"); /* * Boolean property to stop incremental reader when there is a pending compaction. * This is needed to prevent certain race conditions with RO views of MOR tables. only applicable for RO views. diff --git a/hudi-hadoop-mr/src/test/java/org/apache/hudi/hadoop/TestAnnotation.java b/hudi-hadoop-mr/src/test/java/org/apache/hudi/hadoop/TestAnnotation.java index 1f74c7a2a..7a4f260ea 100644 --- a/hudi-hadoop-mr/src/test/java/org/apache/hudi/hadoop/TestAnnotation.java +++ b/hudi-hadoop-mr/src/test/java/org/apache/hudi/hadoop/TestAnnotation.java @@ -29,14 +29,19 @@ public class TestAnnotation { @Test public void testHoodieParquetInputFormatAnnotation() { assertTrue(HoodieParquetInputFormat.class.isAnnotationPresent(UseFileSplitsFromInputFormat.class)); + assertTrue(HoodieParquetInputFormat.class.isAnnotationPresent(UseRecordReaderFromInputFormat.class)); Annotation[] annotations = HoodieParquetInputFormat.class.getAnnotations(); - boolean found = false; + boolean foundFileSplitsAnnotation = false; + boolean foundRecordReaderAnnotation = false; for (Annotation annotation : annotations) { - if ("UseFileSplitsFromInputFormat".equals(annotation.annotationType().getSimpleName())) { - found = true; + if (UseFileSplitsFromInputFormat.class.getSimpleName().equals(annotation.annotationType().getSimpleName())) { + foundFileSplitsAnnotation = true; + } else if (UseRecordReaderFromInputFormat.class.getSimpleName().equals(annotation.annotationType().getSimpleName())) { + foundRecordReaderAnnotation = true; } } - assertTrue(found); + assertTrue(foundFileSplitsAnnotation); + assertTrue(foundRecordReaderAnnotation); } @Test @@ -47,10 +52,9 @@ public class TestAnnotation { boolean foundFileSplitsAnnotation = false; boolean foundRecordReaderAnnotation = false; for (Annotation annotation : annotations) { - if ("UseFileSplitsFromInputFormat".equals(annotation.annotationType().getSimpleName())) { + if (UseFileSplitsFromInputFormat.class.getSimpleName().equals(annotation.annotationType().getSimpleName())) { foundFileSplitsAnnotation = true; - } - if ("UseRecordReaderFromInputFormat".equals(annotation.annotationType().getSimpleName())) { + } else if (UseRecordReaderFromInputFormat.class.getSimpleName().equals(annotation.annotationType().getSimpleName())) { foundRecordReaderAnnotation = true; } } diff --git a/packaging/hudi-presto-bundle/pom.xml b/packaging/hudi-presto-bundle/pom.xml index e49a1eb8d..95f01bc21 100644 --- a/packaging/hudi-presto-bundle/pom.xml +++ b/packaging/hudi-presto-bundle/pom.xml @@ -76,7 +76,12 @@ org.apache.hbase:hbase-common org.apache.hbase:hbase-protocol org.apache.hbase:hbase-server + org.apache.hbase:hbase-annotations org.apache.htrace:htrace-core + com.yammer.metrics:metrics-core + com.google.guava:guava + commons-lang:commons-lang + com.google.protobuf:protobuf-java @@ -105,6 +110,22 @@ org.apache.htrace. org.apache.hudi.org.apache.htrace. + + com.yammer.metrics. + org.apache.hudi.com.yammer.metrics. + + + com.google.common. + ${presto.bundle.bootstrap.shade.prefix}com.google.common. + + + org.apache.commons.lang. + ${presto.bundle.bootstrap.shade.prefix}org.apache.commons.lang. + + + com.google.protobuf. + ${presto.bundle.bootstrap.shade.prefix}com.google.protobuf. + false @@ -159,5 +180,42 @@ avro compile + + + + com.google.guava + guava + 12.0.1 + ${presto.bundle.bootstrap.scope} + + + + + commons-lang + commons-lang + 2.6 + ${presto.bundle.bootstrap.scope} + + + + + com.google.protobuf + protobuf-java + 2.5.0 + ${presto.bundle.bootstrap.scope} + + + + + presto-shade-unbundle-bootstrap + + provided + + + + diff --git a/pom.xml b/pom.xml index cce96d2ce..b4d684699 100644 --- a/pom.xml +++ b/pom.xml @@ -126,6 +126,8 @@ -Xmx2g 0.8.5 + compile + org.apache.hudi.