diff --git a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HoodieParquetInputFormat.java b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HoodieParquetInputFormat.java index 74a557e7e..8b89949e8 100644 --- a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HoodieParquetInputFormat.java +++ b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/HoodieParquetInputFormat.java @@ -37,7 +37,6 @@ import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat; import org.apache.hadoop.hive.ql.io.sarg.ConvertAstToSearchArg; -import org.apache.hadoop.hive.ql.metadata.VirtualColumn; import org.apache.hadoop.hive.ql.plan.TableScanDesc; import org.apache.hadoop.io.ArrayWritable; import org.apache.hadoop.io.NullWritable; @@ -63,6 +62,7 @@ import java.util.stream.IntStream; * that does not correspond to a hoodie table then they are passed in as is (as what FileInputFormat.listStatus() * would do). The JobConf could have paths from multipe Hoodie/Non-Hoodie tables */ +@UseRecordReaderFromInputFormat @UseFileSplitsFromInputFormat public class HoodieParquetInputFormat extends MapredParquetInputFormat implements Configurable { @@ -179,7 +179,7 @@ public class HoodieParquetInputFormat extends MapredParquetInputFormat implement // clearOutExistingPredicate(job); // } if (split instanceof BootstrapBaseFileSplit) { - BootstrapBaseFileSplit eSplit = (BootstrapBaseFileSplit)split; + BootstrapBaseFileSplit eSplit = (BootstrapBaseFileSplit) split; String[] rawColNames = HoodieColumnProjectionUtils.getReadColumnNames(job); List rawColIds = HoodieColumnProjectionUtils.getReadColumnIDs(job); List> projectedColsWithIndex = @@ -191,7 +191,7 @@ public class HoodieParquetInputFormat extends MapredParquetInputFormat implement .collect(Collectors.toList()); List> externalColsProjected = projectedColsWithIndex.stream() .filter(idxWithName -> !HoodieRecord.HOODIE_META_COLUMNS.contains(idxWithName.getValue()) - && !VirtualColumn.VIRTUAL_COLUMN_NAMES.contains(idxWithName.getValue())) + && !HoodieHiveUtils.VIRTUAL_COLUMN_NAMES.contains(idxWithName.getValue())) .collect(Collectors.toList()); // This always matches hive table description diff --git a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/utils/HoodieHiveUtils.java b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/utils/HoodieHiveUtils.java index 02fb9d06d..8dbe08075 100644 --- a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/utils/HoodieHiveUtils.java +++ b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/utils/HoodieHiveUtils.java @@ -20,6 +20,7 @@ package org.apache.hudi.hadoop.utils; import org.apache.hadoop.fs.Path; import org.apache.hadoop.mapreduce.JobContext; +import org.apache.hudi.common.util.CollectionUtils; import org.apache.log4j.LogManager; import org.apache.log4j.Logger; @@ -27,6 +28,7 @@ import java.util.ArrayList; import java.util.List; import java.util.Map; import java.util.Objects; +import java.util.Set; import java.util.regex.Matcher; import java.util.regex.Pattern; import java.util.stream.Collectors; @@ -38,6 +40,9 @@ public class HoodieHiveUtils { public static final String HOODIE_CONSUME_MODE_PATTERN = "hoodie.%s.consume.mode"; public static final String HOODIE_START_COMMIT_PATTERN = "hoodie.%s.consume.start.timestamp"; public static final String HOODIE_MAX_COMMIT_PATTERN = "hoodie.%s.consume.max.commits"; + public static final Set VIRTUAL_COLUMN_NAMES = CollectionUtils.createImmutableSet( + "INPUT__FILE__NAME", "BLOCK__OFFSET__INSIDE__FILE", "ROW__OFFSET__INSIDE__BLOCK", "RAW__DATA__SIZE", + "ROW__ID", "GROUPING__ID"); /* * Boolean property to stop incremental reader when there is a pending compaction. * This is needed to prevent certain race conditions with RO views of MOR tables. only applicable for RO views. diff --git a/hudi-hadoop-mr/src/test/java/org/apache/hudi/hadoop/TestAnnotation.java b/hudi-hadoop-mr/src/test/java/org/apache/hudi/hadoop/TestAnnotation.java index 1f74c7a2a..7a4f260ea 100644 --- a/hudi-hadoop-mr/src/test/java/org/apache/hudi/hadoop/TestAnnotation.java +++ b/hudi-hadoop-mr/src/test/java/org/apache/hudi/hadoop/TestAnnotation.java @@ -29,14 +29,19 @@ public class TestAnnotation { @Test public void testHoodieParquetInputFormatAnnotation() { assertTrue(HoodieParquetInputFormat.class.isAnnotationPresent(UseFileSplitsFromInputFormat.class)); + assertTrue(HoodieParquetInputFormat.class.isAnnotationPresent(UseRecordReaderFromInputFormat.class)); Annotation[] annotations = HoodieParquetInputFormat.class.getAnnotations(); - boolean found = false; + boolean foundFileSplitsAnnotation = false; + boolean foundRecordReaderAnnotation = false; for (Annotation annotation : annotations) { - if ("UseFileSplitsFromInputFormat".equals(annotation.annotationType().getSimpleName())) { - found = true; + if (UseFileSplitsFromInputFormat.class.getSimpleName().equals(annotation.annotationType().getSimpleName())) { + foundFileSplitsAnnotation = true; + } else if (UseRecordReaderFromInputFormat.class.getSimpleName().equals(annotation.annotationType().getSimpleName())) { + foundRecordReaderAnnotation = true; } } - assertTrue(found); + assertTrue(foundFileSplitsAnnotation); + assertTrue(foundRecordReaderAnnotation); } @Test @@ -47,10 +52,9 @@ public class TestAnnotation { boolean foundFileSplitsAnnotation = false; boolean foundRecordReaderAnnotation = false; for (Annotation annotation : annotations) { - if ("UseFileSplitsFromInputFormat".equals(annotation.annotationType().getSimpleName())) { + if (UseFileSplitsFromInputFormat.class.getSimpleName().equals(annotation.annotationType().getSimpleName())) { foundFileSplitsAnnotation = true; - } - if ("UseRecordReaderFromInputFormat".equals(annotation.annotationType().getSimpleName())) { + } else if (UseRecordReaderFromInputFormat.class.getSimpleName().equals(annotation.annotationType().getSimpleName())) { foundRecordReaderAnnotation = true; } } diff --git a/packaging/hudi-presto-bundle/pom.xml b/packaging/hudi-presto-bundle/pom.xml index e49a1eb8d..95f01bc21 100644 --- a/packaging/hudi-presto-bundle/pom.xml +++ b/packaging/hudi-presto-bundle/pom.xml @@ -76,7 +76,12 @@ org.apache.hbase:hbase-common org.apache.hbase:hbase-protocol org.apache.hbase:hbase-server + org.apache.hbase:hbase-annotations org.apache.htrace:htrace-core + com.yammer.metrics:metrics-core + com.google.guava:guava + commons-lang:commons-lang + com.google.protobuf:protobuf-java @@ -105,6 +110,22 @@ org.apache.htrace. org.apache.hudi.org.apache.htrace. + + com.yammer.metrics. + org.apache.hudi.com.yammer.metrics. + + + com.google.common. + ${presto.bundle.bootstrap.shade.prefix}com.google.common. + + + org.apache.commons.lang. + ${presto.bundle.bootstrap.shade.prefix}org.apache.commons.lang. + + + com.google.protobuf. + ${presto.bundle.bootstrap.shade.prefix}com.google.protobuf. + false @@ -159,5 +180,42 @@ avro compile + + + + com.google.guava + guava + 12.0.1 + ${presto.bundle.bootstrap.scope} + + + + + commons-lang + commons-lang + 2.6 + ${presto.bundle.bootstrap.scope} + + + + + com.google.protobuf + protobuf-java + 2.5.0 + ${presto.bundle.bootstrap.scope} + + + + + presto-shade-unbundle-bootstrap + + provided + + + + diff --git a/pom.xml b/pom.xml index cce96d2ce..b4d684699 100644 --- a/pom.xml +++ b/pom.xml @@ -126,6 +126,8 @@ -Xmx2g 0.8.5 + compile + org.apache.hudi.