From 736a9408549f66014bfdfdc72832be11aa9fd39b Mon Sep 17 00:00:00 2001 From: liujinhui <965147871@qq.com> Date: Thu, 29 Oct 2020 11:29:50 +0800 Subject: [PATCH] [HUDI-1274] Make hive synchronization supports hourly partition (#2122) --- ...ashEncodedHourPartitionValueExtractor.java | 67 +++++++++++++++++++ .../hive/TestPartitionValueExtractor.java | 38 +++++++++++ 2 files changed, 105 insertions(+) create mode 100644 hudi-sync/hudi-hive-sync/src/main/java/org/apache/hudi/hive/SlashEncodedHourPartitionValueExtractor.java create mode 100644 hudi-sync/hudi-hive-sync/src/test/java/org/apache/hudi/hive/TestPartitionValueExtractor.java diff --git a/hudi-sync/hudi-hive-sync/src/main/java/org/apache/hudi/hive/SlashEncodedHourPartitionValueExtractor.java b/hudi-sync/hudi-hive-sync/src/main/java/org/apache/hudi/hive/SlashEncodedHourPartitionValueExtractor.java new file mode 100644 index 000000000..dcb2c6d76 --- /dev/null +++ b/hudi-sync/hudi-hive-sync/src/main/java/org/apache/hudi/hive/SlashEncodedHourPartitionValueExtractor.java @@ -0,0 +1,67 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.hive; + +import org.joda.time.DateTime; +import org.joda.time.format.DateTimeFormat; +import org.joda.time.format.DateTimeFormatter; + +import java.util.Collections; +import java.util.List; + +/** + * HDFS Path contain hive partition values for the keys it is partitioned on. This mapping is not straight forward and + * requires a pluggable implementation to extract the partition value from HDFS path. + *

+ * This implementation extracts datestr=yyyy-mm-dd-HH from path of type /yyyy/mm/dd/HH + */ +public class SlashEncodedHourPartitionValueExtractor implements PartitionValueExtractor { + + private static final long serialVersionUID = 1L; + private transient DateTimeFormatter dtfOut; + + public SlashEncodedHourPartitionValueExtractor() { + this.dtfOut = DateTimeFormat.forPattern("yyyy-MM-dd-HH"); + } + + private DateTimeFormatter getDtfOut() { + if (dtfOut == null) { + dtfOut = DateTimeFormat.forPattern("yyyy-MM-dd-HH"); + } + return dtfOut; + } + + @Override + public List extractPartitionValuesInPath(String partitionPath) { + // partition path is expected to be in this format yyyy/mm/dd/HH + String[] splits = partitionPath.split("/"); + if (splits.length != 4) { + throw new IllegalArgumentException("Partition path " + partitionPath + " is not in the form yyyy/mm/dd/HH"); + } + //Hive style partitions need to contain '=' + int year = Integer.parseInt(splits[0].contains("=") ? splits[0].split("=")[1] : splits[0]); + int mm = Integer.parseInt(splits[1].contains("=") ? splits[1].split("=")[1] : splits[1]); + int dd = Integer.parseInt(splits[2].contains("=") ? splits[2].split("=")[1] : splits[2]); + int hh = Integer.parseInt(splits[3].contains("=") ? splits[3].split("=")[1] : splits[3]); + + DateTime dateTime = new DateTime(year, mm, dd, hh, 0); + + return Collections.singletonList(getDtfOut().print(dateTime)); + } +} diff --git a/hudi-sync/hudi-hive-sync/src/test/java/org/apache/hudi/hive/TestPartitionValueExtractor.java b/hudi-sync/hudi-hive-sync/src/test/java/org/apache/hudi/hive/TestPartitionValueExtractor.java new file mode 100644 index 000000000..a248e49f4 --- /dev/null +++ b/hudi-sync/hudi-hive-sync/src/test/java/org/apache/hudi/hive/TestPartitionValueExtractor.java @@ -0,0 +1,38 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hudi.hive; + +import org.junit.jupiter.api.Test; +import java.util.ArrayList; +import java.util.List; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertThrows; + +public class TestPartitionValueExtractor { + @Test + public void testHourPartition() { + SlashEncodedHourPartitionValueExtractor hourPartition = new SlashEncodedHourPartitionValueExtractor(); + List list = new ArrayList<>(); + list.add("2020-12-20-01"); + assertEquals(hourPartition.extractPartitionValuesInPath("2020/12/20/01"), list); + assertThrows(IllegalArgumentException.class, () -> hourPartition.extractPartitionValuesInPath("2020/12/20")); + assertEquals(hourPartition.extractPartitionValuesInPath("update_time=2020/12/20/01"), list); + } +} \ No newline at end of file