From 51599af2818562b6cea9bd01bd81af363209a2d2 Mon Sep 17 00:00:00 2001 From: Rahil C <32500120+rahil-c@users.noreply.github.com> Date: Wed, 27 Jul 2022 14:58:29 -0700 Subject: [PATCH] [HUDI-4126] Disable file splits for Bootstrap real time queries (via InputFormat) (#6219) Co-authored-by: Udit Mehrotra Co-authored-by: Raymond Xu <2701446+xushiyan@users.noreply.github.com> --- .../hadoop/realtime/HoodieRealtimePath.java | 2 +- ...TestHoodieCopyOnWriteTableInputFormat.java | 60 ++++++++++++++++ ...TestHoodieMergeOnReadTableInputFormat.java | 68 +++++++++++++++++++ 3 files changed, 129 insertions(+), 1 deletion(-) create mode 100644 hudi-hadoop-mr/src/test/java/org/apache/hudi/hadoop/TestHoodieCopyOnWriteTableInputFormat.java create mode 100644 hudi-hadoop-mr/src/test/java/org/apache/hudi/hadoop/realtime/TestHoodieMergeOnReadTableInputFormat.java diff --git a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/realtime/HoodieRealtimePath.java b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/realtime/HoodieRealtimePath.java index bba44d5c6..1f1dd1b92 100644 --- a/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/realtime/HoodieRealtimePath.java +++ b/hudi-hadoop-mr/src/main/java/org/apache/hudi/hadoop/realtime/HoodieRealtimePath.java @@ -89,7 +89,7 @@ public class HoodieRealtimePath extends Path { } public boolean isSplitable() { - return !toString().isEmpty(); + return !toString().isEmpty() && !includeBootstrapFilePath(); } public PathWithBootstrapFileStatus getPathWithBootstrapFileStatus() { diff --git a/hudi-hadoop-mr/src/test/java/org/apache/hudi/hadoop/TestHoodieCopyOnWriteTableInputFormat.java b/hudi-hadoop-mr/src/test/java/org/apache/hudi/hadoop/TestHoodieCopyOnWriteTableInputFormat.java new file mode 100644 index 000000000..902778ed1 --- /dev/null +++ b/hudi-hadoop-mr/src/test/java/org/apache/hudi/hadoop/TestHoodieCopyOnWriteTableInputFormat.java @@ -0,0 +1,60 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.hudi.hadoop; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.io.TempDir; + +import java.io.IOException; +import java.net.URI; +import java.nio.file.Files; + +import static org.junit.jupiter.api.Assertions.assertFalse; + +public class TestHoodieCopyOnWriteTableInputFormat { + + @TempDir + java.nio.file.Path tempDir; + private FileSystem fs; + + @BeforeEach + void setUp() throws IOException { + fs = FileSystem.get(tempDir.toUri(), new Configuration()); + } + + @AfterEach + void tearDown() throws IOException { + fs.close(); + } + + @Test + void pathNotSplitableForBootstrapScenario() throws IOException { + URI source = Files.createTempFile(tempDir, "source", ".parquet").toUri(); + URI target = Files.createTempFile(tempDir, "target", ".parquet").toUri(); + PathWithBootstrapFileStatus path = new PathWithBootstrapFileStatus(new Path(target), fs.getFileStatus(new Path(source))); + HoodieCopyOnWriteTableInputFormat cowInputFormat = new HoodieCopyOnWriteTableInputFormat(); + assertFalse(cowInputFormat.isSplitable(fs, path), "Path for bootstrap should not be splitable."); + } +} diff --git a/hudi-hadoop-mr/src/test/java/org/apache/hudi/hadoop/realtime/TestHoodieMergeOnReadTableInputFormat.java b/hudi-hadoop-mr/src/test/java/org/apache/hudi/hadoop/realtime/TestHoodieMergeOnReadTableInputFormat.java new file mode 100644 index 000000000..d44f5fbf6 --- /dev/null +++ b/hudi-hadoop-mr/src/test/java/org/apache/hudi/hadoop/realtime/TestHoodieMergeOnReadTableInputFormat.java @@ -0,0 +1,68 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.hudi.hadoop.realtime; + +import org.apache.hudi.common.util.Option; +import org.apache.hudi.hadoop.PathWithBootstrapFileStatus; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.io.TempDir; + +import java.io.IOException; +import java.net.URI; +import java.nio.file.Files; +import java.util.Collections; + +import static org.junit.jupiter.api.Assertions.assertFalse; +import static org.junit.jupiter.api.Assertions.assertTrue; + +public class TestHoodieMergeOnReadTableInputFormat { + + @TempDir + java.nio.file.Path tempDir; + private FileSystem fs; + + @BeforeEach + void setUp() throws IOException { + fs = FileSystem.get(tempDir.toUri(), new Configuration()); + } + + @AfterEach + void tearDown() throws IOException { + fs.close(); + } + + @Test + void pathNotSplitableForBootstrapScenario() throws IOException { + URI source = Files.createTempFile(tempDir, "source", ".parquet").toUri(); + URI target = Files.createTempFile(tempDir, "target", ".parquet").toUri(); + HoodieRealtimePath rtPath = new HoodieRealtimePath(new Path("foo"), "bar", target.toString(), Collections.emptyList(), "000", false, Option.empty()); + assertTrue(new HoodieMergeOnReadTableInputFormat().isSplitable(fs, rtPath)); + + PathWithBootstrapFileStatus path = new PathWithBootstrapFileStatus(new Path(target), fs.getFileStatus(new Path(source))); + rtPath.setPathWithBootstrapFileStatus(path); + assertFalse(new HoodieMergeOnReadTableInputFormat().isSplitable(fs, rtPath), "Path for bootstrap should not be splitable."); + } +}