[HUDI-430] Adding InlineFileSystem to support embedding any file format as an InlineFile (#1176)
* Adding InlineFileSystem to support embedding any file format (parquet, hfile, etc). Supports reading the embedded file using respective readers.
This commit is contained in:
committed by
GitHub
parent
04449f33fe
commit
ac73bdcdc3
@@ -0,0 +1,98 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.common.inline.fs;
|
||||
|
||||
import org.apache.hadoop.fs.Path;
|
||||
|
||||
/**
|
||||
* Utils to parse InLineFileSystem paths.
|
||||
* Inline FS format:
|
||||
* "inlinefs://<path_to_outer_file>/<outer_file_scheme>/?start_offset=start_offset>&length=<length>"
|
||||
* Eg: "inlinefs://<path_to_outer_file>/s3a/?start_offset=20&length=40"
|
||||
*/
|
||||
public class InLineFSUtils {
|
||||
private static final String START_OFFSET_STR = "start_offset";
|
||||
private static final String LENGTH_STR = "length";
|
||||
private static final String EQUALS_STR = "=";
|
||||
|
||||
/**
|
||||
* Fetch inline file path from outer path.
|
||||
* Eg
|
||||
* Input:
|
||||
* Path = s3a://file1, origScheme: file, startOffset = 20, length = 40
|
||||
* Output: "inlinefs:/file1/s3a/?start_offset=20&length=40"
|
||||
*
|
||||
* @param outerPath
|
||||
* @param origScheme
|
||||
* @param inLineStartOffset
|
||||
* @param inLineLength
|
||||
* @return
|
||||
*/
|
||||
public static Path getInlineFilePath(Path outerPath, String origScheme, long inLineStartOffset, long inLineLength) {
|
||||
String subPath = outerPath.toString().substring(outerPath.toString().indexOf(":") + 1);
|
||||
return new Path(
|
||||
InLineFileSystem.SCHEME + "://" + subPath + "/" + origScheme
|
||||
+ "/" + "?" + START_OFFSET_STR + EQUALS_STR + inLineStartOffset
|
||||
+ "&" + LENGTH_STR + EQUALS_STR + inLineLength
|
||||
);
|
||||
}
|
||||
|
||||
/**
|
||||
* Inline file format
|
||||
* "inlinefs://<path_to_outer_file>/<outer_file_scheme>/?start_offset=start_offset>&length=<length>"
|
||||
* Outer File format
|
||||
* "<outer_file_scheme>://<path_to_outer_file>"
|
||||
* <p>
|
||||
* Eg input : "inlinefs://file1/sa3/?start_offset=20&length=40".
|
||||
* Output : "sa3://file1"
|
||||
*
|
||||
* @param inlinePath inline file system path
|
||||
* @return
|
||||
*/
|
||||
public static Path getOuterfilePathFromInlinePath(Path inlinePath) {
|
||||
String scheme = inlinePath.getParent().getName();
|
||||
Path basePath = inlinePath.getParent().getParent();
|
||||
return new Path(basePath.toString().replaceFirst(InLineFileSystem.SCHEME, scheme));
|
||||
}
|
||||
|
||||
/**
|
||||
* Eg input : "inlinefs://file1/s3a/?start_offset=20&length=40".
|
||||
* output: 20
|
||||
*
|
||||
* @param inlinePath
|
||||
* @return
|
||||
*/
|
||||
public static int startOffset(Path inlinePath) {
|
||||
String[] slices = inlinePath.toString().split("[?&=]");
|
||||
return Integer.parseInt(slices[slices.length - 3]);
|
||||
}
|
||||
|
||||
/**
|
||||
* Eg input : "inlinefs:/file1/s3a/?start_offset=20&length=40".
|
||||
* Output: 40
|
||||
*
|
||||
* @param inlinePath
|
||||
* @return
|
||||
*/
|
||||
public static int length(Path inlinePath) {
|
||||
String[] slices = inlinePath.toString().split("[?&=]");
|
||||
return Integer.parseInt(slices[slices.length - 1]);
|
||||
}
|
||||
|
||||
}
|
||||
@@ -0,0 +1,133 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.common.inline.fs;
|
||||
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.fs.FSDataInputStream;
|
||||
import org.apache.hadoop.fs.FSDataOutputStream;
|
||||
import org.apache.hadoop.fs.FileStatus;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.apache.hadoop.fs.permission.FsPermission;
|
||||
import org.apache.hadoop.util.Progressable;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.net.URI;
|
||||
|
||||
/**
|
||||
* Enables reading any inline file at a given offset and length. This {@link FileSystem} is used only in read path and does not support
|
||||
* any write apis.
|
||||
* <p>
|
||||
* - Reading an inlined file at a given offset, length, read it out as if it were an independent file of that length
|
||||
* - Inlined path is of the form "inlinefs:///path/to/outer/file/<outer_file_scheme>/?start_offset=<start_offset>&length=<length>
|
||||
* <p>
|
||||
* TODO: The reader/writer may try to use relative paths based on the inlinepath and it may not work. Need to handle
|
||||
* this gracefully eg. the parquet summary metadata reading. TODO: If this shows promise, also support directly writing
|
||||
* the inlined file to the underneath file without buffer
|
||||
*/
|
||||
public class InLineFileSystem extends FileSystem {
|
||||
|
||||
public static final String SCHEME = "inlinefs";
|
||||
private Configuration conf = null;
|
||||
|
||||
@Override
|
||||
public void initialize(URI name, Configuration conf) throws IOException {
|
||||
super.initialize(name, conf);
|
||||
this.conf = conf;
|
||||
}
|
||||
|
||||
@Override
|
||||
public URI getUri() {
|
||||
return URI.create(getScheme());
|
||||
}
|
||||
|
||||
public String getScheme() {
|
||||
return SCHEME;
|
||||
}
|
||||
|
||||
@Override
|
||||
public FSDataInputStream open(Path inlinePath, int bufferSize) throws IOException {
|
||||
Path outerPath = InLineFSUtils.getOuterfilePathFromInlinePath(inlinePath);
|
||||
FileSystem outerFs = outerPath.getFileSystem(conf);
|
||||
FSDataInputStream outerStream = outerFs.open(outerPath, bufferSize);
|
||||
return new InLineFsDataInputStream(InLineFSUtils.startOffset(inlinePath), outerStream, InLineFSUtils.length(inlinePath));
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean exists(Path f) {
|
||||
try {
|
||||
return getFileStatus(f) != null;
|
||||
} catch (Exception e) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public FileStatus getFileStatus(Path inlinePath) throws IOException {
|
||||
Path outerPath = InLineFSUtils.getOuterfilePathFromInlinePath(inlinePath);
|
||||
FileSystem outerFs = outerPath.getFileSystem(conf);
|
||||
FileStatus status = outerFs.getFileStatus(outerPath);
|
||||
FileStatus toReturn = new FileStatus(InLineFSUtils.length(inlinePath), status.isDirectory(), status.getReplication(), status.getBlockSize(),
|
||||
status.getModificationTime(), status.getAccessTime(), status.getPermission(), status.getOwner(),
|
||||
status.getGroup(), inlinePath);
|
||||
return toReturn;
|
||||
}
|
||||
|
||||
@Override
|
||||
public FSDataOutputStream create(Path path, FsPermission fsPermission, boolean b, int i, short i1, long l,
|
||||
Progressable progressable) throws IOException {
|
||||
throw new UnsupportedOperationException("Can't rename files");
|
||||
}
|
||||
|
||||
@Override
|
||||
public FSDataOutputStream append(Path path, int i, Progressable progressable) throws IOException {
|
||||
throw new UnsupportedOperationException("Can't rename files");
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean rename(Path path, Path path1) throws IOException {
|
||||
throw new UnsupportedOperationException("Can't rename files");
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean delete(Path path, boolean b) throws IOException {
|
||||
throw new UnsupportedOperationException("Can't delete files");
|
||||
}
|
||||
|
||||
@Override
|
||||
public FileStatus[] listStatus(Path inlinePath) throws IOException {
|
||||
return new FileStatus[] {getFileStatus(inlinePath)};
|
||||
}
|
||||
|
||||
@Override
|
||||
public void setWorkingDirectory(Path path) {
|
||||
throw new UnsupportedOperationException("Can't set working directory");
|
||||
}
|
||||
|
||||
@Override
|
||||
public Path getWorkingDirectory() {
|
||||
throw new UnsupportedOperationException("Can't get working directory");
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean mkdirs(Path path, FsPermission fsPermission) throws IOException {
|
||||
throw new UnsupportedOperationException("Can't set working directory");
|
||||
}
|
||||
|
||||
}
|
||||
@@ -0,0 +1,114 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.common.inline.fs;
|
||||
|
||||
import org.apache.hadoop.fs.FSDataInputStream;
|
||||
import org.apache.hadoop.fs.ReadOption;
|
||||
import org.apache.hadoop.io.ByteBufferPool;
|
||||
|
||||
import java.io.FileDescriptor;
|
||||
import java.io.IOException;
|
||||
import java.nio.ByteBuffer;
|
||||
import java.util.EnumSet;
|
||||
|
||||
/**
|
||||
* Inline {@link FSDataInputStream}. A startOffset that is passed in is assumed to be the start of the InputStream.
|
||||
* All operations are handled having the {@code startOffset} as starting point.
|
||||
*/
|
||||
public class InLineFsDataInputStream extends FSDataInputStream {
|
||||
|
||||
private final int startOffset;
|
||||
private final FSDataInputStream outerStream;
|
||||
private final int length;
|
||||
|
||||
public InLineFsDataInputStream(int startOffset, FSDataInputStream outerStream, int length) {
|
||||
super(outerStream.getWrappedStream());
|
||||
this.startOffset = startOffset;
|
||||
this.outerStream = outerStream;
|
||||
this.length = length;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void seek(long desired) throws IOException {
|
||||
outerStream.seek(startOffset + desired);
|
||||
}
|
||||
|
||||
@Override
|
||||
public long getPos() throws IOException {
|
||||
return outerStream.getPos() - startOffset;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int read(long position, byte[] buffer, int offset, int length) throws IOException {
|
||||
return outerStream.read(startOffset + position, buffer, offset, length);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void readFully(long position, byte[] buffer, int offset, int length) throws IOException {
|
||||
outerStream.readFully(startOffset + position, buffer, offset, length);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void readFully(long position, byte[] buffer)
|
||||
throws IOException {
|
||||
outerStream.readFully(startOffset + position, buffer, 0, buffer.length);
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean seekToNewSource(long targetPos) throws IOException {
|
||||
boolean toReturn = outerStream.seekToNewSource(startOffset + targetPos);
|
||||
return toReturn;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int read(ByteBuffer buf) throws IOException {
|
||||
return outerStream.read(buf);
|
||||
}
|
||||
|
||||
@Override
|
||||
public FileDescriptor getFileDescriptor() throws IOException {
|
||||
return outerStream.getFileDescriptor();
|
||||
}
|
||||
|
||||
@Override
|
||||
public void setReadahead(Long readahead) throws IOException, UnsupportedOperationException {
|
||||
outerStream.setReadahead(readahead);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void setDropBehind(Boolean dropBehind) throws IOException, UnsupportedOperationException {
|
||||
outerStream.setDropBehind(dropBehind);
|
||||
}
|
||||
|
||||
@Override
|
||||
public ByteBuffer read(ByteBufferPool bufferPool, int maxLength, EnumSet<ReadOption> opts)
|
||||
throws IOException, UnsupportedOperationException {
|
||||
return outerStream.read(bufferPool, maxLength, opts);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void releaseBuffer(ByteBuffer buffer) {
|
||||
outerStream.releaseBuffer(buffer);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void unbuffer() {
|
||||
outerStream.unbuffer();
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,138 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.common.inline.fs;
|
||||
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.fs.FSDataInputStream;
|
||||
import org.apache.hadoop.fs.FSDataOutputStream;
|
||||
import org.apache.hadoop.fs.FileStatus;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.apache.hadoop.fs.permission.FsPermission;
|
||||
import org.apache.hadoop.util.Progressable;
|
||||
|
||||
import java.io.ByteArrayOutputStream;
|
||||
import java.io.FileNotFoundException;
|
||||
import java.io.IOException;
|
||||
import java.net.URI;
|
||||
import java.net.URISyntaxException;
|
||||
|
||||
/**
|
||||
* A FileSystem which stores all content in memory and returns a byte[] when {@link #getFileAsBytes()} is called
|
||||
* This FileSystem is used only in write path. Does not support any read apis except {@link #getFileAsBytes()}.
|
||||
*/
|
||||
public class InMemoryFileSystem extends FileSystem {
|
||||
|
||||
// TODO: this needs to be per path to support num_cores > 1, and we should release the buffer once done
|
||||
private ByteArrayOutputStream bos;
|
||||
private Configuration conf = null;
|
||||
public static final String SCHEME = "inmemfs";
|
||||
private URI uri;
|
||||
|
||||
InMemoryFileSystem() {
|
||||
}
|
||||
|
||||
@Override
|
||||
public void initialize(URI name, Configuration conf) throws IOException {
|
||||
super.initialize(name, conf);
|
||||
this.conf = conf;
|
||||
this.uri = name;
|
||||
}
|
||||
|
||||
@Override
|
||||
public URI getUri() {
|
||||
return uri;
|
||||
}
|
||||
|
||||
public String getScheme() {
|
||||
return SCHEME;
|
||||
}
|
||||
|
||||
@Override
|
||||
public FSDataInputStream open(Path inlinePath, int bufferSize) {
|
||||
return null;
|
||||
}
|
||||
|
||||
@Override
|
||||
public FSDataOutputStream create(Path path, FsPermission fsPermission, boolean b, int i, short i1, long l,
|
||||
Progressable progressable) throws IOException {
|
||||
bos = new ByteArrayOutputStream();
|
||||
try {
|
||||
this.uri = new URI(path.toString());
|
||||
} catch (URISyntaxException e) {
|
||||
throw new IllegalArgumentException("Path not parsable as URI " + path);
|
||||
}
|
||||
return new FSDataOutputStream(bos, new Statistics(getScheme()));
|
||||
}
|
||||
|
||||
public byte[] getFileAsBytes() {
|
||||
return bos.toByteArray();
|
||||
}
|
||||
|
||||
@Override
|
||||
public FSDataOutputStream append(Path path, int i, Progressable progressable) throws IOException {
|
||||
return null;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean rename(Path path, Path path1) throws IOException {
|
||||
throw new UnsupportedOperationException("Can't rename files");
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean delete(Path path, boolean b) throws IOException {
|
||||
throw new UnsupportedOperationException("Can't delete files");
|
||||
}
|
||||
|
||||
@Override
|
||||
public FileStatus[] listStatus(Path inlinePath) throws FileNotFoundException, IOException {
|
||||
throw new UnsupportedOperationException("No support for listStatus");
|
||||
}
|
||||
|
||||
@Override
|
||||
public void setWorkingDirectory(Path path) {
|
||||
throw new UnsupportedOperationException("Can't set working directory");
|
||||
}
|
||||
|
||||
@Override
|
||||
public Path getWorkingDirectory() {
|
||||
return null;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean mkdirs(Path path, FsPermission fsPermission) throws IOException {
|
||||
throw new UnsupportedOperationException("Can't mkdir");
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean exists(Path f) throws IOException {
|
||||
throw new UnsupportedOperationException("Can't check for exists");
|
||||
}
|
||||
|
||||
@Override
|
||||
public FileStatus getFileStatus(Path inlinePath) throws IOException {
|
||||
throw new UnsupportedOperationException("No support for getFileStatus");
|
||||
}
|
||||
|
||||
@Override
|
||||
public void close() throws IOException {
|
||||
super.close();
|
||||
bos.close();
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user