From d6f94b998dc3b98308b86c4a6cd11e2a250a8913 Mon Sep 17 00:00:00 2001 From: Yash Sharma Date: Fri, 24 Mar 2017 14:42:10 +1100 Subject: [PATCH] Hoodie operability with S3 --- README.md | 1 + docs/s3_filesystem.md | 47 +++++++++++++++++ .../io/storage/HoodieWrapperFileSystem.java | 3 +- .../src/test/java/HoodieClientExample.java | 51 +++++++++++++++---- 4 files changed, 91 insertions(+), 11 deletions(-) create mode 100644 docs/s3_filesystem.md diff --git a/README.md b/README.md index 89edb4b60..7d298971a 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,4 @@ +# Hoodie Hoodie manages storage of large analytical datasets on [HDFS](http://hadoop.apache.org/docs/stable/hadoop-project-dist/hadoop-hdfs/HdfsDesign.html) and serve them out via two types of tables * **Read Optimized Table** - Provides excellent query performance via purely columnar storage (e.g. [Parquet](https://parquet.apache.org/)) diff --git a/docs/s3_filesystem.md b/docs/s3_filesystem.md new file mode 100644 index 000000000..c1bdd2fb1 --- /dev/null +++ b/docs/s3_filesystem.md @@ -0,0 +1,47 @@ +--- +title: S3 Filesystem (experimental) +keywords: sql hive s3 spark presto +sidebar: mydoc_sidebar +permalink: s3_hoodie.html +toc: false +summary: In this page, we go over how to configure hoodie with S3 filesystem. +--- +Hoodie works with HDFS by default. There is an experimental work going on Hoodie-S3 compatibility. + +## S3 configs + +Add the required configs in your core-site.xml from where Hoodie can fetch them. Replace the `fs.defaultFS` with your S3 bucket name and Hoodie should be able to read/write from the bucket. + +``` + + fs.defaultFS + s3://ysharma + + + + fs.s3.impl + org.apache.hadoop.fs.s3native.NativeS3FileSystem + + + + fs.s3.awsAccessKeyId + AWS_KEY + + + + fs.s3.awsSecretAccessKey + AWS_SECRET + + + + fs.s3n.awsAccessKeyId + AWS_KEY + + + + fs.s3n.awsSecretAccessKey + AWS_SECRET + +``` + + diff --git a/hoodie-client/src/main/java/com/uber/hoodie/io/storage/HoodieWrapperFileSystem.java b/hoodie-client/src/main/java/com/uber/hoodie/io/storage/HoodieWrapperFileSystem.java index 64034b4d2..d413fc5c3 100644 --- a/hoodie-client/src/main/java/com/uber/hoodie/io/storage/HoodieWrapperFileSystem.java +++ b/hoodie-client/src/main/java/com/uber/hoodie/io/storage/HoodieWrapperFileSystem.java @@ -49,9 +49,10 @@ public class HoodieWrapperFileSystem extends FileSystem { public static final String HOODIE_SCHEME_PREFIX = "hoodie-"; static { - SUPPORT_SCHEMES = new HashSet<>(2); + SUPPORT_SCHEMES = new HashSet<>(); SUPPORT_SCHEMES.add("file"); SUPPORT_SCHEMES.add("hdfs"); + SUPPORT_SCHEMES.add("s3"); } private ConcurrentMap openStreams = diff --git a/hoodie-client/src/test/java/HoodieClientExample.java b/hoodie-client/src/test/java/HoodieClientExample.java index eb7e56f70..711b4bb04 100644 --- a/hoodie-client/src/test/java/HoodieClientExample.java +++ b/hoodie-client/src/test/java/HoodieClientExample.java @@ -15,14 +15,20 @@ */ import com.uber.hoodie.HoodieWriteClient; -import com.uber.hoodie.common.table.HoodieTableMetaClient; -import com.uber.hoodie.common.util.FSUtils; -import com.uber.hoodie.config.HoodieWriteConfig; import com.uber.hoodie.common.HoodieTestDataGenerator; import com.uber.hoodie.common.model.HoodieRecord; +import com.uber.hoodie.common.table.HoodieTableMetaClient; +import com.uber.hoodie.common.util.FSUtils; import com.uber.hoodie.config.HoodieIndexConfig; +import com.uber.hoodie.config.HoodieWriteConfig; import com.uber.hoodie.index.HoodieIndex; - +import org.apache.commons.cli.CommandLine; +import org.apache.commons.cli.CommandLineParser; +import org.apache.commons.cli.BasicParser; +import org.apache.commons.cli.HelpFormatter; +import org.apache.commons.cli.Option; +import org.apache.commons.cli.Options; +import org.apache.commons.cli.ParseException; import org.apache.log4j.LogManager; import org.apache.log4j.Logger; import org.apache.spark.SparkConf; @@ -41,8 +47,34 @@ public class HoodieClientExample { private static Logger logger = LogManager.getLogger(HoodieClientExample.class); + private static final String DEFAULT_TABLE_PATH = "file:///tmp/hoodie/sample-table"; + private static final String DEFAULT_TABLE_NAME = "sample-table"; + public static void main(String[] args) throws Exception { - String tablePath = args.length == 1 ? args[0] : "file:///tmp/hoodie/sample-table"; + Options options = new Options(); + Option path = new Option("p", "table-path", true, "input table path"); + path.setRequired(false); + options.addOption(path); + + Option name = new Option("n", "table-name", true, "input table name"); + name.setRequired(false); + options.addOption(name); + + CommandLineParser parser = new BasicParser(); + HelpFormatter formatter = new HelpFormatter(); + CommandLine cmd; + + try { + cmd = parser.parse(options, args); + } catch (ParseException e) { + System.out.println(e.getMessage()); + formatter.printHelp("HoodieClientExample", options); + System.exit(1); + return; + } + + String inputTablePath = cmd.getOptionValue("table-path", DEFAULT_TABLE_PATH); + String inputTableName = cmd.getOptionValue("table-name", DEFAULT_TABLE_NAME); HoodieTestDataGenerator dataGen = new HoodieTestDataGenerator(); @@ -54,16 +86,15 @@ public class HoodieClientExample { // generate some records to be loaded in. HoodieWriteConfig cfg = - HoodieWriteConfig.newBuilder().withPath(tablePath) + HoodieWriteConfig.newBuilder().withPath(inputTablePath) .withSchema(HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA).withParallelism(2, 2) - .forTable("sample-table").withIndexConfig( + .forTable(inputTableName).withIndexConfig( HoodieIndexConfig.newBuilder().withIndexType(HoodieIndex.IndexType.BLOOM).build()) .build(); Properties properties = new Properties(); - properties.put(HoodieWriteConfig.TABLE_NAME, "sample-table"); + properties.put(HoodieWriteConfig.TABLE_NAME, inputTableName); HoodieTableMetaClient - .initializePathAsHoodieDataset(FSUtils.getFs(), tablePath, - properties); + .initializePathAsHoodieDataset(FSUtils.getFs(), inputTablePath, properties); HoodieWriteClient client = new HoodieWriteClient(jsc, cfg); /**