1
0

Multi FS Support

- Reviving PR 191, to make FileSystem creation off actual path
 - Streamline all filesystem access to HoodieTableMetaClient
 - Hadoop Conf from Spark Context serialized & passed to executor code too
 - Pick up env vars prefixed with HOODIE_ENV_ into Configuration object
 - Cleanup usage of FSUtils.getFS, piggybacking off HoodieTableMetaClient.getFS
 - Adding s3a to supported schemes & support escaping "." in env vars
 - Tests use HoodieTestUtils.getDefaultHadoopConf
This commit is contained in:
Vinoth Chandar
2017-12-10 23:31:54 -08:00
committed by vinoth chandar
parent 44839b88c6
commit 0cd186c899
78 changed files with 851 additions and 535 deletions

View File

@@ -77,7 +77,7 @@ public class MercifulJsonConverter {
switch (schema.getType()) {
case BOOLEAN:
if (value instanceof Boolean) {
return (Boolean) value;
return value;
}
break;
case DOUBLE:

View File

@@ -0,0 +1,56 @@
/*
* Copyright (c) 2017 Uber Technologies, Inc. (hoodie-dev-group@uber.com)
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
*
*/
package com.uber.hoodie.common;
import java.io.IOException;
import java.io.ObjectInputStream;
import java.io.ObjectOutputStream;
import java.io.Serializable;
import org.apache.hadoop.conf.Configuration;
public class SerializableConfiguration implements Serializable {
private transient Configuration configuration;
public SerializableConfiguration(Configuration configuration) {
this.configuration = configuration;
}
public Configuration get() {
return configuration;
}
private void writeObject(ObjectOutputStream out) throws IOException {
out.defaultWriteObject();
configuration.write(out);
}
private void readObject(ObjectInputStream in) throws IOException {
configuration = new Configuration(false);
configuration.readFields(in);
}
@Override
public String toString() {
StringBuilder str = new StringBuilder();
configuration.iterator().forEachRemaining(e ->
str.append(String.format("%s => %s \n", e.getKey(), e.getValue())));
return configuration.toString();
}
}

View File

@@ -17,5 +17,5 @@
package com.uber.hoodie.common.model;
public enum ActionType {
commit, savepoint, compaction, clean, rollback;
commit, savepoint, compaction, clean, rollback
}

View File

@@ -16,6 +16,7 @@
package com.uber.hoodie.common.table;
import com.uber.hoodie.common.SerializableConfiguration;
import com.uber.hoodie.common.model.HoodieTableType;
import com.uber.hoodie.common.table.timeline.HoodieActiveTimeline;
import com.uber.hoodie.common.table.timeline.HoodieArchivedTimeline;
@@ -26,6 +27,7 @@ import java.io.IOException;
import java.io.Serializable;
import java.util.Objects;
import java.util.Properties;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
@@ -52,24 +54,28 @@ public class HoodieTableMetaClient implements Serializable {
private String basePath;
private transient FileSystem fs;
private String metaPath;
private SerializableConfiguration hadoopConf;
private HoodieTableType tableType;
private HoodieTableConfig tableConfig;
private HoodieActiveTimeline activeTimeline;
private HoodieArchivedTimeline archivedTimeline;
public HoodieTableMetaClient(FileSystem fs, String basePath) throws DatasetNotFoundException {
public HoodieTableMetaClient(Configuration conf, String basePath)
throws DatasetNotFoundException {
// Do not load any timeline by default
this(fs, basePath, false);
this(conf, basePath, false);
}
public HoodieTableMetaClient(FileSystem fs, String basePath, boolean loadActiveTimelineOnLoad)
public HoodieTableMetaClient(Configuration conf, String basePath,
boolean loadActiveTimelineOnLoad)
throws DatasetNotFoundException {
log.info("Loading HoodieTableMetaClient from " + basePath);
this.basePath = basePath;
this.fs = fs;
this.hadoopConf = new SerializableConfiguration(conf);
Path basePathDir = new Path(this.basePath);
this.metaPath = basePath + File.separator + METAFOLDER_NAME;
Path metaPathDir = new Path(this.metaPath);
this.fs = getFs();
DatasetNotFoundException.checkValidDataset(fs, basePathDir, metaPathDir);
this.tableConfig = new HoodieTableConfig(fs, metaPath);
this.tableType = tableConfig.getTableType();
@@ -96,7 +102,7 @@ public class HoodieTableMetaClient implements Serializable {
private void readObject(java.io.ObjectInputStream in)
throws IOException, ClassNotFoundException {
in.defaultReadObject();
this.fs = FSUtils.getFs();
fs = null; // will be lazily inited
}
private void writeObject(java.io.ObjectOutputStream out)
@@ -136,9 +142,16 @@ public class HoodieTableMetaClient implements Serializable {
* Get the FS implementation for this table
*/
public FileSystem getFs() {
if (fs == null) {
fs = FSUtils.getFs(metaPath, hadoopConf.get());
}
return fs;
}
public Configuration getHadoopConf() {
return hadoopConf.get();
}
/**
* Get the active instants as a timeline
*
@@ -146,7 +159,7 @@ public class HoodieTableMetaClient implements Serializable {
*/
public synchronized HoodieActiveTimeline getActiveTimeline() {
if (activeTimeline == null) {
activeTimeline = new HoodieActiveTimeline(fs, metaPath);
activeTimeline = new HoodieActiveTimeline(this);
}
return activeTimeline;
}
@@ -159,7 +172,7 @@ public class HoodieTableMetaClient implements Serializable {
*/
public synchronized HoodieArchivedTimeline getArchivedTimeline() {
if (archivedTimeline == null) {
archivedTimeline = new HoodieArchivedTimeline(fs, metaPath);
archivedTimeline = new HoodieArchivedTimeline(this);
}
return archivedTimeline;
}
@@ -196,7 +209,7 @@ public class HoodieTableMetaClient implements Serializable {
fs.mkdirs(metaPathDir);
}
HoodieTableConfig.createHoodieProperties(fs, metaPathDir, props);
HoodieTableMetaClient metaClient = new HoodieTableMetaClient(fs, basePath);
HoodieTableMetaClient metaClient = new HoodieTableMetaClient(fs.getConf(), basePath);
log.info("Finished initializing Table of type " + metaClient.getTableConfig().getTableType()
+ " from " + basePath);
return metaClient;

View File

@@ -83,7 +83,7 @@ public class HoodieCompactedLogRecordScanner implements
Schema readerSchema, String latestInstantTime) {
this.readerSchema = readerSchema;
this.latestInstantTime = latestInstantTime;
this.hoodieTableMetaClient = new HoodieTableMetaClient(fs, basePath);
this.hoodieTableMetaClient = new HoodieTableMetaClient(fs.getConf(), basePath);
// load class from the payload fully qualified class name
this.payloadClassFQN = this.hoodieTableMetaClient.getTableConfig().getPayloadClass();

View File

@@ -150,7 +150,7 @@ public interface HoodieLogFormat {
public Writer build() throws IOException, InterruptedException {
log.info("Building HoodieLogFormat Writer");
if (fs == null) {
fs = FSUtils.getFs();
throw new IllegalArgumentException("fs is not specified");
}
if (logFileId == null) {
throw new IllegalArgumentException("FileID is not specified");

View File

@@ -58,7 +58,7 @@ public abstract class HoodieLogBlock {
*/
public enum LogMetadataType {
INSTANT_TIME,
TARGET_INSTANT_TIME;
TARGET_INSTANT_TIME
}
public HoodieLogBlock(Map<LogMetadataType, String> logMetadata) {

View File

@@ -37,7 +37,6 @@ import java.util.stream.Stream;
import org.apache.commons.io.IOUtils;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.log4j.LogManager;
import org.apache.log4j.Logger;
@@ -54,11 +53,8 @@ public class HoodieActiveTimeline extends HoodieDefaultTimeline {
public static final SimpleDateFormat COMMIT_FORMATTER = new SimpleDateFormat("yyyyMMddHHmmss");
private final transient static Logger log = LogManager.getLogger(HoodieActiveTimeline.class);
private String metaPath;
private transient FileSystem fs;
private HoodieTableMetaClient metaClient;
/**
* Returns next commit time in the {@link #COMMIT_FORMATTER} format.
@@ -67,12 +63,14 @@ public class HoodieActiveTimeline extends HoodieDefaultTimeline {
return HoodieActiveTimeline.COMMIT_FORMATTER.format(new Date());
}
protected HoodieActiveTimeline(FileSystem fs, String metaPath, String[] includedExtensions) {
protected HoodieActiveTimeline(HoodieTableMetaClient metaClient, String[] includedExtensions) {
// Filter all the filter in the metapath and include only the extensions passed and
// convert them into HoodieInstant
try {
this.instants =
Arrays.stream(HoodieTableMetaClient.scanFiles(fs, new Path(metaPath), path -> {
Arrays.stream(
HoodieTableMetaClient
.scanFiles(metaClient.getFs(), new Path(metaClient.getMetaPath()), path -> {
// Include only the meta files with extensions that needs to be included
String extension = FSUtils.getFileExtension(path.getName());
return Arrays.stream(includedExtensions).anyMatch(Predicate.isEqual(extension));
@@ -85,14 +83,13 @@ public class HoodieActiveTimeline extends HoodieDefaultTimeline {
} catch (IOException e) {
throw new HoodieIOException("Failed to scan metadata", e);
}
this.fs = fs;
this.metaPath = metaPath;
this.metaClient = metaClient;
// multiple casts will make this lambda serializable - http://docs.oracle.com/javase/specs/jls/se8/html/jls-15.html#jls-15.16
this.details = (Function<HoodieInstant, Optional<byte[]>> & Serializable) this::getInstantDetails;
}
public HoodieActiveTimeline(FileSystem fs, String metaPath) {
this(fs, metaPath,
public HoodieActiveTimeline(HoodieTableMetaClient metaClient) {
this(metaClient,
new String[]{COMMIT_EXTENSION, INFLIGHT_COMMIT_EXTENSION, DELTA_COMMIT_EXTENSION,
INFLIGHT_DELTA_COMMIT_EXTENSION, SAVEPOINT_EXTENSION, INFLIGHT_SAVEPOINT_EXTENSION,
CLEAN_EXTENSION, INFLIGHT_CLEAN_EXTENSION});
@@ -114,7 +111,6 @@ public class HoodieActiveTimeline extends HoodieDefaultTimeline {
private void readObject(java.io.ObjectInputStream in)
throws IOException, ClassNotFoundException {
in.defaultReadObject();
this.fs = FSUtils.getFs();
}
/**
@@ -214,9 +210,9 @@ public class HoodieActiveTimeline extends HoodieDefaultTimeline {
public void deleteInflight(HoodieInstant instant) {
log.info("Deleting in-flight " + instant);
Path inFlightCommitFilePath = new Path(metaPath, instant.getFileName());
Path inFlightCommitFilePath = new Path(metaClient.getMetaPath(), instant.getFileName());
try {
boolean result = fs.delete(inFlightCommitFilePath, false);
boolean result = metaClient.getFs().delete(inFlightCommitFilePath, false);
if (result) {
log.info("Removed in-flight " + instant);
} else {
@@ -230,18 +226,18 @@ public class HoodieActiveTimeline extends HoodieDefaultTimeline {
@Override
public Optional<byte[]> getInstantDetails(HoodieInstant instant) {
Path detailPath = new Path(metaPath, instant.getFileName());
Path detailPath = new Path(metaClient.getMetaPath(), instant.getFileName());
return readDataFromPath(detailPath);
}
protected void moveInflightToComplete(HoodieInstant inflight, HoodieInstant completed,
Optional<byte[]> data) {
Path commitFilePath = new Path(metaPath, completed.getFileName());
Path commitFilePath = new Path(metaClient.getMetaPath(), completed.getFileName());
try {
// open a new file and write the commit metadata in
Path inflightCommitFile = new Path(metaPath, inflight.getFileName());
Path inflightCommitFile = new Path(metaClient.getMetaPath(), inflight.getFileName());
createFileInMetaPath(inflight.getFileName(), data);
boolean success = fs.rename(inflightCommitFile, commitFilePath);
boolean success = metaClient.getFs().rename(inflightCommitFile, commitFilePath);
if (!success) {
throw new HoodieIOException(
"Could not rename " + inflightCommitFile + " to " + commitFilePath);
@@ -252,11 +248,11 @@ public class HoodieActiveTimeline extends HoodieDefaultTimeline {
}
protected void moveCompleteToInflight(HoodieInstant completed, HoodieInstant inflight) {
Path inFlightCommitFilePath = new Path(metaPath, inflight.getFileName());
Path inFlightCommitFilePath = new Path(metaClient.getMetaPath(), inflight.getFileName());
try {
if (!fs.exists(inFlightCommitFilePath)) {
Path commitFilePath = new Path(metaPath, completed.getFileName());
boolean success = fs.rename(commitFilePath, inFlightCommitFilePath);
if (!metaClient.getFs().exists(inFlightCommitFilePath)) {
Path commitFilePath = new Path(metaClient.getMetaPath(), completed.getFileName());
boolean success = metaClient.getFs().rename(commitFilePath, inFlightCommitFilePath);
if (!success) {
throw new HoodieIOException(
"Could not rename " + commitFilePath + " to " + inFlightCommitFilePath);
@@ -272,15 +268,15 @@ public class HoodieActiveTimeline extends HoodieDefaultTimeline {
}
protected void createFileInMetaPath(String filename, Optional<byte[]> content) {
Path fullPath = new Path(metaPath, filename);
Path fullPath = new Path(metaClient.getMetaPath(), filename);
try {
if (!content.isPresent()) {
if (fs.createNewFile(fullPath)) {
if (metaClient.getFs().createNewFile(fullPath)) {
log.info("Created a new file in meta path: " + fullPath);
return;
}
} else {
FSDataOutputStream fsout = fs.create(fullPath, true);
FSDataOutputStream fsout = metaClient.getFs().create(fullPath, true);
fsout.write(content.get());
fsout.close();
return;
@@ -292,7 +288,7 @@ public class HoodieActiveTimeline extends HoodieDefaultTimeline {
}
protected Optional<byte[]> readDataFromPath(Path detailPath) {
try (FSDataInputStream is = fs.open(detailPath)) {
try (FSDataInputStream is = metaClient.getFs().open(detailPath)) {
return Optional.of(IOUtils.toByteArray(is));
} catch (IOException e) {
throw new HoodieIOException("Could not read commit details from " + detailPath, e);
@@ -300,6 +296,6 @@ public class HoodieActiveTimeline extends HoodieDefaultTimeline {
}
public HoodieActiveTimeline reload() {
return new HoodieActiveTimeline(fs, metaPath);
return new HoodieActiveTimeline(metaClient);
}
}

View File

@@ -16,8 +16,8 @@
package com.uber.hoodie.common.table.timeline;
import com.uber.hoodie.common.table.HoodieTableMetaClient;
import com.uber.hoodie.common.table.HoodieTimeline;
import com.uber.hoodie.common.util.FSUtils;
import com.uber.hoodie.exception.HoodieIOException;
import java.io.IOException;
import java.io.Serializable;
@@ -27,7 +27,6 @@ import java.util.Map;
import java.util.Optional;
import java.util.function.Function;
import java.util.stream.Collectors;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Text;
@@ -44,17 +43,17 @@ import org.apache.log4j.Logger;
public class HoodieArchivedTimeline extends HoodieDefaultTimeline {
private static final String HOODIE_COMMIT_ARCHIVE_LOG_FILE = "commits";
private transient FileSystem fs;
private String metaPath;
private HoodieTableMetaClient metaClient;
private Map<String, byte[]> readCommits = new HashMap<>();
private final transient static Logger log = LogManager.getLogger(HoodieArchivedTimeline.class);
public HoodieArchivedTimeline(FileSystem fs, String metaPath) {
public HoodieArchivedTimeline(HoodieTableMetaClient metaClient) {
// Read back the commits to make sure
Path archiveLogPath = getArchiveLogPath(metaPath);
Path archiveLogPath = getArchiveLogPath(metaClient.getMetaPath());
try (SequenceFile.Reader reader =
new SequenceFile.Reader(fs.getConf(), SequenceFile.Reader.file(archiveLogPath))) {
new SequenceFile.Reader(metaClient.getHadoopConf(),
SequenceFile.Reader.file(archiveLogPath))) {
Text key = new Text();
Text val = new Text();
while (reader.next(key, val)) {
@@ -71,8 +70,7 @@ public class HoodieArchivedTimeline extends HoodieDefaultTimeline {
}
// multiple casts will make this lambda serializable - http://docs.oracle.com/javase/specs/jls/se8/html/jls-15.html#jls-15.16
this.details = (Function<HoodieInstant, Optional<byte[]>> & Serializable) this::getInstantDetails;
this.fs = fs;
this.metaPath = metaPath;
this.metaClient = metaClient;
}
/**
@@ -91,7 +89,6 @@ public class HoodieArchivedTimeline extends HoodieDefaultTimeline {
private void readObject(java.io.ObjectInputStream in)
throws IOException, ClassNotFoundException {
in.defaultReadObject();
this.fs = FSUtils.getFs();
}
@@ -105,7 +102,7 @@ public class HoodieArchivedTimeline extends HoodieDefaultTimeline {
}
public HoodieArchivedTimeline reload() {
return new HoodieArchivedTimeline(fs, metaPath);
return new HoodieArchivedTimeline(metaClient);
}
}

View File

@@ -40,7 +40,6 @@ import java.util.stream.Collectors;
import java.util.stream.Stream;
import org.apache.commons.lang3.tuple.Pair;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
/**
@@ -57,7 +56,6 @@ public class HoodieTableFileSystemView implements TableFileSystemView,
TableFileSystemView.RealtimeView, Serializable {
protected HoodieTableMetaClient metaClient;
protected transient FileSystem fs;
// This is the commits that will be visible for all views extending this view
protected HoodieTimeline visibleActiveTimeline;
@@ -72,7 +70,6 @@ public class HoodieTableFileSystemView implements TableFileSystemView,
public HoodieTableFileSystemView(HoodieTableMetaClient metaClient,
HoodieTimeline visibleActiveTimeline) {
this.metaClient = metaClient;
this.fs = metaClient.getFs();
this.visibleActiveTimeline = visibleActiveTimeline;
this.fileGroupMap = new HashMap<>();
this.partitionToFileGroupsMap = new HashMap<>();
@@ -98,7 +95,6 @@ public class HoodieTableFileSystemView implements TableFileSystemView,
private void readObject(java.io.ObjectInputStream in)
throws IOException, ClassNotFoundException {
in.defaultReadObject();
this.fs = FSUtils.getFs();
}
private void writeObject(java.io.ObjectOutputStream out)
@@ -255,8 +251,8 @@ public class HoodieTableFileSystemView implements TableFileSystemView,
try {
// Create the path if it does not exist already
Path partitionPath = new Path(metaClient.getBasePath(), partitionPathStr);
FSUtils.createPathIfNotExists(fs, partitionPath);
FileStatus[] statuses = fs.listStatus(partitionPath);
FSUtils.createPathIfNotExists(metaClient.getFs(), partitionPath);
FileStatus[] statuses = metaClient.getFs().listStatus(partitionPath);
List<HoodieFileGroup> fileGroups = addFilesToView(statuses);
return fileGroups.stream();
} catch (IOException e) {

View File

@@ -94,7 +94,7 @@ public class AvroUtils {
public static HoodieCleanMetadata convertCleanMetadata(String startCleanTime,
Optional<Long> durationInMs, List<HoodieCleanStat> cleanStats) {
ImmutableMap.Builder<String, HoodieCleanPartitionMetadata> partitionMetadataBuilder =
ImmutableMap.<String, HoodieCleanPartitionMetadata>builder();
ImmutableMap.builder();
int totalDeleted = 0;
String earliestCommitToRetain = null;
for (HoodieCleanStat stat : cleanStats) {
@@ -116,7 +116,7 @@ public class AvroUtils {
public static HoodieRollbackMetadata convertRollbackMetadata(String startRollbackTime,
Optional<Long> durationInMs, List<String> commits, List<HoodieRollbackStat> stats) {
ImmutableMap.Builder<String, HoodieRollbackPartitionMetadata> partitionMetadataBuilder =
ImmutableMap.<String, HoodieRollbackPartitionMetadata>builder();
ImmutableMap.builder();
int totalDeleted = 0;
for (HoodieRollbackStat stat : stats) {
HoodieRollbackPartitionMetadata metadata =
@@ -132,7 +132,7 @@ public class AvroUtils {
public static HoodieSavepointMetadata convertSavepointMetadata(String user, String comment,
Map<String, List<String>> latestFiles) {
ImmutableMap.Builder<String, HoodieSavepointPartitionMetadata> partitionMetadataBuilder =
ImmutableMap.<String, HoodieSavepointPartitionMetadata>builder();
ImmutableMap.builder();
for (Map.Entry<String, List<String>> stat : latestFiles.entrySet()) {
HoodieSavepointPartitionMetadata metadata =
new HoodieSavepointPartitionMetadata(stat.getKey(), stat.getValue());

View File

@@ -29,6 +29,7 @@ import java.util.ArrayList;
import java.util.Arrays;
import java.util.Comparator;
import java.util.List;
import java.util.Map.Entry;
import java.util.Optional;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
@@ -55,6 +56,7 @@ public class FSUtils {
private static final int MAX_ATTEMPTS_RECOVER_LEASE = 10;
private static final long MIN_CLEAN_TO_KEEP = 10;
private static final long MIN_ROLLBACK_TO_KEEP = 10;
private static final String HOODIE_ENV_PROPS_PREFIX = "HOODIE_ENV_";
private static FileSystem fs;
/**
@@ -65,17 +67,32 @@ public class FSUtils {
FSUtils.fs = fs;
}
public static Configuration prepareHadoopConf(Configuration conf) {
conf.set("fs.hdfs.impl", org.apache.hadoop.hdfs.DistributedFileSystem.class.getName());
conf.set("fs.file.impl", org.apache.hadoop.fs.LocalFileSystem.class.getName());
public static FileSystem getFs() {
// look for all properties, prefixed to be picked up
for (Entry<String, String> prop : System.getenv().entrySet()) {
if (prop.getKey().startsWith(HOODIE_ENV_PROPS_PREFIX)) {
LOG.info("Picking up value for hoodie env var :" + prop.getKey());
conf.set(prop.getKey()
.replace(HOODIE_ENV_PROPS_PREFIX, "")
.replaceAll("_DOT_", "."),
prop.getValue());
}
}
return conf;
}
public static FileSystem getFs(String path, Configuration conf) {
if (fs != null) {
return fs;
}
Configuration conf = new Configuration();
conf.set("fs.hdfs.impl", org.apache.hadoop.hdfs.DistributedFileSystem.class.getName());
conf.set("fs.file.impl", org.apache.hadoop.fs.LocalFileSystem.class.getName());
FileSystem fs;
conf = prepareHadoopConf(conf);
try {
fs = FileSystem.get(conf);
fs = new Path(path).getFileSystem(conf);
} catch (IOException e) {
throw new HoodieIOException("Failed to get instance of " + FileSystem.class.getName(),
e);
@@ -83,7 +100,6 @@ public class FSUtils {
LOG.info(
String.format("Hadoop Configuration: fs.defaultFS: [%s], Config:[%s], FileSystem: [%s]",
conf.getRaw("fs.defaultFS"), conf.toString(), fs.toString()));
return fs;
}

View File

@@ -51,10 +51,11 @@ public class ParquetUtils {
* Read the rowKey list from the given parquet file.
*
* @param filePath The parquet file path.
* @param configuration configuration to build fs object
*/
public static Set<String> readRowKeysFromParquet(Path filePath) {
Configuration conf = new Configuration();
conf.addResource(getFs().getConf());
public static Set<String> readRowKeysFromParquet(Configuration configuration, Path filePath) {
Configuration conf = new Configuration(configuration);
conf.addResource(getFs(filePath.toString(), conf).getConf());
Schema readSchema = HoodieAvroUtils.getRecordKeySchema();
AvroReadSupport.setAvroReadSchema(conf, readSchema);
AvroReadSupport.setRequestedProjection(conf, readSchema);
@@ -84,19 +85,12 @@ public class ParquetUtils {
return rowKeys;
}
/**
* Read the metadata from a parquet file
*/
public static ParquetMetadata readMetadata(Path parquetFilePath) {
return readMetadata(new Configuration(), parquetFilePath);
}
public static ParquetMetadata readMetadata(Configuration conf, Path parquetFilePath) {
ParquetMetadata footer;
try {
// TODO(vc): Should we use the parallel reading version here?
footer = ParquetFileReader.readFooter(getFs().getConf(), parquetFilePath);
footer = ParquetFileReader
.readFooter(getFs(parquetFilePath.toString(), conf).getConf(), parquetFilePath);
} catch (IOException e) {
throw new HoodieIOException("Failed to read footer for parquet " + parquetFilePath,
e);
@@ -108,14 +102,15 @@ public class ParquetUtils {
/**
* Get the schema of the given parquet file.
*/
public static MessageType readSchema(Path parquetFilePath) {
return readMetadata(parquetFilePath).getFileMetaData().getSchema();
public static MessageType readSchema(Configuration configuration, Path parquetFilePath) {
return readMetadata(configuration, parquetFilePath).getFileMetaData().getSchema();
}
private static List<String> readParquetFooter(Path parquetFilePath, String... footerNames) {
private static List<String> readParquetFooter(Configuration configuration, Path parquetFilePath,
String... footerNames) {
List<String> footerVals = new ArrayList<>();
ParquetMetadata footer = readMetadata(parquetFilePath);
ParquetMetadata footer = readMetadata(configuration, parquetFilePath);
Map<String, String> metadata = footer.getFileMetaData().getKeyValueMetaData();
for (String footerName : footerNames) {
if (metadata.containsKey(footerName)) {
@@ -128,21 +123,22 @@ public class ParquetUtils {
return footerVals;
}
public static Schema readAvroSchema(Path parquetFilePath) {
return new AvroSchemaConverter().convert(readSchema(parquetFilePath));
public static Schema readAvroSchema(Configuration configuration, Path parquetFilePath) {
return new AvroSchemaConverter().convert(readSchema(configuration, parquetFilePath));
}
/**
* Read out the bloom filter from the parquet file meta data.
*/
public static BloomFilter readBloomFilterFromParquetMetadata(Path parquetFilePath) {
String footerVal = readParquetFooter(parquetFilePath,
public static BloomFilter readBloomFilterFromParquetMetadata(Configuration configuration,
Path parquetFilePath) {
String footerVal = readParquetFooter(configuration, parquetFilePath,
HoodieAvroWriteSupport.HOODIE_AVRO_BLOOM_FILTER_METADATA_KEY).get(0);
return new BloomFilter(footerVal);
}
public static String[] readMinMaxRecordKeys(Path parquetFilePath) {
List<String> minMaxKeys = readParquetFooter(parquetFilePath,
public static String[] readMinMaxRecordKeys(Configuration configuration, Path parquetFilePath) {
List<String> minMaxKeys = readParquetFooter(configuration, parquetFilePath,
HoodieAvroWriteSupport.HOODIE_MIN_RECORD_KEY_FOOTER,
HoodieAvroWriteSupport.HOODIE_MAX_RECORD_KEY_FOOTER);
if (minMaxKeys.size() != 2) {
@@ -156,11 +152,11 @@ public class ParquetUtils {
/**
* NOTE: This literally reads the entire file contents, thus should be used with caution.
*/
public static List<GenericRecord> readAvroRecords(Path filePath) {
public static List<GenericRecord> readAvroRecords(Configuration configuration, Path filePath) {
ParquetReader reader = null;
List<GenericRecord> records = new ArrayList<>();
try {
reader = AvroParquetReader.builder(filePath).build();
reader = AvroParquetReader.builder(filePath).withConf(configuration).build();
Object obj = reader.read();
while (obj != null) {
if (obj instanceof GenericRecord) {

View File

@@ -19,6 +19,7 @@ package com.uber.hoodie.common.minicluster;
import com.google.common.base.Preconditions;
import com.google.common.io.Files;
import com.uber.hoodie.common.model.HoodieTestUtils;
import java.io.File;
import java.io.IOException;
import org.apache.commons.io.FileUtils;
@@ -54,7 +55,7 @@ public class HdfsTestService {
private MiniDFSCluster miniDfsCluster;
public HdfsTestService() {
hadoopConf = new Configuration();
hadoopConf = HoodieTestUtils.getDefaultHadoopConf();
workDir = Files.createTempDir().getAbsolutePath();
}
@@ -67,7 +68,7 @@ public class HdfsTestService {
.checkState(workDir != null, "The work dir must be set before starting cluster.");
if (hadoopConf == null) {
hadoopConf = new Configuration();
hadoopConf = HoodieTestUtils.getDefaultHadoopConf();
}
// If clean, then remove the work dir so we can start fresh.

View File

@@ -58,6 +58,7 @@ import java.util.stream.Stream;
import org.apache.avro.Schema;
import org.apache.avro.generic.GenericRecord;
import org.apache.avro.generic.IndexedRecord;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
@@ -69,18 +70,23 @@ import org.junit.rules.TemporaryFolder;
public class HoodieTestUtils {
public static FileSystem fs = FSUtils.getFs();
public static FileSystem fs;
public static final String TEST_EXTENSION = ".test";
public static final String RAW_TRIPS_TEST_NAME = "raw_trips";
public static final int DEFAULT_TASK_PARTITIONID = 1;
public static final String[] DEFAULT_PARTITION_PATHS = {"2016/03/15", "2015/03/16", "2015/03/17"};
private static Random rand = new Random(46474747);
public static void resetFS() {
HoodieTestUtils.fs = FSUtils.getFs();
public static void resetFS(String basePath) {
HoodieTestUtils.fs = FSUtils.getFs(basePath, HoodieTestUtils.getDefaultHadoopConf());
}
public static Configuration getDefaultHadoopConf() {
return new Configuration();
}
public static HoodieTableMetaClient init(String basePath) throws IOException {
fs = FSUtils.getFs(basePath, HoodieTestUtils.getDefaultHadoopConf());
return initTableType(basePath, HoodieTableType.COPY_ON_WRITE);
}
@@ -211,7 +217,7 @@ public class HoodieTestUtils {
Path commitFile =
new Path(basePath + "/" + HoodieTableMetaClient.METAFOLDER_NAME + "/" + HoodieTimeline
.makeCleanerFileName(commitTime));
FileSystem fs = FSUtils.getFs();
FileSystem fs = FSUtils.getFs(basePath, HoodieTestUtils.getDefaultHadoopConf());
FSDataOutputStream os = fs.create(commitFile, true);
try {
HoodieCleanStat cleanStats = new HoodieCleanStat(

View File

@@ -67,7 +67,7 @@ public class HoodieLogFormatTest {
private FileSystem fs;
private Path partitionPath;
private String basePath;
private static String basePath;
@BeforeClass
public static void setUpClass() throws IOException, InterruptedException {
@@ -78,7 +78,7 @@ public class HoodieLogFormatTest {
@AfterClass
public static void tearDownClass() {
MiniClusterUtil.shutdown();
HoodieTestUtils.resetFS();
HoodieTestUtils.resetFS(basePath);
}
@Before
@@ -343,7 +343,7 @@ public class HoodieLogFormatTest {
writer.close();
// Append some arbit byte[] to thee end of the log (mimics a partially written commit)
fs = FileSystem.get(fs.getConf());
fs = FSUtils.getFs(fs.getUri().toString(), fs.getConf());
FSDataOutputStream outputStream = fs.append(writer.getLogFile().getPath());
// create a block with
outputStream.write(HoodieLogFormat.MAGIC);
@@ -533,7 +533,7 @@ public class HoodieLogFormatTest {
writer.close();
// Append some arbit byte[] to thee end of the log (mimics a partially written commit)
fs = FileSystem.get(fs.getConf());
fs = FSUtils.getFs(fs.getUri().toString(), fs.getConf());
FSDataOutputStream outputStream = fs.append(writer.getLogFile().getPath());
// create a block with
outputStream.write(HoodieLogFormat.MAGIC);

View File

@@ -74,7 +74,7 @@ public class HoodieActiveTimelineTest {
HoodieInstant instant5 =
new HoodieInstant(true, HoodieTimeline.COMMIT_ACTION, "9");
timeline = new HoodieActiveTimeline(HoodieTestUtils.fs, metaClient.getMetaPath());
timeline = new HoodieActiveTimeline(metaClient);
timeline.saveAsComplete(instant1, Optional.empty());
timeline.saveAsComplete(instant2, Optional.empty());
timeline.saveAsComplete(instant3, Optional.empty());
@@ -98,7 +98,7 @@ public class HoodieActiveTimelineTest {
@Test
public void testTimelineOperationsBasic() throws Exception {
timeline = new HoodieActiveTimeline(HoodieTestUtils.fs, metaClient.getMetaPath());
timeline = new HoodieActiveTimeline(metaClient);
assertTrue(timeline.empty());
assertEquals("", 0, timeline.countInstants());
assertEquals("", Optional.empty(), timeline.firstInstant());

View File

@@ -69,7 +69,7 @@ public class HoodieTableFileSystemViewTest {
}
private void refreshFsView(FileStatus[] statuses) {
metaClient = new HoodieTableMetaClient(HoodieTestUtils.fs, basePath, true);
metaClient = new HoodieTableMetaClient(HoodieTestUtils.fs.getConf(), basePath, true);
if (statuses != null) {
fsView = new HoodieTableFileSystemView(metaClient,
metaClient.getActiveTimeline().getCommitTimeline().filterCompletedInstants(),

View File

@@ -16,15 +16,24 @@
package com.uber.hoodie.common.util;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertTrue;
import com.uber.hoodie.common.model.HoodieTestUtils;
import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.UUID;
import org.apache.hadoop.conf.Configuration;
import org.junit.Rule;
import org.junit.Test;
import org.junit.contrib.java.lang.system.EnvironmentVariables;
public class TestFSUtils {
@Rule
public final EnvironmentVariables environmentVariables
= new EnvironmentVariables();
@Test
public void testMakeDataFileName() {
String commitTime = new SimpleDateFormat("yyyyMMddHHmmss").format(new Date());
@@ -59,4 +68,15 @@ public class TestFSUtils {
String fullFileName = FSUtils.makeDataFileName(commitTime, taskPartitionId, fileName);
assertTrue(FSUtils.getFileId(fullFileName).equals(fileName));
}
@Test
public void testEnvVarVariablesPickedup() {
environmentVariables.set("HOODIE_ENV_fs_DOT_key1", "value1");
Configuration conf = FSUtils.prepareHadoopConf(HoodieTestUtils.getDefaultHadoopConf());
assertEquals("value1", conf.get("fs.key1"));
conf.set("fs.key1", "value11");
conf.set("fs.key2", "value2");
assertEquals("value11", conf.get("fs.key1"));
assertEquals("value2", conf.get("fs.key2"));
}
}

View File

@@ -22,6 +22,7 @@ import static org.junit.Assert.assertTrue;
import com.uber.hoodie.avro.HoodieAvroWriteSupport;
import com.uber.hoodie.common.BloomFilter;
import com.uber.hoodie.common.model.HoodieRecord;
import com.uber.hoodie.common.model.HoodieTestUtils;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
@@ -79,12 +80,15 @@ public class TestParquetUtils {
// Read and verify
List<String> rowKeysInFile = new ArrayList<>(
ParquetUtils.readRowKeysFromParquet(new Path(filePath)));
ParquetUtils
.readRowKeysFromParquet(HoodieTestUtils.getDefaultHadoopConf(), new Path(filePath)));
Collections.sort(rowKeysInFile);
Collections.sort(rowKeys);
assertEquals("Did not read back the expected list of keys", rowKeys, rowKeysInFile);
BloomFilter filterInFile = ParquetUtils.readBloomFilterFromParquetMetadata(new Path(filePath));
BloomFilter filterInFile = ParquetUtils
.readBloomFilterFromParquetMetadata(HoodieTestUtils.getDefaultHadoopConf(),
new Path(filePath));
for (String rowKey : rowKeys) {
assertTrue("key should be found in bloom filter", filterInFile.mightContain(rowKey));
}