diff --git a/hoodie-cli/src/main/java/com/uber/hoodie/cli/HoodieCLI.java b/hoodie-cli/src/main/java/com/uber/hoodie/cli/HoodieCLI.java index 27f0ab516..15cce03e2 100644 --- a/hoodie-cli/src/main/java/com/uber/hoodie/cli/HoodieCLI.java +++ b/hoodie-cli/src/main/java/com/uber/hoodie/cli/HoodieCLI.java @@ -17,6 +17,7 @@ package com.uber.hoodie.cli; import com.uber.hoodie.common.table.HoodieTableMetaClient; +import com.uber.hoodie.common.util.FSUtils; import java.io.IOException; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; @@ -36,7 +37,7 @@ public class HoodieCLI { public static boolean initConf() { if (HoodieCLI.conf == null) { - HoodieCLI.conf = new Configuration(); + HoodieCLI.conf = FSUtils.prepareHadoopConf(new Configuration()); return true; } return false; diff --git a/hoodie-cli/src/main/java/com/uber/hoodie/cli/commands/ArchivedCommitsCommand.java b/hoodie-cli/src/main/java/com/uber/hoodie/cli/commands/ArchivedCommitsCommand.java index e1b03b271..d4945c012 100644 --- a/hoodie-cli/src/main/java/com/uber/hoodie/cli/commands/ArchivedCommitsCommand.java +++ b/hoodie-cli/src/main/java/com/uber/hoodie/cli/commands/ArchivedCommitsCommand.java @@ -54,12 +54,14 @@ public class ArchivedCommitsCommand implements CommandMarker { System.out .println("===============> Showing only " + limit + " archived commits <==============="); - FileStatus[] fsStatuses = FSUtils.getFs().globStatus( - new Path(HoodieCLI.tableMetadata.getBasePath() + "/.hoodie/.commits_.archive*")); + String basePath = HoodieCLI.tableMetadata.getBasePath(); + FileStatus[] fsStatuses = FSUtils.getFs(basePath, HoodieCLI.conf) + .globStatus(new Path(basePath + "/.hoodie/.commits_.archive*")); List allCommits = new ArrayList<>(); for (FileStatus fs : fsStatuses) { //read the archived file - HoodieLogFormat.Reader reader = HoodieLogFormat.newReader(FSUtils.getFs(), + HoodieLogFormat.Reader reader = HoodieLogFormat + .newReader(FSUtils.getFs(basePath, HoodieCLI.conf), new HoodieLogFile(fs.getPath()), HoodieArchivedMetaEntry.getClassSchema(), false); List readRecords = new ArrayList<>(); diff --git a/hoodie-cli/src/main/java/com/uber/hoodie/cli/commands/CleansCommand.java b/hoodie-cli/src/main/java/com/uber/hoodie/cli/commands/CleansCommand.java index 1b5a9602d..38638866b 100644 --- a/hoodie-cli/src/main/java/com/uber/hoodie/cli/commands/CleansCommand.java +++ b/hoodie-cli/src/main/java/com/uber/hoodie/cli/commands/CleansCommand.java @@ -77,7 +77,7 @@ public class CleansCommand implements CommandMarker { @CliCommand(value = "cleans refresh", help = "Refresh the commits") public String refreshCleans() throws IOException { HoodieTableMetaClient metadata = - new HoodieTableMetaClient(HoodieCLI.fs, HoodieCLI.tableMetadata.getBasePath()); + new HoodieTableMetaClient(HoodieCLI.conf, HoodieCLI.tableMetadata.getBasePath()); HoodieCLI.setTableMetadata(metadata); return "Metadata for table " + metadata.getTableConfig().getTableName() + " refreshed."; } diff --git a/hoodie-cli/src/main/java/com/uber/hoodie/cli/commands/CommitsCommand.java b/hoodie-cli/src/main/java/com/uber/hoodie/cli/commands/CommitsCommand.java index d6446a2c4..d7b6594d1 100644 --- a/hoodie-cli/src/main/java/com/uber/hoodie/cli/commands/CommitsCommand.java +++ b/hoodie-cli/src/main/java/com/uber/hoodie/cli/commands/CommitsCommand.java @@ -96,7 +96,7 @@ public class CommitsCommand implements CommandMarker { @CliCommand(value = "commits refresh", help = "Refresh the commits") public String refreshCommits() throws IOException { HoodieTableMetaClient metadata = - new HoodieTableMetaClient(HoodieCLI.fs, HoodieCLI.tableMetadata.getBasePath()); + new HoodieTableMetaClient(HoodieCLI.conf, HoodieCLI.tableMetadata.getBasePath()); HoodieCLI.setTableMetadata(metadata); return "Metadata for table " + metadata.getTableConfig().getTableName() + " refreshed."; } @@ -224,14 +224,13 @@ public class CommitsCommand implements CommandMarker { public String compareCommits( @CliOption(key = {"path"}, help = "Path of the dataset to compare to") final String path) throws Exception { - HoodieTableMetaClient target = new HoodieTableMetaClient(HoodieCLI.fs, path); + + HoodieTableMetaClient target = new HoodieTableMetaClient(HoodieCLI.conf, path); HoodieTimeline targetTimeline = target.getActiveTimeline().getCommitsTimeline() .filterCompletedInstants(); - ; HoodieTableMetaClient source = HoodieCLI.tableMetadata; HoodieTimeline sourceTimeline = source.getActiveTimeline().getCommitsTimeline() .filterCompletedInstants(); - ; String targetLatestCommit = targetTimeline.getInstants().iterator().hasNext() ? "0" : targetTimeline.lastInstant().get().getTimestamp(); @@ -266,7 +265,7 @@ public class CommitsCommand implements CommandMarker { public String syncCommits( @CliOption(key = {"path"}, help = "Path of the dataset to compare to") final String path) throws Exception { - HoodieCLI.syncTableMetadata = new HoodieTableMetaClient(HoodieCLI.fs, path); + HoodieCLI.syncTableMetadata = new HoodieTableMetaClient(HoodieCLI.conf, path); HoodieCLI.state = HoodieCLI.CLIState.SYNC; return "Load sync state between " + HoodieCLI.tableMetadata.getTableConfig().getTableName() + " and " + HoodieCLI.syncTableMetadata.getTableConfig().getTableName(); diff --git a/hoodie-cli/src/main/java/com/uber/hoodie/cli/commands/DatasetsCommand.java b/hoodie-cli/src/main/java/com/uber/hoodie/cli/commands/DatasetsCommand.java index fc1f22a3a..de59d0669 100644 --- a/hoodie-cli/src/main/java/com/uber/hoodie/cli/commands/DatasetsCommand.java +++ b/hoodie-cli/src/main/java/com/uber/hoodie/cli/commands/DatasetsCommand.java @@ -33,7 +33,7 @@ public class DatasetsCommand implements CommandMarker { final String path) throws IOException { boolean initialized = HoodieCLI.initConf(); HoodieCLI.initFS(initialized); - HoodieCLI.setTableMetadata(new HoodieTableMetaClient(HoodieCLI.fs, path)); + HoodieCLI.setTableMetadata(new HoodieTableMetaClient(HoodieCLI.conf, path)); HoodieCLI.state = HoodieCLI.CLIState.DATASET; return "Metadata for table " + HoodieCLI.tableMetadata.getTableConfig().getTableName() + " loaded"; diff --git a/hoodie-cli/src/main/java/com/uber/hoodie/cli/commands/SavepointsCommand.java b/hoodie-cli/src/main/java/com/uber/hoodie/cli/commands/SavepointsCommand.java index 4f5b2c9a3..db267d6a6 100644 --- a/hoodie-cli/src/main/java/com/uber/hoodie/cli/commands/SavepointsCommand.java +++ b/hoodie-cli/src/main/java/com/uber/hoodie/cli/commands/SavepointsCommand.java @@ -138,7 +138,7 @@ public class SavepointsCommand implements CommandMarker { @CliCommand(value = "savepoints refresh", help = "Refresh the savepoints") public String refreshMetaClient() throws IOException { HoodieTableMetaClient metadata = - new HoodieTableMetaClient(HoodieCLI.fs, HoodieCLI.tableMetadata.getBasePath()); + new HoodieTableMetaClient(HoodieCLI.conf, HoodieCLI.tableMetadata.getBasePath()); HoodieCLI.setTableMetadata(metadata); return "Metadata for table " + metadata.getTableConfig().getTableName() + " refreshed."; } diff --git a/hoodie-cli/src/main/java/com/uber/hoodie/cli/commands/SparkMain.java b/hoodie-cli/src/main/java/com/uber/hoodie/cli/commands/SparkMain.java index aba2d9da8..4f8f2f856 100644 --- a/hoodie-cli/src/main/java/com/uber/hoodie/cli/commands/SparkMain.java +++ b/hoodie-cli/src/main/java/com/uber/hoodie/cli/commands/SparkMain.java @@ -97,8 +97,8 @@ public class SparkMain { String repairedOutputPath, String basePath) throws Exception { - DedupeSparkJob job = new DedupeSparkJob(basePath, - duplicatedPartitionPath, repairedOutputPath, new SQLContext(jsc), FSUtils.getFs()); + DedupeSparkJob job = new DedupeSparkJob(basePath, duplicatedPartitionPath, repairedOutputPath, + new SQLContext(jsc), FSUtils.getFs(basePath, jsc.hadoopConfiguration())); job.fixDuplicates(true); return 0; } diff --git a/hoodie-cli/src/main/java/com/uber/hoodie/cli/utils/SparkUtil.java b/hoodie-cli/src/main/java/com/uber/hoodie/cli/utils/SparkUtil.java index 4b4ab2a2d..d85626d35 100644 --- a/hoodie-cli/src/main/java/com/uber/hoodie/cli/utils/SparkUtil.java +++ b/hoodie-cli/src/main/java/com/uber/hoodie/cli/utils/SparkUtil.java @@ -18,6 +18,7 @@ package com.uber.hoodie.cli.utils; import com.uber.hoodie.HoodieWriteClient; import com.uber.hoodie.cli.commands.SparkMain; +import com.uber.hoodie.common.util.FSUtils; import java.io.File; import java.net.URISyntaxException; import org.apache.log4j.Logger; @@ -66,6 +67,7 @@ public class SparkUtil { sparkConf = HoodieWriteClient.registerClasses(sparkConf); JavaSparkContext jsc = new JavaSparkContext(sparkConf); jsc.hadoopConfiguration().setBoolean("parquet.enable.summary-metadata", false); + FSUtils.prepareHadoopConf(jsc.hadoopConfiguration()); return jsc; } } diff --git a/hoodie-cli/src/main/scala/com/uber/hoodie/cli/DedupeSparkJob.scala b/hoodie-cli/src/main/scala/com/uber/hoodie/cli/DedupeSparkJob.scala index 82c97e0a4..0058043ed 100644 --- a/hoodie-cli/src/main/scala/com/uber/hoodie/cli/DedupeSparkJob.scala +++ b/hoodie-cli/src/main/scala/com/uber/hoodie/cli/DedupeSparkJob.scala @@ -75,7 +75,7 @@ class DedupeSparkJob(basePath: String, val tmpTableName = s"htbl_${System.currentTimeMillis()}" val dedupeTblName = s"${tmpTableName}_dupeKeys" - val metadata = new HoodieTableMetaClient(fs, basePath) + val metadata = new HoodieTableMetaClient(fs.getConf, basePath) val allFiles = fs.listStatus(new org.apache.hadoop.fs.Path(s"${basePath}/${duplicatedPartitionPath}")) val fsView = new HoodieTableFileSystemView(metadata, metadata.getActiveTimeline.getCommitTimeline.filterCompletedInstants(), allFiles) @@ -127,7 +127,7 @@ class DedupeSparkJob(basePath: String, def fixDuplicates(dryRun: Boolean = true) = { - val metadata = new HoodieTableMetaClient(fs, basePath) + val metadata = new HoodieTableMetaClient(fs.getConf, basePath) val allFiles = fs.listStatus(new Path(s"${basePath}/${duplicatedPartitionPath}")) val fsView = new HoodieTableFileSystemView(metadata, metadata.getActiveTimeline.getCommitTimeline.filterCompletedInstants(), allFiles) diff --git a/hoodie-cli/src/main/scala/com/uber/hoodie/cli/SparkHelpers.scala b/hoodie-cli/src/main/scala/com/uber/hoodie/cli/SparkHelpers.scala index 3fc18c8e4..8e1eccfee 100644 --- a/hoodie-cli/src/main/scala/com/uber/hoodie/cli/SparkHelpers.scala +++ b/hoodie-cli/src/main/scala/com/uber/hoodie/cli/SparkHelpers.scala @@ -38,7 +38,7 @@ import scala.collection.mutable._ object SparkHelpers { @throws[Exception] def skipKeysAndWriteNewFile(commitTime: String, fs: FileSystem, sourceFile: Path, destinationFile: Path, keysToSkip: Set[String]) { - val sourceRecords = ParquetUtils.readAvroRecords(sourceFile) + val sourceRecords = ParquetUtils.readAvroRecords(fs.getConf, sourceFile) val schema: Schema = sourceRecords.get(0).getSchema val filter: BloomFilter = new BloomFilter(HoodieIndexConfig.DEFAULT_BLOOM_FILTER_NUM_ENTRIES.toInt, HoodieIndexConfig.DEFAULT_BLOOM_FILTER_FPP.toDouble) val writeSupport: HoodieAvroWriteSupport = new HoodieAvroWriteSupport(new AvroSchemaConverter().convert(schema), schema, filter) @@ -47,6 +47,7 @@ object SparkHelpers { for (rec <- sourceRecords) { val key: String = rec.get(HoodieRecord.RECORD_KEY_METADATA_FIELD).toString if (!keysToSkip.contains(key)) { + writer.writeAvro(key, rec) } } diff --git a/hoodie-client/src/main/java/com/uber/hoodie/HoodieReadClient.java b/hoodie-client/src/main/java/com/uber/hoodie/HoodieReadClient.java index 6c329c330..eb9ad62e4 100644 --- a/hoodie-client/src/main/java/com/uber/hoodie/HoodieReadClient.java +++ b/hoodie-client/src/main/java/com/uber/hoodie/HoodieReadClient.java @@ -67,10 +67,10 @@ public class HoodieReadClient implements Serializable { */ public HoodieReadClient(JavaSparkContext jsc, String basePath) { this.jsc = jsc; - this.fs = FSUtils.getFs(); + this.fs = FSUtils.getFs(basePath, jsc.hadoopConfiguration()); // Create a Hoodie table which encapsulated the commits and files visible this.hoodieTable = HoodieTable - .getHoodieTable(new HoodieTableMetaClient(fs, basePath, true), null); + .getHoodieTable(new HoodieTableMetaClient(jsc.hadoopConfiguration(), basePath, true), null); this.commitTimeline = hoodieTable.getCommitTimeline().filterCompletedInstants(); this.index = new HoodieBloomIndex(HoodieWriteConfig.newBuilder().withPath(basePath).build(), jsc); @@ -129,8 +129,8 @@ public class HoodieReadClient implements Serializable { JavaPairRDD keyRowRDD = originalDF.javaRDD() .mapToPair(row -> { HoodieKey key = new HoodieKey( - row.getAs(HoodieRecord.RECORD_KEY_METADATA_FIELD), - row.getAs(HoodieRecord.PARTITION_PATH_METADATA_FIELD)); + row.getAs(HoodieRecord.RECORD_KEY_METADATA_FIELD), + row.getAs(HoodieRecord.PARTITION_PATH_METADATA_FIELD)); return new Tuple2<>(key, row); }); diff --git a/hoodie-client/src/main/java/com/uber/hoodie/HoodieWriteClient.java b/hoodie-client/src/main/java/com/uber/hoodie/HoodieWriteClient.java index 0de261c8f..cfd20b4e1 100644 --- a/hoodie-client/src/main/java/com/uber/hoodie/HoodieWriteClient.java +++ b/hoodie-client/src/main/java/com/uber/hoodie/HoodieWriteClient.java @@ -53,18 +53,6 @@ import com.uber.hoodie.table.HoodieTable; import com.uber.hoodie.table.UserDefinedBulkInsertPartitioner; import com.uber.hoodie.table.WorkloadProfile; import com.uber.hoodie.table.WorkloadStat; -import org.apache.hadoop.fs.FileSystem; -import org.apache.log4j.LogManager; -import org.apache.log4j.Logger; -import org.apache.spark.Partitioner; -import org.apache.spark.SparkConf; -import org.apache.spark.api.java.JavaRDD; -import org.apache.spark.api.java.JavaSparkContext; -import org.apache.spark.api.java.function.PairFunction; -import org.apache.spark.storage.StorageLevel; -import scala.Option; -import scala.Tuple2; - import java.io.IOException; import java.io.Serializable; import java.nio.charset.StandardCharsets; @@ -76,6 +64,17 @@ import java.util.List; import java.util.Map; import java.util.Optional; import java.util.stream.Collectors; +import org.apache.hadoop.fs.FileSystem; +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; +import org.apache.spark.Partitioner; +import org.apache.spark.SparkConf; +import org.apache.spark.api.java.JavaRDD; +import org.apache.spark.api.java.JavaSparkContext; +import org.apache.spark.api.java.function.PairFunction; +import org.apache.spark.storage.StorageLevel; +import scala.Option; +import scala.Tuple2; /** * Hoodie Write Client helps you build datasets on HDFS [insert()] and then perform efficient @@ -112,7 +111,7 @@ public class HoodieWriteClient implements Seriali */ public HoodieWriteClient(JavaSparkContext jsc, HoodieWriteConfig clientConfig, boolean rollbackInFlight) { - this.fs = FSUtils.getFs(); + this.fs = FSUtils.getFs(clientConfig.getBasePath(), jsc.hadoopConfiguration()); this.jsc = jsc; this.config = clientConfig; this.index = HoodieIndex.createIndex(config, jsc); @@ -133,8 +132,9 @@ public class HoodieWriteClient implements Seriali */ public JavaRDD> filterExists(JavaRDD> hoodieRecords) { // Create a Hoodie table which encapsulated the commits and files visible - HoodieTable table = HoodieTable - .getHoodieTable(new HoodieTableMetaClient(fs, config.getBasePath(), true), config); + HoodieTable table = HoodieTable.getHoodieTable( + new HoodieTableMetaClient(jsc.hadoopConfiguration(), config.getBasePath(), true), + config); JavaRDD> recordsWithLocation = index.tagLocation(hoodieRecords, table); return recordsWithLocation.filter(v1 -> !v1.isCurrentLocationKnown()); @@ -146,8 +146,9 @@ public class HoodieWriteClient implements Seriali public JavaRDD upsert(JavaRDD> records, final String commitTime) { writeContext = metrics.getCommitCtx(); // Create a Hoodie table which encapsulated the commits and files visible - HoodieTable table = HoodieTable - .getHoodieTable(new HoodieTableMetaClient(fs, config.getBasePath(), true), config); + HoodieTable table = HoodieTable.getHoodieTable( + new HoodieTableMetaClient(jsc.hadoopConfiguration(), config.getBasePath(), true), + config); try { // De-dupe/merge if needed @@ -180,8 +181,9 @@ public class HoodieWriteClient implements Seriali public JavaRDD insert(JavaRDD> records, final String commitTime) { writeContext = metrics.getCommitCtx(); // Create a Hoodie table which encapsulated the commits and files visible - HoodieTable table = HoodieTable - .getHoodieTable(new HoodieTableMetaClient(fs, config.getBasePath(), true), config); + HoodieTable table = HoodieTable.getHoodieTable( + new HoodieTableMetaClient(jsc.hadoopConfiguration(), config.getBasePath(), true), + config); try { // De-dupe/merge if needed JavaRDD> dedupedRecords = @@ -237,8 +239,9 @@ public class HoodieWriteClient implements Seriali Option bulkInsertPartitioner) { writeContext = metrics.getCommitCtx(); // Create a Hoodie table which encapsulated the commits and files visible - HoodieTable table = HoodieTable - .getHoodieTable(new HoodieTableMetaClient(fs, config.getBasePath(), true), config); + HoodieTable table = HoodieTable.getHoodieTable( + new HoodieTableMetaClient(jsc.hadoopConfiguration(), config.getBasePath(), true), + config); try { // De-dupe/merge if needed @@ -306,8 +309,7 @@ public class HoodieWriteClient implements Seriali * file instead of using HoodieCommitMetadata */ private void saveWorkloadProfileMetadataToInflight(WorkloadProfile profile, - HoodieTable table, - String commitTime) throws HoodieCommitException { + HoodieTable table, String commitTime) throws HoodieCommitException { try { HoodieCommitMetadata metadata = new HoodieCommitMetadata(); profile.getPartitionPaths().stream().forEach(path -> { @@ -409,8 +411,9 @@ public class HoodieWriteClient implements Seriali logger.info("Commiting " + commitTime); // Create a Hoodie table which encapsulated the commits and files visible - HoodieTable table = HoodieTable - .getHoodieTable(new HoodieTableMetaClient(fs, config.getBasePath(), true), config); + HoodieTable table = HoodieTable.getHoodieTable( + new HoodieTableMetaClient(jsc.hadoopConfiguration(), config.getBasePath(), true), + config); HoodieActiveTimeline activeTimeline = table.getActiveTimeline(); @@ -485,8 +488,9 @@ public class HoodieWriteClient implements Seriali * @return true if the savepoint was created successfully */ public boolean savepoint(String user, String comment) { - HoodieTable table = HoodieTable - .getHoodieTable(new HoodieTableMetaClient(fs, config.getBasePath(), true), config); + HoodieTable table = HoodieTable.getHoodieTable( + new HoodieTableMetaClient(jsc.hadoopConfiguration(), config.getBasePath(), true), + config); if (table.getCompletedCommitTimeline().empty()) { throw new HoodieSavepointException("Could not savepoint. Commit timeline is empty"); } @@ -512,8 +516,9 @@ public class HoodieWriteClient implements Seriali * @return true if the savepoint was created successfully */ public boolean savepoint(String commitTime, String user, String comment) { - HoodieTable table = HoodieTable - .getHoodieTable(new HoodieTableMetaClient(fs, config.getBasePath(), true), config); + HoodieTable table = HoodieTable.getHoodieTable( + new HoodieTableMetaClient(jsc.hadoopConfiguration(), config.getBasePath(), true), + config); Optional cleanInstant = table.getCompletedCleanTimeline().lastInstant(); HoodieInstant commitInstant = new HoodieInstant(false, HoodieTimeline.COMMIT_ACTION, @@ -575,8 +580,9 @@ public class HoodieWriteClient implements Seriali * @return true if the savepoint was deleted successfully */ public void deleteSavepoint(String savepointTime) { - HoodieTable table = HoodieTable - .getHoodieTable(new HoodieTableMetaClient(fs, config.getBasePath(), true), config); + HoodieTable table = HoodieTable.getHoodieTable( + new HoodieTableMetaClient(jsc.hadoopConfiguration(), config.getBasePath(), true), + config); HoodieActiveTimeline activeTimeline = table.getActiveTimeline(); HoodieInstant savePoint = @@ -602,8 +608,9 @@ public class HoodieWriteClient implements Seriali * @return true if the savepoint was rollecback to successfully */ public boolean rollbackToSavepoint(String savepointTime) { - HoodieTable table = HoodieTable - .getHoodieTable(new HoodieTableMetaClient(fs, config.getBasePath(), true), config); + HoodieTable table = HoodieTable.getHoodieTable( + new HoodieTableMetaClient(jsc.hadoopConfiguration(), config.getBasePath(), true), + config); HoodieActiveTimeline activeTimeline = table.getActiveTimeline(); HoodieTimeline commitTimeline = table.getCommitsTimeline(); @@ -653,8 +660,9 @@ public class HoodieWriteClient implements Seriali String startRollbackTime = HoodieActiveTimeline.COMMIT_FORMATTER.format(new Date()); // Create a Hoodie table which encapsulated the commits and files visible - HoodieTable table = HoodieTable - .getHoodieTable(new HoodieTableMetaClient(fs, config.getBasePath(), true), config); + HoodieTable table = HoodieTable.getHoodieTable( + new HoodieTableMetaClient(jsc.hadoopConfiguration(), config.getBasePath(), true), + config); HoodieActiveTimeline activeTimeline = table.getActiveTimeline(); HoodieTimeline inflightTimeline = table.getInflightCommitTimeline(); HoodieTimeline commitTimeline = table.getCompletedCommitTimeline(); @@ -761,8 +769,9 @@ public class HoodieWriteClient implements Seriali final Timer.Context context = metrics.getCleanCtx(); // Create a Hoodie table which encapsulated the commits and files visible - HoodieTable table = HoodieTable - .getHoodieTable(new HoodieTableMetaClient(fs, config.getBasePath(), true), config); + HoodieTable table = HoodieTable.getHoodieTable( + new HoodieTableMetaClient(jsc.hadoopConfiguration(), config.getBasePath(), + true), config); List cleanStats = table.clean(jsc); if (cleanStats.isEmpty()) { @@ -810,8 +819,9 @@ public class HoodieWriteClient implements Seriali public void startCommitWithTime(String commitTime) { logger.info("Generate a new commit time " + commitTime); - HoodieTable table = HoodieTable - .getHoodieTable(new HoodieTableMetaClient(fs, config.getBasePath(), true), config); + HoodieTable table = HoodieTable.getHoodieTable( + new HoodieTableMetaClient(jsc.hadoopConfiguration(), config.getBasePath(), true), + config); HoodieActiveTimeline activeTimeline = table.getActiveTimeline(); String commitActionType = table.getCommitActionType(); activeTimeline.createInflight( @@ -827,8 +837,9 @@ public class HoodieWriteClient implements Seriali */ private void compact(String compactionCommitTime) throws IOException { // Create a Hoodie table which encapsulated the commits and files visible - HoodieTable table = HoodieTable - .getHoodieTable(new HoodieTableMetaClient(fs, config.getBasePath(), true), config); + HoodieTable table = HoodieTable.getHoodieTable( + new HoodieTableMetaClient(jsc.hadoopConfiguration(), config.getBasePath(), true), + config); Optional compactionMetadata = table.compact(jsc, compactionCommitTime); if (compactionMetadata.isPresent()) { logger.info("Compacted successfully on commit " + compactionCommitTime); @@ -876,8 +887,9 @@ public class HoodieWriteClient implements Seriali * Cleanup all inflight commits */ private void rollbackInflightCommits() { - HoodieTable table = HoodieTable - .getHoodieTable(new HoodieTableMetaClient(fs, config.getBasePath(), true), config); + HoodieTable table = HoodieTable.getHoodieTable( + new HoodieTableMetaClient(jsc.hadoopConfiguration(), config.getBasePath(), true), + config); HoodieTimeline inflightTimeline = table.getCommitsTimeline().filterInflights(); List commits = inflightTimeline.getInstants().map(HoodieInstant::getTimestamp) .collect(Collectors.toList()); diff --git a/hoodie-client/src/main/java/com/uber/hoodie/func/LazyInsertIterable.java b/hoodie-client/src/main/java/com/uber/hoodie/func/LazyInsertIterable.java index cc038f21a..8b49897e7 100644 --- a/hoodie-client/src/main/java/com/uber/hoodie/func/LazyInsertIterable.java +++ b/hoodie-client/src/main/java/com/uber/hoodie/func/LazyInsertIterable.java @@ -72,7 +72,8 @@ public class LazyInsertIterable extends HoodieIOHandle.cleanupTmpFilesFromCurrentCommit(hoodieConfig, commitTime, record.getPartitionPath(), - TaskContext.getPartitionId()); + TaskContext.getPartitionId(), + hoodieTable); partitionsCleaned.add(record.getPartitionPath()); } diff --git a/hoodie-client/src/main/java/com/uber/hoodie/index/bloom/HoodieBloomIndex.java b/hoodie-client/src/main/java/com/uber/hoodie/index/bloom/HoodieBloomIndex.java index 5a0d69002..bf452bc8f 100644 --- a/hoodie-client/src/main/java/com/uber/hoodie/index/bloom/HoodieBloomIndex.java +++ b/hoodie-client/src/main/java/com/uber/hoodie/index/bloom/HoodieBloomIndex.java @@ -155,7 +155,8 @@ public class HoodieBloomIndex extends HoodieIndex // Step 3: Obtain a RDD, for each incoming record, that already exists, with the file id, that contains it. int parallelism = autoComputeParallelism(recordsPerPartition, partitionToFileInfo, partitionRecordKeyPairRDD); - return findMatchingFilesForRecordKeys(partitionToFileInfo, partitionRecordKeyPairRDD, + return findMatchingFilesForRecordKeys(hoodieTable, partitionToFileInfo, + partitionRecordKeyPairRDD, parallelism); } @@ -257,7 +258,8 @@ public class HoodieBloomIndex extends HoodieIndex .mapToPair(ft -> { try { String[] minMaxKeys = ParquetUtils - .readMinMaxRecordKeys(ft._2().getFileStatus().getPath()); + .readMinMaxRecordKeys(hoodieTable.getHadoopConf(), + ft._2().getFileStatus().getPath()); return new Tuple2<>(ft._1(), new BloomIndexFileInfo(ft._2().getFileName(), minMaxKeys[0], minMaxKeys[1])); } catch (MetadataNotFoundException me) { @@ -358,7 +360,7 @@ public class HoodieBloomIndex extends HoodieIndex * Make sure the parallelism is atleast the groupby parallelism for tagging location */ @VisibleForTesting - JavaPairRDD findMatchingFilesForRecordKeys( + JavaPairRDD findMatchingFilesForRecordKeys(HoodieTable hoodieTable, final Map> partitionToFileIndexInfo, JavaPairRDD partitionRecordKeyPairRDD, int totalSubpartitions) { @@ -372,7 +374,8 @@ public class HoodieBloomIndex extends HoodieIndex .sortByKey(true, joinParallelism); return fileSortedTripletRDD - .mapPartitionsWithIndex(new HoodieBloomIndexCheckFunction(config.getBasePath()), true) + .mapPartitionsWithIndex( + new HoodieBloomIndexCheckFunction(hoodieTable, config.getBasePath()), true) .flatMap(indexLookupResults -> indexLookupResults.iterator()) .filter(lookupResult -> lookupResult.getMatchingRecordKeys().size() > 0) .flatMapToPair(lookupResult -> { diff --git a/hoodie-client/src/main/java/com/uber/hoodie/index/bloom/HoodieBloomIndexCheckFunction.java b/hoodie-client/src/main/java/com/uber/hoodie/index/bloom/HoodieBloomIndexCheckFunction.java index 0d562ae86..cf5a12007 100644 --- a/hoodie-client/src/main/java/com/uber/hoodie/index/bloom/HoodieBloomIndexCheckFunction.java +++ b/hoodie-client/src/main/java/com/uber/hoodie/index/bloom/HoodieBloomIndexCheckFunction.java @@ -24,10 +24,12 @@ import com.uber.hoodie.common.util.ParquetUtils; import com.uber.hoodie.exception.HoodieException; import com.uber.hoodie.exception.HoodieIndexException; import com.uber.hoodie.func.LazyIterableIterator; +import com.uber.hoodie.table.HoodieTable; import java.util.ArrayList; import java.util.Iterator; import java.util.List; import java.util.Set; +import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.log4j.LogManager; import org.apache.log4j.Logger; @@ -45,20 +47,24 @@ public class HoodieBloomIndexCheckFunction implements private final String basePath; - public HoodieBloomIndexCheckFunction(String basePath) { + private final HoodieTable table; + + public HoodieBloomIndexCheckFunction(HoodieTable table, String basePath) { + this.table = table; this.basePath = basePath; } /** * Given a list of row keys and one file, return only row keys existing in that file. */ - public static List checkCandidatesAgainstFile(List candidateRecordKeys, + public static List checkCandidatesAgainstFile(Configuration configuration, + List candidateRecordKeys, Path filePath) throws HoodieIndexException { List foundRecordKeys = new ArrayList<>(); try { // Load all rowKeys from the file, to double-confirm if (!candidateRecordKeys.isEmpty()) { - Set fileRowKeys = ParquetUtils.readRowKeysFromParquet(filePath); + Set fileRowKeys = ParquetUtils.readRowKeysFromParquet(configuration, filePath); logger.info("Loading " + fileRowKeys.size() + " row keys from " + filePath); if (logger.isDebugEnabled()) { logger.debug("Keys from " + filePath + " => " + fileRowKeys); @@ -107,7 +113,8 @@ public class HoodieBloomIndexCheckFunction implements private void initState(String fileName, String partitionPath) throws HoodieIndexException { try { Path filePath = new Path(basePath + "/" + partitionPath + "/" + fileName); - bloomFilter = ParquetUtils.readBloomFilterFromParquetMetadata(filePath); + bloomFilter = ParquetUtils + .readBloomFilterFromParquetMetadata(table.getHadoopConf(), filePath); candidateRecordKeys = new ArrayList<>(); currentFile = fileName; currentParitionPath = partitionPath; @@ -154,7 +161,7 @@ public class HoodieBloomIndexCheckFunction implements .debug("#The candidate row keys for " + filePath + " => " + candidateRecordKeys); } ret.add(new IndexLookupResult(currentFile, - checkCandidatesAgainstFile(candidateRecordKeys, filePath))); + checkCandidatesAgainstFile(table.getHadoopConf(), candidateRecordKeys, filePath))); initState(fileName, partitionPath); if (bloomFilter.mightContain(recordKey)) { @@ -177,7 +184,7 @@ public class HoodieBloomIndexCheckFunction implements logger.debug("#The candidate row keys for " + filePath + " => " + candidateRecordKeys); } ret.add(new IndexLookupResult(currentFile, - checkCandidatesAgainstFile(candidateRecordKeys, filePath))); + checkCandidatesAgainstFile(table.getHadoopConf(), candidateRecordKeys, filePath))); } } catch (Throwable e) { diff --git a/hoodie-client/src/main/java/com/uber/hoodie/io/HoodieCleanHelper.java b/hoodie-client/src/main/java/com/uber/hoodie/io/HoodieCleanHelper.java index 086b87f89..85028ba29 100644 --- a/hoodie-client/src/main/java/com/uber/hoodie/io/HoodieCleanHelper.java +++ b/hoodie-client/src/main/java/com/uber/hoodie/io/HoodieCleanHelper.java @@ -33,7 +33,6 @@ import java.util.Iterator; import java.util.List; import java.util.Optional; import java.util.stream.Collectors; -import org.apache.hadoop.fs.FileSystem; import org.apache.log4j.LogManager; import org.apache.log4j.Logger; @@ -51,14 +50,12 @@ public class HoodieCleanHelper> { private final HoodieTimeline commitTimeline; private HoodieTable hoodieTable; private HoodieWriteConfig config; - private FileSystem fs; public HoodieCleanHelper(HoodieTable hoodieTable, HoodieWriteConfig config) { this.hoodieTable = hoodieTable; this.fileSystemView = hoodieTable.getCompletedFileSystemView(); this.commitTimeline = hoodieTable.getCompletedCommitTimeline(); this.config = config; - this.fs = hoodieTable.getFs(); } diff --git a/hoodie-client/src/main/java/com/uber/hoodie/io/HoodieCommitArchiveLog.java b/hoodie-client/src/main/java/com/uber/hoodie/io/HoodieCommitArchiveLog.java index 3cb697c7a..7704517c6 100644 --- a/hoodie-client/src/main/java/com/uber/hoodie/io/HoodieCommitArchiveLog.java +++ b/hoodie-client/src/main/java/com/uber/hoodie/io/HoodieCommitArchiveLog.java @@ -126,7 +126,8 @@ public class HoodieCommitArchiveLog { int minCommitsToKeep = config.getMinCommitsToKeep(); HoodieTable table = HoodieTable - .getHoodieTable(new HoodieTableMetaClient(fs, config.getBasePath(), true), config); + .getHoodieTable(new HoodieTableMetaClient(fs.getConf(), config.getBasePath(), true), + config); // GroupBy each action and limit each action timeline to maxCommitsToKeep HoodieTimeline cleanAndRollbackTimeline = table.getActiveTimeline() @@ -165,7 +166,7 @@ public class HoodieCommitArchiveLog { private boolean deleteArchivedInstants(List archivedInstants) { log.info("Deleting instants " + archivedInstants); HoodieTableMetaClient metaClient = - new HoodieTableMetaClient(fs, config.getBasePath(), true); + new HoodieTableMetaClient(fs.getConf(), config.getBasePath(), true); boolean success = true; for (HoodieInstant archivedInstant : archivedInstants) { @@ -188,7 +189,7 @@ public class HoodieCommitArchiveLog { try { HoodieTableMetaClient metaClient = - new HoodieTableMetaClient(fs, config.getBasePath(), true); + new HoodieTableMetaClient(fs.getConf(), config.getBasePath(), true); HoodieTimeline commitTimeline = metaClient.getActiveTimeline().getAllCommitsTimeline().filterCompletedInstants(); diff --git a/hoodie-client/src/main/java/com/uber/hoodie/io/HoodieIOHandle.java b/hoodie-client/src/main/java/com/uber/hoodie/io/HoodieIOHandle.java index 53ec545fc..332c9a2d3 100644 --- a/hoodie-client/src/main/java/com/uber/hoodie/io/HoodieIOHandle.java +++ b/hoodie-client/src/main/java/com/uber/hoodie/io/HoodieIOHandle.java @@ -47,7 +47,7 @@ public abstract class HoodieIOHandle { HoodieTable hoodieTable) { this.commitTime = commitTime; this.config = config; - this.fs = FSUtils.getFs(); + this.fs = hoodieTable.getMetaClient().getFs(); this.hoodieTable = hoodieTable; this.hoodieTimeline = hoodieTable.getCompletedCommitTimeline(); this.fileSystemView = hoodieTable.getROFileSystemView(); @@ -73,8 +73,9 @@ public abstract class HoodieIOHandle { public static void cleanupTmpFilesFromCurrentCommit(HoodieWriteConfig config, String commitTime, String partitionPath, - int taskPartitionId) { - FileSystem fs = FSUtils.getFs(); + int taskPartitionId, + HoodieTable hoodieTable) { + FileSystem fs = hoodieTable.getMetaClient().getFs(); try { FileStatus[] prevFailedFiles = fs.globStatus(new Path(String .format("%s/%s/%s", config.getBasePath(), partitionPath, diff --git a/hoodie-client/src/main/java/com/uber/hoodie/io/compact/HoodieRealtimeTableCompactor.java b/hoodie-client/src/main/java/com/uber/hoodie/io/compact/HoodieRealtimeTableCompactor.java index 92107c0cc..fda6b5a26 100644 --- a/hoodie-client/src/main/java/com/uber/hoodie/io/compact/HoodieRealtimeTableCompactor.java +++ b/hoodie-client/src/main/java/com/uber/hoodie/io/compact/HoodieRealtimeTableCompactor.java @@ -16,6 +16,8 @@ package com.uber.hoodie.io.compact; +import static java.util.stream.Collectors.toList; + import com.google.common.base.Preconditions; import com.google.common.collect.Lists; import com.google.common.collect.Sets; @@ -34,13 +36,6 @@ import com.uber.hoodie.config.HoodieWriteConfig; import com.uber.hoodie.exception.HoodieCompactionException; import com.uber.hoodie.table.HoodieCopyOnWriteTable; import com.uber.hoodie.table.HoodieTable; -import org.apache.avro.Schema; -import org.apache.hadoop.fs.FileSystem; -import org.apache.log4j.LogManager; -import org.apache.log4j.Logger; -import org.apache.spark.api.java.JavaSparkContext; -import org.apache.spark.api.java.function.FlatMapFunction; - import java.io.IOException; import java.nio.charset.StandardCharsets; import java.util.Collection; @@ -49,8 +44,12 @@ import java.util.List; import java.util.Optional; import java.util.stream.Collectors; import java.util.stream.StreamSupport; - -import static java.util.stream.Collectors.toList; +import org.apache.avro.Schema; +import org.apache.hadoop.fs.FileSystem; +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; +import org.apache.spark.api.java.JavaSparkContext; +import org.apache.spark.api.java.function.FlatMapFunction; /** * HoodieRealtimeTableCompactor compacts a hoodie table with merge on read storage. Computes all @@ -73,7 +72,6 @@ public class HoodieRealtimeTableCompactor implements HoodieCompactor { .getTableType().name()); // TODO - rollback any compactions in flight - HoodieTableMetaClient metaClient = hoodieTable.getMetaClient(); log.info("Compacting " + metaClient.getBasePath() + " with commit " + compactionCommitTime); List partitionPaths = @@ -102,15 +100,9 @@ public class HoodieRealtimeTableCompactor implements HoodieCompactor { log.info("After filtering, Compacting " + operations + " files"); List updateStatusMap = jsc.parallelize(operations, operations.size()) - .map(s -> executeCompaction(metaClient, config, s, compactionCommitTime)) - .flatMap(new FlatMapFunction, HoodieWriteStat>() { - @Override - public Iterator call( - List hoodieWriteStats) - throws Exception { - return hoodieWriteStats.iterator(); - } - }).collect(); + .map(s -> executeCompaction(hoodieTable, config, s, compactionCommitTime)) + .flatMap(writeStatList -> writeStatList.iterator()) + .collect(); HoodieCommitMetadata metadata = new HoodieCommitMetadata(true); for (HoodieWriteStat stat : updateStatusMap) { @@ -134,10 +126,11 @@ public class HoodieRealtimeTableCompactor implements HoodieCompactor { return true; } - private List executeCompaction(HoodieTableMetaClient metaClient, - HoodieWriteConfig config, CompactionOperation operation, String commitTime) + + private List executeCompaction(HoodieTable hoodieTable, + HoodieWriteConfig config, CompactionOperation operation, String commitTime) throws IOException { - FileSystem fs = FSUtils.getFs(); + FileSystem fs = hoodieTable.getMetaClient().getFs(); Schema readerSchema = HoodieAvroUtils.addMetadataFields(new Schema.Parser().parse(config.getSchema())); @@ -147,7 +140,7 @@ public class HoodieRealtimeTableCompactor implements HoodieCompactor { // Reads the entire avro file. Always only specific blocks should be read from the avro file (failure recover). // Load all the delta commits since the last compaction commit and get all the blocks to be loaded and load it using CompositeAvroLogReader // Since a DeltaCommit is not defined yet, reading all the records. revisit this soon. - + HoodieTableMetaClient metaClient = hoodieTable.getMetaClient(); String maxInstantTime = metaClient.getActiveTimeline() .getTimelineOfActions( Sets.newHashSet(HoodieTimeline.COMMIT_ACTION, @@ -162,8 +155,7 @@ public class HoodieRealtimeTableCompactor implements HoodieCompactor { } // Compacting is very similar to applying updates to existing file - HoodieCopyOnWriteTable table = - new HoodieCopyOnWriteTable(config, metaClient); + HoodieCopyOnWriteTable table = new HoodieCopyOnWriteTable(config, metaClient); Iterator> result = table .handleUpdate(commitTime, operation.getFileId(), scanner.iterator()); Iterable> resultIterable = () -> result; diff --git a/hoodie-client/src/main/java/com/uber/hoodie/io/storage/HoodieParquetWriter.java b/hoodie-client/src/main/java/com/uber/hoodie/io/storage/HoodieParquetWriter.java index 2e207fd7f..48bdfda59 100644 --- a/hoodie-client/src/main/java/com/uber/hoodie/io/storage/HoodieParquetWriter.java +++ b/hoodie-client/src/main/java/com/uber/hoodie/io/storage/HoodieParquetWriter.java @@ -19,6 +19,7 @@ package com.uber.hoodie.io.storage; import com.uber.hoodie.avro.HoodieAvroWriteSupport; import com.uber.hoodie.common.model.HoodieRecord; import com.uber.hoodie.common.model.HoodieRecordPayload; +import com.uber.hoodie.common.util.FSUtils; import com.uber.hoodie.common.util.HoodieAvroUtils; import java.io.IOException; import java.util.concurrent.atomic.AtomicLong; @@ -26,7 +27,6 @@ import org.apache.avro.Schema; import org.apache.avro.generic.GenericRecord; import org.apache.avro.generic.IndexedRecord; import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.parquet.hadoop.ParquetFileWriter; import org.apache.parquet.hadoop.ParquetWriter; @@ -51,9 +51,9 @@ public class HoodieParquetWriter HoodieStorageWriter newParquetStorageWriter( - String commitTime, Path path, HoodieWriteConfig config, Schema schema) throws IOException { + String commitTime, Path path, HoodieWriteConfig config, Schema schema, + HoodieTable hoodieTable) throws IOException { BloomFilter filter = new BloomFilter(config.getBloomFilterNumEntries(), config.getBloomFilterFPP()); HoodieAvroWriteSupport writeSupport = @@ -50,7 +50,7 @@ public class HoodieStorageWriterFactory { HoodieParquetConfig parquetConfig = new HoodieParquetConfig(writeSupport, CompressionCodecName.GZIP, config.getParquetBlockSize(), config.getParquetPageSize(), - config.getParquetMaxFileSize(), FSUtils.getFs().getConf()); + config.getParquetMaxFileSize(), hoodieTable.getHadoopConf()); return new HoodieParquetWriter<>(commitTime, path, parquetConfig, schema); } diff --git a/hoodie-client/src/main/java/com/uber/hoodie/io/storage/HoodieWrapperFileSystem.java b/hoodie-client/src/main/java/com/uber/hoodie/io/storage/HoodieWrapperFileSystem.java index 87a628447..a9d2a2298 100644 --- a/hoodie-client/src/main/java/com/uber/hoodie/io/storage/HoodieWrapperFileSystem.java +++ b/hoodie-client/src/main/java/com/uber/hoodie/io/storage/HoodieWrapperFileSystem.java @@ -16,7 +16,8 @@ package com.uber.hoodie.io.storage; -import java.io.FileNotFoundException; +import com.uber.hoodie.common.util.FSUtils; +import com.uber.hoodie.exception.HoodieIOException; import java.io.IOException; import java.net.URI; import java.net.URISyntaxException; @@ -33,7 +34,6 @@ import org.apache.hadoop.fs.ContentSummary; import org.apache.hadoop.fs.CreateFlag; import org.apache.hadoop.fs.FSDataInputStream; import org.apache.hadoop.fs.FSDataOutputStream; -import org.apache.hadoop.fs.FileAlreadyExistsException; import org.apache.hadoop.fs.FileChecksum; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; @@ -41,17 +41,14 @@ import org.apache.hadoop.fs.FsServerDefaults; import org.apache.hadoop.fs.FsStatus; import org.apache.hadoop.fs.LocatedFileStatus; import org.apache.hadoop.fs.Options; -import org.apache.hadoop.fs.ParentNotDirectoryException; import org.apache.hadoop.fs.Path; import org.apache.hadoop.fs.PathFilter; import org.apache.hadoop.fs.RemoteIterator; -import org.apache.hadoop.fs.UnsupportedFileSystemException; import org.apache.hadoop.fs.XAttrSetFlag; import org.apache.hadoop.fs.permission.AclEntry; import org.apache.hadoop.fs.permission.AclStatus; import org.apache.hadoop.fs.permission.FsAction; import org.apache.hadoop.fs.permission.FsPermission; -import org.apache.hadoop.security.AccessControlException; import org.apache.hadoop.security.Credentials; import org.apache.hadoop.security.token.Token; import org.apache.hadoop.util.Progressable; @@ -70,6 +67,8 @@ public class HoodieWrapperFileSystem extends FileSystem { SUPPORT_SCHEMES.add("file"); SUPPORT_SCHEMES.add("hdfs"); SUPPORT_SCHEMES.add("s3"); + SUPPORT_SCHEMES.add("s3a"); + // Hoodie currently relies on underlying object store being fully // consistent so only regional buckets should be used. @@ -85,7 +84,12 @@ public class HoodieWrapperFileSystem extends FileSystem { @Override public void initialize(URI uri, Configuration conf) throws IOException { // Get the default filesystem to decorate - fileSystem = FileSystem.get(conf); + Path path = new Path(uri); + // Remove 'hoodie-' prefix from path + if (path.toString().startsWith(HOODIE_SCHEME_PREFIX)) { + path = new Path(path.toString().replace(HOODIE_SCHEME_PREFIX, "")); + } + this.fileSystem = FSUtils.getFs(path.toString(), conf); // Do not need to explicitly initialize the default filesystem, its done already in the above FileSystem.get // fileSystem.initialize(FileSystem.getDefaultUri(conf), conf); // fileSystem.setConf(conf); @@ -219,7 +223,7 @@ public class HoodieWrapperFileSystem extends FileSystem { } @Override - public FileStatus[] listStatus(Path f) throws FileNotFoundException, IOException { + public FileStatus[] listStatus(Path f) throws IOException { return fileSystem.listStatus(convertToDefaultPath(f)); } @@ -415,19 +419,19 @@ public class HoodieWrapperFileSystem extends FileSystem { @Override public FileStatus[] listStatus(Path f, PathFilter filter) - throws FileNotFoundException, IOException { + throws IOException { return fileSystem.listStatus(convertToDefaultPath(f), filter); } @Override public FileStatus[] listStatus(Path[] files) - throws FileNotFoundException, IOException { + throws IOException { return fileSystem.listStatus(convertDefaults(files)); } @Override public FileStatus[] listStatus(Path[] files, PathFilter filter) - throws FileNotFoundException, IOException { + throws IOException { return fileSystem.listStatus(convertDefaults(files), filter); } @@ -444,13 +448,13 @@ public class HoodieWrapperFileSystem extends FileSystem { @Override public RemoteIterator listLocatedStatus(Path f) - throws FileNotFoundException, IOException { + throws IOException { return fileSystem.listLocatedStatus(convertToDefaultPath(f)); } @Override public RemoteIterator listFiles(Path f, boolean recursive) - throws FileNotFoundException, IOException { + throws IOException { return fileSystem.listFiles(convertToDefaultPath(f), recursive); } @@ -571,21 +575,21 @@ public class HoodieWrapperFileSystem extends FileSystem { @Override public void access(Path path, FsAction mode) - throws AccessControlException, FileNotFoundException, IOException { + throws IOException { fileSystem.access(convertToDefaultPath(path), mode); } @Override public void createSymlink(Path target, Path link, boolean createParent) - throws AccessControlException, FileAlreadyExistsException, FileNotFoundException, - ParentNotDirectoryException, UnsupportedFileSystemException, IOException { + throws + IOException { fileSystem .createSymlink(convertToDefaultPath(target), convertToDefaultPath(link), createParent); } @Override public FileStatus getFileLinkStatus(Path f) - throws AccessControlException, FileNotFoundException, UnsupportedFileSystemException, + throws IOException { return fileSystem.getFileLinkStatus(convertToDefaultPath(f)); } @@ -759,8 +763,12 @@ public class HoodieWrapperFileSystem extends FileSystem { } public static Path convertToHoodiePath(Path file, Configuration conf) { - String scheme = FileSystem.getDefaultUri(conf).getScheme(); - return convertPathWithScheme(file, getHoodieScheme(scheme)); + try { + String scheme = FSUtils.getFs(file.toString(), conf).getScheme(); + return convertPathWithScheme(file, getHoodieScheme(scheme)); + } catch (HoodieIOException e) { + throw e; + } } private Path convertToDefaultPath(Path oldPath) { diff --git a/hoodie-client/src/main/java/com/uber/hoodie/table/HoodieCopyOnWriteTable.java b/hoodie-client/src/main/java/com/uber/hoodie/table/HoodieCopyOnWriteTable.java index a509411d7..3e4b1dc86 100644 --- a/hoodie-client/src/main/java/com/uber/hoodie/table/HoodieCopyOnWriteTable.java +++ b/hoodie-client/src/main/java/com/uber/hoodie/table/HoodieCopyOnWriteTable.java @@ -52,7 +52,6 @@ import java.util.Set; import java.util.stream.Collectors; import org.apache.avro.generic.GenericRecord; import org.apache.avro.generic.IndexedRecord; -import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; @@ -413,10 +412,10 @@ public class HoodieCopyOnWriteTable extends Hoodi throw new HoodieUpsertException("Error in finding the old file path at commit " + commitTime + " at fileLoc: " + fileLoc); } else { - Configuration conf = FSUtils.getFs().getConf(); - AvroReadSupport.setAvroReadSchema(conf, upsertHandle.getSchema()); + AvroReadSupport.setAvroReadSchema(getHadoopConf(), upsertHandle.getSchema()); ParquetReader reader = - AvroParquetReader.builder(upsertHandle.getOldFilePath()).withConf(conf).build(); + AvroParquetReader.builder(upsertHandle.getOldFilePath()).withConf(getHadoopConf()) + .build(); try { IndexedRecord record; while ((record = reader.read()) != null) { @@ -500,8 +499,9 @@ public class HoodieCopyOnWriteTable extends Hoodi @Override public List clean(JavaSparkContext jsc) { try { + FileSystem fs = getMetaClient().getFs(); List partitionsToClean = - FSUtils.getAllPartitionPaths(getFs(), getMetaClient().getBasePath(), + FSUtils.getAllPartitionPaths(fs, getMetaClient().getBasePath(), config.shouldAssumeDatePartitioning()); logger.info("Partitions to clean up : " + partitionsToClean + ", with policy " + config .getCleanerPolicy()); @@ -522,7 +522,7 @@ public class HoodieCopyOnWriteTable extends Hoodi protected Map deleteCleanedFiles(String partitionPath, List commits) throws IOException { logger.info("Cleaning path " + partitionPath); - FileSystem fs = FSUtils.getFs(); + FileSystem fs = getMetaClient().getFs(); FileStatus[] toBeDeleted = fs.listStatus(new Path(config.getBasePath(), partitionPath), path -> { if (!path.toString().contains(".parquet")) { @@ -558,7 +558,7 @@ public class HoodieCopyOnWriteTable extends Hoodi // delete all the data files for all these commits logger.info("Clean out all parquet files generated for commits: " + commits); List stats = jsc.parallelize( - FSUtils.getAllPartitionPaths(FSUtils.getFs(), this.getMetaClient().getBasePath(), + FSUtils.getAllPartitionPaths(metaClient.getFs(), getMetaClient().getBasePath(), config.shouldAssumeDatePartitioning())) .map((Function) partitionPath -> { // Scan all partitions files with this commit time @@ -618,7 +618,7 @@ public class HoodieCopyOnWriteTable extends Hoodi .parallelize(partitionsToClean, cleanerParallelism) .flatMapToPair(getFilesToDeleteFunc(this, config)) .repartition(cleanerParallelism) // repartition to remove skews - .mapPartitionsToPair(deleteFilesFunc(this, config)) + .mapPartitionsToPair(deleteFilesFunc(this)) .reduceByKey( // merge partition level clean stats below (Function2) (e1, e2) -> e1 @@ -646,17 +646,17 @@ public class HoodieCopyOnWriteTable extends Hoodi }).collect(Collectors.toList()); } - private PairFlatMapFunction>, String, PartitionCleanStat> deleteFilesFunc( - HoodieTable table, HoodieWriteConfig config) { + private static PairFlatMapFunction>, String, PartitionCleanStat> deleteFilesFunc( + HoodieTable table) { return (PairFlatMapFunction>, String, PartitionCleanStat>) iter -> { - HoodieCleanHelper cleaner = new HoodieCleanHelper(table, config); Map partitionCleanStatMap = new HashMap<>(); + FileSystem fs = table.getMetaClient().getFs(); while (iter.hasNext()) { Tuple2 partitionDelFileTuple = iter.next(); String partitionPath = partitionDelFileTuple._1(); String deletePathStr = partitionDelFileTuple._2(); - Boolean deletedFileResult = deleteFileAndGetResult(deletePathStr); + Boolean deletedFileResult = deleteFileAndGetResult(fs, deletePathStr); if (!partitionCleanStatMap.containsKey(partitionPath)) { partitionCleanStatMap.put(partitionPath, new PartitionCleanStat(partitionPath)); @@ -682,10 +682,11 @@ public class HoodieCopyOnWriteTable extends Hoodi }; } - private Boolean deleteFileAndGetResult(String deletePathStr) throws IOException { + private static Boolean deleteFileAndGetResult(FileSystem fs, String deletePathStr) + throws IOException { Path deletePath = new Path(deletePathStr); logger.debug("Working on delete path :" + deletePath); - boolean deleteResult = getFs().delete(deletePath, false); + boolean deleteResult = fs.delete(deletePath, false); if (deleteResult) { logger.debug("Cleaned file at path :" + deletePath); } diff --git a/hoodie-client/src/main/java/com/uber/hoodie/table/HoodieMergeOnReadTable.java b/hoodie-client/src/main/java/com/uber/hoodie/table/HoodieMergeOnReadTable.java index 19461b025..839aa1840 100644 --- a/hoodie-client/src/main/java/com/uber/hoodie/table/HoodieMergeOnReadTable.java +++ b/hoodie-client/src/main/java/com/uber/hoodie/table/HoodieMergeOnReadTable.java @@ -32,19 +32,11 @@ import com.uber.hoodie.common.table.log.block.HoodieCommandBlock; import com.uber.hoodie.common.table.log.block.HoodieLogBlock; import com.uber.hoodie.common.table.timeline.HoodieActiveTimeline; import com.uber.hoodie.common.table.timeline.HoodieInstant; -import com.uber.hoodie.common.util.FSUtils; import com.uber.hoodie.config.HoodieWriteConfig; import com.uber.hoodie.exception.HoodieCompactionException; import com.uber.hoodie.exception.HoodieRollbackException; import com.uber.hoodie.io.HoodieAppendHandle; import com.uber.hoodie.io.compact.HoodieRealtimeTableCompactor; -import org.apache.hadoop.fs.FileStatus; -import org.apache.hadoop.fs.Path; -import org.apache.log4j.LogManager; -import org.apache.log4j.Logger; -import org.apache.spark.api.java.JavaSparkContext; -import org.apache.spark.api.java.function.Function; - import java.io.IOException; import java.io.UncheckedIOException; import java.util.Arrays; @@ -55,6 +47,12 @@ import java.util.List; import java.util.Map; import java.util.Optional; import java.util.stream.Collectors; +import org.apache.hadoop.fs.FileStatus; +import org.apache.hadoop.fs.Path; +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; +import org.apache.spark.api.java.JavaSparkContext; +import org.apache.spark.api.java.function.Function; /** @@ -74,8 +72,7 @@ public class HoodieMergeOnReadTable extends private static Logger logger = LogManager.getLogger(HoodieMergeOnReadTable.class); - public HoodieMergeOnReadTable(HoodieWriteConfig config, - HoodieTableMetaClient metaClient) { + public HoodieMergeOnReadTable(HoodieWriteConfig config, HoodieTableMetaClient metaClient) { super(config, metaClient); } @@ -195,7 +192,7 @@ public class HoodieMergeOnReadTable extends .onParentPath( new Path(this.getMetaClient().getBasePath(), partitionPath)) .withFileId(wStat.getFileId()).overBaseCommit(wStat.getPrevCommit()) - .withFs(FSUtils.getFs()) + .withFs(getMetaClient().getFs()) .withFileExtension(HoodieLogFile.DELTA_EXTENSION).build(); Long numRollbackBlocks = 0L; // generate metadata @@ -215,7 +212,8 @@ public class HoodieMergeOnReadTable extends numRollbackBlocks++; } filesToNumBlocksRollback - .put(FSUtils.getFs().getFileStatus(writer.getLogFile().getPath()), + .put(getMetaClient().getFs() + .getFileStatus(writer.getLogFile().getPath()), numRollbackBlocks); } catch (IOException | InterruptedException io) { throw new HoodieRollbackException( diff --git a/hoodie-client/src/main/java/com/uber/hoodie/table/HoodieTable.java b/hoodie-client/src/main/java/com/uber/hoodie/table/HoodieTable.java index 76ab92d99..3140e34ee 100644 --- a/hoodie-client/src/main/java/com/uber/hoodie/table/HoodieTable.java +++ b/hoodie-client/src/main/java/com/uber/hoodie/table/HoodieTable.java @@ -34,12 +34,6 @@ import com.uber.hoodie.config.HoodieWriteConfig; import com.uber.hoodie.exception.HoodieCommitException; import com.uber.hoodie.exception.HoodieException; import com.uber.hoodie.exception.HoodieSavepointException; -import org.apache.hadoop.fs.FileSystem; -import org.apache.log4j.LogManager; -import org.apache.log4j.Logger; -import org.apache.spark.Partitioner; -import org.apache.spark.api.java.JavaSparkContext; - import java.io.IOException; import java.io.Serializable; import java.util.Iterator; @@ -47,6 +41,11 @@ import java.util.List; import java.util.Optional; import java.util.stream.Collectors; import java.util.stream.Stream; +import org.apache.hadoop.conf.Configuration; +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; +import org.apache.spark.Partitioner; +import org.apache.spark.api.java.JavaSparkContext; /** * Abstract implementation of a HoodieTable @@ -55,6 +54,7 @@ public abstract class HoodieTable implements Seri protected final HoodieWriteConfig config; protected final HoodieTableMetaClient metaClient; + private static Logger logger = LogManager.getLogger(HoodieTable.class); protected HoodieTable(HoodieWriteConfig config, HoodieTableMetaClient metaClient) { @@ -87,8 +87,8 @@ public abstract class HoodieTable implements Seri return metaClient; } - public FileSystem getFs() { - return metaClient.getFs(); + public Configuration getHadoopConf() { + return metaClient.getHadoopConf(); } /** diff --git a/hoodie-client/src/test/java/HoodieClientExample.java b/hoodie-client/src/test/java/HoodieClientExample.java index ef31fea1c..f1fc056e1 100644 --- a/hoodie-client/src/test/java/HoodieClientExample.java +++ b/hoodie-client/src/test/java/HoodieClientExample.java @@ -81,7 +81,7 @@ public class HoodieClientExample { // initialize the table, if not done already Path path = new Path(tablePath); - FileSystem fs = FSUtils.getFs(); + FileSystem fs = FSUtils.getFs(tablePath, jsc.hadoopConfiguration()); if (!fs.exists(path)) { HoodieTableMetaClient .initTableType(fs, tablePath, HoodieTableType.valueOf(tableType), tableName, diff --git a/hoodie-client/src/test/java/com/uber/hoodie/TestHoodieClientOnCopyOnWriteStorage.java b/hoodie-client/src/test/java/com/uber/hoodie/TestHoodieClientOnCopyOnWriteStorage.java index f6995762d..e3adf8d9c 100644 --- a/hoodie-client/src/test/java/com/uber/hoodie/TestHoodieClientOnCopyOnWriteStorage.java +++ b/hoodie-client/src/test/java/com/uber/hoodie/TestHoodieClientOnCopyOnWriteStorage.java @@ -84,6 +84,7 @@ public class TestHoodieClientOnCopyOnWriteStorage implements Serializable { private transient JavaSparkContext jsc = null; private transient SQLContext sqlContext; + private transient FileSystem fs; private String basePath = null; private transient HoodieTestDataGenerator dataGen = null; private String[] partitionPaths = {"2016/01/01", "2016/02/02", "2016/06/02"}; @@ -100,6 +101,7 @@ public class TestHoodieClientOnCopyOnWriteStorage implements Serializable { TemporaryFolder folder = new TemporaryFolder(); folder.create(); basePath = folder.getRoot().getAbsolutePath(); + fs = FSUtils.getFs(basePath.toString(), jsc.hadoopConfiguration()); HoodieTestUtils.init(basePath); dataGen = new HoodieTestDataGenerator(); } @@ -212,7 +214,6 @@ public class TestHoodieClientOnCopyOnWriteStorage implements Serializable { HoodieWriteConfig cfg = getConfig(); HoodieWriteClient client = new HoodieWriteClient(jsc, cfg); HoodieIndex index = HoodieIndex.createIndex(cfg, jsc); - FileSystem fs = FSUtils.getFs(); /** * Write 1 (only inserts) @@ -230,8 +231,9 @@ public class TestHoodieClientOnCopyOnWriteStorage implements Serializable { assertPartitionMetadata(HoodieTestDataGenerator.DEFAULT_PARTITION_PATHS, fs); // verify that there is a commit - HoodieTableMetaClient metaClient = new HoodieTableMetaClient(fs, basePath); - HoodieTimeline timeline = new HoodieActiveTimeline(fs, metaClient.getMetaPath()) + HoodieTableMetaClient metaClient = new HoodieTableMetaClient(jsc.hadoopConfiguration(), + basePath); + HoodieTimeline timeline = new HoodieActiveTimeline(metaClient) .getCommitTimeline(); assertEquals("Expecting a single commit.", 1, @@ -242,7 +244,8 @@ public class TestHoodieClientOnCopyOnWriteStorage implements Serializable { records.size(), HoodieClientTestUtils.readCommit(basePath, sqlContext, timeline, newCommitTime).count()); // Should have 100 records in table (check using Index), all in locations marked at commit - HoodieTable table = HoodieTable.getHoodieTable(metaClient, getConfig()); + HoodieTable table = HoodieTable + .getHoodieTable(metaClient, getConfig()); List taggedRecords = index.tagLocation(jsc.parallelize(records, 1), table) .collect(); @@ -268,13 +271,13 @@ public class TestHoodieClientOnCopyOnWriteStorage implements Serializable { assertNoWriteErrors(statuses); // verify there are now 2 commits - timeline = new HoodieActiveTimeline(fs, metaClient.getMetaPath()).getCommitTimeline(); + timeline = new HoodieActiveTimeline(metaClient).getCommitTimeline(); assertEquals("Expecting two commits.", timeline.findInstantsAfter("000", Integer.MAX_VALUE).countInstants(), 2); assertEquals("Latest commit should be 004", timeline.lastInstant().get().getTimestamp(), newCommitTime); - metaClient = new HoodieTableMetaClient(fs, basePath); + metaClient = new HoodieTableMetaClient(jsc.hadoopConfiguration(), basePath); table = HoodieTable.getHoodieTable(metaClient, getConfig()); // Index should be able to locate all updates in correct locations. @@ -305,7 +308,6 @@ public class TestHoodieClientOnCopyOnWriteStorage implements Serializable { HoodieWriteConfig cfg = getConfig(); HoodieWriteClient client = new HoodieWriteClient(jsc, cfg); HoodieIndex index = HoodieIndex.createIndex(cfg, jsc); - FileSystem fs = FSUtils.getFs(); /** * Write 1 (inserts and deletes) @@ -326,8 +328,9 @@ public class TestHoodieClientOnCopyOnWriteStorage implements Serializable { assertNoWriteErrors(statuses); // verify that there is a commit - HoodieTableMetaClient metaClient = new HoodieTableMetaClient(fs, basePath); - HoodieTimeline timeline = new HoodieActiveTimeline(fs, metaClient.getMetaPath()) + HoodieTableMetaClient metaClient = new HoodieTableMetaClient(jsc.hadoopConfiguration(), + basePath); + HoodieTimeline timeline = new HoodieActiveTimeline(metaClient) .getCommitTimeline(); assertEquals("Expecting a single commit.", 1, timeline.findInstantsAfter("000", Integer.MAX_VALUE).countInstants()); @@ -336,7 +339,8 @@ public class TestHoodieClientOnCopyOnWriteStorage implements Serializable { assertEquals("Must contain 200 records", fewRecordsForInsert.size(), HoodieClientTestUtils.readCommit(basePath, sqlContext, timeline, newCommitTime).count()); // Should have 100 records in table (check using Index), all in locations marked at commit - HoodieTable table = HoodieTable.getHoodieTable(metaClient, getConfig()); + HoodieTable table = HoodieTable + .getHoodieTable(metaClient, getConfig()); List taggedRecords = index .tagLocation(jsc.parallelize(fewRecordsForInsert, 1), table).collect(); @@ -359,7 +363,7 @@ public class TestHoodieClientOnCopyOnWriteStorage implements Serializable { assertNoWriteErrors(statuses); // verify there are now 2 commits - timeline = new HoodieActiveTimeline(fs, metaClient.getMetaPath()).getCommitTimeline(); + timeline = new HoodieActiveTimeline(metaClient).getCommitTimeline(); assertEquals("Expecting two commits.", timeline.findInstantsAfter("000", Integer.MAX_VALUE).countInstants(), 2); assertEquals("Latest commit should be 004", timeline.lastInstant().get().getTimestamp(), @@ -393,7 +397,6 @@ public class TestHoodieClientOnCopyOnWriteStorage implements Serializable { .withCleanerPolicy(HoodieCleaningPolicy.KEEP_LATEST_COMMITS).retainCommits(1) .build()).build(); HoodieWriteClient client = new HoodieWriteClient(jsc, cfg); - FileSystem fs = FSUtils.getFs(); HoodieTestDataGenerator .writePartitionMetadata(fs, HoodieTestDataGenerator.DEFAULT_PARTITION_PATHS, basePath); @@ -451,8 +454,10 @@ public class TestHoodieClientOnCopyOnWriteStorage implements Serializable { List partitionPaths = FSUtils .getAllPartitionPaths(fs, cfg.getBasePath(), getConfig().shouldAssumeDatePartitioning()); - HoodieTableMetaClient metaClient = new HoodieTableMetaClient(fs, basePath); - HoodieTable table = HoodieTable.getHoodieTable(metaClient, getConfig()); + HoodieTableMetaClient metaClient = new HoodieTableMetaClient(jsc.hadoopConfiguration(), + basePath); + HoodieTable table = HoodieTable + .getHoodieTable(metaClient, getConfig()); final TableFileSystemView.ReadOptimizedView view = table.getROFileSystemView(); List dataFiles = partitionPaths.stream().flatMap(s -> { return view.getAllDataFiles(s).filter(f -> f.getCommitTime().equals("002")); @@ -472,7 +477,7 @@ public class TestHoodieClientOnCopyOnWriteStorage implements Serializable { // Verify there are no errors assertNoWriteErrors(statuses); - metaClient = new HoodieTableMetaClient(fs, basePath); + metaClient = new HoodieTableMetaClient(jsc.hadoopConfiguration(), basePath); table = HoodieTable.getHoodieTable(metaClient, getConfig()); final TableFileSystemView.ReadOptimizedView view1 = table.getROFileSystemView(); dataFiles = partitionPaths.stream().flatMap(s -> { @@ -490,7 +495,6 @@ public class TestHoodieClientOnCopyOnWriteStorage implements Serializable { .withCleanerPolicy(HoodieCleaningPolicy.KEEP_LATEST_COMMITS).retainCommits(1) .build()).build(); HoodieWriteClient client = new HoodieWriteClient(jsc, cfg); - FileSystem fs = FSUtils.getFs(); HoodieTestDataGenerator .writePartitionMetadata(fs, HoodieTestDataGenerator.DEFAULT_PARTITION_PATHS, basePath); @@ -531,7 +535,8 @@ public class TestHoodieClientOnCopyOnWriteStorage implements Serializable { assertNoWriteErrors(statuses); List partitionPaths = FSUtils .getAllPartitionPaths(fs, cfg.getBasePath(), getConfig().shouldAssumeDatePartitioning()); - HoodieTableMetaClient metaClient = new HoodieTableMetaClient(fs, basePath); + HoodieTableMetaClient metaClient = new HoodieTableMetaClient(jsc.hadoopConfiguration(), + basePath); HoodieTable table = HoodieTable.getHoodieTable(metaClient, getConfig()); final TableFileSystemView.ReadOptimizedView view1 = table.getROFileSystemView(); @@ -551,7 +556,7 @@ public class TestHoodieClientOnCopyOnWriteStorage implements Serializable { // Verify there are no errors assertNoWriteErrors(statuses); - metaClient = new HoodieTableMetaClient(fs, basePath); + metaClient = new HoodieTableMetaClient(jsc.hadoopConfiguration(), basePath); table = HoodieTable.getHoodieTable(metaClient, getConfig()); final TableFileSystemView.ReadOptimizedView view2 = table.getROFileSystemView(); @@ -573,7 +578,7 @@ public class TestHoodieClientOnCopyOnWriteStorage implements Serializable { table.getCompletedSavepointTimeline().getInstants().findFirst().get(); client.rollbackToSavepoint(savepoint.getTimestamp()); - metaClient = new HoodieTableMetaClient(fs, basePath); + metaClient = new HoodieTableMetaClient(jsc.hadoopConfiguration(), basePath); table = HoodieTable.getHoodieTable(metaClient, getConfig()); final TableFileSystemView.ReadOptimizedView view3 = table.getROFileSystemView(); dataFiles = partitionPaths.stream().flatMap(s -> { @@ -602,7 +607,6 @@ public class TestHoodieClientOnCopyOnWriteStorage implements Serializable { .retainFileVersions(maxVersions).build()).build(); HoodieWriteClient client = new HoodieWriteClient(jsc, cfg); HoodieIndex index = HoodieIndex.createIndex(cfg, jsc); - FileSystem fs = FSUtils.getFs(); /** * do a big insert @@ -618,8 +622,9 @@ public class TestHoodieClientOnCopyOnWriteStorage implements Serializable { assertNoWriteErrors(statuses); // verify that there is a commit - HoodieTableMetaClient metaClient = new HoodieTableMetaClient(fs, basePath); - HoodieTimeline timeline = new HoodieActiveTimeline(fs, metaClient.getMetaPath()) + HoodieTableMetaClient metaClient = new HoodieTableMetaClient(jsc.hadoopConfiguration(), + basePath); + HoodieTimeline timeline = new HoodieActiveTimeline(metaClient) .getCommitTimeline(); assertEquals("Expecting a single commit.", 1, timeline.findInstantsAfter("000", Integer.MAX_VALUE).countInstants()); @@ -647,7 +652,8 @@ public class TestHoodieClientOnCopyOnWriteStorage implements Serializable { // Verify there are no errors assertNoWriteErrors(statuses); - HoodieTableMetaClient metadata = new HoodieTableMetaClient(fs, basePath); + HoodieTableMetaClient metadata = new HoodieTableMetaClient(jsc.hadoopConfiguration(), + basePath); table = HoodieTable.getHoodieTable(metadata, getConfig()); timeline = table.getCommitsTimeline(); @@ -702,7 +708,6 @@ public class TestHoodieClientOnCopyOnWriteStorage implements Serializable { .retainCommits(maxCommits).build()).build(); HoodieWriteClient client = new HoodieWriteClient(jsc, cfg); HoodieIndex index = HoodieIndex.createIndex(cfg, jsc); - FileSystem fs = FSUtils.getFs(); /** * do a big insert @@ -718,13 +723,15 @@ public class TestHoodieClientOnCopyOnWriteStorage implements Serializable { assertNoWriteErrors(statuses); // verify that there is a commit - HoodieTableMetaClient metaClient = new HoodieTableMetaClient(fs, basePath); - HoodieTimeline timeline = new HoodieActiveTimeline(fs, metaClient.getMetaPath()) + HoodieTableMetaClient metaClient = new HoodieTableMetaClient(jsc.hadoopConfiguration(), + basePath); + HoodieTimeline timeline = new HoodieActiveTimeline(metaClient) .getCommitTimeline(); assertEquals("Expecting a single commit.", 1, timeline.findInstantsAfter("000", Integer.MAX_VALUE).countInstants()); // Should have 100 records in table (check using Index), all in locations marked at commit - HoodieTable table = HoodieTable.getHoodieTable(metaClient, getConfig()); + HoodieTable table = HoodieTable + .getHoodieTable(metaClient, getConfig()); assertFalse(table.getCompletedCommitTimeline().empty()); String commitTime = @@ -747,7 +754,8 @@ public class TestHoodieClientOnCopyOnWriteStorage implements Serializable { // Verify there are no errors assertNoWriteErrors(statuses); - HoodieTableMetaClient metadata = new HoodieTableMetaClient(fs, basePath); + HoodieTableMetaClient metadata = new HoodieTableMetaClient(jsc.hadoopConfiguration(), + basePath); HoodieTable table1 = HoodieTable.getHoodieTable(metadata, cfg); HoodieTimeline activeTimeline = table1.getCompletedCommitTimeline(); Optional @@ -788,8 +796,8 @@ public class TestHoodieClientOnCopyOnWriteStorage implements Serializable { String commitTime2 = "20160502020601"; String commitTime3 = "20160506030611"; new File(basePath + "/.hoodie").mkdirs(); - HoodieTestDataGenerator.writePartitionMetadata(FSUtils.getFs(), - new String[]{"2016/05/01", "2016/05/02", "2016/05/06"}, + HoodieTestDataGenerator + .writePartitionMetadata(fs, new String[]{"2016/05/01", "2016/05/02", "2016/05/06"}, basePath); // Only first two have commit files @@ -878,9 +886,9 @@ public class TestHoodieClientOnCopyOnWriteStorage implements Serializable { String commitTime2 = "20160502020601"; String commitTime3 = "20160506030611"; new File(basePath + "/.hoodie").mkdirs(); - HoodieTestDataGenerator.writePartitionMetadata(FSUtils.getFs(), - new String[]{"2016/05/01", "2016/05/02", "2016/05/06"}, - basePath); + HoodieTestDataGenerator + .writePartitionMetadata(fs, new String[]{"2016/05/01", "2016/05/02", "2016/05/06"}, + basePath); // One good commit HoodieTestUtils.createCommitFiles(basePath, commitTime1); @@ -956,8 +964,6 @@ public class TestHoodieClientOnCopyOnWriteStorage implements Serializable { @Test public void testSmallInsertHandlingForUpserts() throws Exception { - - FileSystem fs = FSUtils.getFs(); final String TEST_PARTITION_PATH = "2016/09/26"; final int INSERT_SPLIT_LIMIT = 100; // setup the small file handling params @@ -982,7 +988,7 @@ public class TestHoodieClientOnCopyOnWriteStorage implements Serializable { assertEquals("Just 1 file needs to be added.", 1, statuses.size()); String file1 = statuses.get(0).getFileId(); assertEquals("file should contain 100 records", - ParquetUtils.readRowKeysFromParquet(new Path(basePath, + ParquetUtils.readRowKeysFromParquet(jsc.hadoopConfiguration(), new Path(basePath, TEST_PARTITION_PATH + "/" + FSUtils.makeDataFileName(commitTime1, 0, file1))).size(), 100); @@ -1006,9 +1012,9 @@ public class TestHoodieClientOnCopyOnWriteStorage implements Serializable { Path newFile = new Path(basePath, TEST_PARTITION_PATH + "/" + FSUtils.makeDataFileName(commitTime2, 0, file1)); assertEquals("file should contain 140 records", - ParquetUtils.readRowKeysFromParquet(newFile).size(), 140); + ParquetUtils.readRowKeysFromParquet(jsc.hadoopConfiguration(), newFile).size(), 140); - List records = ParquetUtils.readAvroRecords(newFile); + List records = ParquetUtils.readAvroRecords(jsc.hadoopConfiguration(), newFile); for (GenericRecord record : records) { String recordKey = record.get(HoodieRecord.RECORD_KEY_METADATA_FIELD).toString(); assertEquals("only expect commit2", commitTime2, @@ -1030,7 +1036,7 @@ public class TestHoodieClientOnCopyOnWriteStorage implements Serializable { assertNoWriteErrors(statuses); assertEquals("2 files needs to be committed.", 2, statuses.size()); - HoodieTableMetaClient metadata = new HoodieTableMetaClient(fs, basePath); + HoodieTableMetaClient metadata = new HoodieTableMetaClient(jsc.hadoopConfiguration(), basePath); HoodieTable table = HoodieTable.getHoodieTable(metadata, config); TableFileSystemView.ReadOptimizedView fileSystemView = table.getROFileSystemView(); List files = fileSystemView @@ -1040,7 +1046,7 @@ public class TestHoodieClientOnCopyOnWriteStorage implements Serializable { for (HoodieDataFile file : files) { if (file.getFileName().contains(file1)) { assertEquals("Existing file should be expanded", commitTime3, file.getCommitTime()); - records = ParquetUtils.readAvroRecords(new Path(file.getPath())); + records = ParquetUtils.readAvroRecords(jsc.hadoopConfiguration(), new Path(file.getPath())); for (GenericRecord record : records) { String recordKey = record.get(HoodieRecord.RECORD_KEY_METADATA_FIELD).toString(); String recordCommitTime = record.get(HoodieRecord.COMMIT_TIME_METADATA_FIELD).toString(); @@ -1057,7 +1063,7 @@ public class TestHoodieClientOnCopyOnWriteStorage implements Serializable { keys2.size()); } else { assertEquals("New file must be written for commit 3", commitTime3, file.getCommitTime()); - records = ParquetUtils.readAvroRecords(new Path(file.getPath())); + records = ParquetUtils.readAvroRecords(jsc.hadoopConfiguration(), new Path(file.getPath())); for (GenericRecord record : records) { String recordKey = record.get(HoodieRecord.RECORD_KEY_METADATA_FIELD).toString(); assertEquals("only expect commit3", commitTime3, @@ -1091,12 +1097,12 @@ public class TestHoodieClientOnCopyOnWriteStorage implements Serializable { List statuses = client.insert(insertRecordsRDD1, commitTime1).collect(); assertNoWriteErrors(statuses); - assertPartitionMetadata(new String[]{TEST_PARTITION_PATH}, FSUtils.getFs()); + assertPartitionMetadata(new String[]{TEST_PARTITION_PATH}, fs); assertEquals("Just 1 file needs to be added.", 1, statuses.size()); String file1 = statuses.get(0).getFileId(); assertEquals("file should contain 100 records", - ParquetUtils.readRowKeysFromParquet(new Path(basePath, + ParquetUtils.readRowKeysFromParquet(jsc.hadoopConfiguration(), new Path(basePath, TEST_PARTITION_PATH + "/" + FSUtils.makeDataFileName(commitTime1, 0, file1))).size(), 100); @@ -1116,9 +1122,9 @@ public class TestHoodieClientOnCopyOnWriteStorage implements Serializable { Path newFile = new Path(basePath, TEST_PARTITION_PATH + "/" + FSUtils.makeDataFileName(commitTime2, 0, file1)); assertEquals("file should contain 140 records", - ParquetUtils.readRowKeysFromParquet(newFile).size(), 140); + ParquetUtils.readRowKeysFromParquet(jsc.hadoopConfiguration(), newFile).size(), 140); - List records = ParquetUtils.readAvroRecords(newFile); + List records = ParquetUtils.readAvroRecords(jsc.hadoopConfiguration(), newFile); for (GenericRecord record : records) { String recordKey = record.get(HoodieRecord.RECORD_KEY_METADATA_FIELD).toString(); String recCommitTime = record.get(HoodieRecord.COMMIT_TIME_METADATA_FIELD).toString(); @@ -1137,8 +1143,8 @@ public class TestHoodieClientOnCopyOnWriteStorage implements Serializable { assertNoWriteErrors(statuses); assertEquals("2 files needs to be committed.", 2, statuses.size()); - FileSystem fs = FSUtils.getFs(); - HoodieTableMetaClient metaClient = new HoodieTableMetaClient(fs, basePath); + HoodieTableMetaClient metaClient = new HoodieTableMetaClient(jsc.hadoopConfiguration(), + basePath); HoodieTable table = HoodieTable.getHoodieTable(metaClient, config); List files = table.getROFileSystemView().getLatestDataFilesBeforeOrOn(TEST_PARTITION_PATH, commitTime3) @@ -1148,7 +1154,7 @@ public class TestHoodieClientOnCopyOnWriteStorage implements Serializable { int totalInserts = 0; for (HoodieDataFile file : files) { assertEquals("All files must be at commit 3", commitTime3, file.getCommitTime()); - records = ParquetUtils.readAvroRecords(new Path(file.getPath())); + records = ParquetUtils.readAvroRecords(jsc.hadoopConfiguration(), new Path(file.getPath())); totalInserts += records.size(); } assertEquals("Total number of records must add up", totalInserts, @@ -1169,7 +1175,8 @@ public class TestHoodieClientOnCopyOnWriteStorage implements Serializable { String file1P0C0 = HoodieTestUtils.createNewDataFile(basePath, partitionPaths[0], "000"); String file1P1C0 = HoodieTestUtils.createNewDataFile(basePath, partitionPaths[1], "000"); HoodieTable table = HoodieTable - .getHoodieTable(new HoodieTableMetaClient(FSUtils.getFs(), config.getBasePath(), true), + .getHoodieTable( + new HoodieTableMetaClient(jsc.hadoopConfiguration(), config.getBasePath(), true), config); List hoodieCleanStatsOne = table.clean(jsc); @@ -1183,7 +1190,8 @@ public class TestHoodieClientOnCopyOnWriteStorage implements Serializable { // make next commit, with 1 insert & 1 update per partition HoodieTestUtils.createCommitFiles(basePath, "001"); table = HoodieTable - .getHoodieTable(new HoodieTableMetaClient(FSUtils.getFs(), config.getBasePath(), true), + .getHoodieTable( + new HoodieTableMetaClient(jsc.hadoopConfiguration(), basePath, true), config); String file2P0C1 = HoodieTestUtils @@ -1206,7 +1214,8 @@ public class TestHoodieClientOnCopyOnWriteStorage implements Serializable { // make next commit, with 2 updates to existing files, and 1 insert HoodieTestUtils.createCommitFiles(basePath, "002"); table = HoodieTable - .getHoodieTable(new HoodieTableMetaClient(FSUtils.getFs(), config.getBasePath(), true), + .getHoodieTable( + new HoodieTableMetaClient(jsc.hadoopConfiguration(), config.getBasePath(), true), config); HoodieTestUtils.createDataFile(basePath, partitionPaths[0], "002", file1P0C0); // update @@ -1260,8 +1269,7 @@ public class TestHoodieClientOnCopyOnWriteStorage implements Serializable { // make 1 compaction commit HoodieTestUtils.createCompactionCommitFiles(basePath, "001"); - HoodieTable table = HoodieTable - .getHoodieTable(metaClient, config); + HoodieTable table = HoodieTable.getHoodieTable(metaClient, config); List hoodieCleanStats = table.clean(jsc); assertEquals("Must clean three files, one parquet and 2 log files", 3, getCleanStat(hoodieCleanStats, partitionPaths[0]).getSuccessDeleteFiles().size()); @@ -1286,9 +1294,8 @@ public class TestHoodieClientOnCopyOnWriteStorage implements Serializable { String file1P0C0 = HoodieTestUtils.createNewDataFile(basePath, partitionPaths[0], "000"); String file1P1C0 = HoodieTestUtils.createNewDataFile(basePath, partitionPaths[1], "000"); - HoodieTable table = HoodieTable - .getHoodieTable(new HoodieTableMetaClient(FSUtils.getFs(), config.getBasePath(), true), - config); + HoodieTable table = HoodieTable.getHoodieTable( + new HoodieTableMetaClient(jsc.hadoopConfiguration(), config.getBasePath(), true), config); List hoodieCleanStatsOne = table.clean(jsc); assertEquals("Must not clean any files", 0, @@ -1300,9 +1307,8 @@ public class TestHoodieClientOnCopyOnWriteStorage implements Serializable { // make next commit, with 1 insert & 1 update per partition HoodieTestUtils.createCommitFiles(basePath, "001"); - table = HoodieTable - .getHoodieTable(new HoodieTableMetaClient(FSUtils.getFs(), config.getBasePath(), true), - config); + table = HoodieTable.getHoodieTable( + new HoodieTableMetaClient(jsc.hadoopConfiguration(), config.getBasePath(), true), config); String file2P0C1 = HoodieTestUtils .createNewDataFile(basePath, partitionPaths[0], "001"); // insert @@ -1324,7 +1330,8 @@ public class TestHoodieClientOnCopyOnWriteStorage implements Serializable { // make next commit, with 2 updates to existing files, and 1 insert HoodieTestUtils.createCommitFiles(basePath, "002"); table = HoodieTable - .getHoodieTable(new HoodieTableMetaClient(FSUtils.getFs(), config.getBasePath(), true), + .getHoodieTable( + new HoodieTableMetaClient(jsc.hadoopConfiguration(), config.getBasePath(), true), config); HoodieTestUtils.createDataFile(basePath, partitionPaths[0], "002", file1P0C0); // update @@ -1341,7 +1348,8 @@ public class TestHoodieClientOnCopyOnWriteStorage implements Serializable { // make next commit, with 2 updates to existing files, and 1 insert HoodieTestUtils.createCommitFiles(basePath, "003"); table = HoodieTable - .getHoodieTable(new HoodieTableMetaClient(FSUtils.getFs(), config.getBasePath(), true), + .getHoodieTable( + new HoodieTableMetaClient(jsc.hadoopConfiguration(), config.getBasePath(), true), config); HoodieTestUtils.createDataFile(basePath, partitionPaths[0], "003", file1P0C0); // update @@ -1384,7 +1392,8 @@ public class TestHoodieClientOnCopyOnWriteStorage implements Serializable { HoodieTestUtils.createCommitFiles(basePath, "000"); HoodieTable table = HoodieTable - .getHoodieTable(new HoodieTableMetaClient(FSUtils.getFs(), config.getBasePath(), true), + .getHoodieTable( + new HoodieTableMetaClient(jsc.hadoopConfiguration(), config.getBasePath(), true), config); List hoodieCleanStatsOne = table.clean(jsc); @@ -1449,7 +1458,8 @@ public class TestHoodieClientOnCopyOnWriteStorage implements Serializable { updateAllFilesInPartition(filesP2C0, partitionPaths[2], "003"); HoodieTable table = HoodieTable - .getHoodieTable(new HoodieTableMetaClient(FSUtils.getFs(), config.getBasePath(), true), + .getHoodieTable( + new HoodieTableMetaClient(jsc.hadoopConfiguration(), config.getBasePath(), true), config); List hoodieCleanStats = table.clean(jsc); @@ -1476,8 +1486,8 @@ public class TestHoodieClientOnCopyOnWriteStorage implements Serializable { HoodieWriteConfig cfg = getConfigBuilder().withAutoCommit(false).build(); HoodieWriteClient client = new HoodieWriteClient(jsc, cfg); - FileSystem fs = FSUtils.getFs(); - HoodieTableMetaClient metaClient = new HoodieTableMetaClient(fs, basePath); + HoodieTableMetaClient metaClient = new HoodieTableMetaClient(jsc.hadoopConfiguration(), + basePath); HoodieTable table = HoodieTable.getHoodieTable(metaClient, cfg); String commitTime = "000"; diff --git a/hoodie-client/src/test/java/com/uber/hoodie/TestMultiFS.java b/hoodie-client/src/test/java/com/uber/hoodie/TestMultiFS.java new file mode 100644 index 000000000..1bdc15d25 --- /dev/null +++ b/hoodie-client/src/test/java/com/uber/hoodie/TestMultiFS.java @@ -0,0 +1,159 @@ +/* + * Copyright (c) 2017 Uber Technologies, Inc. (hoodie-dev-group@uber.com) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.uber.hoodie; + + +import static org.junit.Assert.assertEquals; + +import com.uber.hoodie.common.HoodieClientTestUtils; +import com.uber.hoodie.common.HoodieTestDataGenerator; +import com.uber.hoodie.common.minicluster.HdfsTestService; +import com.uber.hoodie.common.model.HoodieAvroPayload; +import com.uber.hoodie.common.model.HoodieRecord; +import com.uber.hoodie.common.model.HoodieTableType; +import com.uber.hoodie.common.model.HoodieTestUtils; +import com.uber.hoodie.common.table.HoodieTableMetaClient; +import com.uber.hoodie.common.table.HoodieTimeline; +import com.uber.hoodie.common.table.timeline.HoodieActiveTimeline; +import com.uber.hoodie.common.util.FSUtils; +import com.uber.hoodie.config.HoodieIndexConfig; +import com.uber.hoodie.config.HoodieWriteConfig; +import com.uber.hoodie.index.HoodieIndex; +import java.io.Serializable; +import java.util.List; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.hdfs.DistributedFileSystem; +import org.apache.hadoop.hdfs.MiniDFSCluster; +import org.apache.log4j.LogManager; +import org.apache.log4j.Logger; +import org.apache.spark.SparkConf; +import org.apache.spark.api.java.JavaRDD; +import org.apache.spark.api.java.JavaSparkContext; +import org.apache.spark.sql.Dataset; +import org.apache.spark.sql.Row; +import org.apache.spark.sql.SQLContext; +import org.junit.After; +import org.junit.Before; +import org.junit.Test; + +public class TestMultiFS implements Serializable { + + private static String dfsBasePath; + private static HdfsTestService hdfsTestService; + private static MiniDFSCluster dfsCluster; + private static DistributedFileSystem dfs; + private static Logger logger = LogManager.getLogger(TestMultiFS.class); + private String tablePath = "file:///tmp/hoodie/sample-table"; + private String tableName = "hoodie_rt"; + private String tableType = HoodieTableType.COPY_ON_WRITE.name(); + private static JavaSparkContext jsc; + private static SQLContext sqlContext; + + @Before + public void initClass() throws Exception { + hdfsTestService = new HdfsTestService(); + dfsCluster = hdfsTestService.start(true); + + // Create a temp folder as the base path + dfs = dfsCluster.getFileSystem(); + dfsBasePath = dfs.getWorkingDirectory().toString(); + dfs.mkdirs(new Path(dfsBasePath)); + + SparkConf sparkConf = new SparkConf().setAppName("hoodie-client-example"); + sparkConf.setMaster("local[1]"); + sparkConf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer"); + sparkConf.set("spark.kryoserializer.buffer.max", "512m"); + jsc = new JavaSparkContext(sparkConf); + sqlContext = new SQLContext(jsc); + } + + @After + public void cleanupClass() throws Exception { + if (hdfsTestService != null) { + hdfsTestService.stop(); + } + if (jsc != null) { + jsc.stop(); + } + FSUtils.setFs(null); + } + + @Test + public void readLocalWriteHDFS() throws Exception { + + // Generator of some records to be loaded in. + HoodieTestDataGenerator dataGen = new HoodieTestDataGenerator(); + + // Initialize table and filesystem + FileSystem hdfs = FSUtils.getFs(dfsBasePath, jsc.hadoopConfiguration()); + HoodieTableMetaClient + .initTableType(hdfs, dfsBasePath, HoodieTableType.valueOf(tableType), tableName, + HoodieAvroPayload.class.getName()); + + //Create write client to write some records in + HoodieWriteConfig cfg = HoodieWriteConfig.newBuilder().withPath(dfsBasePath) + .withSchema(HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA).withParallelism(2, 2) + .forTable(tableName).withIndexConfig( + HoodieIndexConfig.newBuilder().withIndexType(HoodieIndex.IndexType.BLOOM).build()) + .build(); + HoodieWriteClient hdfsWriteClient = new HoodieWriteClient(jsc, cfg); + + // Write generated data to hdfs (only inserts) + String readCommitTime = hdfsWriteClient.startCommit(); + logger.info("Starting commit " + readCommitTime); + List records = dataGen.generateInserts(readCommitTime, 100); + JavaRDD writeRecords = jsc.parallelize(records, 1); + hdfsWriteClient.upsert(writeRecords, readCommitTime); + + // Read from hdfs + FileSystem fs = FSUtils.getFs(dfsBasePath, HoodieTestUtils.getDefaultHadoopConf()); + HoodieTableMetaClient metaClient = new HoodieTableMetaClient(fs.getConf(), dfsBasePath); + HoodieTimeline timeline = new HoodieActiveTimeline(metaClient) + .getCommitTimeline(); + Dataset readRecords = HoodieClientTestUtils + .readCommit(dfsBasePath, sqlContext, timeline, readCommitTime); + assertEquals("Should contain 100 records", readRecords.count(), records.size()); + + // Write to local + FileSystem local = FSUtils.getFs(tablePath, jsc.hadoopConfiguration()); + HoodieTableMetaClient + .initTableType(local, tablePath, HoodieTableType.valueOf(tableType), tableName, + HoodieAvroPayload.class.getName()); + HoodieWriteConfig localConfig = HoodieWriteConfig.newBuilder().withPath(tablePath) + .withSchema(HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA).withParallelism(2, 2) + .forTable(tableName).withIndexConfig( + HoodieIndexConfig.newBuilder().withIndexType(HoodieIndex.IndexType.BLOOM).build()) + .build(); + HoodieWriteClient localWriteClient = new HoodieWriteClient(jsc, localConfig); + + String writeCommitTime = localWriteClient.startCommit(); + logger.info("Starting write commit " + writeCommitTime); + List localRecords = dataGen.generateInserts(writeCommitTime, 100); + JavaRDD localWriteRecords = jsc.parallelize(localRecords, 1); + logger.info("Writing to path: " + tablePath); + localWriteClient.upsert(localWriteRecords, writeCommitTime); + + logger.info("Reading from path: " + tablePath); + fs = FSUtils.getFs(tablePath, HoodieTestUtils.getDefaultHadoopConf()); + metaClient = new HoodieTableMetaClient(fs.getConf(), tablePath); + timeline = new HoodieActiveTimeline(metaClient).getCommitTimeline(); + Dataset localReadRecords = HoodieClientTestUtils + .readCommit(tablePath, sqlContext, timeline, writeCommitTime); + assertEquals("Should contain 100 records", localReadRecords.count(), localRecords.size()); + } +} diff --git a/hoodie-client/src/test/java/com/uber/hoodie/common/HoodieClientTestUtils.java b/hoodie-client/src/test/java/com/uber/hoodie/common/HoodieClientTestUtils.java index c2db12d5e..295d41525 100644 --- a/hoodie-client/src/test/java/com/uber/hoodie/common/HoodieClientTestUtils.java +++ b/hoodie-client/src/test/java/com/uber/hoodie/common/HoodieClientTestUtils.java @@ -174,7 +174,7 @@ public class HoodieClientTestUtils { List filteredPaths = new ArrayList<>(); try { HoodieTable hoodieTable = HoodieTable - .getHoodieTable(new HoodieTableMetaClient(fs, basePath, true), null); + .getHoodieTable(new HoodieTableMetaClient(fs.getConf(), basePath, true), null); for (String path : paths) { TableFileSystemView.ReadOptimizedView fileSystemView = new HoodieTableFileSystemView( hoodieTable.getMetaClient(), diff --git a/hoodie-client/src/test/java/com/uber/hoodie/common/HoodieMergeOnReadTestUtils.java b/hoodie-client/src/test/java/com/uber/hoodie/common/HoodieMergeOnReadTestUtils.java index ec3d5728c..c7e3e9156 100644 --- a/hoodie-client/src/test/java/com/uber/hoodie/common/HoodieMergeOnReadTestUtils.java +++ b/hoodie-client/src/test/java/com/uber/hoodie/common/HoodieMergeOnReadTestUtils.java @@ -18,7 +18,7 @@ package com.uber.hoodie.common; import static com.uber.hoodie.common.HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA; -import com.uber.hoodie.common.util.FSUtils; +import com.uber.hoodie.common.model.HoodieTestUtils; import com.uber.hoodie.common.util.HoodieAvroUtils; import com.uber.hoodie.hadoop.realtime.HoodieRealtimeInputFormat; import java.io.IOException; @@ -42,12 +42,13 @@ import org.apache.hadoop.mapred.RecordReader; */ public class HoodieMergeOnReadTestUtils { - public static List getRecordsUsingInputFormat(List inputPaths) + public static List getRecordsUsingInputFormat(List inputPaths, + String basePath) throws IOException { JobConf jobConf = new JobConf(); Schema schema = HoodieAvroUtils.addMetadataFields(Schema.parse(TRIP_EXAMPLE_SCHEMA)); HoodieRealtimeInputFormat inputFormat = new HoodieRealtimeInputFormat(); - setPropsForInputFormat(inputFormat, jobConf, schema); + setPropsForInputFormat(inputFormat, jobConf, schema, basePath); return inputPaths.stream().map(path -> { setInputPath(jobConf, path); List records = new ArrayList<>(); @@ -76,12 +77,12 @@ public class HoodieMergeOnReadTestUtils { } private static void setPropsForInputFormat(HoodieRealtimeInputFormat inputFormat, JobConf jobConf, - Schema schema) { + Schema schema, String basePath) { List fields = schema.getFields(); String names = fields.stream().map(f -> f.name().toString()).collect(Collectors.joining(",")); String postions = fields.stream().map(f -> String.valueOf(f.pos())) .collect(Collectors.joining(",")); - Configuration conf = FSUtils.getFs().getConf(); + Configuration conf = HoodieTestUtils.getDefaultHadoopConf(); jobConf.set(ColumnProjectionUtils.READ_COLUMN_NAMES_CONF_STR, names); jobConf.set(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR, postions); jobConf.set("partition_columns", "datestr"); diff --git a/hoodie-client/src/test/java/com/uber/hoodie/common/HoodieTestDataGenerator.java b/hoodie-client/src/test/java/com/uber/hoodie/common/HoodieTestDataGenerator.java index c197e6b51..beb76df65 100644 --- a/hoodie-client/src/test/java/com/uber/hoodie/common/HoodieTestDataGenerator.java +++ b/hoodie-client/src/test/java/com/uber/hoodie/common/HoodieTestDataGenerator.java @@ -20,6 +20,7 @@ import com.uber.hoodie.common.model.HoodieCommitMetadata; import com.uber.hoodie.common.model.HoodieKey; import com.uber.hoodie.common.model.HoodiePartitionMetadata; import com.uber.hoodie.common.model.HoodieRecord; +import com.uber.hoodie.common.model.HoodieTestUtils; import com.uber.hoodie.common.table.HoodieTableMetaClient; import com.uber.hoodie.common.table.HoodieTimeline; import com.uber.hoodie.common.util.FSUtils; @@ -193,7 +194,7 @@ public class HoodieTestDataGenerator { Path commitFile = new Path(basePath + "/" + HoodieTableMetaClient.METAFOLDER_NAME + "/" + HoodieTimeline .makeCommitFileName(commitTime)); - FileSystem fs = FSUtils.getFs(); + FileSystem fs = FSUtils.getFs(basePath, HoodieTestUtils.getDefaultHadoopConf()); FSDataOutputStream os = fs.create(commitFile, true); HoodieCommitMetadata commitMetadata = new HoodieCommitMetadata(); try { @@ -209,7 +210,7 @@ public class HoodieTestDataGenerator { Path commitFile = new Path(basePath + "/" + HoodieTableMetaClient.METAFOLDER_NAME + "/" + HoodieTimeline .makeSavePointFileName(commitTime)); - FileSystem fs = FSUtils.getFs(); + FileSystem fs = FSUtils.getFs(basePath, HoodieTestUtils.getDefaultHadoopConf()); FSDataOutputStream os = fs.create(commitFile, true); HoodieCommitMetadata commitMetadata = new HoodieCommitMetadata(); try { diff --git a/hoodie-client/src/test/java/com/uber/hoodie/func/TestUpdateMapFunction.java b/hoodie-client/src/test/java/com/uber/hoodie/func/TestUpdateMapFunction.java index 8433c0366..ef4a86833 100644 --- a/hoodie-client/src/test/java/com/uber/hoodie/func/TestUpdateMapFunction.java +++ b/hoodie-client/src/test/java/com/uber/hoodie/func/TestUpdateMapFunction.java @@ -55,8 +55,9 @@ public class TestUpdateMapFunction { public void testSchemaEvolutionOnUpdate() throws Exception { // Create a bunch of records with a old version of schema HoodieWriteConfig config = makeHoodieClientConfig("/exampleSchema.txt"); - HoodieTableMetaClient metadata = new HoodieTableMetaClient(FSUtils.getFs(), basePath); - HoodieCopyOnWriteTable table = new HoodieCopyOnWriteTable(config, metadata); + HoodieTableMetaClient metaClient = new HoodieTableMetaClient( + HoodieTestUtils.getDefaultHadoopConf(), basePath); + HoodieCopyOnWriteTable table = new HoodieCopyOnWriteTable(config, metaClient); String recordStr1 = "{\"_row_key\":\"8eb5b87a-1feh-4edd-87b4-6ec96dc405a0\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}"; @@ -80,16 +81,16 @@ public class TestUpdateMapFunction { Iterator> insertResult = table.handleInsert("100", records.iterator()); Path commitFile = new Path(config.getBasePath() + "/.hoodie/" + HoodieTimeline.makeCommitFileName("100")); - FSUtils.getFs().create(commitFile); + FSUtils.getFs(basePath, HoodieTestUtils.getDefaultHadoopConf()).create(commitFile); // Now try an update with an evolved schema // Evolved schema does not have guarantee on preserving the original field ordering config = makeHoodieClientConfig("/exampleEvolvedSchema.txt"); - metadata = new HoodieTableMetaClient(FSUtils.getFs(), basePath); + metaClient = new HoodieTableMetaClient(HoodieTestUtils.getDefaultHadoopConf(), basePath); String fileId = insertResult.next().get(0).getFileId(); System.out.println(fileId); - table = new HoodieCopyOnWriteTable(config, metadata); + table = new HoodieCopyOnWriteTable(config, metaClient); // New content with values for the newly added field recordStr1 = "{\"_row_key\":\"8eb5b87a-1feh-4edd-87b4-6ec96dc405a0\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12,\"added_field\":1}"; diff --git a/hoodie-client/src/test/java/com/uber/hoodie/index/bloom/TestHoodieBloomIndex.java b/hoodie-client/src/test/java/com/uber/hoodie/index/bloom/TestHoodieBloomIndex.java index 8d4cc2558..92dcae96b 100644 --- a/hoodie-client/src/test/java/com/uber/hoodie/index/bloom/TestHoodieBloomIndex.java +++ b/hoodie-client/src/test/java/com/uber/hoodie/index/bloom/TestHoodieBloomIndex.java @@ -54,7 +54,6 @@ import java.util.stream.Collectors; import org.apache.avro.Schema; import org.apache.avro.generic.GenericRecord; import org.apache.commons.io.IOUtils; -import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.parquet.avro.AvroSchemaConverter; @@ -73,12 +72,11 @@ public class TestHoodieBloomIndex { private JavaSparkContext jsc = null; private String basePath = null; - private transient final FileSystem fs; + private transient FileSystem fs; private String schemaStr; private Schema schema; public TestHoodieBloomIndex() throws Exception { - fs = FSUtils.getFs(); } @Before @@ -89,6 +87,7 @@ public class TestHoodieBloomIndex { TemporaryFolder folder = new TemporaryFolder(); folder.create(); basePath = folder.getRoot().getAbsolutePath(); + fs = FSUtils.getFs(basePath, jsc.hadoopConfiguration()); HoodieTestUtils.init(basePath); // We have some records to be tagged (two different partitions) schemaStr = IOUtils.toString(getClass().getResourceAsStream("/exampleSchema.txt"), "UTF-8"); @@ -120,8 +119,6 @@ public class TestHoodieBloomIndex { .parallelize(Arrays.asList(record1, record2, record3, record4)); // Load to memory - HoodieWriteConfig config = HoodieWriteConfig.newBuilder().withPath(basePath).build(); - Map> map = recordRDD .mapToPair(record -> new Tuple2<>(record.getPartitionPath(), record.getRecordKey())) .groupByKey().collectAsMap(); @@ -174,7 +171,7 @@ public class TestHoodieBloomIndex { Arrays.asList(record2, record3, record4), schema, null, false); List partitions = Arrays.asList("2016/01/21", "2016/04/01", "2015/03/12"); - HoodieTableMetaClient metadata = new HoodieTableMetaClient(fs, basePath); + HoodieTableMetaClient metadata = new HoodieTableMetaClient(jsc.hadoopConfiguration(), basePath); HoodieTable table = HoodieTable.getHoodieTable(metadata, config); List> filesList = index.loadInvolvedFiles(partitions, table); // Still 0, as no valid commit @@ -291,7 +288,8 @@ public class TestHoodieBloomIndex { List uuids = Arrays.asList(record1.getRecordKey(), record2.getRecordKey(), record3.getRecordKey(), record4.getRecordKey()); - List results = HoodieBloomIndexCheckFunction.checkCandidatesAgainstFile(uuids, + List results = HoodieBloomIndexCheckFunction + .checkCandidatesAgainstFile(jsc.hadoopConfiguration(), uuids, new Path(basePath + "/2016/01/31/" + filename)); assertEquals(results.size(), 2); assertTrue(results.get(0).equals("1eb5b87a-1feh-4edd-87b4-6ec96dc405a0") @@ -308,7 +306,7 @@ public class TestHoodieBloomIndex { // We have some records to be tagged (two different partitions) JavaRDD recordRDD = jsc.emptyRDD(); // Also create the metadata and config - HoodieTableMetaClient metadata = new HoodieTableMetaClient(fs, basePath); + HoodieTableMetaClient metadata = new HoodieTableMetaClient(jsc.hadoopConfiguration(), basePath); HoodieWriteConfig config = HoodieWriteConfig.newBuilder().withPath(basePath).build(); HoodieTable table = HoodieTable.getHoodieTable(metadata, config); @@ -348,7 +346,7 @@ public class TestHoodieBloomIndex { .parallelize(Arrays.asList(record1, record2, record3, record4)); // Also create the metadata and config - HoodieTableMetaClient metadata = new HoodieTableMetaClient(fs, basePath); + HoodieTableMetaClient metadata = new HoodieTableMetaClient(jsc.hadoopConfiguration(), basePath); HoodieWriteConfig config = HoodieWriteConfig.newBuilder().withPath(basePath).build(); HoodieTable table = HoodieTable.getHoodieTable(metadata, config); @@ -367,7 +365,7 @@ public class TestHoodieBloomIndex { String filename3 = writeParquetFile("2015/01/31", Arrays.asList(record4), schema, null, true); // We do the tag again - metadata = new HoodieTableMetaClient(fs, basePath); + metadata = new HoodieTableMetaClient(jsc.hadoopConfiguration(), basePath); table = HoodieTable.getHoodieTable(metadata, config); taggedRecordRDD = bloomIndex.tagLocation(recordRDD, table); @@ -409,7 +407,7 @@ public class TestHoodieBloomIndex { JavaRDD keysRDD = jsc.parallelize(Arrays.asList(key1, key2, key3, key4)); // Also create the metadata and config - HoodieTableMetaClient metadata = new HoodieTableMetaClient(fs, basePath); + HoodieTableMetaClient metadata = new HoodieTableMetaClient(jsc.hadoopConfiguration(), basePath); HoodieWriteConfig config = HoodieWriteConfig.newBuilder().withPath(basePath).build(); HoodieTable table = HoodieTable.getHoodieTable(metadata, config); @@ -429,7 +427,7 @@ public class TestHoodieBloomIndex { String filename3 = writeParquetFile("2015/01/31", Arrays.asList(record4), schema, null, true); // We do the tag again - metadata = new HoodieTableMetaClient(fs, basePath); + metadata = new HoodieTableMetaClient(jsc.hadoopConfiguration(), basePath); table = HoodieTable.getHoodieTable(metadata, config); taggedRecordRDD = bloomIndex.fetchRecordLocation(keysRDD, table); @@ -476,7 +474,7 @@ public class TestHoodieBloomIndex { // We do the tag JavaRDD recordRDD = jsc.parallelize(Arrays.asList(record1, record2)); - HoodieTableMetaClient metadata = new HoodieTableMetaClient(fs, basePath); + HoodieTableMetaClient metadata = new HoodieTableMetaClient(jsc.hadoopConfiguration(), basePath); HoodieWriteConfig config = HoodieWriteConfig.newBuilder().withPath(basePath).build(); HoodieTable table = HoodieTable.getHoodieTable(metadata, config); @@ -515,7 +513,7 @@ public class TestHoodieBloomIndex { String commitTime = FSUtils.getCommitTime(filename); HoodieParquetConfig config = new HoodieParquetConfig(writeSupport, CompressionCodecName.GZIP, ParquetWriter.DEFAULT_BLOCK_SIZE, ParquetWriter.DEFAULT_PAGE_SIZE, 120 * 1024 * 1024, - new Configuration()); + HoodieTestUtils.getDefaultHadoopConf()); HoodieParquetWriter writer = new HoodieParquetWriter( commitTime, new Path(basePath + "/" + partitionPath + "/" + filename), diff --git a/hoodie-client/src/test/java/com/uber/hoodie/io/TestHoodieCommitArchiveLog.java b/hoodie-client/src/test/java/com/uber/hoodie/io/TestHoodieCommitArchiveLog.java index 5106e8eea..430a0a591 100644 --- a/hoodie-client/src/test/java/com/uber/hoodie/io/TestHoodieCommitArchiveLog.java +++ b/hoodie-client/src/test/java/com/uber/hoodie/io/TestHoodieCommitArchiveLog.java @@ -55,7 +55,7 @@ public class TestHoodieCommitArchiveLog { folder.create(); basePath = folder.getRoot().getAbsolutePath(); HoodieTestUtils.init(basePath); - fs = FSUtils.getFs(); + fs = FSUtils.getFs(basePath, HoodieTestUtils.getDefaultHadoopConf()); } @Test @@ -82,7 +82,7 @@ public class TestHoodieCommitArchiveLog { HoodieTestDataGenerator.createCommitFile(basePath, "104"); HoodieTestDataGenerator.createCommitFile(basePath, "105"); - HoodieTableMetaClient metadata = new HoodieTableMetaClient(fs, basePath); + HoodieTableMetaClient metadata = new HoodieTableMetaClient(fs.getConf(), basePath); HoodieTimeline timeline = metadata.getActiveTimeline().getCommitsTimeline().filterCompletedInstants(); @@ -112,8 +112,8 @@ public class TestHoodieCommitArchiveLog { originalCommits.removeAll(timeline.getInstants().collect(Collectors.toList())); //read the file - HoodieLogFormat.Reader reader = HoodieLogFormat.newReader(FSUtils.getFs(), - new HoodieLogFile(new Path(basePath + "/.hoodie/.commits_.archive.1")), + HoodieLogFormat.Reader reader = HoodieLogFormat + .newReader(fs, new HoodieLogFile(new Path(basePath + "/.hoodie/.commits_.archive.1")), HoodieArchivedMetaEntry.getClassSchema(), false); int archivedRecordsCount = 0; @@ -147,7 +147,7 @@ public class TestHoodieCommitArchiveLog { .withSchema(HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA).withParallelism(2, 2) .forTable("test-trip-table").withCompactionConfig( HoodieCompactionConfig.newBuilder().archiveCommitsWith(2, 5).build()).build(); - HoodieTableMetaClient metadata = new HoodieTableMetaClient(fs, basePath); + HoodieTableMetaClient metadata = new HoodieTableMetaClient(fs.getConf(), basePath); HoodieCommitArchiveLog archiveLog = new HoodieCommitArchiveLog(cfg, fs); HoodieTestDataGenerator.createCommitFile(basePath, "100"); HoodieTestDataGenerator.createCommitFile(basePath, "101"); @@ -173,7 +173,7 @@ public class TestHoodieCommitArchiveLog { .withSchema(HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA).withParallelism(2, 2) .forTable("test-trip-table").withCompactionConfig( HoodieCompactionConfig.newBuilder().archiveCommitsWith(2, 5).build()).build(); - HoodieTableMetaClient metadata = new HoodieTableMetaClient(fs, basePath); + HoodieTableMetaClient metadata = new HoodieTableMetaClient(fs.getConf(), basePath); HoodieCommitArchiveLog archiveLog = new HoodieCommitArchiveLog(cfg, fs); HoodieTestDataGenerator.createCommitFile(basePath, "100"); HoodieTestDataGenerator.createCommitFile(basePath, "101"); @@ -206,7 +206,7 @@ public class TestHoodieCommitArchiveLog { .withSchema(HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA).withParallelism(2, 2) .forTable("test-trip-table").withCompactionConfig( HoodieCompactionConfig.newBuilder().archiveCommitsWith(2, 5).build()).build(); - HoodieTableMetaClient metadata = new HoodieTableMetaClient(fs, basePath); + HoodieTableMetaClient metadata = new HoodieTableMetaClient(fs.getConf(), basePath); HoodieCommitArchiveLog archiveLog = new HoodieCommitArchiveLog(cfg, fs); HoodieTestDataGenerator.createCommitFile(basePath, "100"); HoodieTestDataGenerator.createCommitFile(basePath, "101"); diff --git a/hoodie-client/src/test/java/com/uber/hoodie/io/TestHoodieCompactor.java b/hoodie-client/src/test/java/com/uber/hoodie/io/TestHoodieCompactor.java index c842bf592..a6e385f2a 100644 --- a/hoodie-client/src/test/java/com/uber/hoodie/io/TestHoodieCompactor.java +++ b/hoodie-client/src/test/java/com/uber/hoodie/io/TestHoodieCompactor.java @@ -16,6 +16,9 @@ package com.uber.hoodie.io; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertTrue; + import com.uber.hoodie.HoodieWriteClient; import com.uber.hoodie.WriteStatus; import com.uber.hoodie.common.HoodieClientTestUtils; @@ -28,7 +31,6 @@ import com.uber.hoodie.common.model.HoodieTestUtils; import com.uber.hoodie.common.table.HoodieTableMetaClient; import com.uber.hoodie.common.table.HoodieTimeline; import com.uber.hoodie.common.table.timeline.HoodieActiveTimeline; -import com.uber.hoodie.common.util.FSUtils; import com.uber.hoodie.config.HoodieCompactionConfig; import com.uber.hoodie.config.HoodieIndexConfig; import com.uber.hoodie.config.HoodieStorageConfig; @@ -38,7 +40,10 @@ import com.uber.hoodie.index.bloom.HoodieBloomIndex; import com.uber.hoodie.io.compact.HoodieCompactor; import com.uber.hoodie.io.compact.HoodieRealtimeTableCompactor; import com.uber.hoodie.table.HoodieTable; -import org.apache.hadoop.fs.FileSystem; +import java.io.File; +import java.io.IOException; +import java.util.List; +import java.util.stream.Collectors; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaSparkContext; import org.junit.After; @@ -46,14 +51,6 @@ import org.junit.Before; import org.junit.Test; import org.junit.rules.TemporaryFolder; -import java.io.File; -import java.io.IOException; -import java.util.List; -import java.util.stream.Collectors; - -import static org.junit.Assert.assertEquals; -import static org.junit.Assert.assertTrue; - public class TestHoodieCompactor { private transient JavaSparkContext jsc = null; @@ -104,18 +101,19 @@ public class TestHoodieCompactor { @Test(expected = IllegalArgumentException.class) public void testCompactionOnCopyOnWriteFail() throws Exception { HoodieTestUtils.initTableType(basePath, HoodieTableType.COPY_ON_WRITE); - - HoodieTableMetaClient metaClient = new HoodieTableMetaClient(FSUtils.getFs(), basePath); + HoodieTableMetaClient metaClient = new HoodieTableMetaClient(jsc.hadoopConfiguration(), + basePath); HoodieTable table = HoodieTable.getHoodieTable(metaClient, getConfig()); - compactor.compact(jsc, getConfig(), table, HoodieActiveTimeline.createNewCommitTime()); } @Test public void testCompactionEmpty() throws Exception { - HoodieTableMetaClient metaClient = new HoodieTableMetaClient(FSUtils.getFs(), basePath); + HoodieTableMetaClient metaClient = new HoodieTableMetaClient(jsc.hadoopConfiguration(), + basePath); HoodieWriteConfig config = getConfig(); - HoodieTable table = HoodieTable.getHoodieTable(metaClient, config); + HoodieTable table = HoodieTable + .getHoodieTable(metaClient, config); HoodieWriteClient writeClient = new HoodieWriteClient(jsc, config); String newCommitTime = writeClient.startCommit(); @@ -132,7 +130,6 @@ public class TestHoodieCompactor { @Test public void testLogFileCountsAfterCompaction() throws Exception { - FileSystem fs = FSUtils.getFs(); // insert 100 records HoodieWriteConfig config = getConfig(); HoodieWriteClient writeClient = new HoodieWriteClient(jsc, config); @@ -144,7 +141,8 @@ public class TestHoodieCompactor { List statuses = writeClient.insert(recordsRDD, newCommitTime).collect(); // Update all the 100 records - HoodieTableMetaClient metaClient = new HoodieTableMetaClient(fs, basePath); + HoodieTableMetaClient metaClient = new HoodieTableMetaClient(jsc.hadoopConfiguration(), + basePath); HoodieTable table = HoodieTable.getHoodieTable(metaClient, config); newCommitTime = "101"; @@ -161,7 +159,7 @@ public class TestHoodieCompactor { updatedRecords); // Verify that all data file has one log file - metaClient = new HoodieTableMetaClient(fs, basePath); + metaClient = new HoodieTableMetaClient(jsc.hadoopConfiguration(), basePath); table = HoodieTable.getHoodieTable(metaClient, config); for (String partitionPath : dataGen.getPartitionPaths()) { List groupedLogFiles = @@ -174,14 +172,14 @@ public class TestHoodieCompactor { } // Do a compaction - metaClient = new HoodieTableMetaClient(fs, basePath); + metaClient = new HoodieTableMetaClient(jsc.hadoopConfiguration(), basePath); table = HoodieTable.getHoodieTable(metaClient, config); HoodieCommitMetadata result = compactor.compact(jsc, getConfig(), table, HoodieActiveTimeline.createNewCommitTime()); // Verify that recently written compacted data file has no log file - metaClient = new HoodieTableMetaClient(fs, basePath); + metaClient = new HoodieTableMetaClient(jsc.hadoopConfiguration(), basePath); table = HoodieTable.getHoodieTable(metaClient, config); HoodieActiveTimeline timeline = metaClient.getActiveTimeline(); diff --git a/hoodie-client/src/test/java/com/uber/hoodie/table/TestCopyOnWriteTable.java b/hoodie-client/src/test/java/com/uber/hoodie/table/TestCopyOnWriteTable.java index a98b76838..61303c3fb 100644 --- a/hoodie-client/src/test/java/com/uber/hoodie/table/TestCopyOnWriteTable.java +++ b/hoodie-client/src/test/java/com/uber/hoodie/table/TestCopyOnWriteTable.java @@ -89,7 +89,8 @@ public class TestCopyOnWriteTable { String commitTime = HoodieTestUtils.makeNewCommitTime(); HoodieWriteConfig config = makeHoodieClientConfig(); - HoodieTableMetaClient metaClient = new HoodieTableMetaClient(FSUtils.getFs(), basePath); + HoodieTableMetaClient metaClient = new HoodieTableMetaClient(jsc.hadoopConfiguration(), + basePath); HoodieTable table = HoodieTable.getHoodieTable(metaClient, config); HoodieCreateHandle io = new HoodieCreateHandle(config, commitTime, table, partitionPath); @@ -115,7 +116,7 @@ public class TestCopyOnWriteTable { // Prepare the AvroParquetIO HoodieWriteConfig config = makeHoodieClientConfig(); String firstCommitTime = HoodieTestUtils.makeNewCommitTime(); - HoodieTableMetaClient metadata = new HoodieTableMetaClient(FSUtils.getFs(), basePath); + HoodieTableMetaClient metadata = new HoodieTableMetaClient(jsc.hadoopConfiguration(), basePath); String partitionPath = "/2016/01/31"; HoodieCopyOnWriteTable table = new HoodieCopyOnWriteTable(config, metadata); @@ -154,7 +155,8 @@ public class TestCopyOnWriteTable { // Read out the bloom filter and make sure filter can answer record exist or not Path parquetFilePath = new Path(parquetFile.getAbsolutePath()); - BloomFilter filter = ParquetUtils.readBloomFilterFromParquetMetadata(parquetFilePath); + BloomFilter filter = ParquetUtils + .readBloomFilterFromParquetMetadata(jsc.hadoopConfiguration(), parquetFilePath); for (HoodieRecord record : records) { assertTrue(filter.mightContain(record.getRecordKey())); } @@ -163,7 +165,8 @@ public class TestCopyOnWriteTable { + FSUtils.getCommitTime(parquetFile.getName()) + ".commit").createNewFile(); // Read the parquet file, check the record content - List fileRecords = ParquetUtils.readAvroRecords(parquetFilePath); + List fileRecords = ParquetUtils + .readAvroRecords(jsc.hadoopConfiguration(), parquetFilePath); GenericRecord newRecord; int index = 0; for (GenericRecord record : fileRecords) { @@ -188,7 +191,7 @@ public class TestCopyOnWriteTable { Thread.sleep(1000); String newCommitTime = HoodieTestUtils.makeNewCommitTime(); - metadata = new HoodieTableMetaClient(FSUtils.getFs(), basePath); + metadata = new HoodieTableMetaClient(jsc.hadoopConfiguration(), basePath); table = new HoodieCopyOnWriteTable(config, metadata); Iterator> iter = table .handleUpdate(newCommitTime, updatedRecord1.getCurrentLocation().getFileId(), @@ -211,7 +214,7 @@ public class TestCopyOnWriteTable { // Check whether the record has been updated Path updatedParquetFilePath = new Path(updatedParquetFile.getAbsolutePath()); BloomFilter updatedFilter = ParquetUtils - .readBloomFilterFromParquetMetadata(updatedParquetFilePath); + .readBloomFilterFromParquetMetadata(jsc.hadoopConfiguration(), updatedParquetFilePath); for (HoodieRecord record : records) { // No change to the _row_key assertTrue(updatedFilter.mightContain(record.getRecordKey())); @@ -261,7 +264,7 @@ public class TestCopyOnWriteTable { HoodieWriteConfig config = makeHoodieClientConfigBuilder() .withWriteStatusClass(MetadataMergeWriteStatus.class).build(); String firstCommitTime = HoodieTestUtils.makeNewCommitTime(); - HoodieTableMetaClient metadata = new HoodieTableMetaClient(FSUtils.getFs(), basePath); + HoodieTableMetaClient metadata = new HoodieTableMetaClient(jsc.hadoopConfiguration(), basePath); HoodieCopyOnWriteTable table = new HoodieCopyOnWriteTable(config, metadata); @@ -298,8 +301,8 @@ public class TestCopyOnWriteTable { public void testInsertWithPartialFailures() throws Exception { HoodieWriteConfig config = makeHoodieClientConfig(); String commitTime = HoodieTestUtils.makeNewCommitTime(); - FileSystem fs = FSUtils.getFs(); - HoodieTableMetaClient metadata = new HoodieTableMetaClient(fs, basePath); + FileSystem fs = FSUtils.getFs(basePath, jsc.hadoopConfiguration()); + HoodieTableMetaClient metadata = new HoodieTableMetaClient(jsc.hadoopConfiguration(), basePath); HoodieCopyOnWriteTable table = new HoodieCopyOnWriteTable(config, metadata); // Write a few records, and get atleast one file @@ -340,7 +343,7 @@ public class TestCopyOnWriteTable { public void testInsertRecords() throws Exception { HoodieWriteConfig config = makeHoodieClientConfig(); String commitTime = HoodieTestUtils.makeNewCommitTime(); - HoodieTableMetaClient metadata = new HoodieTableMetaClient(FSUtils.getFs(), basePath); + HoodieTableMetaClient metadata = new HoodieTableMetaClient(jsc.hadoopConfiguration(), basePath); HoodieCopyOnWriteTable table = new HoodieCopyOnWriteTable(config, metadata); // Case 1: @@ -389,7 +392,7 @@ public class TestCopyOnWriteTable { HoodieStorageConfig.newBuilder().limitFileSize(64 * 1024).parquetBlockSize(64 * 1024) .parquetPageSize(64 * 1024).build()).build(); String commitTime = HoodieTestUtils.makeNewCommitTime(); - HoodieTableMetaClient metadata = new HoodieTableMetaClient(FSUtils.getFs(), basePath); + HoodieTableMetaClient metadata = new HoodieTableMetaClient(jsc.hadoopConfiguration(), basePath); HoodieCopyOnWriteTable table = new HoodieCopyOnWriteTable(config, metadata); List records = new ArrayList<>(); @@ -437,7 +440,7 @@ public class TestCopyOnWriteTable { HoodieClientTestUtils.fakeCommitFile(basePath, "001"); HoodieClientTestUtils.fakeDataFile(basePath, TEST_PARTITION_PATH, "001", "file1", fileSize); - HoodieTableMetaClient metadata = new HoodieTableMetaClient(FSUtils.getFs(), basePath); + HoodieTableMetaClient metadata = new HoodieTableMetaClient(jsc.hadoopConfiguration(), basePath); HoodieCopyOnWriteTable table = new HoodieCopyOnWriteTable(config, metadata); HoodieTestDataGenerator dataGenerator = new HoodieTestDataGenerator( diff --git a/hoodie-client/src/test/java/com/uber/hoodie/table/TestMergeOnReadTable.java b/hoodie-client/src/test/java/com/uber/hoodie/table/TestMergeOnReadTable.java index 42649159c..5465879f0 100644 --- a/hoodie-client/src/test/java/com/uber/hoodie/table/TestMergeOnReadTable.java +++ b/hoodie-client/src/test/java/com/uber/hoodie/table/TestMergeOnReadTable.java @@ -19,6 +19,11 @@ package com.uber.hoodie.table; +import static com.uber.hoodie.common.HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertTrue; + import com.uber.hoodie.HoodieWriteClient; import com.uber.hoodie.WriteStatus; import com.uber.hoodie.common.HoodieClientTestUtils; @@ -45,6 +50,14 @@ import com.uber.hoodie.config.HoodieWriteConfig; import com.uber.hoodie.index.HoodieIndex; import com.uber.hoodie.io.compact.HoodieCompactor; import com.uber.hoodie.io.compact.HoodieRealtimeTableCompactor; +import java.io.File; +import java.io.IOException; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Optional; +import java.util.stream.Collectors; +import java.util.stream.Stream; import org.apache.avro.generic.GenericRecord; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; @@ -61,25 +74,11 @@ import org.junit.BeforeClass; import org.junit.Test; import org.junit.rules.TemporaryFolder; -import java.io.File; -import java.io.IOException; -import java.util.HashMap; -import java.util.List; -import java.util.Map; -import java.util.Optional; -import java.util.stream.Collectors; -import java.util.stream.Stream; - -import static com.uber.hoodie.common.HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA; -import static org.junit.Assert.assertEquals; -import static org.junit.Assert.assertFalse; -import static org.junit.Assert.assertTrue; - public class TestMergeOnReadTable { private transient JavaSparkContext jsc = null; private transient SQLContext sqlContext; - private String basePath = null; + private static String basePath = null; private HoodieCompactor compactor; private FileSystem fs; @@ -94,12 +93,11 @@ public class TestMergeOnReadTable { if (hdfsTestService != null) { hdfsTestService.stop(); dfsCluster.shutdown(); - ; } FSUtils.setFs(null); // Need to closeAll to clear FileSystem.Cache, required because DFS and LocalFS used in the same JVM FileSystem.closeAll(); - HoodieTestUtils.resetFS(); + HoodieTestUtils.resetFS(basePath); } @BeforeClass @@ -113,30 +111,28 @@ public class TestMergeOnReadTable { dfs = dfsCluster.getFileSystem(); } FSUtils.setFs(dfs); - HoodieTestUtils.resetFS(); + HoodieTestUtils.resetFS(basePath); } @Before public void init() throws IOException { - this.fs = FSUtils.getFs(); - // Initialize a local spark env jsc = new JavaSparkContext( HoodieClientTestUtils.getSparkConfForTest("TestHoodieMergeOnReadTable")); - jsc.hadoopConfiguration().addResource(FSUtils.getFs().getConf()); // Create a temp folder as the base path TemporaryFolder folder = new TemporaryFolder(); folder.create(); basePath = folder.getRoot().getAbsolutePath(); + fs = FSUtils.getFs(basePath, jsc.hadoopConfiguration()); + jsc.hadoopConfiguration().addResource(fs.getConf()); + dfs.mkdirs(new Path(basePath)); FSUtils.setFs(dfs); HoodieTestUtils.initTableType(basePath, HoodieTableType.MERGE_ON_READ); + sqlContext = new SQLContext(jsc); // SQLContext stuff compactor = new HoodieRealtimeTableCompactor(); - - //SQLContext stuff - sqlContext = new SQLContext(jsc); } @After @@ -167,7 +163,8 @@ public class TestMergeOnReadTable { List statuses = client.upsert(writeRecords, newCommitTime).collect(); assertNoWriteErrors(statuses); - HoodieTableMetaClient metaClient = new HoodieTableMetaClient(fs, cfg.getBasePath()); + HoodieTableMetaClient metaClient = new HoodieTableMetaClient(jsc.hadoopConfiguration(), + cfg.getBasePath()); HoodieTable hoodieTable = HoodieTable.getHoodieTable(metaClient, cfg); Optional deltaCommit = @@ -209,7 +206,7 @@ public class TestMergeOnReadTable { statuses = client.upsert(jsc.parallelize(records, 1), newCommitTime).collect(); // Verify there are no errors assertNoWriteErrors(statuses); - metaClient = new HoodieTableMetaClient(fs, cfg.getBasePath()); + metaClient = new HoodieTableMetaClient(jsc.hadoopConfiguration(), cfg.getBasePath()); deltaCommit = metaClient.getActiveTimeline().getDeltaCommitTimeline().lastInstant(); assertTrue(deltaCommit.isPresent()); assertEquals("Latest Delta commit should be 004", "004", deltaCommit.get().getTimestamp()); @@ -229,8 +226,9 @@ public class TestMergeOnReadTable { assertTrue(dataFilesToRead.findAny().isPresent()); // verify that there is a commit - table = HoodieTable - .getHoodieTable(new HoodieTableMetaClient(fs, cfg.getBasePath(), true), getConfig(false)); + table = HoodieTable.getHoodieTable( + new HoodieTableMetaClient(jsc.hadoopConfiguration(), cfg.getBasePath(), true), + getConfig(false)); HoodieTimeline timeline = table.getCommitTimeline().filterCompletedInstants(); assertEquals("Expecting a single commit.", 1, timeline.findInstantsAfter("000", Integer.MAX_VALUE).countInstants()); @@ -284,7 +282,8 @@ public class TestMergeOnReadTable { List statuses = client.upsert(writeRecords, newCommitTime).collect(); assertNoWriteErrors(statuses); - HoodieTableMetaClient metaClient = new HoodieTableMetaClient(fs, cfg.getBasePath()); + HoodieTableMetaClient metaClient = new HoodieTableMetaClient(jsc.hadoopConfiguration(), + cfg.getBasePath()); HoodieTable hoodieTable = HoodieTable.getHoodieTable(metaClient, cfg); Optional deltaCommit = @@ -332,7 +331,7 @@ public class TestMergeOnReadTable { // Verify there are no errors assertNoWriteErrors(statuses); - metaClient = new HoodieTableMetaClient(fs, cfg.getBasePath()); + metaClient = new HoodieTableMetaClient(jsc.hadoopConfiguration(), cfg.getBasePath()); deltaCommit = metaClient.getActiveTimeline().getDeltaCommitTimeline().lastInstant(); assertTrue(deltaCommit.isPresent()); assertEquals("Latest Delta commit should be 004", "004", deltaCommit.get().getTimestamp()); @@ -349,7 +348,7 @@ public class TestMergeOnReadTable { List dataFiles = roView.getLatestDataFiles().map(hf -> hf.getPath()) .collect(Collectors.toList()); List recordsRead = HoodieMergeOnReadTestUtils - .getRecordsUsingInputFormat(dataFiles); + .getRecordsUsingInputFormat(dataFiles, basePath); //Wrote 40 records and deleted 20 records, so remaining 40-20 = 20 assertEquals("Must contain 20 records", 20, recordsRead.size()); } @@ -377,9 +376,8 @@ public class TestMergeOnReadTable { //verify there are no errors assertNoWriteErrors(statuses); - HoodieTableMetaClient metaClient = new HoodieTableMetaClient(fs, cfg.getBasePath()); - HoodieTable hoodieTable = HoodieTable.getHoodieTable(metaClient, cfg); - + HoodieTableMetaClient metaClient = new HoodieTableMetaClient(jsc.hadoopConfiguration(), + cfg.getBasePath()); Optional commit = metaClient.getActiveTimeline().getCommitTimeline().firstInstant(); assertTrue(commit.isPresent()); @@ -403,8 +401,9 @@ public class TestMergeOnReadTable { //rollback a COW commit when TableType is MOR client.rollback(newCommitTime); - metaClient = new HoodieTableMetaClient(fs, cfg.getBasePath()); - hoodieTable = HoodieTable.getHoodieTable(metaClient, cfg); + metaClient = new HoodieTableMetaClient(jsc.hadoopConfiguration(), cfg.getBasePath()); + HoodieTable hoodieTable = HoodieTable + .getHoodieTable(metaClient, cfg); FileStatus[] allFiles = HoodieTestUtils .listAllDataFilesInPath(metaClient.getFs(), cfg.getBasePath()); HoodieTableFileSystemView roView = new HoodieTableFileSystemView(metaClient, @@ -440,7 +439,8 @@ public class TestMergeOnReadTable { List statuses = client.upsert(writeRecords, newCommitTime).collect(); assertNoWriteErrors(statuses); - HoodieTableMetaClient metaClient = new HoodieTableMetaClient(fs, cfg.getBasePath()); + HoodieTableMetaClient metaClient = new HoodieTableMetaClient(jsc.hadoopConfiguration(), + cfg.getBasePath()); HoodieTable hoodieTable = HoodieTable.getHoodieTable(metaClient, cfg); Optional deltaCommit = @@ -476,7 +476,7 @@ public class TestMergeOnReadTable { statuses = client.upsert(jsc.parallelize(records, 1), newCommitTime).collect(); // Verify there are no errors assertNoWriteErrors(statuses); - metaClient = new HoodieTableMetaClient(fs, cfg.getBasePath()); + metaClient = new HoodieTableMetaClient(jsc.hadoopConfiguration(), cfg.getBasePath()); deltaCommit = metaClient.getActiveTimeline().getDeltaCommitTimeline().lastInstant(); assertTrue(deltaCommit.isPresent()); assertEquals("Latest Delta commit should be 002", "002", deltaCommit.get().getTimestamp()); @@ -487,19 +487,19 @@ public class TestMergeOnReadTable { List dataFiles = roView.getLatestDataFiles().map(hf -> hf.getPath()) .collect(Collectors.toList()); List recordsRead = HoodieMergeOnReadTestUtils - .getRecordsUsingInputFormat(dataFiles); + .getRecordsUsingInputFormat(dataFiles, basePath); assertEquals(recordsRead.size(), 200); // Test delta commit rollback client.rollback(newCommitTime); - metaClient = new HoodieTableMetaClient(fs, cfg.getBasePath()); + metaClient = new HoodieTableMetaClient(jsc.hadoopConfiguration(), cfg.getBasePath()); hoodieTable = HoodieTable.getHoodieTable(metaClient, cfg); roView = new HoodieTableFileSystemView(metaClient, hoodieTable.getCompletedCommitTimeline(), allFiles); dataFiles = roView.getLatestDataFiles().map(hf -> hf.getPath()).collect(Collectors.toList()); - recordsRead = HoodieMergeOnReadTestUtils.getRecordsUsingInputFormat(dataFiles); + recordsRead = HoodieMergeOnReadTestUtils.getRecordsUsingInputFormat(dataFiles, basePath); assertEquals(recordsRead.size(), 200); @@ -516,13 +516,14 @@ public class TestMergeOnReadTable { assertNoWriteErrors(statuses); HoodieCompactor compactor = new HoodieRealtimeTableCompactor(); - metaClient = new HoodieTableMetaClient(fs, cfg.getBasePath()); - HoodieTable table = HoodieTable.getHoodieTable(metaClient, getConfig(true)); + metaClient = new HoodieTableMetaClient(jsc.hadoopConfiguration(), cfg.getBasePath()); + HoodieTable table = HoodieTable + .getHoodieTable(metaClient, getConfig(true)); compactor.compact(jsc, getConfig(true), table, HoodieActiveTimeline.createNewCommitTime()); allFiles = HoodieTestUtils.listAllDataFilesInPath(metaClient.getFs(), cfg.getBasePath()); - metaClient = new HoodieTableMetaClient(fs, cfg.getBasePath()); + metaClient = new HoodieTableMetaClient(jsc.hadoopConfiguration(), cfg.getBasePath()); hoodieTable = HoodieTable.getHoodieTable(metaClient, cfg); roView = new HoodieTableFileSystemView(metaClient, hoodieTable.getCommitsTimeline(), allFiles); @@ -541,7 +542,7 @@ public class TestMergeOnReadTable { client.rollback(compactedCommitTime); allFiles = HoodieTestUtils.listAllDataFilesInPath(metaClient.getFs(), cfg.getBasePath()); - metaClient = new HoodieTableMetaClient(fs, cfg.getBasePath()); + metaClient = new HoodieTableMetaClient(jsc.hadoopConfiguration(), cfg.getBasePath()); hoodieTable = HoodieTable.getHoodieTable(metaClient, cfg); roView = new HoodieTableFileSystemView(metaClient, hoodieTable.getCommitsTimeline(), allFiles); diff --git a/hoodie-common/pom.xml b/hoodie-common/pom.xml index db973dfa7..181fed895 100644 --- a/hoodie-common/pom.xml +++ b/hoodie-common/pom.xml @@ -140,5 +140,11 @@ + + com.github.stefanbirkner + system-rules + 1.16.0 + test + diff --git a/hoodie-common/src/main/java/com/uber/hoodie/avro/MercifulJsonConverter.java b/hoodie-common/src/main/java/com/uber/hoodie/avro/MercifulJsonConverter.java index b775d9068..b39d64174 100644 --- a/hoodie-common/src/main/java/com/uber/hoodie/avro/MercifulJsonConverter.java +++ b/hoodie-common/src/main/java/com/uber/hoodie/avro/MercifulJsonConverter.java @@ -77,7 +77,7 @@ public class MercifulJsonConverter { switch (schema.getType()) { case BOOLEAN: if (value instanceof Boolean) { - return (Boolean) value; + return value; } break; case DOUBLE: diff --git a/hoodie-common/src/main/java/com/uber/hoodie/common/SerializableConfiguration.java b/hoodie-common/src/main/java/com/uber/hoodie/common/SerializableConfiguration.java new file mode 100644 index 000000000..01a1a6c09 --- /dev/null +++ b/hoodie-common/src/main/java/com/uber/hoodie/common/SerializableConfiguration.java @@ -0,0 +1,56 @@ +/* + * Copyright (c) 2017 Uber Technologies, Inc. (hoodie-dev-group@uber.com) + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * + */ + +package com.uber.hoodie.common; + +import java.io.IOException; +import java.io.ObjectInputStream; +import java.io.ObjectOutputStream; +import java.io.Serializable; +import org.apache.hadoop.conf.Configuration; + +public class SerializableConfiguration implements Serializable { + + private transient Configuration configuration; + + public SerializableConfiguration(Configuration configuration) { + this.configuration = configuration; + } + + public Configuration get() { + return configuration; + } + + private void writeObject(ObjectOutputStream out) throws IOException { + out.defaultWriteObject(); + configuration.write(out); + } + + private void readObject(ObjectInputStream in) throws IOException { + configuration = new Configuration(false); + configuration.readFields(in); + } + + @Override + public String toString() { + StringBuilder str = new StringBuilder(); + configuration.iterator().forEachRemaining(e -> + str.append(String.format("%s => %s \n", e.getKey(), e.getValue()))); + return configuration.toString(); + } +} diff --git a/hoodie-common/src/main/java/com/uber/hoodie/common/model/ActionType.java b/hoodie-common/src/main/java/com/uber/hoodie/common/model/ActionType.java index 4a4427696..96e5b97d0 100644 --- a/hoodie-common/src/main/java/com/uber/hoodie/common/model/ActionType.java +++ b/hoodie-common/src/main/java/com/uber/hoodie/common/model/ActionType.java @@ -17,5 +17,5 @@ package com.uber.hoodie.common.model; public enum ActionType { - commit, savepoint, compaction, clean, rollback; + commit, savepoint, compaction, clean, rollback } diff --git a/hoodie-common/src/main/java/com/uber/hoodie/common/table/HoodieTableMetaClient.java b/hoodie-common/src/main/java/com/uber/hoodie/common/table/HoodieTableMetaClient.java index d012d9799..758d7b5db 100644 --- a/hoodie-common/src/main/java/com/uber/hoodie/common/table/HoodieTableMetaClient.java +++ b/hoodie-common/src/main/java/com/uber/hoodie/common/table/HoodieTableMetaClient.java @@ -16,6 +16,7 @@ package com.uber.hoodie.common.table; +import com.uber.hoodie.common.SerializableConfiguration; import com.uber.hoodie.common.model.HoodieTableType; import com.uber.hoodie.common.table.timeline.HoodieActiveTimeline; import com.uber.hoodie.common.table.timeline.HoodieArchivedTimeline; @@ -26,6 +27,7 @@ import java.io.IOException; import java.io.Serializable; import java.util.Objects; import java.util.Properties; +import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; @@ -52,24 +54,28 @@ public class HoodieTableMetaClient implements Serializable { private String basePath; private transient FileSystem fs; private String metaPath; + private SerializableConfiguration hadoopConf; private HoodieTableType tableType; private HoodieTableConfig tableConfig; private HoodieActiveTimeline activeTimeline; private HoodieArchivedTimeline archivedTimeline; - public HoodieTableMetaClient(FileSystem fs, String basePath) throws DatasetNotFoundException { + public HoodieTableMetaClient(Configuration conf, String basePath) + throws DatasetNotFoundException { // Do not load any timeline by default - this(fs, basePath, false); + this(conf, basePath, false); } - public HoodieTableMetaClient(FileSystem fs, String basePath, boolean loadActiveTimelineOnLoad) + public HoodieTableMetaClient(Configuration conf, String basePath, + boolean loadActiveTimelineOnLoad) throws DatasetNotFoundException { log.info("Loading HoodieTableMetaClient from " + basePath); this.basePath = basePath; - this.fs = fs; + this.hadoopConf = new SerializableConfiguration(conf); Path basePathDir = new Path(this.basePath); this.metaPath = basePath + File.separator + METAFOLDER_NAME; Path metaPathDir = new Path(this.metaPath); + this.fs = getFs(); DatasetNotFoundException.checkValidDataset(fs, basePathDir, metaPathDir); this.tableConfig = new HoodieTableConfig(fs, metaPath); this.tableType = tableConfig.getTableType(); @@ -96,7 +102,7 @@ public class HoodieTableMetaClient implements Serializable { private void readObject(java.io.ObjectInputStream in) throws IOException, ClassNotFoundException { in.defaultReadObject(); - this.fs = FSUtils.getFs(); + fs = null; // will be lazily inited } private void writeObject(java.io.ObjectOutputStream out) @@ -136,9 +142,16 @@ public class HoodieTableMetaClient implements Serializable { * Get the FS implementation for this table */ public FileSystem getFs() { + if (fs == null) { + fs = FSUtils.getFs(metaPath, hadoopConf.get()); + } return fs; } + public Configuration getHadoopConf() { + return hadoopConf.get(); + } + /** * Get the active instants as a timeline * @@ -146,7 +159,7 @@ public class HoodieTableMetaClient implements Serializable { */ public synchronized HoodieActiveTimeline getActiveTimeline() { if (activeTimeline == null) { - activeTimeline = new HoodieActiveTimeline(fs, metaPath); + activeTimeline = new HoodieActiveTimeline(this); } return activeTimeline; } @@ -159,7 +172,7 @@ public class HoodieTableMetaClient implements Serializable { */ public synchronized HoodieArchivedTimeline getArchivedTimeline() { if (archivedTimeline == null) { - archivedTimeline = new HoodieArchivedTimeline(fs, metaPath); + archivedTimeline = new HoodieArchivedTimeline(this); } return archivedTimeline; } @@ -196,7 +209,7 @@ public class HoodieTableMetaClient implements Serializable { fs.mkdirs(metaPathDir); } HoodieTableConfig.createHoodieProperties(fs, metaPathDir, props); - HoodieTableMetaClient metaClient = new HoodieTableMetaClient(fs, basePath); + HoodieTableMetaClient metaClient = new HoodieTableMetaClient(fs.getConf(), basePath); log.info("Finished initializing Table of type " + metaClient.getTableConfig().getTableType() + " from " + basePath); return metaClient; diff --git a/hoodie-common/src/main/java/com/uber/hoodie/common/table/log/HoodieCompactedLogRecordScanner.java b/hoodie-common/src/main/java/com/uber/hoodie/common/table/log/HoodieCompactedLogRecordScanner.java index 667199233..140d5d4fa 100644 --- a/hoodie-common/src/main/java/com/uber/hoodie/common/table/log/HoodieCompactedLogRecordScanner.java +++ b/hoodie-common/src/main/java/com/uber/hoodie/common/table/log/HoodieCompactedLogRecordScanner.java @@ -83,7 +83,7 @@ public class HoodieCompactedLogRecordScanner implements Schema readerSchema, String latestInstantTime) { this.readerSchema = readerSchema; this.latestInstantTime = latestInstantTime; - this.hoodieTableMetaClient = new HoodieTableMetaClient(fs, basePath); + this.hoodieTableMetaClient = new HoodieTableMetaClient(fs.getConf(), basePath); // load class from the payload fully qualified class name this.payloadClassFQN = this.hoodieTableMetaClient.getTableConfig().getPayloadClass(); diff --git a/hoodie-common/src/main/java/com/uber/hoodie/common/table/log/HoodieLogFormat.java b/hoodie-common/src/main/java/com/uber/hoodie/common/table/log/HoodieLogFormat.java index 2979bb619..58734b165 100644 --- a/hoodie-common/src/main/java/com/uber/hoodie/common/table/log/HoodieLogFormat.java +++ b/hoodie-common/src/main/java/com/uber/hoodie/common/table/log/HoodieLogFormat.java @@ -150,7 +150,7 @@ public interface HoodieLogFormat { public Writer build() throws IOException, InterruptedException { log.info("Building HoodieLogFormat Writer"); if (fs == null) { - fs = FSUtils.getFs(); + throw new IllegalArgumentException("fs is not specified"); } if (logFileId == null) { throw new IllegalArgumentException("FileID is not specified"); diff --git a/hoodie-common/src/main/java/com/uber/hoodie/common/table/log/block/HoodieLogBlock.java b/hoodie-common/src/main/java/com/uber/hoodie/common/table/log/block/HoodieLogBlock.java index 817016100..39049b25b 100644 --- a/hoodie-common/src/main/java/com/uber/hoodie/common/table/log/block/HoodieLogBlock.java +++ b/hoodie-common/src/main/java/com/uber/hoodie/common/table/log/block/HoodieLogBlock.java @@ -58,7 +58,7 @@ public abstract class HoodieLogBlock { */ public enum LogMetadataType { INSTANT_TIME, - TARGET_INSTANT_TIME; + TARGET_INSTANT_TIME } public HoodieLogBlock(Map logMetadata) { diff --git a/hoodie-common/src/main/java/com/uber/hoodie/common/table/timeline/HoodieActiveTimeline.java b/hoodie-common/src/main/java/com/uber/hoodie/common/table/timeline/HoodieActiveTimeline.java index ae004991c..4c9f5c523 100644 --- a/hoodie-common/src/main/java/com/uber/hoodie/common/table/timeline/HoodieActiveTimeline.java +++ b/hoodie-common/src/main/java/com/uber/hoodie/common/table/timeline/HoodieActiveTimeline.java @@ -37,7 +37,6 @@ import java.util.stream.Stream; import org.apache.commons.io.IOUtils; import org.apache.hadoop.fs.FSDataInputStream; import org.apache.hadoop.fs.FSDataOutputStream; -import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.log4j.LogManager; import org.apache.log4j.Logger; @@ -54,11 +53,8 @@ public class HoodieActiveTimeline extends HoodieDefaultTimeline { public static final SimpleDateFormat COMMIT_FORMATTER = new SimpleDateFormat("yyyyMMddHHmmss"); - private final transient static Logger log = LogManager.getLogger(HoodieActiveTimeline.class); - private String metaPath; - private transient FileSystem fs; - + private HoodieTableMetaClient metaClient; /** * Returns next commit time in the {@link #COMMIT_FORMATTER} format. @@ -67,12 +63,14 @@ public class HoodieActiveTimeline extends HoodieDefaultTimeline { return HoodieActiveTimeline.COMMIT_FORMATTER.format(new Date()); } - protected HoodieActiveTimeline(FileSystem fs, String metaPath, String[] includedExtensions) { + protected HoodieActiveTimeline(HoodieTableMetaClient metaClient, String[] includedExtensions) { // Filter all the filter in the metapath and include only the extensions passed and // convert them into HoodieInstant try { this.instants = - Arrays.stream(HoodieTableMetaClient.scanFiles(fs, new Path(metaPath), path -> { + Arrays.stream( + HoodieTableMetaClient + .scanFiles(metaClient.getFs(), new Path(metaClient.getMetaPath()), path -> { // Include only the meta files with extensions that needs to be included String extension = FSUtils.getFileExtension(path.getName()); return Arrays.stream(includedExtensions).anyMatch(Predicate.isEqual(extension)); @@ -85,14 +83,13 @@ public class HoodieActiveTimeline extends HoodieDefaultTimeline { } catch (IOException e) { throw new HoodieIOException("Failed to scan metadata", e); } - this.fs = fs; - this.metaPath = metaPath; + this.metaClient = metaClient; // multiple casts will make this lambda serializable - http://docs.oracle.com/javase/specs/jls/se8/html/jls-15.html#jls-15.16 this.details = (Function> & Serializable) this::getInstantDetails; } - public HoodieActiveTimeline(FileSystem fs, String metaPath) { - this(fs, metaPath, + public HoodieActiveTimeline(HoodieTableMetaClient metaClient) { + this(metaClient, new String[]{COMMIT_EXTENSION, INFLIGHT_COMMIT_EXTENSION, DELTA_COMMIT_EXTENSION, INFLIGHT_DELTA_COMMIT_EXTENSION, SAVEPOINT_EXTENSION, INFLIGHT_SAVEPOINT_EXTENSION, CLEAN_EXTENSION, INFLIGHT_CLEAN_EXTENSION}); @@ -114,7 +111,6 @@ public class HoodieActiveTimeline extends HoodieDefaultTimeline { private void readObject(java.io.ObjectInputStream in) throws IOException, ClassNotFoundException { in.defaultReadObject(); - this.fs = FSUtils.getFs(); } /** @@ -214,9 +210,9 @@ public class HoodieActiveTimeline extends HoodieDefaultTimeline { public void deleteInflight(HoodieInstant instant) { log.info("Deleting in-flight " + instant); - Path inFlightCommitFilePath = new Path(metaPath, instant.getFileName()); + Path inFlightCommitFilePath = new Path(metaClient.getMetaPath(), instant.getFileName()); try { - boolean result = fs.delete(inFlightCommitFilePath, false); + boolean result = metaClient.getFs().delete(inFlightCommitFilePath, false); if (result) { log.info("Removed in-flight " + instant); } else { @@ -230,18 +226,18 @@ public class HoodieActiveTimeline extends HoodieDefaultTimeline { @Override public Optional getInstantDetails(HoodieInstant instant) { - Path detailPath = new Path(metaPath, instant.getFileName()); + Path detailPath = new Path(metaClient.getMetaPath(), instant.getFileName()); return readDataFromPath(detailPath); } protected void moveInflightToComplete(HoodieInstant inflight, HoodieInstant completed, Optional data) { - Path commitFilePath = new Path(metaPath, completed.getFileName()); + Path commitFilePath = new Path(metaClient.getMetaPath(), completed.getFileName()); try { // open a new file and write the commit metadata in - Path inflightCommitFile = new Path(metaPath, inflight.getFileName()); + Path inflightCommitFile = new Path(metaClient.getMetaPath(), inflight.getFileName()); createFileInMetaPath(inflight.getFileName(), data); - boolean success = fs.rename(inflightCommitFile, commitFilePath); + boolean success = metaClient.getFs().rename(inflightCommitFile, commitFilePath); if (!success) { throw new HoodieIOException( "Could not rename " + inflightCommitFile + " to " + commitFilePath); @@ -252,11 +248,11 @@ public class HoodieActiveTimeline extends HoodieDefaultTimeline { } protected void moveCompleteToInflight(HoodieInstant completed, HoodieInstant inflight) { - Path inFlightCommitFilePath = new Path(metaPath, inflight.getFileName()); + Path inFlightCommitFilePath = new Path(metaClient.getMetaPath(), inflight.getFileName()); try { - if (!fs.exists(inFlightCommitFilePath)) { - Path commitFilePath = new Path(metaPath, completed.getFileName()); - boolean success = fs.rename(commitFilePath, inFlightCommitFilePath); + if (!metaClient.getFs().exists(inFlightCommitFilePath)) { + Path commitFilePath = new Path(metaClient.getMetaPath(), completed.getFileName()); + boolean success = metaClient.getFs().rename(commitFilePath, inFlightCommitFilePath); if (!success) { throw new HoodieIOException( "Could not rename " + commitFilePath + " to " + inFlightCommitFilePath); @@ -272,15 +268,15 @@ public class HoodieActiveTimeline extends HoodieDefaultTimeline { } protected void createFileInMetaPath(String filename, Optional content) { - Path fullPath = new Path(metaPath, filename); + Path fullPath = new Path(metaClient.getMetaPath(), filename); try { if (!content.isPresent()) { - if (fs.createNewFile(fullPath)) { + if (metaClient.getFs().createNewFile(fullPath)) { log.info("Created a new file in meta path: " + fullPath); return; } } else { - FSDataOutputStream fsout = fs.create(fullPath, true); + FSDataOutputStream fsout = metaClient.getFs().create(fullPath, true); fsout.write(content.get()); fsout.close(); return; @@ -292,7 +288,7 @@ public class HoodieActiveTimeline extends HoodieDefaultTimeline { } protected Optional readDataFromPath(Path detailPath) { - try (FSDataInputStream is = fs.open(detailPath)) { + try (FSDataInputStream is = metaClient.getFs().open(detailPath)) { return Optional.of(IOUtils.toByteArray(is)); } catch (IOException e) { throw new HoodieIOException("Could not read commit details from " + detailPath, e); @@ -300,6 +296,6 @@ public class HoodieActiveTimeline extends HoodieDefaultTimeline { } public HoodieActiveTimeline reload() { - return new HoodieActiveTimeline(fs, metaPath); + return new HoodieActiveTimeline(metaClient); } } diff --git a/hoodie-common/src/main/java/com/uber/hoodie/common/table/timeline/HoodieArchivedTimeline.java b/hoodie-common/src/main/java/com/uber/hoodie/common/table/timeline/HoodieArchivedTimeline.java index 37e5e9414..bc04873af 100644 --- a/hoodie-common/src/main/java/com/uber/hoodie/common/table/timeline/HoodieArchivedTimeline.java +++ b/hoodie-common/src/main/java/com/uber/hoodie/common/table/timeline/HoodieArchivedTimeline.java @@ -16,8 +16,8 @@ package com.uber.hoodie.common.table.timeline; +import com.uber.hoodie.common.table.HoodieTableMetaClient; import com.uber.hoodie.common.table.HoodieTimeline; -import com.uber.hoodie.common.util.FSUtils; import com.uber.hoodie.exception.HoodieIOException; import java.io.IOException; import java.io.Serializable; @@ -27,7 +27,6 @@ import java.util.Map; import java.util.Optional; import java.util.function.Function; import java.util.stream.Collectors; -import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.SequenceFile; import org.apache.hadoop.io.Text; @@ -44,17 +43,17 @@ import org.apache.log4j.Logger; public class HoodieArchivedTimeline extends HoodieDefaultTimeline { private static final String HOODIE_COMMIT_ARCHIVE_LOG_FILE = "commits"; - private transient FileSystem fs; - private String metaPath; + private HoodieTableMetaClient metaClient; private Map readCommits = new HashMap<>(); private final transient static Logger log = LogManager.getLogger(HoodieArchivedTimeline.class); - public HoodieArchivedTimeline(FileSystem fs, String metaPath) { + public HoodieArchivedTimeline(HoodieTableMetaClient metaClient) { // Read back the commits to make sure - Path archiveLogPath = getArchiveLogPath(metaPath); + Path archiveLogPath = getArchiveLogPath(metaClient.getMetaPath()); try (SequenceFile.Reader reader = - new SequenceFile.Reader(fs.getConf(), SequenceFile.Reader.file(archiveLogPath))) { + new SequenceFile.Reader(metaClient.getHadoopConf(), + SequenceFile.Reader.file(archiveLogPath))) { Text key = new Text(); Text val = new Text(); while (reader.next(key, val)) { @@ -71,8 +70,7 @@ public class HoodieArchivedTimeline extends HoodieDefaultTimeline { } // multiple casts will make this lambda serializable - http://docs.oracle.com/javase/specs/jls/se8/html/jls-15.html#jls-15.16 this.details = (Function> & Serializable) this::getInstantDetails; - this.fs = fs; - this.metaPath = metaPath; + this.metaClient = metaClient; } /** @@ -91,7 +89,6 @@ public class HoodieArchivedTimeline extends HoodieDefaultTimeline { private void readObject(java.io.ObjectInputStream in) throws IOException, ClassNotFoundException { in.defaultReadObject(); - this.fs = FSUtils.getFs(); } @@ -105,7 +102,7 @@ public class HoodieArchivedTimeline extends HoodieDefaultTimeline { } public HoodieArchivedTimeline reload() { - return new HoodieArchivedTimeline(fs, metaPath); + return new HoodieArchivedTimeline(metaClient); } } diff --git a/hoodie-common/src/main/java/com/uber/hoodie/common/table/view/HoodieTableFileSystemView.java b/hoodie-common/src/main/java/com/uber/hoodie/common/table/view/HoodieTableFileSystemView.java index afd2c89dc..60df7f471 100644 --- a/hoodie-common/src/main/java/com/uber/hoodie/common/table/view/HoodieTableFileSystemView.java +++ b/hoodie-common/src/main/java/com/uber/hoodie/common/table/view/HoodieTableFileSystemView.java @@ -40,7 +40,6 @@ import java.util.stream.Collectors; import java.util.stream.Stream; import org.apache.commons.lang3.tuple.Pair; import org.apache.hadoop.fs.FileStatus; -import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; /** @@ -57,7 +56,6 @@ public class HoodieTableFileSystemView implements TableFileSystemView, TableFileSystemView.RealtimeView, Serializable { protected HoodieTableMetaClient metaClient; - protected transient FileSystem fs; // This is the commits that will be visible for all views extending this view protected HoodieTimeline visibleActiveTimeline; @@ -72,7 +70,6 @@ public class HoodieTableFileSystemView implements TableFileSystemView, public HoodieTableFileSystemView(HoodieTableMetaClient metaClient, HoodieTimeline visibleActiveTimeline) { this.metaClient = metaClient; - this.fs = metaClient.getFs(); this.visibleActiveTimeline = visibleActiveTimeline; this.fileGroupMap = new HashMap<>(); this.partitionToFileGroupsMap = new HashMap<>(); @@ -98,7 +95,6 @@ public class HoodieTableFileSystemView implements TableFileSystemView, private void readObject(java.io.ObjectInputStream in) throws IOException, ClassNotFoundException { in.defaultReadObject(); - this.fs = FSUtils.getFs(); } private void writeObject(java.io.ObjectOutputStream out) @@ -255,8 +251,8 @@ public class HoodieTableFileSystemView implements TableFileSystemView, try { // Create the path if it does not exist already Path partitionPath = new Path(metaClient.getBasePath(), partitionPathStr); - FSUtils.createPathIfNotExists(fs, partitionPath); - FileStatus[] statuses = fs.listStatus(partitionPath); + FSUtils.createPathIfNotExists(metaClient.getFs(), partitionPath); + FileStatus[] statuses = metaClient.getFs().listStatus(partitionPath); List fileGroups = addFilesToView(statuses); return fileGroups.stream(); } catch (IOException e) { diff --git a/hoodie-common/src/main/java/com/uber/hoodie/common/util/AvroUtils.java b/hoodie-common/src/main/java/com/uber/hoodie/common/util/AvroUtils.java index ae0dbd3f0..633960d0a 100644 --- a/hoodie-common/src/main/java/com/uber/hoodie/common/util/AvroUtils.java +++ b/hoodie-common/src/main/java/com/uber/hoodie/common/util/AvroUtils.java @@ -94,7 +94,7 @@ public class AvroUtils { public static HoodieCleanMetadata convertCleanMetadata(String startCleanTime, Optional durationInMs, List cleanStats) { ImmutableMap.Builder partitionMetadataBuilder = - ImmutableMap.builder(); + ImmutableMap.builder(); int totalDeleted = 0; String earliestCommitToRetain = null; for (HoodieCleanStat stat : cleanStats) { @@ -116,7 +116,7 @@ public class AvroUtils { public static HoodieRollbackMetadata convertRollbackMetadata(String startRollbackTime, Optional durationInMs, List commits, List stats) { ImmutableMap.Builder partitionMetadataBuilder = - ImmutableMap.builder(); + ImmutableMap.builder(); int totalDeleted = 0; for (HoodieRollbackStat stat : stats) { HoodieRollbackPartitionMetadata metadata = @@ -132,7 +132,7 @@ public class AvroUtils { public static HoodieSavepointMetadata convertSavepointMetadata(String user, String comment, Map> latestFiles) { ImmutableMap.Builder partitionMetadataBuilder = - ImmutableMap.builder(); + ImmutableMap.builder(); for (Map.Entry> stat : latestFiles.entrySet()) { HoodieSavepointPartitionMetadata metadata = new HoodieSavepointPartitionMetadata(stat.getKey(), stat.getValue()); diff --git a/hoodie-common/src/main/java/com/uber/hoodie/common/util/FSUtils.java b/hoodie-common/src/main/java/com/uber/hoodie/common/util/FSUtils.java index daecf6237..d788cdd44 100644 --- a/hoodie-common/src/main/java/com/uber/hoodie/common/util/FSUtils.java +++ b/hoodie-common/src/main/java/com/uber/hoodie/common/util/FSUtils.java @@ -29,6 +29,7 @@ import java.util.ArrayList; import java.util.Arrays; import java.util.Comparator; import java.util.List; +import java.util.Map.Entry; import java.util.Optional; import java.util.regex.Matcher; import java.util.regex.Pattern; @@ -55,6 +56,7 @@ public class FSUtils { private static final int MAX_ATTEMPTS_RECOVER_LEASE = 10; private static final long MIN_CLEAN_TO_KEEP = 10; private static final long MIN_ROLLBACK_TO_KEEP = 10; + private static final String HOODIE_ENV_PROPS_PREFIX = "HOODIE_ENV_"; private static FileSystem fs; /** @@ -65,17 +67,32 @@ public class FSUtils { FSUtils.fs = fs; } + public static Configuration prepareHadoopConf(Configuration conf) { + conf.set("fs.hdfs.impl", org.apache.hadoop.hdfs.DistributedFileSystem.class.getName()); + conf.set("fs.file.impl", org.apache.hadoop.fs.LocalFileSystem.class.getName()); - public static FileSystem getFs() { + // look for all properties, prefixed to be picked up + for (Entry prop : System.getenv().entrySet()) { + if (prop.getKey().startsWith(HOODIE_ENV_PROPS_PREFIX)) { + LOG.info("Picking up value for hoodie env var :" + prop.getKey()); + conf.set(prop.getKey() + .replace(HOODIE_ENV_PROPS_PREFIX, "") + .replaceAll("_DOT_", "."), + prop.getValue()); + } + } + return conf; + } + + + public static FileSystem getFs(String path, Configuration conf) { if (fs != null) { return fs; } - Configuration conf = new Configuration(); - conf.set("fs.hdfs.impl", org.apache.hadoop.hdfs.DistributedFileSystem.class.getName()); - conf.set("fs.file.impl", org.apache.hadoop.fs.LocalFileSystem.class.getName()); FileSystem fs; + conf = prepareHadoopConf(conf); try { - fs = FileSystem.get(conf); + fs = new Path(path).getFileSystem(conf); } catch (IOException e) { throw new HoodieIOException("Failed to get instance of " + FileSystem.class.getName(), e); @@ -83,7 +100,6 @@ public class FSUtils { LOG.info( String.format("Hadoop Configuration: fs.defaultFS: [%s], Config:[%s], FileSystem: [%s]", conf.getRaw("fs.defaultFS"), conf.toString(), fs.toString())); - return fs; } diff --git a/hoodie-common/src/main/java/com/uber/hoodie/common/util/ParquetUtils.java b/hoodie-common/src/main/java/com/uber/hoodie/common/util/ParquetUtils.java index a4a683350..ecbaad1c9 100644 --- a/hoodie-common/src/main/java/com/uber/hoodie/common/util/ParquetUtils.java +++ b/hoodie-common/src/main/java/com/uber/hoodie/common/util/ParquetUtils.java @@ -51,10 +51,11 @@ public class ParquetUtils { * Read the rowKey list from the given parquet file. * * @param filePath The parquet file path. + * @param configuration configuration to build fs object */ - public static Set readRowKeysFromParquet(Path filePath) { - Configuration conf = new Configuration(); - conf.addResource(getFs().getConf()); + public static Set readRowKeysFromParquet(Configuration configuration, Path filePath) { + Configuration conf = new Configuration(configuration); + conf.addResource(getFs(filePath.toString(), conf).getConf()); Schema readSchema = HoodieAvroUtils.getRecordKeySchema(); AvroReadSupport.setAvroReadSchema(conf, readSchema); AvroReadSupport.setRequestedProjection(conf, readSchema); @@ -84,19 +85,12 @@ public class ParquetUtils { return rowKeys; } - - /** - * Read the metadata from a parquet file - */ - public static ParquetMetadata readMetadata(Path parquetFilePath) { - return readMetadata(new Configuration(), parquetFilePath); - } - public static ParquetMetadata readMetadata(Configuration conf, Path parquetFilePath) { ParquetMetadata footer; try { // TODO(vc): Should we use the parallel reading version here? - footer = ParquetFileReader.readFooter(getFs().getConf(), parquetFilePath); + footer = ParquetFileReader + .readFooter(getFs(parquetFilePath.toString(), conf).getConf(), parquetFilePath); } catch (IOException e) { throw new HoodieIOException("Failed to read footer for parquet " + parquetFilePath, e); @@ -108,14 +102,15 @@ public class ParquetUtils { /** * Get the schema of the given parquet file. */ - public static MessageType readSchema(Path parquetFilePath) { - return readMetadata(parquetFilePath).getFileMetaData().getSchema(); + public static MessageType readSchema(Configuration configuration, Path parquetFilePath) { + return readMetadata(configuration, parquetFilePath).getFileMetaData().getSchema(); } - private static List readParquetFooter(Path parquetFilePath, String... footerNames) { + private static List readParquetFooter(Configuration configuration, Path parquetFilePath, + String... footerNames) { List footerVals = new ArrayList<>(); - ParquetMetadata footer = readMetadata(parquetFilePath); + ParquetMetadata footer = readMetadata(configuration, parquetFilePath); Map metadata = footer.getFileMetaData().getKeyValueMetaData(); for (String footerName : footerNames) { if (metadata.containsKey(footerName)) { @@ -128,21 +123,22 @@ public class ParquetUtils { return footerVals; } - public static Schema readAvroSchema(Path parquetFilePath) { - return new AvroSchemaConverter().convert(readSchema(parquetFilePath)); + public static Schema readAvroSchema(Configuration configuration, Path parquetFilePath) { + return new AvroSchemaConverter().convert(readSchema(configuration, parquetFilePath)); } /** * Read out the bloom filter from the parquet file meta data. */ - public static BloomFilter readBloomFilterFromParquetMetadata(Path parquetFilePath) { - String footerVal = readParquetFooter(parquetFilePath, + public static BloomFilter readBloomFilterFromParquetMetadata(Configuration configuration, + Path parquetFilePath) { + String footerVal = readParquetFooter(configuration, parquetFilePath, HoodieAvroWriteSupport.HOODIE_AVRO_BLOOM_FILTER_METADATA_KEY).get(0); return new BloomFilter(footerVal); } - public static String[] readMinMaxRecordKeys(Path parquetFilePath) { - List minMaxKeys = readParquetFooter(parquetFilePath, + public static String[] readMinMaxRecordKeys(Configuration configuration, Path parquetFilePath) { + List minMaxKeys = readParquetFooter(configuration, parquetFilePath, HoodieAvroWriteSupport.HOODIE_MIN_RECORD_KEY_FOOTER, HoodieAvroWriteSupport.HOODIE_MAX_RECORD_KEY_FOOTER); if (minMaxKeys.size() != 2) { @@ -156,11 +152,11 @@ public class ParquetUtils { /** * NOTE: This literally reads the entire file contents, thus should be used with caution. */ - public static List readAvroRecords(Path filePath) { + public static List readAvroRecords(Configuration configuration, Path filePath) { ParquetReader reader = null; List records = new ArrayList<>(); try { - reader = AvroParquetReader.builder(filePath).build(); + reader = AvroParquetReader.builder(filePath).withConf(configuration).build(); Object obj = reader.read(); while (obj != null) { if (obj instanceof GenericRecord) { diff --git a/hoodie-common/src/test/java/com/uber/hoodie/common/minicluster/HdfsTestService.java b/hoodie-common/src/test/java/com/uber/hoodie/common/minicluster/HdfsTestService.java index 0b3a6bdeb..74cc1104b 100644 --- a/hoodie-common/src/test/java/com/uber/hoodie/common/minicluster/HdfsTestService.java +++ b/hoodie-common/src/test/java/com/uber/hoodie/common/minicluster/HdfsTestService.java @@ -19,6 +19,7 @@ package com.uber.hoodie.common.minicluster; import com.google.common.base.Preconditions; import com.google.common.io.Files; +import com.uber.hoodie.common.model.HoodieTestUtils; import java.io.File; import java.io.IOException; import org.apache.commons.io.FileUtils; @@ -54,7 +55,7 @@ public class HdfsTestService { private MiniDFSCluster miniDfsCluster; public HdfsTestService() { - hadoopConf = new Configuration(); + hadoopConf = HoodieTestUtils.getDefaultHadoopConf(); workDir = Files.createTempDir().getAbsolutePath(); } @@ -67,7 +68,7 @@ public class HdfsTestService { .checkState(workDir != null, "The work dir must be set before starting cluster."); if (hadoopConf == null) { - hadoopConf = new Configuration(); + hadoopConf = HoodieTestUtils.getDefaultHadoopConf(); } // If clean, then remove the work dir so we can start fresh. diff --git a/hoodie-common/src/test/java/com/uber/hoodie/common/model/HoodieTestUtils.java b/hoodie-common/src/test/java/com/uber/hoodie/common/model/HoodieTestUtils.java index 9489e57ab..7017bbf91 100644 --- a/hoodie-common/src/test/java/com/uber/hoodie/common/model/HoodieTestUtils.java +++ b/hoodie-common/src/test/java/com/uber/hoodie/common/model/HoodieTestUtils.java @@ -58,6 +58,7 @@ import java.util.stream.Stream; import org.apache.avro.Schema; import org.apache.avro.generic.GenericRecord; import org.apache.avro.generic.IndexedRecord; +import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FSDataOutputStream; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; @@ -69,18 +70,23 @@ import org.junit.rules.TemporaryFolder; public class HoodieTestUtils { - public static FileSystem fs = FSUtils.getFs(); + public static FileSystem fs; public static final String TEST_EXTENSION = ".test"; public static final String RAW_TRIPS_TEST_NAME = "raw_trips"; public static final int DEFAULT_TASK_PARTITIONID = 1; public static final String[] DEFAULT_PARTITION_PATHS = {"2016/03/15", "2015/03/16", "2015/03/17"}; private static Random rand = new Random(46474747); - public static void resetFS() { - HoodieTestUtils.fs = FSUtils.getFs(); + public static void resetFS(String basePath) { + HoodieTestUtils.fs = FSUtils.getFs(basePath, HoodieTestUtils.getDefaultHadoopConf()); + } + + public static Configuration getDefaultHadoopConf() { + return new Configuration(); } public static HoodieTableMetaClient init(String basePath) throws IOException { + fs = FSUtils.getFs(basePath, HoodieTestUtils.getDefaultHadoopConf()); return initTableType(basePath, HoodieTableType.COPY_ON_WRITE); } @@ -211,7 +217,7 @@ public class HoodieTestUtils { Path commitFile = new Path(basePath + "/" + HoodieTableMetaClient.METAFOLDER_NAME + "/" + HoodieTimeline .makeCleanerFileName(commitTime)); - FileSystem fs = FSUtils.getFs(); + FileSystem fs = FSUtils.getFs(basePath, HoodieTestUtils.getDefaultHadoopConf()); FSDataOutputStream os = fs.create(commitFile, true); try { HoodieCleanStat cleanStats = new HoodieCleanStat( diff --git a/hoodie-common/src/test/java/com/uber/hoodie/common/table/log/HoodieLogFormatTest.java b/hoodie-common/src/test/java/com/uber/hoodie/common/table/log/HoodieLogFormatTest.java index 8fc7fb46f..084cc1f12 100644 --- a/hoodie-common/src/test/java/com/uber/hoodie/common/table/log/HoodieLogFormatTest.java +++ b/hoodie-common/src/test/java/com/uber/hoodie/common/table/log/HoodieLogFormatTest.java @@ -67,7 +67,7 @@ public class HoodieLogFormatTest { private FileSystem fs; private Path partitionPath; - private String basePath; + private static String basePath; @BeforeClass public static void setUpClass() throws IOException, InterruptedException { @@ -78,7 +78,7 @@ public class HoodieLogFormatTest { @AfterClass public static void tearDownClass() { MiniClusterUtil.shutdown(); - HoodieTestUtils.resetFS(); + HoodieTestUtils.resetFS(basePath); } @Before @@ -343,7 +343,7 @@ public class HoodieLogFormatTest { writer.close(); // Append some arbit byte[] to thee end of the log (mimics a partially written commit) - fs = FileSystem.get(fs.getConf()); + fs = FSUtils.getFs(fs.getUri().toString(), fs.getConf()); FSDataOutputStream outputStream = fs.append(writer.getLogFile().getPath()); // create a block with outputStream.write(HoodieLogFormat.MAGIC); @@ -533,7 +533,7 @@ public class HoodieLogFormatTest { writer.close(); // Append some arbit byte[] to thee end of the log (mimics a partially written commit) - fs = FileSystem.get(fs.getConf()); + fs = FSUtils.getFs(fs.getUri().toString(), fs.getConf()); FSDataOutputStream outputStream = fs.append(writer.getLogFile().getPath()); // create a block with outputStream.write(HoodieLogFormat.MAGIC); diff --git a/hoodie-common/src/test/java/com/uber/hoodie/common/table/string/HoodieActiveTimelineTest.java b/hoodie-common/src/test/java/com/uber/hoodie/common/table/string/HoodieActiveTimelineTest.java index 22285a6c5..189be698d 100644 --- a/hoodie-common/src/test/java/com/uber/hoodie/common/table/string/HoodieActiveTimelineTest.java +++ b/hoodie-common/src/test/java/com/uber/hoodie/common/table/string/HoodieActiveTimelineTest.java @@ -74,7 +74,7 @@ public class HoodieActiveTimelineTest { HoodieInstant instant5 = new HoodieInstant(true, HoodieTimeline.COMMIT_ACTION, "9"); - timeline = new HoodieActiveTimeline(HoodieTestUtils.fs, metaClient.getMetaPath()); + timeline = new HoodieActiveTimeline(metaClient); timeline.saveAsComplete(instant1, Optional.empty()); timeline.saveAsComplete(instant2, Optional.empty()); timeline.saveAsComplete(instant3, Optional.empty()); @@ -98,7 +98,7 @@ public class HoodieActiveTimelineTest { @Test public void testTimelineOperationsBasic() throws Exception { - timeline = new HoodieActiveTimeline(HoodieTestUtils.fs, metaClient.getMetaPath()); + timeline = new HoodieActiveTimeline(metaClient); assertTrue(timeline.empty()); assertEquals("", 0, timeline.countInstants()); assertEquals("", Optional.empty(), timeline.firstInstant()); diff --git a/hoodie-common/src/test/java/com/uber/hoodie/common/table/view/HoodieTableFileSystemViewTest.java b/hoodie-common/src/test/java/com/uber/hoodie/common/table/view/HoodieTableFileSystemViewTest.java index 992b86416..68d32215a 100644 --- a/hoodie-common/src/test/java/com/uber/hoodie/common/table/view/HoodieTableFileSystemViewTest.java +++ b/hoodie-common/src/test/java/com/uber/hoodie/common/table/view/HoodieTableFileSystemViewTest.java @@ -69,7 +69,7 @@ public class HoodieTableFileSystemViewTest { } private void refreshFsView(FileStatus[] statuses) { - metaClient = new HoodieTableMetaClient(HoodieTestUtils.fs, basePath, true); + metaClient = new HoodieTableMetaClient(HoodieTestUtils.fs.getConf(), basePath, true); if (statuses != null) { fsView = new HoodieTableFileSystemView(metaClient, metaClient.getActiveTimeline().getCommitTimeline().filterCompletedInstants(), diff --git a/hoodie-common/src/test/java/com/uber/hoodie/common/util/TestFSUtils.java b/hoodie-common/src/test/java/com/uber/hoodie/common/util/TestFSUtils.java index edcc1509b..43b0ebdf3 100644 --- a/hoodie-common/src/test/java/com/uber/hoodie/common/util/TestFSUtils.java +++ b/hoodie-common/src/test/java/com/uber/hoodie/common/util/TestFSUtils.java @@ -16,15 +16,24 @@ package com.uber.hoodie.common.util; +import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertTrue; +import com.uber.hoodie.common.model.HoodieTestUtils; import java.text.SimpleDateFormat; import java.util.Date; import java.util.UUID; +import org.apache.hadoop.conf.Configuration; +import org.junit.Rule; import org.junit.Test; +import org.junit.contrib.java.lang.system.EnvironmentVariables; public class TestFSUtils { + @Rule + public final EnvironmentVariables environmentVariables + = new EnvironmentVariables(); + @Test public void testMakeDataFileName() { String commitTime = new SimpleDateFormat("yyyyMMddHHmmss").format(new Date()); @@ -59,4 +68,15 @@ public class TestFSUtils { String fullFileName = FSUtils.makeDataFileName(commitTime, taskPartitionId, fileName); assertTrue(FSUtils.getFileId(fullFileName).equals(fileName)); } + + @Test + public void testEnvVarVariablesPickedup() { + environmentVariables.set("HOODIE_ENV_fs_DOT_key1", "value1"); + Configuration conf = FSUtils.prepareHadoopConf(HoodieTestUtils.getDefaultHadoopConf()); + assertEquals("value1", conf.get("fs.key1")); + conf.set("fs.key1", "value11"); + conf.set("fs.key2", "value2"); + assertEquals("value11", conf.get("fs.key1")); + assertEquals("value2", conf.get("fs.key2")); + } } diff --git a/hoodie-common/src/test/java/com/uber/hoodie/common/util/TestParquetUtils.java b/hoodie-common/src/test/java/com/uber/hoodie/common/util/TestParquetUtils.java index 266cb1158..fd0a4475a 100644 --- a/hoodie-common/src/test/java/com/uber/hoodie/common/util/TestParquetUtils.java +++ b/hoodie-common/src/test/java/com/uber/hoodie/common/util/TestParquetUtils.java @@ -22,6 +22,7 @@ import static org.junit.Assert.assertTrue; import com.uber.hoodie.avro.HoodieAvroWriteSupport; import com.uber.hoodie.common.BloomFilter; import com.uber.hoodie.common.model.HoodieRecord; +import com.uber.hoodie.common.model.HoodieTestUtils; import java.io.IOException; import java.util.ArrayList; import java.util.Collections; @@ -79,12 +80,15 @@ public class TestParquetUtils { // Read and verify List rowKeysInFile = new ArrayList<>( - ParquetUtils.readRowKeysFromParquet(new Path(filePath))); + ParquetUtils + .readRowKeysFromParquet(HoodieTestUtils.getDefaultHadoopConf(), new Path(filePath))); Collections.sort(rowKeysInFile); Collections.sort(rowKeys); assertEquals("Did not read back the expected list of keys", rowKeys, rowKeysInFile); - BloomFilter filterInFile = ParquetUtils.readBloomFilterFromParquetMetadata(new Path(filePath)); + BloomFilter filterInFile = ParquetUtils + .readBloomFilterFromParquetMetadata(HoodieTestUtils.getDefaultHadoopConf(), + new Path(filePath)); for (String rowKey : rowKeys) { assertTrue("key should be found in bloom filter", filterInFile.mightContain(rowKey)); } diff --git a/hoodie-hadoop-mr/src/main/java/com/uber/hoodie/hadoop/HoodieInputFormat.java b/hoodie-hadoop-mr/src/main/java/com/uber/hoodie/hadoop/HoodieInputFormat.java index ee4f5a954..302373e1b 100644 --- a/hoodie-hadoop-mr/src/main/java/com/uber/hoodie/hadoop/HoodieInputFormat.java +++ b/hoodie-hadoop-mr/src/main/java/com/uber/hoodie/hadoop/HoodieInputFormat.java @@ -295,6 +295,6 @@ public class HoodieInputFormat extends MapredParquetInputFormat } Path baseDir = HoodieHiveUtil.getNthParent(dataPath, levels); LOG.info("Reading hoodie metadata from path " + baseDir.toString()); - return new HoodieTableMetaClient(fs, baseDir.toString()); + return new HoodieTableMetaClient(fs.getConf(), baseDir.toString()); } } diff --git a/hoodie-hadoop-mr/src/main/java/com/uber/hoodie/hadoop/HoodieROTablePathFilter.java b/hoodie-hadoop-mr/src/main/java/com/uber/hoodie/hadoop/HoodieROTablePathFilter.java index c8ffbcc9a..2c01ed666 100644 --- a/hoodie-hadoop-mr/src/main/java/com/uber/hoodie/hadoop/HoodieROTablePathFilter.java +++ b/hoodie-hadoop-mr/src/main/java/com/uber/hoodie/hadoop/HoodieROTablePathFilter.java @@ -19,6 +19,7 @@ import com.uber.hoodie.common.model.HoodieDataFile; import com.uber.hoodie.common.model.HoodiePartitionMetadata; import com.uber.hoodie.common.table.HoodieTableMetaClient; import com.uber.hoodie.common.table.view.HoodieTableFileSystemView; +import com.uber.hoodie.common.util.FSUtils; import com.uber.hoodie.exception.DatasetNotFoundException; import com.uber.hoodie.exception.HoodieException; import java.io.Serializable; @@ -86,7 +87,7 @@ public class HoodieROTablePathFilter implements PathFilter, Serializable { } Path folder = null; try { - FileSystem fs = path.getFileSystem(new Configuration()); + FileSystem fs = path.getFileSystem(FSUtils.prepareHadoopConf(new Configuration())); if (fs.isDirectory(path)) { return true; } @@ -123,7 +124,7 @@ public class HoodieROTablePathFilter implements PathFilter, Serializable { if (baseDir != null) { try { HoodieTableMetaClient metaClient = - new HoodieTableMetaClient(fs, baseDir.toString()); + new HoodieTableMetaClient(fs.getConf(), baseDir.toString()); HoodieTableFileSystemView fsView = new HoodieTableFileSystemView(metaClient, metaClient.getActiveTimeline().getCommitTimeline() .filterCompletedInstants(), diff --git a/hoodie-hadoop-mr/src/main/java/com/uber/hoodie/hadoop/realtime/HoodieRealtimeRecordReader.java b/hoodie-hadoop-mr/src/main/java/com/uber/hoodie/hadoop/realtime/HoodieRealtimeRecordReader.java index 00ef57e42..35ef39eda 100644 --- a/hoodie-hadoop-mr/src/main/java/com/uber/hoodie/hadoop/realtime/HoodieRealtimeRecordReader.java +++ b/hoodie-hadoop-mr/src/main/java/com/uber/hoodie/hadoop/realtime/HoodieRealtimeRecordReader.java @@ -84,7 +84,7 @@ public class HoodieRealtimeRecordReader implements RecordReader " + job.get(ColumnProjectionUtils.READ_COLUMN_NAMES_CONF_STR)); try { baseFileSchema = readSchema(jobConf, split.getPath()); - readAndCompactLog(); + readAndCompactLog(jobConf); } catch (IOException e) { throw new HoodieIOException( "Could not create HoodieRealtimeRecordReader on path " + this.split.getPath(), e); @@ -110,7 +110,7 @@ public class HoodieRealtimeRecordReader implements RecordReader projectionFields = orderFields( jobConf.get(ColumnProjectionUtils.READ_COLUMN_NAMES_CONF_STR), @@ -123,7 +123,8 @@ public class HoodieRealtimeRecordReader implements RecordReader records = new ArrayList<>(); for (int i = 0; i < numberOfRecords; i++) { records.add(SchemaTestUtil.generateAvroRecordFromJson(schema, i, newCommit, "fileid0")); @@ -123,8 +129,8 @@ public class HoodieRealtimeRecordReaderTest { //create a RecordReader to be used by HoodieRealtimeRecordReader RecordReader reader = new MapredParquetInputFormat(). - getRecordReader(new FileSplit(split.getPath(), 0, - FSUtils.getFs().getLength(split.getPath()), (String[]) null), jobConf, null); + getRecordReader(new FileSplit(split.getPath(), 0, fs.getLength(split.getPath()), + (String[]) null), jobConf, null); JobConf jobConf = new JobConf(); List fields = schema.getFields(); String names = fields.stream().map(f -> f.name().toString()).collect(Collectors.joining(",")); @@ -183,8 +189,8 @@ public class HoodieRealtimeRecordReaderTest { //create a RecordReader to be used by HoodieRealtimeRecordReader RecordReader reader = new MapredParquetInputFormat(). - getRecordReader(new FileSplit(split.getPath(), 0, - FSUtils.getFs().getLength(split.getPath()), (String[]) null), jobConf, null); + getRecordReader(new FileSplit(split.getPath(), 0, fs.getLength(split.getPath()), + (String[]) null), jobConf, null); JobConf jobConf = new JobConf(); List fields = schema.getFields(); diff --git a/hoodie-hive/src/main/java/com/uber/hoodie/hive/HiveSyncTool.java b/hoodie-hive/src/main/java/com/uber/hoodie/hive/HiveSyncTool.java index 088e24a9c..472001ede 100644 --- a/hoodie-hive/src/main/java/com/uber/hoodie/hive/HiveSyncTool.java +++ b/hoodie-hive/src/main/java/com/uber/hoodie/hive/HiveSyncTool.java @@ -30,6 +30,7 @@ import java.util.List; import java.util.Map; import java.util.Optional; import java.util.stream.Collectors; +import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.hive.conf.HiveConf; import org.apache.hadoop.hive.metastore.api.Partition; @@ -183,7 +184,7 @@ public class HiveSyncTool { cmd.usage(); System.exit(1); } - FileSystem fs = FSUtils.getFs(); + FileSystem fs = FSUtils.getFs(cfg.basePath, new Configuration()); HiveConf hiveConf = new HiveConf(); hiveConf.addResource(fs.getConf()); new HiveSyncTool(cfg, hiveConf, fs).syncHoodieTable(); diff --git a/hoodie-hive/src/main/java/com/uber/hoodie/hive/HoodieHiveClient.java b/hoodie-hive/src/main/java/com/uber/hoodie/hive/HoodieHiveClient.java index dede5e5f5..95d1d5821 100644 --- a/hoodie-hive/src/main/java/com/uber/hoodie/hive/HoodieHiveClient.java +++ b/hoodie-hive/src/main/java/com/uber/hoodie/hive/HoodieHiveClient.java @@ -91,7 +91,7 @@ public class HoodieHiveClient { HoodieHiveClient(HiveSyncConfig cfg, HiveConf configuration, FileSystem fs) { this.syncConfig = cfg; this.fs = fs; - this.metaClient = new HoodieTableMetaClient(fs, cfg.basePath, true); + this.metaClient = new HoodieTableMetaClient(fs.getConf(), cfg.basePath, true); this.tableType = metaClient.getTableType(); LOG.info("Creating hive connection " + cfg.jdbcUrl); diff --git a/hoodie-hive/src/test/java/com/uber/hoodie/hive/util/HiveTestService.java b/hoodie-hive/src/test/java/com/uber/hoodie/hive/util/HiveTestService.java index 26ed1b0f9..7b9172ead 100644 --- a/hoodie-hive/src/test/java/com/uber/hoodie/hive/util/HiveTestService.java +++ b/hoodie-hive/src/test/java/com/uber/hoodie/hive/util/HiveTestService.java @@ -20,6 +20,7 @@ package com.uber.hoodie.hive.util; import com.google.common.base.Preconditions; import com.google.common.collect.Maps; import com.google.common.io.Files; +import com.uber.hoodie.common.model.HoodieTestUtils; import java.io.File; import java.io.IOException; import java.net.InetSocketAddress; @@ -87,7 +88,7 @@ public class HiveTestService { .checkState(workDir != null, "The work dir must be set before starting cluster."); if (hadoopConf == null) { - hadoopConf = new Configuration(); + hadoopConf = HoodieTestUtils.getDefaultHadoopConf(); } String localHiveLocation = getHiveLocation(workDir); diff --git a/hoodie-spark/src/main/java/com/uber/hoodie/HoodieDataSourceHelpers.java b/hoodie-spark/src/main/java/com/uber/hoodie/HoodieDataSourceHelpers.java index b74d91707..996786a7d 100644 --- a/hoodie-spark/src/main/java/com/uber/hoodie/HoodieDataSourceHelpers.java +++ b/hoodie-spark/src/main/java/com/uber/hoodie/HoodieDataSourceHelpers.java @@ -67,7 +67,7 @@ public class HoodieDataSourceHelpers { */ public static HoodieTimeline allCompletedCommitsCompactions(FileSystem fs, String basePath) { HoodieTable table = HoodieTable - .getHoodieTable(new HoodieTableMetaClient(fs, basePath, true), null); + .getHoodieTable(new HoodieTableMetaClient(fs.getConf(), basePath, true), null); if (table.getMetaClient().getTableType().equals(HoodieTableType.MERGE_ON_READ)) { return table.getActiveTimeline().getTimelineOfActions( Sets.newHashSet(HoodieActiveTimeline.COMMIT_ACTION, diff --git a/hoodie-spark/src/main/scala/com/uber/hoodie/IncrementalRelation.scala b/hoodie-spark/src/main/scala/com/uber/hoodie/IncrementalRelation.scala index 4aca81f17..cf0a2c1aa 100644 --- a/hoodie-spark/src/main/scala/com/uber/hoodie/IncrementalRelation.scala +++ b/hoodie-spark/src/main/scala/com/uber/hoodie/IncrementalRelation.scala @@ -47,7 +47,7 @@ class IncrementalRelation(val sqlContext: SQLContext, private val log = LogManager.getLogger(classOf[IncrementalRelation]) val fs = new Path(basePath).getFileSystem(sqlContext.sparkContext.hadoopConfiguration) - val metaClient = new HoodieTableMetaClient(fs, basePath, true) + val metaClient = new HoodieTableMetaClient(sqlContext.sparkContext.hadoopConfiguration, basePath, true) // MOR datasets not supported yet if (metaClient.getTableType.equals(HoodieTableType.MERGE_ON_READ)) { throw new HoodieException("Incremental view not implemented yet, for merge-on-read datasets") @@ -72,7 +72,8 @@ class IncrementalRelation(val sqlContext: SQLContext, val latestMeta = HoodieCommitMetadata .fromBytes(commitTimeline.getInstantDetails(commitsToReturn.last).get) val metaFilePath = latestMeta.getFileIdAndFullPaths(basePath).values().iterator().next() - AvroConversionUtils.convertAvroSchemaToStructType(ParquetUtils.readAvroSchema(new Path(metaFilePath))) + AvroConversionUtils.convertAvroSchemaToStructType(ParquetUtils.readAvroSchema( + sqlContext.sparkContext.hadoopConfiguration, new Path(metaFilePath))) } override def schema: StructType = latestSchema diff --git a/hoodie-spark/src/test/scala/DataSourceTest.scala b/hoodie-spark/src/test/scala/DataSourceTest.scala index b9fed1cfe..1bdc92777 100644 --- a/hoodie-spark/src/test/scala/DataSourceTest.scala +++ b/hoodie-spark/src/test/scala/DataSourceTest.scala @@ -57,7 +57,7 @@ class DataSourceTest extends AssertionsForJUnit { val folder = new TemporaryFolder folder.create basePath = folder.getRoot.getAbsolutePath - fs = FSUtils.getFs + fs = FSUtils.getFs(basePath, spark.sparkContext.hadoopConfiguration) } @Test def testCopyOnWriteStorage() { diff --git a/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/HDFSParquetImporter.java b/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/HDFSParquetImporter.java index 27f264974..1d3a2b1c1 100644 --- a/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/HDFSParquetImporter.java +++ b/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/HDFSParquetImporter.java @@ -56,21 +56,18 @@ import org.apache.spark.Accumulator; import org.apache.spark.SparkConf; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaSparkContext; -import org.apache.spark.api.java.function.Function; -import org.apache.spark.api.java.function.VoidFunction; import scala.Tuple2; public class HDFSParquetImporter implements Serializable { private static volatile Logger logger = LogManager.getLogger(HDFSParquetImporter.class); private final Config cfg; - private final transient FileSystem fs; + private transient FileSystem fs; public static final SimpleDateFormat PARTITION_FORMATTER = new SimpleDateFormat("yyyy/MM/dd"); public HDFSParquetImporter( Config cfg) throws IOException { this.cfg = cfg; - fs = FSUtils.getFs(); } public static class FormatValidator implements IValueValidator { @@ -203,6 +200,7 @@ public class HDFSParquetImporter implements Serializable { } public int dataImport(JavaSparkContext jsc, int retry) throws Exception { + this.fs = FSUtils.getFs(cfg.targetPath, jsc.hadoopConfiguration()); int ret = -1; try { // Verify that targetPath is not present. @@ -251,43 +249,36 @@ public class HDFSParquetImporter implements Serializable { GenericRecord.class, job.getConfiguration()) // To reduce large number of tasks. .coalesce(16 * cfg.parallelism) - .map(new Function, HoodieRecord>() { - @Override - public HoodieRecord call(Tuple2 entry) - throws Exception { - GenericRecord genericRecord = entry._2(); - Object partitionField = genericRecord.get(cfg.partitionKey); - if (partitionField == null) { - throw new HoodieIOException( - "partition key is missing. :" + cfg.partitionKey); - } - Object rowField = genericRecord.get(cfg.rowKey); - if (rowField == null) { - throw new HoodieIOException( - "row field is missing. :" + cfg.rowKey); - } - long ts = (long) ((Double) partitionField * 1000l); - String partitionPath = PARTITION_FORMATTER.format(new Date(ts)); - return new HoodieRecord( - new HoodieKey((String) rowField, partitionPath), - new HoodieJsonPayload(genericRecord.toString())); - } - } + .map(entry -> { + GenericRecord genericRecord = ((Tuple2) entry)._2(); + Object partitionField = genericRecord.get(cfg.partitionKey); + if (partitionField == null) { + throw new HoodieIOException( + "partition key is missing. :" + cfg.partitionKey); + } + Object rowField = genericRecord.get(cfg.rowKey); + if (rowField == null) { + throw new HoodieIOException( + "row field is missing. :" + cfg.rowKey); + } + long ts = (long) ((Double) partitionField * 1000l); + String partitionPath = PARTITION_FORMATTER.format(new Date(ts)); + return new HoodieRecord<>( + new HoodieKey((String) rowField, partitionPath), + new HoodieJsonPayload(genericRecord.toString())); + } ); // Get commit time. String commitTime = client.startCommit(); JavaRDD writeResponse = client.bulkInsert(hoodieRecords, commitTime); Accumulator errors = jsc.accumulator(0); - writeResponse.foreach(new VoidFunction() { - @Override - public void call(WriteStatus writeStatus) throws Exception { + writeResponse.foreach(writeStatus -> { if (writeStatus.hasErrors()) { errors.add(1); logger.error(String.format("Error processing records :writeStatus:%s", writeStatus.getStat().toString())); } - } }); if (errors.value() == 0) { logger.info(String diff --git a/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/HiveIncrementalPuller.java b/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/HiveIncrementalPuller.java index 5bcba8cb8..a6fbc3f36 100644 --- a/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/HiveIncrementalPuller.java +++ b/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/HiveIncrementalPuller.java @@ -291,7 +291,7 @@ public class HiveIncrementalPuller { if (!fs.exists(new Path(targetDataPath)) || !fs.exists(new Path(targetDataPath + "/.hoodie"))) { return "0"; } - HoodieTableMetaClient metadata = new HoodieTableMetaClient(fs, targetDataPath); + HoodieTableMetaClient metadata = new HoodieTableMetaClient(fs.getConf(), targetDataPath); Optional lastCommit = metadata.getActiveTimeline().getCommitsTimeline() @@ -331,7 +331,7 @@ public class HiveIncrementalPuller { private String getLastCommitTimePulled(FileSystem fs, String sourceTableLocation) throws IOException { - HoodieTableMetaClient metadata = new HoodieTableMetaClient(fs, sourceTableLocation); + HoodieTableMetaClient metadata = new HoodieTableMetaClient(fs.getConf(), sourceTableLocation); List commitsToSync = metadata.getActiveTimeline().getCommitsTimeline() .filterCompletedInstants() .findInstantsAfter(config.fromCommitTime, config.maxCommits).getInstants() diff --git a/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/HoodieSnapshotCopier.java b/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/HoodieSnapshotCopier.java index 3d199e0f5..41ef77276 100644 --- a/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/HoodieSnapshotCopier.java +++ b/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/HoodieSnapshotCopier.java @@ -20,6 +20,7 @@ package com.uber.hoodie.utilities; import com.beust.jcommander.JCommander; import com.beust.jcommander.Parameter; +import com.uber.hoodie.common.SerializableConfiguration; import com.uber.hoodie.common.model.HoodieDataFile; import com.uber.hoodie.common.model.HoodiePartitionMetadata; import com.uber.hoodie.common.table.HoodieTableConfig; @@ -70,8 +71,10 @@ public class HoodieSnapshotCopier implements Serializable { public void snapshot(JavaSparkContext jsc, String baseDir, final String outputDir, final boolean shouldAssumeDatePartitioning) throws IOException { - FileSystem fs = FSUtils.getFs(); - final HoodieTableMetaClient tableMetadata = new HoodieTableMetaClient(fs, baseDir); + FileSystem fs = FSUtils.getFs(baseDir, jsc.hadoopConfiguration()); + final SerializableConfiguration serConf = new SerializableConfiguration( + jsc.hadoopConfiguration()); + final HoodieTableMetaClient tableMetadata = new HoodieTableMetaClient(fs.getConf(), baseDir); final TableFileSystemView.ReadOptimizedView fsView = new HoodieTableFileSystemView( tableMetadata, tableMetadata.getActiveTimeline().getCommitsTimeline() @@ -104,7 +107,7 @@ public class HoodieSnapshotCopier implements Serializable { jsc.parallelize(partitions, partitions.size()) .flatMap(partition -> { // Only take latest version files <= latestCommit. - FileSystem fs1 = FSUtils.getFs(); + FileSystem fs1 = FSUtils.getFs(baseDir, serConf.get()); List> filePaths = new ArrayList<>(); Stream dataFiles = fsView .getLatestDataFilesBeforeOrOn(partition, latestCommitTimestamp); @@ -123,13 +126,13 @@ public class HoodieSnapshotCopier implements Serializable { String partition = tuple._1(); Path sourceFilePath = new Path(tuple._2()); Path toPartitionPath = new Path(outputDir, partition); - FileSystem fs1 = FSUtils.getFs(); + FileSystem ifs = FSUtils.getFs(baseDir, serConf.get()); - if (!fs1.exists(toPartitionPath)) { - fs1.mkdirs(toPartitionPath); + if (!ifs.exists(toPartitionPath)) { + ifs.mkdirs(toPartitionPath); } - FileUtil.copy(fs1, sourceFilePath, fs1, - new Path(toPartitionPath, sourceFilePath.getName()), false, fs1.getConf()); + FileUtil.copy(ifs, sourceFilePath, ifs, + new Path(toPartitionPath, sourceFilePath.getName()), false, ifs.getConf()); }); // Also copy the .commit files diff --git a/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/deltastreamer/HoodieDeltaStreamer.java b/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/deltastreamer/HoodieDeltaStreamer.java index 2baafd036..6a63c33fc 100644 --- a/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/deltastreamer/HoodieDeltaStreamer.java +++ b/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/deltastreamer/HoodieDeltaStreamer.java @@ -115,10 +115,11 @@ public class HoodieDeltaStreamer implements Serializable { public HoodieDeltaStreamer(Config cfg) throws IOException { this.cfg = cfg; - this.fs = FSUtils.getFs(); + this.jssc = getSparkContext(); + this.fs = FSUtils.getFs(cfg.targetBasePath, jssc.hadoopConfiguration()); if (fs.exists(new Path(cfg.targetBasePath))) { - HoodieTableMetaClient meta = new HoodieTableMetaClient(fs, cfg.targetBasePath); + HoodieTableMetaClient meta = new HoodieTableMetaClient(fs.getConf(), cfg.targetBasePath); this.commitTimelineOpt = Optional .of(meta.getActiveTimeline().getCommitsTimeline() .filterCompletedInstants()); @@ -129,8 +130,6 @@ public class HoodieDeltaStreamer implements Serializable { //TODO(vc) Should these be passed from outside? initSchemaProvider(); initKeyGenerator(); - this.jssc = getSparkContext(); - initSource(); } @@ -203,7 +202,9 @@ public class HoodieDeltaStreamer implements Serializable { Properties properties = new Properties(); properties.put(HoodieWriteConfig.TABLE_NAME, cfg.targetTableName); HoodieTableMetaClient - .initializePathAsHoodieDataset(FSUtils.getFs(), cfg.targetBasePath, properties); + .initializePathAsHoodieDataset( + FSUtils.getFs(cfg.targetBasePath, jssc.hadoopConfiguration()), cfg.targetBasePath, + properties); } log.info("Checkpoint to resume from : " + resumeCheckpointStr); diff --git a/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/schema/FilebasedSchemaProvider.java b/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/schema/FilebasedSchemaProvider.java index f6ea67f01..44745b093 100644 --- a/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/schema/FilebasedSchemaProvider.java +++ b/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/schema/FilebasedSchemaProvider.java @@ -25,6 +25,7 @@ import java.io.IOException; import java.util.Arrays; import org.apache.avro.Schema; import org.apache.commons.configuration.PropertiesConfiguration; +import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; @@ -50,7 +51,7 @@ public class FilebasedSchemaProvider extends SchemaProvider { public FilebasedSchemaProvider(PropertiesConfiguration config) { super(config); - this.fs = FSUtils.getFs(); + this.fs = FSUtils.getFs(config.getBasePath(), new Configuration()); DataSourceUtils.checkRequiredProperties(config, Arrays.asList(Config.SOURCE_SCHEMA_FILE_PROP, Config.TARGET_SCHEMA_FILE_PROP)); diff --git a/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/sources/DFSSource.java b/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/sources/DFSSource.java index 128a449a4..f9c9a9a1d 100644 --- a/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/sources/DFSSource.java +++ b/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/sources/DFSSource.java @@ -65,7 +65,7 @@ public class DFSSource extends Source { public DFSSource(PropertiesConfiguration config, JavaSparkContext sparkContext, SourceDataFormat dataFormat, SchemaProvider schemaProvider) { super(config, sparkContext, dataFormat, schemaProvider); - this.fs = FSUtils.getFs(); + this.fs = FSUtils.getFs(config.getBasePath(), sparkContext.hadoopConfiguration()); DataSourceUtils.checkRequiredProperties(config, Arrays.asList(Config.ROOT_INPUT_PATH_PROP)); } diff --git a/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/sources/HiveIncrPullSource.java b/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/sources/HiveIncrPullSource.java index 08c919366..aeecb9db0 100644 --- a/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/sources/HiveIncrPullSource.java +++ b/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/sources/HiveIncrPullSource.java @@ -72,7 +72,7 @@ public class HiveIncrPullSource extends Source { public HiveIncrPullSource(PropertiesConfiguration config, JavaSparkContext sparkContext, SourceDataFormat dataFormat, SchemaProvider schemaProvider) { super(config, sparkContext, dataFormat, schemaProvider); - this.fs = FSUtils.getFs(); + this.fs = FSUtils.getFs(config.getBasePath(), sparkContext.hadoopConfiguration()); DataSourceUtils.checkRequiredProperties(config, Arrays.asList(Config.ROOT_INPUT_PATH_PROP)); this.incrPullRootPath = config.getString(Config.ROOT_INPUT_PATH_PROP); } diff --git a/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/sources/KafkaSource.java b/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/sources/KafkaSource.java index 2f2941e5d..36c43c596 100644 --- a/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/sources/KafkaSource.java +++ b/hoodie-utilities/src/main/java/com/uber/hoodie/utilities/sources/KafkaSource.java @@ -48,7 +48,6 @@ import org.apache.spark.streaming.kafka.KafkaCluster; import org.apache.spark.streaming.kafka.KafkaUtils; import org.apache.spark.streaming.kafka.OffsetRange; import scala.Predef; -import scala.Tuple2; import scala.collection.JavaConverters; import scala.collection.immutable.Map; import scala.collection.immutable.Set; @@ -134,16 +133,16 @@ public class KafkaSource extends Source { public static Map toScalaMap(HashMap m) { return JavaConverters.mapAsScalaMapConverter(m).asScala().toMap( - Predef.>conforms() + Predef.conforms() ); } public static Set toScalaSet(HashSet s) { - return JavaConverters.asScalaSetConverter(s).asScala().toSet(); + return JavaConverters.asScalaSetConverter(s).asScala().toSet(); } public static java.util.Map toJavaMap(Map m) { - return JavaConverters.mapAsJavaMapConverter(m).asJava(); + return JavaConverters.mapAsJavaMapConverter(m).asJava(); } } diff --git a/hoodie-utilities/src/test/java/com/uber/hoodie/utilities/TestHDFSParquetImporter.java b/hoodie-utilities/src/test/java/com/uber/hoodie/utilities/TestHDFSParquetImporter.java index af0a52330..c1eaa86aa 100644 --- a/hoodie-utilities/src/test/java/com/uber/hoodie/utilities/TestHDFSParquetImporter.java +++ b/hoodie-utilities/src/test/java/com/uber/hoodie/utilities/TestHDFSParquetImporter.java @@ -24,6 +24,7 @@ import com.uber.hoodie.HoodieReadClient; import com.uber.hoodie.HoodieWriteClient; import com.uber.hoodie.common.HoodieTestDataGenerator; import com.uber.hoodie.common.minicluster.HdfsTestService; +import com.uber.hoodie.common.model.HoodieTestUtils; import com.uber.hoodie.common.table.HoodieTimeline; import com.uber.hoodie.common.table.timeline.HoodieActiveTimeline; import com.uber.hoodie.common.util.FSUtils; @@ -38,7 +39,6 @@ import java.util.Map.Entry; import java.util.concurrent.TimeUnit; import java.util.concurrent.atomic.AtomicInteger; import org.apache.avro.generic.GenericRecord; -import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FSDataOutputStream; import org.apache.hadoop.fs.LocatedFileStatus; import org.apache.hadoop.fs.Path; @@ -174,7 +174,7 @@ public class TestHDFSParquetImporter implements Serializable { ParquetWriter writer = AvroParquetWriter .builder(srcFile) .withSchema(HoodieTestDataGenerator.avroSchema) - .withConf(new Configuration()) + .withConf(HoodieTestUtils.getDefaultHadoopConf()) .build(); for (GenericRecord record : records) { writer.write(record); diff --git a/hoodie-utilities/src/test/java/com/uber/hoodie/utilities/TestHoodieSnapshotCopier.java b/hoodie-utilities/src/test/java/com/uber/hoodie/utilities/TestHoodieSnapshotCopier.java index 6f9acc489..db670673f 100644 --- a/hoodie-utilities/src/test/java/com/uber/hoodie/utilities/TestHoodieSnapshotCopier.java +++ b/hoodie-utilities/src/test/java/com/uber/hoodie/utilities/TestHoodieSnapshotCopier.java @@ -44,17 +44,22 @@ public class TestHoodieSnapshotCopier { @Before public void init() throws IOException { - // Prepare directories - TemporaryFolder folder = new TemporaryFolder(); - folder.create(); - rootPath = folder.getRoot().getAbsolutePath(); - basePath = rootPath + "/" + HoodieTestUtils.RAW_TRIPS_TEST_NAME; - HoodieTestUtils.init(basePath); - outputPath = rootPath + "/output"; - fs = FSUtils.getFs(); - // Start a local Spark job - SparkConf conf = new SparkConf().setAppName("snapshot-test-job").setMaster("local[2]"); - jsc = new JavaSparkContext(conf); + try { + // Prepare directories + TemporaryFolder folder = new TemporaryFolder(); + folder.create(); + rootPath = "file://" + folder.getRoot().getAbsolutePath(); + basePath = rootPath + "/" + HoodieTestUtils.RAW_TRIPS_TEST_NAME; + outputPath = rootPath + "/output"; + + fs = FSUtils.getFs(basePath, HoodieTestUtils.getDefaultHadoopConf()); + HoodieTestUtils.init(basePath); + // Start a local Spark job + SparkConf conf = new SparkConf().setAppName("snapshot-test-job").setMaster("local[2]"); + jsc = new JavaSparkContext(conf); + } catch (Exception e) { + e.printStackTrace(); + } } @Test