Multi FS Support
- Reviving PR 191, to make FileSystem creation off actual path - Streamline all filesystem access to HoodieTableMetaClient - Hadoop Conf from Spark Context serialized & passed to executor code too - Pick up env vars prefixed with HOODIE_ENV_ into Configuration object - Cleanup usage of FSUtils.getFS, piggybacking off HoodieTableMetaClient.getFS - Adding s3a to supported schemes & support escaping "." in env vars - Tests use HoodieTestUtils.getDefaultHadoopConf
This commit is contained in:
committed by
vinoth chandar
parent
44839b88c6
commit
0cd186c899
@@ -17,6 +17,7 @@
|
||||
package com.uber.hoodie.cli;
|
||||
|
||||
import com.uber.hoodie.common.table.HoodieTableMetaClient;
|
||||
import com.uber.hoodie.common.util.FSUtils;
|
||||
import java.io.IOException;
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
@@ -36,7 +37,7 @@ public class HoodieCLI {
|
||||
|
||||
public static boolean initConf() {
|
||||
if (HoodieCLI.conf == null) {
|
||||
HoodieCLI.conf = new Configuration();
|
||||
HoodieCLI.conf = FSUtils.prepareHadoopConf(new Configuration());
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
|
||||
@@ -54,12 +54,14 @@ public class ArchivedCommitsCommand implements CommandMarker {
|
||||
|
||||
System.out
|
||||
.println("===============> Showing only " + limit + " archived commits <===============");
|
||||
FileStatus[] fsStatuses = FSUtils.getFs().globStatus(
|
||||
new Path(HoodieCLI.tableMetadata.getBasePath() + "/.hoodie/.commits_.archive*"));
|
||||
String basePath = HoodieCLI.tableMetadata.getBasePath();
|
||||
FileStatus[] fsStatuses = FSUtils.getFs(basePath, HoodieCLI.conf)
|
||||
.globStatus(new Path(basePath + "/.hoodie/.commits_.archive*"));
|
||||
List<String[]> allCommits = new ArrayList<>();
|
||||
for (FileStatus fs : fsStatuses) {
|
||||
//read the archived file
|
||||
HoodieLogFormat.Reader reader = HoodieLogFormat.newReader(FSUtils.getFs(),
|
||||
HoodieLogFormat.Reader reader = HoodieLogFormat
|
||||
.newReader(FSUtils.getFs(basePath, HoodieCLI.conf),
|
||||
new HoodieLogFile(fs.getPath()), HoodieArchivedMetaEntry.getClassSchema(), false);
|
||||
|
||||
List<IndexedRecord> readRecords = new ArrayList<>();
|
||||
|
||||
@@ -77,7 +77,7 @@ public class CleansCommand implements CommandMarker {
|
||||
@CliCommand(value = "cleans refresh", help = "Refresh the commits")
|
||||
public String refreshCleans() throws IOException {
|
||||
HoodieTableMetaClient metadata =
|
||||
new HoodieTableMetaClient(HoodieCLI.fs, HoodieCLI.tableMetadata.getBasePath());
|
||||
new HoodieTableMetaClient(HoodieCLI.conf, HoodieCLI.tableMetadata.getBasePath());
|
||||
HoodieCLI.setTableMetadata(metadata);
|
||||
return "Metadata for table " + metadata.getTableConfig().getTableName() + " refreshed.";
|
||||
}
|
||||
|
||||
@@ -96,7 +96,7 @@ public class CommitsCommand implements CommandMarker {
|
||||
@CliCommand(value = "commits refresh", help = "Refresh the commits")
|
||||
public String refreshCommits() throws IOException {
|
||||
HoodieTableMetaClient metadata =
|
||||
new HoodieTableMetaClient(HoodieCLI.fs, HoodieCLI.tableMetadata.getBasePath());
|
||||
new HoodieTableMetaClient(HoodieCLI.conf, HoodieCLI.tableMetadata.getBasePath());
|
||||
HoodieCLI.setTableMetadata(metadata);
|
||||
return "Metadata for table " + metadata.getTableConfig().getTableName() + " refreshed.";
|
||||
}
|
||||
@@ -224,14 +224,13 @@ public class CommitsCommand implements CommandMarker {
|
||||
public String compareCommits(
|
||||
@CliOption(key = {"path"}, help = "Path of the dataset to compare to")
|
||||
final String path) throws Exception {
|
||||
HoodieTableMetaClient target = new HoodieTableMetaClient(HoodieCLI.fs, path);
|
||||
|
||||
HoodieTableMetaClient target = new HoodieTableMetaClient(HoodieCLI.conf, path);
|
||||
HoodieTimeline targetTimeline = target.getActiveTimeline().getCommitsTimeline()
|
||||
.filterCompletedInstants();
|
||||
;
|
||||
HoodieTableMetaClient source = HoodieCLI.tableMetadata;
|
||||
HoodieTimeline sourceTimeline = source.getActiveTimeline().getCommitsTimeline()
|
||||
.filterCompletedInstants();
|
||||
;
|
||||
String targetLatestCommit =
|
||||
targetTimeline.getInstants().iterator().hasNext() ? "0"
|
||||
: targetTimeline.lastInstant().get().getTimestamp();
|
||||
@@ -266,7 +265,7 @@ public class CommitsCommand implements CommandMarker {
|
||||
public String syncCommits(
|
||||
@CliOption(key = {"path"}, help = "Path of the dataset to compare to")
|
||||
final String path) throws Exception {
|
||||
HoodieCLI.syncTableMetadata = new HoodieTableMetaClient(HoodieCLI.fs, path);
|
||||
HoodieCLI.syncTableMetadata = new HoodieTableMetaClient(HoodieCLI.conf, path);
|
||||
HoodieCLI.state = HoodieCLI.CLIState.SYNC;
|
||||
return "Load sync state between " + HoodieCLI.tableMetadata.getTableConfig().getTableName()
|
||||
+ " and " + HoodieCLI.syncTableMetadata.getTableConfig().getTableName();
|
||||
|
||||
@@ -33,7 +33,7 @@ public class DatasetsCommand implements CommandMarker {
|
||||
final String path) throws IOException {
|
||||
boolean initialized = HoodieCLI.initConf();
|
||||
HoodieCLI.initFS(initialized);
|
||||
HoodieCLI.setTableMetadata(new HoodieTableMetaClient(HoodieCLI.fs, path));
|
||||
HoodieCLI.setTableMetadata(new HoodieTableMetaClient(HoodieCLI.conf, path));
|
||||
HoodieCLI.state = HoodieCLI.CLIState.DATASET;
|
||||
return "Metadata for table " + HoodieCLI.tableMetadata.getTableConfig().getTableName()
|
||||
+ " loaded";
|
||||
|
||||
@@ -138,7 +138,7 @@ public class SavepointsCommand implements CommandMarker {
|
||||
@CliCommand(value = "savepoints refresh", help = "Refresh the savepoints")
|
||||
public String refreshMetaClient() throws IOException {
|
||||
HoodieTableMetaClient metadata =
|
||||
new HoodieTableMetaClient(HoodieCLI.fs, HoodieCLI.tableMetadata.getBasePath());
|
||||
new HoodieTableMetaClient(HoodieCLI.conf, HoodieCLI.tableMetadata.getBasePath());
|
||||
HoodieCLI.setTableMetadata(metadata);
|
||||
return "Metadata for table " + metadata.getTableConfig().getTableName() + " refreshed.";
|
||||
}
|
||||
|
||||
@@ -97,8 +97,8 @@ public class SparkMain {
|
||||
String repairedOutputPath,
|
||||
String basePath)
|
||||
throws Exception {
|
||||
DedupeSparkJob job = new DedupeSparkJob(basePath,
|
||||
duplicatedPartitionPath, repairedOutputPath, new SQLContext(jsc), FSUtils.getFs());
|
||||
DedupeSparkJob job = new DedupeSparkJob(basePath, duplicatedPartitionPath, repairedOutputPath,
|
||||
new SQLContext(jsc), FSUtils.getFs(basePath, jsc.hadoopConfiguration()));
|
||||
job.fixDuplicates(true);
|
||||
return 0;
|
||||
}
|
||||
|
||||
@@ -18,6 +18,7 @@ package com.uber.hoodie.cli.utils;
|
||||
|
||||
import com.uber.hoodie.HoodieWriteClient;
|
||||
import com.uber.hoodie.cli.commands.SparkMain;
|
||||
import com.uber.hoodie.common.util.FSUtils;
|
||||
import java.io.File;
|
||||
import java.net.URISyntaxException;
|
||||
import org.apache.log4j.Logger;
|
||||
@@ -66,6 +67,7 @@ public class SparkUtil {
|
||||
sparkConf = HoodieWriteClient.registerClasses(sparkConf);
|
||||
JavaSparkContext jsc = new JavaSparkContext(sparkConf);
|
||||
jsc.hadoopConfiguration().setBoolean("parquet.enable.summary-metadata", false);
|
||||
FSUtils.prepareHadoopConf(jsc.hadoopConfiguration());
|
||||
return jsc;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -75,7 +75,7 @@ class DedupeSparkJob(basePath: String,
|
||||
val tmpTableName = s"htbl_${System.currentTimeMillis()}"
|
||||
val dedupeTblName = s"${tmpTableName}_dupeKeys"
|
||||
|
||||
val metadata = new HoodieTableMetaClient(fs, basePath)
|
||||
val metadata = new HoodieTableMetaClient(fs.getConf, basePath)
|
||||
|
||||
val allFiles = fs.listStatus(new org.apache.hadoop.fs.Path(s"${basePath}/${duplicatedPartitionPath}"))
|
||||
val fsView = new HoodieTableFileSystemView(metadata, metadata.getActiveTimeline.getCommitTimeline.filterCompletedInstants(), allFiles)
|
||||
@@ -127,7 +127,7 @@ class DedupeSparkJob(basePath: String,
|
||||
|
||||
|
||||
def fixDuplicates(dryRun: Boolean = true) = {
|
||||
val metadata = new HoodieTableMetaClient(fs, basePath)
|
||||
val metadata = new HoodieTableMetaClient(fs.getConf, basePath)
|
||||
|
||||
val allFiles = fs.listStatus(new Path(s"${basePath}/${duplicatedPartitionPath}"))
|
||||
val fsView = new HoodieTableFileSystemView(metadata, metadata.getActiveTimeline.getCommitTimeline.filterCompletedInstants(), allFiles)
|
||||
|
||||
@@ -38,7 +38,7 @@ import scala.collection.mutable._
|
||||
object SparkHelpers {
|
||||
@throws[Exception]
|
||||
def skipKeysAndWriteNewFile(commitTime: String, fs: FileSystem, sourceFile: Path, destinationFile: Path, keysToSkip: Set[String]) {
|
||||
val sourceRecords = ParquetUtils.readAvroRecords(sourceFile)
|
||||
val sourceRecords = ParquetUtils.readAvroRecords(fs.getConf, sourceFile)
|
||||
val schema: Schema = sourceRecords.get(0).getSchema
|
||||
val filter: BloomFilter = new BloomFilter(HoodieIndexConfig.DEFAULT_BLOOM_FILTER_NUM_ENTRIES.toInt, HoodieIndexConfig.DEFAULT_BLOOM_FILTER_FPP.toDouble)
|
||||
val writeSupport: HoodieAvroWriteSupport = new HoodieAvroWriteSupport(new AvroSchemaConverter().convert(schema), schema, filter)
|
||||
@@ -47,6 +47,7 @@ object SparkHelpers {
|
||||
for (rec <- sourceRecords) {
|
||||
val key: String = rec.get(HoodieRecord.RECORD_KEY_METADATA_FIELD).toString
|
||||
if (!keysToSkip.contains(key)) {
|
||||
|
||||
writer.writeAvro(key, rec)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -67,10 +67,10 @@ public class HoodieReadClient implements Serializable {
|
||||
*/
|
||||
public HoodieReadClient(JavaSparkContext jsc, String basePath) {
|
||||
this.jsc = jsc;
|
||||
this.fs = FSUtils.getFs();
|
||||
this.fs = FSUtils.getFs(basePath, jsc.hadoopConfiguration());
|
||||
// Create a Hoodie table which encapsulated the commits and files visible
|
||||
this.hoodieTable = HoodieTable
|
||||
.getHoodieTable(new HoodieTableMetaClient(fs, basePath, true), null);
|
||||
.getHoodieTable(new HoodieTableMetaClient(jsc.hadoopConfiguration(), basePath, true), null);
|
||||
this.commitTimeline = hoodieTable.getCommitTimeline().filterCompletedInstants();
|
||||
this.index =
|
||||
new HoodieBloomIndex(HoodieWriteConfig.newBuilder().withPath(basePath).build(), jsc);
|
||||
@@ -129,8 +129,8 @@ public class HoodieReadClient implements Serializable {
|
||||
JavaPairRDD<HoodieKey, Row> keyRowRDD = originalDF.javaRDD()
|
||||
.mapToPair(row -> {
|
||||
HoodieKey key = new HoodieKey(
|
||||
row.<String>getAs(HoodieRecord.RECORD_KEY_METADATA_FIELD),
|
||||
row.<String>getAs(HoodieRecord.PARTITION_PATH_METADATA_FIELD));
|
||||
row.getAs(HoodieRecord.RECORD_KEY_METADATA_FIELD),
|
||||
row.getAs(HoodieRecord.PARTITION_PATH_METADATA_FIELD));
|
||||
return new Tuple2<>(key, row);
|
||||
});
|
||||
|
||||
|
||||
@@ -53,18 +53,6 @@ import com.uber.hoodie.table.HoodieTable;
|
||||
import com.uber.hoodie.table.UserDefinedBulkInsertPartitioner;
|
||||
import com.uber.hoodie.table.WorkloadProfile;
|
||||
import com.uber.hoodie.table.WorkloadStat;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.log4j.LogManager;
|
||||
import org.apache.log4j.Logger;
|
||||
import org.apache.spark.Partitioner;
|
||||
import org.apache.spark.SparkConf;
|
||||
import org.apache.spark.api.java.JavaRDD;
|
||||
import org.apache.spark.api.java.JavaSparkContext;
|
||||
import org.apache.spark.api.java.function.PairFunction;
|
||||
import org.apache.spark.storage.StorageLevel;
|
||||
import scala.Option;
|
||||
import scala.Tuple2;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Serializable;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
@@ -76,6 +64,17 @@ import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Optional;
|
||||
import java.util.stream.Collectors;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.log4j.LogManager;
|
||||
import org.apache.log4j.Logger;
|
||||
import org.apache.spark.Partitioner;
|
||||
import org.apache.spark.SparkConf;
|
||||
import org.apache.spark.api.java.JavaRDD;
|
||||
import org.apache.spark.api.java.JavaSparkContext;
|
||||
import org.apache.spark.api.java.function.PairFunction;
|
||||
import org.apache.spark.storage.StorageLevel;
|
||||
import scala.Option;
|
||||
import scala.Tuple2;
|
||||
|
||||
/**
|
||||
* Hoodie Write Client helps you build datasets on HDFS [insert()] and then perform efficient
|
||||
@@ -112,7 +111,7 @@ public class HoodieWriteClient<T extends HoodieRecordPayload> implements Seriali
|
||||
*/
|
||||
public HoodieWriteClient(JavaSparkContext jsc, HoodieWriteConfig clientConfig,
|
||||
boolean rollbackInFlight) {
|
||||
this.fs = FSUtils.getFs();
|
||||
this.fs = FSUtils.getFs(clientConfig.getBasePath(), jsc.hadoopConfiguration());
|
||||
this.jsc = jsc;
|
||||
this.config = clientConfig;
|
||||
this.index = HoodieIndex.createIndex(config, jsc);
|
||||
@@ -133,8 +132,9 @@ public class HoodieWriteClient<T extends HoodieRecordPayload> implements Seriali
|
||||
*/
|
||||
public JavaRDD<HoodieRecord<T>> filterExists(JavaRDD<HoodieRecord<T>> hoodieRecords) {
|
||||
// Create a Hoodie table which encapsulated the commits and files visible
|
||||
HoodieTable<T> table = HoodieTable
|
||||
.getHoodieTable(new HoodieTableMetaClient(fs, config.getBasePath(), true), config);
|
||||
HoodieTable<T> table = HoodieTable.getHoodieTable(
|
||||
new HoodieTableMetaClient(jsc.hadoopConfiguration(), config.getBasePath(), true),
|
||||
config);
|
||||
|
||||
JavaRDD<HoodieRecord<T>> recordsWithLocation = index.tagLocation(hoodieRecords, table);
|
||||
return recordsWithLocation.filter(v1 -> !v1.isCurrentLocationKnown());
|
||||
@@ -146,8 +146,9 @@ public class HoodieWriteClient<T extends HoodieRecordPayload> implements Seriali
|
||||
public JavaRDD<WriteStatus> upsert(JavaRDD<HoodieRecord<T>> records, final String commitTime) {
|
||||
writeContext = metrics.getCommitCtx();
|
||||
// Create a Hoodie table which encapsulated the commits and files visible
|
||||
HoodieTable<T> table = HoodieTable
|
||||
.getHoodieTable(new HoodieTableMetaClient(fs, config.getBasePath(), true), config);
|
||||
HoodieTable<T> table = HoodieTable.getHoodieTable(
|
||||
new HoodieTableMetaClient(jsc.hadoopConfiguration(), config.getBasePath(), true),
|
||||
config);
|
||||
|
||||
try {
|
||||
// De-dupe/merge if needed
|
||||
@@ -180,8 +181,9 @@ public class HoodieWriteClient<T extends HoodieRecordPayload> implements Seriali
|
||||
public JavaRDD<WriteStatus> insert(JavaRDD<HoodieRecord<T>> records, final String commitTime) {
|
||||
writeContext = metrics.getCommitCtx();
|
||||
// Create a Hoodie table which encapsulated the commits and files visible
|
||||
HoodieTable<T> table = HoodieTable
|
||||
.getHoodieTable(new HoodieTableMetaClient(fs, config.getBasePath(), true), config);
|
||||
HoodieTable<T> table = HoodieTable.getHoodieTable(
|
||||
new HoodieTableMetaClient(jsc.hadoopConfiguration(), config.getBasePath(), true),
|
||||
config);
|
||||
try {
|
||||
// De-dupe/merge if needed
|
||||
JavaRDD<HoodieRecord<T>> dedupedRecords =
|
||||
@@ -237,8 +239,9 @@ public class HoodieWriteClient<T extends HoodieRecordPayload> implements Seriali
|
||||
Option<UserDefinedBulkInsertPartitioner> bulkInsertPartitioner) {
|
||||
writeContext = metrics.getCommitCtx();
|
||||
// Create a Hoodie table which encapsulated the commits and files visible
|
||||
HoodieTable<T> table = HoodieTable
|
||||
.getHoodieTable(new HoodieTableMetaClient(fs, config.getBasePath(), true), config);
|
||||
HoodieTable<T> table = HoodieTable.getHoodieTable(
|
||||
new HoodieTableMetaClient(jsc.hadoopConfiguration(), config.getBasePath(), true),
|
||||
config);
|
||||
|
||||
try {
|
||||
// De-dupe/merge if needed
|
||||
@@ -306,8 +309,7 @@ public class HoodieWriteClient<T extends HoodieRecordPayload> implements Seriali
|
||||
* file instead of using HoodieCommitMetadata
|
||||
*/
|
||||
private void saveWorkloadProfileMetadataToInflight(WorkloadProfile profile,
|
||||
HoodieTable<T> table,
|
||||
String commitTime) throws HoodieCommitException {
|
||||
HoodieTable<T> table, String commitTime) throws HoodieCommitException {
|
||||
try {
|
||||
HoodieCommitMetadata metadata = new HoodieCommitMetadata();
|
||||
profile.getPartitionPaths().stream().forEach(path -> {
|
||||
@@ -409,8 +411,9 @@ public class HoodieWriteClient<T extends HoodieRecordPayload> implements Seriali
|
||||
|
||||
logger.info("Commiting " + commitTime);
|
||||
// Create a Hoodie table which encapsulated the commits and files visible
|
||||
HoodieTable<T> table = HoodieTable
|
||||
.getHoodieTable(new HoodieTableMetaClient(fs, config.getBasePath(), true), config);
|
||||
HoodieTable<T> table = HoodieTable.getHoodieTable(
|
||||
new HoodieTableMetaClient(jsc.hadoopConfiguration(), config.getBasePath(), true),
|
||||
config);
|
||||
|
||||
HoodieActiveTimeline activeTimeline = table.getActiveTimeline();
|
||||
|
||||
@@ -485,8 +488,9 @@ public class HoodieWriteClient<T extends HoodieRecordPayload> implements Seriali
|
||||
* @return true if the savepoint was created successfully
|
||||
*/
|
||||
public boolean savepoint(String user, String comment) {
|
||||
HoodieTable<T> table = HoodieTable
|
||||
.getHoodieTable(new HoodieTableMetaClient(fs, config.getBasePath(), true), config);
|
||||
HoodieTable<T> table = HoodieTable.getHoodieTable(
|
||||
new HoodieTableMetaClient(jsc.hadoopConfiguration(), config.getBasePath(), true),
|
||||
config);
|
||||
if (table.getCompletedCommitTimeline().empty()) {
|
||||
throw new HoodieSavepointException("Could not savepoint. Commit timeline is empty");
|
||||
}
|
||||
@@ -512,8 +516,9 @@ public class HoodieWriteClient<T extends HoodieRecordPayload> implements Seriali
|
||||
* @return true if the savepoint was created successfully
|
||||
*/
|
||||
public boolean savepoint(String commitTime, String user, String comment) {
|
||||
HoodieTable<T> table = HoodieTable
|
||||
.getHoodieTable(new HoodieTableMetaClient(fs, config.getBasePath(), true), config);
|
||||
HoodieTable<T> table = HoodieTable.getHoodieTable(
|
||||
new HoodieTableMetaClient(jsc.hadoopConfiguration(), config.getBasePath(), true),
|
||||
config);
|
||||
Optional<HoodieInstant> cleanInstant = table.getCompletedCleanTimeline().lastInstant();
|
||||
|
||||
HoodieInstant commitInstant = new HoodieInstant(false, HoodieTimeline.COMMIT_ACTION,
|
||||
@@ -575,8 +580,9 @@ public class HoodieWriteClient<T extends HoodieRecordPayload> implements Seriali
|
||||
* @return true if the savepoint was deleted successfully
|
||||
*/
|
||||
public void deleteSavepoint(String savepointTime) {
|
||||
HoodieTable<T> table = HoodieTable
|
||||
.getHoodieTable(new HoodieTableMetaClient(fs, config.getBasePath(), true), config);
|
||||
HoodieTable<T> table = HoodieTable.getHoodieTable(
|
||||
new HoodieTableMetaClient(jsc.hadoopConfiguration(), config.getBasePath(), true),
|
||||
config);
|
||||
HoodieActiveTimeline activeTimeline = table.getActiveTimeline();
|
||||
|
||||
HoodieInstant savePoint =
|
||||
@@ -602,8 +608,9 @@ public class HoodieWriteClient<T extends HoodieRecordPayload> implements Seriali
|
||||
* @return true if the savepoint was rollecback to successfully
|
||||
*/
|
||||
public boolean rollbackToSavepoint(String savepointTime) {
|
||||
HoodieTable<T> table = HoodieTable
|
||||
.getHoodieTable(new HoodieTableMetaClient(fs, config.getBasePath(), true), config);
|
||||
HoodieTable<T> table = HoodieTable.getHoodieTable(
|
||||
new HoodieTableMetaClient(jsc.hadoopConfiguration(), config.getBasePath(), true),
|
||||
config);
|
||||
HoodieActiveTimeline activeTimeline = table.getActiveTimeline();
|
||||
HoodieTimeline commitTimeline = table.getCommitsTimeline();
|
||||
|
||||
@@ -653,8 +660,9 @@ public class HoodieWriteClient<T extends HoodieRecordPayload> implements Seriali
|
||||
String startRollbackTime = HoodieActiveTimeline.COMMIT_FORMATTER.format(new Date());
|
||||
|
||||
// Create a Hoodie table which encapsulated the commits and files visible
|
||||
HoodieTable<T> table = HoodieTable
|
||||
.getHoodieTable(new HoodieTableMetaClient(fs, config.getBasePath(), true), config);
|
||||
HoodieTable<T> table = HoodieTable.getHoodieTable(
|
||||
new HoodieTableMetaClient(jsc.hadoopConfiguration(), config.getBasePath(), true),
|
||||
config);
|
||||
HoodieActiveTimeline activeTimeline = table.getActiveTimeline();
|
||||
HoodieTimeline inflightTimeline = table.getInflightCommitTimeline();
|
||||
HoodieTimeline commitTimeline = table.getCompletedCommitTimeline();
|
||||
@@ -761,8 +769,9 @@ public class HoodieWriteClient<T extends HoodieRecordPayload> implements Seriali
|
||||
final Timer.Context context = metrics.getCleanCtx();
|
||||
|
||||
// Create a Hoodie table which encapsulated the commits and files visible
|
||||
HoodieTable<T> table = HoodieTable
|
||||
.getHoodieTable(new HoodieTableMetaClient(fs, config.getBasePath(), true), config);
|
||||
HoodieTable<T> table = HoodieTable.getHoodieTable(
|
||||
new HoodieTableMetaClient(jsc.hadoopConfiguration(), config.getBasePath(),
|
||||
true), config);
|
||||
|
||||
List<HoodieCleanStat> cleanStats = table.clean(jsc);
|
||||
if (cleanStats.isEmpty()) {
|
||||
@@ -810,8 +819,9 @@ public class HoodieWriteClient<T extends HoodieRecordPayload> implements Seriali
|
||||
|
||||
public void startCommitWithTime(String commitTime) {
|
||||
logger.info("Generate a new commit time " + commitTime);
|
||||
HoodieTable<T> table = HoodieTable
|
||||
.getHoodieTable(new HoodieTableMetaClient(fs, config.getBasePath(), true), config);
|
||||
HoodieTable<T> table = HoodieTable.getHoodieTable(
|
||||
new HoodieTableMetaClient(jsc.hadoopConfiguration(), config.getBasePath(), true),
|
||||
config);
|
||||
HoodieActiveTimeline activeTimeline = table.getActiveTimeline();
|
||||
String commitActionType = table.getCommitActionType();
|
||||
activeTimeline.createInflight(
|
||||
@@ -827,8 +837,9 @@ public class HoodieWriteClient<T extends HoodieRecordPayload> implements Seriali
|
||||
*/
|
||||
private void compact(String compactionCommitTime) throws IOException {
|
||||
// Create a Hoodie table which encapsulated the commits and files visible
|
||||
HoodieTable<T> table = HoodieTable
|
||||
.getHoodieTable(new HoodieTableMetaClient(fs, config.getBasePath(), true), config);
|
||||
HoodieTable<T> table = HoodieTable.getHoodieTable(
|
||||
new HoodieTableMetaClient(jsc.hadoopConfiguration(), config.getBasePath(), true),
|
||||
config);
|
||||
Optional<HoodieCommitMetadata> compactionMetadata = table.compact(jsc, compactionCommitTime);
|
||||
if (compactionMetadata.isPresent()) {
|
||||
logger.info("Compacted successfully on commit " + compactionCommitTime);
|
||||
@@ -876,8 +887,9 @@ public class HoodieWriteClient<T extends HoodieRecordPayload> implements Seriali
|
||||
* Cleanup all inflight commits
|
||||
*/
|
||||
private void rollbackInflightCommits() {
|
||||
HoodieTable<T> table = HoodieTable
|
||||
.getHoodieTable(new HoodieTableMetaClient(fs, config.getBasePath(), true), config);
|
||||
HoodieTable<T> table = HoodieTable.getHoodieTable(
|
||||
new HoodieTableMetaClient(jsc.hadoopConfiguration(), config.getBasePath(), true),
|
||||
config);
|
||||
HoodieTimeline inflightTimeline = table.getCommitsTimeline().filterInflights();
|
||||
List<String> commits = inflightTimeline.getInstants().map(HoodieInstant::getTimestamp)
|
||||
.collect(Collectors.toList());
|
||||
|
||||
@@ -72,7 +72,8 @@ public class LazyInsertIterable<T extends HoodieRecordPayload> extends
|
||||
HoodieIOHandle.cleanupTmpFilesFromCurrentCommit(hoodieConfig,
|
||||
commitTime,
|
||||
record.getPartitionPath(),
|
||||
TaskContext.getPartitionId());
|
||||
TaskContext.getPartitionId(),
|
||||
hoodieTable);
|
||||
partitionsCleaned.add(record.getPartitionPath());
|
||||
}
|
||||
|
||||
|
||||
@@ -155,7 +155,8 @@ public class HoodieBloomIndex<T extends HoodieRecordPayload> extends HoodieIndex
|
||||
// Step 3: Obtain a RDD, for each incoming record, that already exists, with the file id, that contains it.
|
||||
int parallelism = autoComputeParallelism(recordsPerPartition, partitionToFileInfo,
|
||||
partitionRecordKeyPairRDD);
|
||||
return findMatchingFilesForRecordKeys(partitionToFileInfo, partitionRecordKeyPairRDD,
|
||||
return findMatchingFilesForRecordKeys(hoodieTable, partitionToFileInfo,
|
||||
partitionRecordKeyPairRDD,
|
||||
parallelism);
|
||||
}
|
||||
|
||||
@@ -257,7 +258,8 @@ public class HoodieBloomIndex<T extends HoodieRecordPayload> extends HoodieIndex
|
||||
.mapToPair(ft -> {
|
||||
try {
|
||||
String[] minMaxKeys = ParquetUtils
|
||||
.readMinMaxRecordKeys(ft._2().getFileStatus().getPath());
|
||||
.readMinMaxRecordKeys(hoodieTable.getHadoopConf(),
|
||||
ft._2().getFileStatus().getPath());
|
||||
return new Tuple2<>(ft._1(),
|
||||
new BloomIndexFileInfo(ft._2().getFileName(), minMaxKeys[0], minMaxKeys[1]));
|
||||
} catch (MetadataNotFoundException me) {
|
||||
@@ -358,7 +360,7 @@ public class HoodieBloomIndex<T extends HoodieRecordPayload> extends HoodieIndex
|
||||
* Make sure the parallelism is atleast the groupby parallelism for tagging location
|
||||
*/
|
||||
@VisibleForTesting
|
||||
JavaPairRDD<String, String> findMatchingFilesForRecordKeys(
|
||||
JavaPairRDD<String, String> findMatchingFilesForRecordKeys(HoodieTable hoodieTable,
|
||||
final Map<String, List<BloomIndexFileInfo>> partitionToFileIndexInfo,
|
||||
JavaPairRDD<String, String> partitionRecordKeyPairRDD,
|
||||
int totalSubpartitions) {
|
||||
@@ -372,7 +374,8 @@ public class HoodieBloomIndex<T extends HoodieRecordPayload> extends HoodieIndex
|
||||
.sortByKey(true, joinParallelism);
|
||||
|
||||
return fileSortedTripletRDD
|
||||
.mapPartitionsWithIndex(new HoodieBloomIndexCheckFunction(config.getBasePath()), true)
|
||||
.mapPartitionsWithIndex(
|
||||
new HoodieBloomIndexCheckFunction(hoodieTable, config.getBasePath()), true)
|
||||
.flatMap(indexLookupResults -> indexLookupResults.iterator())
|
||||
.filter(lookupResult -> lookupResult.getMatchingRecordKeys().size() > 0)
|
||||
.flatMapToPair(lookupResult -> {
|
||||
|
||||
@@ -24,10 +24,12 @@ import com.uber.hoodie.common.util.ParquetUtils;
|
||||
import com.uber.hoodie.exception.HoodieException;
|
||||
import com.uber.hoodie.exception.HoodieIndexException;
|
||||
import com.uber.hoodie.func.LazyIterableIterator;
|
||||
import com.uber.hoodie.table.HoodieTable;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.apache.log4j.LogManager;
|
||||
import org.apache.log4j.Logger;
|
||||
@@ -45,20 +47,24 @@ public class HoodieBloomIndexCheckFunction implements
|
||||
|
||||
private final String basePath;
|
||||
|
||||
public HoodieBloomIndexCheckFunction(String basePath) {
|
||||
private final HoodieTable table;
|
||||
|
||||
public HoodieBloomIndexCheckFunction(HoodieTable table, String basePath) {
|
||||
this.table = table;
|
||||
this.basePath = basePath;
|
||||
}
|
||||
|
||||
/**
|
||||
* Given a list of row keys and one file, return only row keys existing in that file.
|
||||
*/
|
||||
public static List<String> checkCandidatesAgainstFile(List<String> candidateRecordKeys,
|
||||
public static List<String> checkCandidatesAgainstFile(Configuration configuration,
|
||||
List<String> candidateRecordKeys,
|
||||
Path filePath) throws HoodieIndexException {
|
||||
List<String> foundRecordKeys = new ArrayList<>();
|
||||
try {
|
||||
// Load all rowKeys from the file, to double-confirm
|
||||
if (!candidateRecordKeys.isEmpty()) {
|
||||
Set<String> fileRowKeys = ParquetUtils.readRowKeysFromParquet(filePath);
|
||||
Set<String> fileRowKeys = ParquetUtils.readRowKeysFromParquet(configuration, filePath);
|
||||
logger.info("Loading " + fileRowKeys.size() + " row keys from " + filePath);
|
||||
if (logger.isDebugEnabled()) {
|
||||
logger.debug("Keys from " + filePath + " => " + fileRowKeys);
|
||||
@@ -107,7 +113,8 @@ public class HoodieBloomIndexCheckFunction implements
|
||||
private void initState(String fileName, String partitionPath) throws HoodieIndexException {
|
||||
try {
|
||||
Path filePath = new Path(basePath + "/" + partitionPath + "/" + fileName);
|
||||
bloomFilter = ParquetUtils.readBloomFilterFromParquetMetadata(filePath);
|
||||
bloomFilter = ParquetUtils
|
||||
.readBloomFilterFromParquetMetadata(table.getHadoopConf(), filePath);
|
||||
candidateRecordKeys = new ArrayList<>();
|
||||
currentFile = fileName;
|
||||
currentParitionPath = partitionPath;
|
||||
@@ -154,7 +161,7 @@ public class HoodieBloomIndexCheckFunction implements
|
||||
.debug("#The candidate row keys for " + filePath + " => " + candidateRecordKeys);
|
||||
}
|
||||
ret.add(new IndexLookupResult(currentFile,
|
||||
checkCandidatesAgainstFile(candidateRecordKeys, filePath)));
|
||||
checkCandidatesAgainstFile(table.getHadoopConf(), candidateRecordKeys, filePath)));
|
||||
|
||||
initState(fileName, partitionPath);
|
||||
if (bloomFilter.mightContain(recordKey)) {
|
||||
@@ -177,7 +184,7 @@ public class HoodieBloomIndexCheckFunction implements
|
||||
logger.debug("#The candidate row keys for " + filePath + " => " + candidateRecordKeys);
|
||||
}
|
||||
ret.add(new IndexLookupResult(currentFile,
|
||||
checkCandidatesAgainstFile(candidateRecordKeys, filePath)));
|
||||
checkCandidatesAgainstFile(table.getHadoopConf(), candidateRecordKeys, filePath)));
|
||||
}
|
||||
|
||||
} catch (Throwable e) {
|
||||
|
||||
@@ -33,7 +33,6 @@ import java.util.Iterator;
|
||||
import java.util.List;
|
||||
import java.util.Optional;
|
||||
import java.util.stream.Collectors;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.log4j.LogManager;
|
||||
import org.apache.log4j.Logger;
|
||||
|
||||
@@ -51,14 +50,12 @@ public class HoodieCleanHelper<T extends HoodieRecordPayload<T>> {
|
||||
private final HoodieTimeline commitTimeline;
|
||||
private HoodieTable<T> hoodieTable;
|
||||
private HoodieWriteConfig config;
|
||||
private FileSystem fs;
|
||||
|
||||
public HoodieCleanHelper(HoodieTable<T> hoodieTable, HoodieWriteConfig config) {
|
||||
this.hoodieTable = hoodieTable;
|
||||
this.fileSystemView = hoodieTable.getCompletedFileSystemView();
|
||||
this.commitTimeline = hoodieTable.getCompletedCommitTimeline();
|
||||
this.config = config;
|
||||
this.fs = hoodieTable.getFs();
|
||||
}
|
||||
|
||||
|
||||
|
||||
@@ -126,7 +126,8 @@ public class HoodieCommitArchiveLog {
|
||||
int minCommitsToKeep = config.getMinCommitsToKeep();
|
||||
|
||||
HoodieTable table = HoodieTable
|
||||
.getHoodieTable(new HoodieTableMetaClient(fs, config.getBasePath(), true), config);
|
||||
.getHoodieTable(new HoodieTableMetaClient(fs.getConf(), config.getBasePath(), true),
|
||||
config);
|
||||
|
||||
// GroupBy each action and limit each action timeline to maxCommitsToKeep
|
||||
HoodieTimeline cleanAndRollbackTimeline = table.getActiveTimeline()
|
||||
@@ -165,7 +166,7 @@ public class HoodieCommitArchiveLog {
|
||||
private boolean deleteArchivedInstants(List<HoodieInstant> archivedInstants) {
|
||||
log.info("Deleting instants " + archivedInstants);
|
||||
HoodieTableMetaClient metaClient =
|
||||
new HoodieTableMetaClient(fs, config.getBasePath(), true);
|
||||
new HoodieTableMetaClient(fs.getConf(), config.getBasePath(), true);
|
||||
|
||||
boolean success = true;
|
||||
for (HoodieInstant archivedInstant : archivedInstants) {
|
||||
@@ -188,7 +189,7 @@ public class HoodieCommitArchiveLog {
|
||||
|
||||
try {
|
||||
HoodieTableMetaClient metaClient =
|
||||
new HoodieTableMetaClient(fs, config.getBasePath(), true);
|
||||
new HoodieTableMetaClient(fs.getConf(), config.getBasePath(), true);
|
||||
HoodieTimeline commitTimeline =
|
||||
metaClient.getActiveTimeline().getAllCommitsTimeline().filterCompletedInstants();
|
||||
|
||||
|
||||
@@ -47,7 +47,7 @@ public abstract class HoodieIOHandle<T extends HoodieRecordPayload> {
|
||||
HoodieTable<T> hoodieTable) {
|
||||
this.commitTime = commitTime;
|
||||
this.config = config;
|
||||
this.fs = FSUtils.getFs();
|
||||
this.fs = hoodieTable.getMetaClient().getFs();
|
||||
this.hoodieTable = hoodieTable;
|
||||
this.hoodieTimeline = hoodieTable.getCompletedCommitTimeline();
|
||||
this.fileSystemView = hoodieTable.getROFileSystemView();
|
||||
@@ -73,8 +73,9 @@ public abstract class HoodieIOHandle<T extends HoodieRecordPayload> {
|
||||
public static void cleanupTmpFilesFromCurrentCommit(HoodieWriteConfig config,
|
||||
String commitTime,
|
||||
String partitionPath,
|
||||
int taskPartitionId) {
|
||||
FileSystem fs = FSUtils.getFs();
|
||||
int taskPartitionId,
|
||||
HoodieTable hoodieTable) {
|
||||
FileSystem fs = hoodieTable.getMetaClient().getFs();
|
||||
try {
|
||||
FileStatus[] prevFailedFiles = fs.globStatus(new Path(String
|
||||
.format("%s/%s/%s", config.getBasePath(), partitionPath,
|
||||
|
||||
@@ -16,6 +16,8 @@
|
||||
|
||||
package com.uber.hoodie.io.compact;
|
||||
|
||||
import static java.util.stream.Collectors.toList;
|
||||
|
||||
import com.google.common.base.Preconditions;
|
||||
import com.google.common.collect.Lists;
|
||||
import com.google.common.collect.Sets;
|
||||
@@ -34,13 +36,6 @@ import com.uber.hoodie.config.HoodieWriteConfig;
|
||||
import com.uber.hoodie.exception.HoodieCompactionException;
|
||||
import com.uber.hoodie.table.HoodieCopyOnWriteTable;
|
||||
import com.uber.hoodie.table.HoodieTable;
|
||||
import org.apache.avro.Schema;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.log4j.LogManager;
|
||||
import org.apache.log4j.Logger;
|
||||
import org.apache.spark.api.java.JavaSparkContext;
|
||||
import org.apache.spark.api.java.function.FlatMapFunction;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.util.Collection;
|
||||
@@ -49,8 +44,12 @@ import java.util.List;
|
||||
import java.util.Optional;
|
||||
import java.util.stream.Collectors;
|
||||
import java.util.stream.StreamSupport;
|
||||
|
||||
import static java.util.stream.Collectors.toList;
|
||||
import org.apache.avro.Schema;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.log4j.LogManager;
|
||||
import org.apache.log4j.Logger;
|
||||
import org.apache.spark.api.java.JavaSparkContext;
|
||||
import org.apache.spark.api.java.function.FlatMapFunction;
|
||||
|
||||
/**
|
||||
* HoodieRealtimeTableCompactor compacts a hoodie table with merge on read storage. Computes all
|
||||
@@ -73,7 +72,6 @@ public class HoodieRealtimeTableCompactor implements HoodieCompactor {
|
||||
.getTableType().name());
|
||||
|
||||
// TODO - rollback any compactions in flight
|
||||
|
||||
HoodieTableMetaClient metaClient = hoodieTable.getMetaClient();
|
||||
log.info("Compacting " + metaClient.getBasePath() + " with commit " + compactionCommitTime);
|
||||
List<String> partitionPaths =
|
||||
@@ -102,15 +100,9 @@ public class HoodieRealtimeTableCompactor implements HoodieCompactor {
|
||||
log.info("After filtering, Compacting " + operations + " files");
|
||||
List<HoodieWriteStat> updateStatusMap =
|
||||
jsc.parallelize(operations, operations.size())
|
||||
.map(s -> executeCompaction(metaClient, config, s, compactionCommitTime))
|
||||
.flatMap(new FlatMapFunction<List<HoodieWriteStat>, HoodieWriteStat>() {
|
||||
@Override
|
||||
public Iterator<HoodieWriteStat> call(
|
||||
List<HoodieWriteStat> hoodieWriteStats)
|
||||
throws Exception {
|
||||
return hoodieWriteStats.iterator();
|
||||
}
|
||||
}).collect();
|
||||
.map(s -> executeCompaction(hoodieTable, config, s, compactionCommitTime))
|
||||
.flatMap(writeStatList -> writeStatList.iterator())
|
||||
.collect();
|
||||
|
||||
HoodieCommitMetadata metadata = new HoodieCommitMetadata(true);
|
||||
for (HoodieWriteStat stat : updateStatusMap) {
|
||||
@@ -134,10 +126,11 @@ public class HoodieRealtimeTableCompactor implements HoodieCompactor {
|
||||
return true;
|
||||
}
|
||||
|
||||
private List<HoodieWriteStat> executeCompaction(HoodieTableMetaClient metaClient,
|
||||
HoodieWriteConfig config, CompactionOperation operation, String commitTime)
|
||||
|
||||
private List<HoodieWriteStat> executeCompaction(HoodieTable hoodieTable,
|
||||
HoodieWriteConfig config, CompactionOperation operation, String commitTime)
|
||||
throws IOException {
|
||||
FileSystem fs = FSUtils.getFs();
|
||||
FileSystem fs = hoodieTable.getMetaClient().getFs();
|
||||
Schema readerSchema =
|
||||
HoodieAvroUtils.addMetadataFields(new Schema.Parser().parse(config.getSchema()));
|
||||
|
||||
@@ -147,7 +140,7 @@ public class HoodieRealtimeTableCompactor implements HoodieCompactor {
|
||||
// Reads the entire avro file. Always only specific blocks should be read from the avro file (failure recover).
|
||||
// Load all the delta commits since the last compaction commit and get all the blocks to be loaded and load it using CompositeAvroLogReader
|
||||
// Since a DeltaCommit is not defined yet, reading all the records. revisit this soon.
|
||||
|
||||
HoodieTableMetaClient metaClient = hoodieTable.getMetaClient();
|
||||
String maxInstantTime = metaClient.getActiveTimeline()
|
||||
.getTimelineOfActions(
|
||||
Sets.newHashSet(HoodieTimeline.COMMIT_ACTION,
|
||||
@@ -162,8 +155,7 @@ public class HoodieRealtimeTableCompactor implements HoodieCompactor {
|
||||
}
|
||||
|
||||
// Compacting is very similar to applying updates to existing file
|
||||
HoodieCopyOnWriteTable table =
|
||||
new HoodieCopyOnWriteTable(config, metaClient);
|
||||
HoodieCopyOnWriteTable table = new HoodieCopyOnWriteTable(config, metaClient);
|
||||
Iterator<List<WriteStatus>> result = table
|
||||
.handleUpdate(commitTime, operation.getFileId(), scanner.iterator());
|
||||
Iterable<List<WriteStatus>> resultIterable = () -> result;
|
||||
|
||||
@@ -19,6 +19,7 @@ package com.uber.hoodie.io.storage;
|
||||
import com.uber.hoodie.avro.HoodieAvroWriteSupport;
|
||||
import com.uber.hoodie.common.model.HoodieRecord;
|
||||
import com.uber.hoodie.common.model.HoodieRecordPayload;
|
||||
import com.uber.hoodie.common.util.FSUtils;
|
||||
import com.uber.hoodie.common.util.HoodieAvroUtils;
|
||||
import java.io.IOException;
|
||||
import java.util.concurrent.atomic.AtomicLong;
|
||||
@@ -26,7 +27,6 @@ import org.apache.avro.Schema;
|
||||
import org.apache.avro.generic.GenericRecord;
|
||||
import org.apache.avro.generic.IndexedRecord;
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.apache.parquet.hadoop.ParquetFileWriter;
|
||||
import org.apache.parquet.hadoop.ParquetWriter;
|
||||
@@ -51,9 +51,9 @@ public class HoodieParquetWriter<T extends HoodieRecordPayload, R extends Indexe
|
||||
private final Schema schema;
|
||||
|
||||
|
||||
private static Configuration registerFileSystem(Configuration conf) {
|
||||
private static Configuration registerFileSystem(Path file, Configuration conf) {
|
||||
Configuration returnConf = new Configuration(conf);
|
||||
String scheme = FileSystem.getDefaultUri(conf).getScheme();
|
||||
String scheme = FSUtils.getFs(file.toString(), conf).getScheme();
|
||||
returnConf.set("fs." + HoodieWrapperFileSystem.getHoodieScheme(scheme) + ".impl",
|
||||
HoodieWrapperFileSystem.class.getName());
|
||||
return returnConf;
|
||||
@@ -67,11 +67,11 @@ public class HoodieParquetWriter<T extends HoodieRecordPayload, R extends Indexe
|
||||
parquetConfig.getPageSize(), parquetConfig.getPageSize(),
|
||||
ParquetWriter.DEFAULT_IS_DICTIONARY_ENABLED,
|
||||
ParquetWriter.DEFAULT_IS_VALIDATING_ENABLED, ParquetWriter.DEFAULT_WRITER_VERSION,
|
||||
registerFileSystem(parquetConfig.getHadoopConf()));
|
||||
registerFileSystem(file, parquetConfig.getHadoopConf()));
|
||||
this.file =
|
||||
HoodieWrapperFileSystem.convertToHoodiePath(file, parquetConfig.getHadoopConf());
|
||||
this.fs = (HoodieWrapperFileSystem) this.file
|
||||
.getFileSystem(registerFileSystem(parquetConfig.getHadoopConf()));
|
||||
.getFileSystem(registerFileSystem(file, parquetConfig.getHadoopConf()));
|
||||
// We cannot accurately measure the snappy compressed output file size. We are choosing a conservative 10%
|
||||
// TODO - compute this compression ratio dynamically by looking at the bytes written to the stream and the actual file size reported by HDFS
|
||||
this.maxFileSize = parquetConfig.getMaxFileSize() + Math
|
||||
|
||||
@@ -19,7 +19,6 @@ package com.uber.hoodie.io.storage;
|
||||
import com.uber.hoodie.avro.HoodieAvroWriteSupport;
|
||||
import com.uber.hoodie.common.BloomFilter;
|
||||
import com.uber.hoodie.common.model.HoodieRecordPayload;
|
||||
import com.uber.hoodie.common.util.FSUtils;
|
||||
import com.uber.hoodie.config.HoodieWriteConfig;
|
||||
import com.uber.hoodie.table.HoodieTable;
|
||||
import java.io.IOException;
|
||||
@@ -37,11 +36,12 @@ public class HoodieStorageWriterFactory {
|
||||
throws IOException {
|
||||
//TODO - based on the metadata choose the implementation of HoodieStorageWriter
|
||||
// Currently only parquet is supported
|
||||
return newParquetStorageWriter(commitTime, path, config, schema);
|
||||
return newParquetStorageWriter(commitTime, path, config, schema, hoodieTable);
|
||||
}
|
||||
|
||||
private static <T extends HoodieRecordPayload, R extends IndexedRecord> HoodieStorageWriter<R> newParquetStorageWriter(
|
||||
String commitTime, Path path, HoodieWriteConfig config, Schema schema) throws IOException {
|
||||
String commitTime, Path path, HoodieWriteConfig config, Schema schema,
|
||||
HoodieTable hoodieTable) throws IOException {
|
||||
BloomFilter filter =
|
||||
new BloomFilter(config.getBloomFilterNumEntries(), config.getBloomFilterFPP());
|
||||
HoodieAvroWriteSupport writeSupport =
|
||||
@@ -50,7 +50,7 @@ public class HoodieStorageWriterFactory {
|
||||
HoodieParquetConfig parquetConfig =
|
||||
new HoodieParquetConfig(writeSupport, CompressionCodecName.GZIP,
|
||||
config.getParquetBlockSize(), config.getParquetPageSize(),
|
||||
config.getParquetMaxFileSize(), FSUtils.getFs().getConf());
|
||||
config.getParquetMaxFileSize(), hoodieTable.getHadoopConf());
|
||||
|
||||
return new HoodieParquetWriter<>(commitTime, path, parquetConfig, schema);
|
||||
}
|
||||
|
||||
@@ -16,7 +16,8 @@
|
||||
|
||||
package com.uber.hoodie.io.storage;
|
||||
|
||||
import java.io.FileNotFoundException;
|
||||
import com.uber.hoodie.common.util.FSUtils;
|
||||
import com.uber.hoodie.exception.HoodieIOException;
|
||||
import java.io.IOException;
|
||||
import java.net.URI;
|
||||
import java.net.URISyntaxException;
|
||||
@@ -33,7 +34,6 @@ import org.apache.hadoop.fs.ContentSummary;
|
||||
import org.apache.hadoop.fs.CreateFlag;
|
||||
import org.apache.hadoop.fs.FSDataInputStream;
|
||||
import org.apache.hadoop.fs.FSDataOutputStream;
|
||||
import org.apache.hadoop.fs.FileAlreadyExistsException;
|
||||
import org.apache.hadoop.fs.FileChecksum;
|
||||
import org.apache.hadoop.fs.FileStatus;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
@@ -41,17 +41,14 @@ import org.apache.hadoop.fs.FsServerDefaults;
|
||||
import org.apache.hadoop.fs.FsStatus;
|
||||
import org.apache.hadoop.fs.LocatedFileStatus;
|
||||
import org.apache.hadoop.fs.Options;
|
||||
import org.apache.hadoop.fs.ParentNotDirectoryException;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.apache.hadoop.fs.PathFilter;
|
||||
import org.apache.hadoop.fs.RemoteIterator;
|
||||
import org.apache.hadoop.fs.UnsupportedFileSystemException;
|
||||
import org.apache.hadoop.fs.XAttrSetFlag;
|
||||
import org.apache.hadoop.fs.permission.AclEntry;
|
||||
import org.apache.hadoop.fs.permission.AclStatus;
|
||||
import org.apache.hadoop.fs.permission.FsAction;
|
||||
import org.apache.hadoop.fs.permission.FsPermission;
|
||||
import org.apache.hadoop.security.AccessControlException;
|
||||
import org.apache.hadoop.security.Credentials;
|
||||
import org.apache.hadoop.security.token.Token;
|
||||
import org.apache.hadoop.util.Progressable;
|
||||
@@ -70,6 +67,8 @@ public class HoodieWrapperFileSystem extends FileSystem {
|
||||
SUPPORT_SCHEMES.add("file");
|
||||
SUPPORT_SCHEMES.add("hdfs");
|
||||
SUPPORT_SCHEMES.add("s3");
|
||||
SUPPORT_SCHEMES.add("s3a");
|
||||
|
||||
|
||||
// Hoodie currently relies on underlying object store being fully
|
||||
// consistent so only regional buckets should be used.
|
||||
@@ -85,7 +84,12 @@ public class HoodieWrapperFileSystem extends FileSystem {
|
||||
@Override
|
||||
public void initialize(URI uri, Configuration conf) throws IOException {
|
||||
// Get the default filesystem to decorate
|
||||
fileSystem = FileSystem.get(conf);
|
||||
Path path = new Path(uri);
|
||||
// Remove 'hoodie-' prefix from path
|
||||
if (path.toString().startsWith(HOODIE_SCHEME_PREFIX)) {
|
||||
path = new Path(path.toString().replace(HOODIE_SCHEME_PREFIX, ""));
|
||||
}
|
||||
this.fileSystem = FSUtils.getFs(path.toString(), conf);
|
||||
// Do not need to explicitly initialize the default filesystem, its done already in the above FileSystem.get
|
||||
// fileSystem.initialize(FileSystem.getDefaultUri(conf), conf);
|
||||
// fileSystem.setConf(conf);
|
||||
@@ -219,7 +223,7 @@ public class HoodieWrapperFileSystem extends FileSystem {
|
||||
}
|
||||
|
||||
@Override
|
||||
public FileStatus[] listStatus(Path f) throws FileNotFoundException, IOException {
|
||||
public FileStatus[] listStatus(Path f) throws IOException {
|
||||
return fileSystem.listStatus(convertToDefaultPath(f));
|
||||
}
|
||||
|
||||
@@ -415,19 +419,19 @@ public class HoodieWrapperFileSystem extends FileSystem {
|
||||
|
||||
@Override
|
||||
public FileStatus[] listStatus(Path f, PathFilter filter)
|
||||
throws FileNotFoundException, IOException {
|
||||
throws IOException {
|
||||
return fileSystem.listStatus(convertToDefaultPath(f), filter);
|
||||
}
|
||||
|
||||
@Override
|
||||
public FileStatus[] listStatus(Path[] files)
|
||||
throws FileNotFoundException, IOException {
|
||||
throws IOException {
|
||||
return fileSystem.listStatus(convertDefaults(files));
|
||||
}
|
||||
|
||||
@Override
|
||||
public FileStatus[] listStatus(Path[] files, PathFilter filter)
|
||||
throws FileNotFoundException, IOException {
|
||||
throws IOException {
|
||||
return fileSystem.listStatus(convertDefaults(files), filter);
|
||||
}
|
||||
|
||||
@@ -444,13 +448,13 @@ public class HoodieWrapperFileSystem extends FileSystem {
|
||||
|
||||
@Override
|
||||
public RemoteIterator<LocatedFileStatus> listLocatedStatus(Path f)
|
||||
throws FileNotFoundException, IOException {
|
||||
throws IOException {
|
||||
return fileSystem.listLocatedStatus(convertToDefaultPath(f));
|
||||
}
|
||||
|
||||
@Override
|
||||
public RemoteIterator<LocatedFileStatus> listFiles(Path f, boolean recursive)
|
||||
throws FileNotFoundException, IOException {
|
||||
throws IOException {
|
||||
return fileSystem.listFiles(convertToDefaultPath(f), recursive);
|
||||
}
|
||||
|
||||
@@ -571,21 +575,21 @@ public class HoodieWrapperFileSystem extends FileSystem {
|
||||
|
||||
@Override
|
||||
public void access(Path path, FsAction mode)
|
||||
throws AccessControlException, FileNotFoundException, IOException {
|
||||
throws IOException {
|
||||
fileSystem.access(convertToDefaultPath(path), mode);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void createSymlink(Path target, Path link, boolean createParent)
|
||||
throws AccessControlException, FileAlreadyExistsException, FileNotFoundException,
|
||||
ParentNotDirectoryException, UnsupportedFileSystemException, IOException {
|
||||
throws
|
||||
IOException {
|
||||
fileSystem
|
||||
.createSymlink(convertToDefaultPath(target), convertToDefaultPath(link), createParent);
|
||||
}
|
||||
|
||||
@Override
|
||||
public FileStatus getFileLinkStatus(Path f)
|
||||
throws AccessControlException, FileNotFoundException, UnsupportedFileSystemException,
|
||||
throws
|
||||
IOException {
|
||||
return fileSystem.getFileLinkStatus(convertToDefaultPath(f));
|
||||
}
|
||||
@@ -759,8 +763,12 @@ public class HoodieWrapperFileSystem extends FileSystem {
|
||||
}
|
||||
|
||||
public static Path convertToHoodiePath(Path file, Configuration conf) {
|
||||
String scheme = FileSystem.getDefaultUri(conf).getScheme();
|
||||
return convertPathWithScheme(file, getHoodieScheme(scheme));
|
||||
try {
|
||||
String scheme = FSUtils.getFs(file.toString(), conf).getScheme();
|
||||
return convertPathWithScheme(file, getHoodieScheme(scheme));
|
||||
} catch (HoodieIOException e) {
|
||||
throw e;
|
||||
}
|
||||
}
|
||||
|
||||
private Path convertToDefaultPath(Path oldPath) {
|
||||
|
||||
@@ -52,7 +52,6 @@ import java.util.Set;
|
||||
import java.util.stream.Collectors;
|
||||
import org.apache.avro.generic.GenericRecord;
|
||||
import org.apache.avro.generic.IndexedRecord;
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.fs.FileStatus;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
@@ -413,10 +412,10 @@ public class HoodieCopyOnWriteTable<T extends HoodieRecordPayload> extends Hoodi
|
||||
throw new HoodieUpsertException("Error in finding the old file path at commit " +
|
||||
commitTime + " at fileLoc: " + fileLoc);
|
||||
} else {
|
||||
Configuration conf = FSUtils.getFs().getConf();
|
||||
AvroReadSupport.setAvroReadSchema(conf, upsertHandle.getSchema());
|
||||
AvroReadSupport.setAvroReadSchema(getHadoopConf(), upsertHandle.getSchema());
|
||||
ParquetReader<IndexedRecord> reader =
|
||||
AvroParquetReader.builder(upsertHandle.getOldFilePath()).withConf(conf).build();
|
||||
AvroParquetReader.builder(upsertHandle.getOldFilePath()).withConf(getHadoopConf())
|
||||
.build();
|
||||
try {
|
||||
IndexedRecord record;
|
||||
while ((record = reader.read()) != null) {
|
||||
@@ -500,8 +499,9 @@ public class HoodieCopyOnWriteTable<T extends HoodieRecordPayload> extends Hoodi
|
||||
@Override
|
||||
public List<HoodieCleanStat> clean(JavaSparkContext jsc) {
|
||||
try {
|
||||
FileSystem fs = getMetaClient().getFs();
|
||||
List<String> partitionsToClean =
|
||||
FSUtils.getAllPartitionPaths(getFs(), getMetaClient().getBasePath(),
|
||||
FSUtils.getAllPartitionPaths(fs, getMetaClient().getBasePath(),
|
||||
config.shouldAssumeDatePartitioning());
|
||||
logger.info("Partitions to clean up : " + partitionsToClean + ", with policy " + config
|
||||
.getCleanerPolicy());
|
||||
@@ -522,7 +522,7 @@ public class HoodieCopyOnWriteTable<T extends HoodieRecordPayload> extends Hoodi
|
||||
protected Map<FileStatus, Boolean> deleteCleanedFiles(String partitionPath, List<String> commits)
|
||||
throws IOException {
|
||||
logger.info("Cleaning path " + partitionPath);
|
||||
FileSystem fs = FSUtils.getFs();
|
||||
FileSystem fs = getMetaClient().getFs();
|
||||
FileStatus[] toBeDeleted =
|
||||
fs.listStatus(new Path(config.getBasePath(), partitionPath), path -> {
|
||||
if (!path.toString().contains(".parquet")) {
|
||||
@@ -558,7 +558,7 @@ public class HoodieCopyOnWriteTable<T extends HoodieRecordPayload> extends Hoodi
|
||||
// delete all the data files for all these commits
|
||||
logger.info("Clean out all parquet files generated for commits: " + commits);
|
||||
List<HoodieRollbackStat> stats = jsc.parallelize(
|
||||
FSUtils.getAllPartitionPaths(FSUtils.getFs(), this.getMetaClient().getBasePath(),
|
||||
FSUtils.getAllPartitionPaths(metaClient.getFs(), getMetaClient().getBasePath(),
|
||||
config.shouldAssumeDatePartitioning()))
|
||||
.map((Function<String, HoodieRollbackStat>) partitionPath -> {
|
||||
// Scan all partitions files with this commit time
|
||||
@@ -618,7 +618,7 @@ public class HoodieCopyOnWriteTable<T extends HoodieRecordPayload> extends Hoodi
|
||||
.parallelize(partitionsToClean, cleanerParallelism)
|
||||
.flatMapToPair(getFilesToDeleteFunc(this, config))
|
||||
.repartition(cleanerParallelism) // repartition to remove skews
|
||||
.mapPartitionsToPair(deleteFilesFunc(this, config))
|
||||
.mapPartitionsToPair(deleteFilesFunc(this))
|
||||
.reduceByKey(
|
||||
// merge partition level clean stats below
|
||||
(Function2<PartitionCleanStat, PartitionCleanStat, PartitionCleanStat>) (e1, e2) -> e1
|
||||
@@ -646,17 +646,17 @@ public class HoodieCopyOnWriteTable<T extends HoodieRecordPayload> extends Hoodi
|
||||
}).collect(Collectors.toList());
|
||||
}
|
||||
|
||||
private PairFlatMapFunction<Iterator<Tuple2<String, String>>, String, PartitionCleanStat> deleteFilesFunc(
|
||||
HoodieTable table, HoodieWriteConfig config) {
|
||||
private static PairFlatMapFunction<Iterator<Tuple2<String, String>>, String, PartitionCleanStat> deleteFilesFunc(
|
||||
HoodieTable table) {
|
||||
return (PairFlatMapFunction<Iterator<Tuple2<String, String>>, String, PartitionCleanStat>) iter -> {
|
||||
HoodieCleanHelper cleaner = new HoodieCleanHelper(table, config);
|
||||
Map<String, PartitionCleanStat> partitionCleanStatMap = new HashMap<>();
|
||||
|
||||
FileSystem fs = table.getMetaClient().getFs();
|
||||
while (iter.hasNext()) {
|
||||
Tuple2<String, String> partitionDelFileTuple = iter.next();
|
||||
String partitionPath = partitionDelFileTuple._1();
|
||||
String deletePathStr = partitionDelFileTuple._2();
|
||||
Boolean deletedFileResult = deleteFileAndGetResult(deletePathStr);
|
||||
Boolean deletedFileResult = deleteFileAndGetResult(fs, deletePathStr);
|
||||
if (!partitionCleanStatMap.containsKey(partitionPath)) {
|
||||
partitionCleanStatMap.put(partitionPath,
|
||||
new PartitionCleanStat(partitionPath));
|
||||
@@ -682,10 +682,11 @@ public class HoodieCopyOnWriteTable<T extends HoodieRecordPayload> extends Hoodi
|
||||
};
|
||||
}
|
||||
|
||||
private Boolean deleteFileAndGetResult(String deletePathStr) throws IOException {
|
||||
private static Boolean deleteFileAndGetResult(FileSystem fs, String deletePathStr)
|
||||
throws IOException {
|
||||
Path deletePath = new Path(deletePathStr);
|
||||
logger.debug("Working on delete path :" + deletePath);
|
||||
boolean deleteResult = getFs().delete(deletePath, false);
|
||||
boolean deleteResult = fs.delete(deletePath, false);
|
||||
if (deleteResult) {
|
||||
logger.debug("Cleaned file at path :" + deletePath);
|
||||
}
|
||||
|
||||
@@ -32,19 +32,11 @@ import com.uber.hoodie.common.table.log.block.HoodieCommandBlock;
|
||||
import com.uber.hoodie.common.table.log.block.HoodieLogBlock;
|
||||
import com.uber.hoodie.common.table.timeline.HoodieActiveTimeline;
|
||||
import com.uber.hoodie.common.table.timeline.HoodieInstant;
|
||||
import com.uber.hoodie.common.util.FSUtils;
|
||||
import com.uber.hoodie.config.HoodieWriteConfig;
|
||||
import com.uber.hoodie.exception.HoodieCompactionException;
|
||||
import com.uber.hoodie.exception.HoodieRollbackException;
|
||||
import com.uber.hoodie.io.HoodieAppendHandle;
|
||||
import com.uber.hoodie.io.compact.HoodieRealtimeTableCompactor;
|
||||
import org.apache.hadoop.fs.FileStatus;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.apache.log4j.LogManager;
|
||||
import org.apache.log4j.Logger;
|
||||
import org.apache.spark.api.java.JavaSparkContext;
|
||||
import org.apache.spark.api.java.function.Function;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.UncheckedIOException;
|
||||
import java.util.Arrays;
|
||||
@@ -55,6 +47,12 @@ import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Optional;
|
||||
import java.util.stream.Collectors;
|
||||
import org.apache.hadoop.fs.FileStatus;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.apache.log4j.LogManager;
|
||||
import org.apache.log4j.Logger;
|
||||
import org.apache.spark.api.java.JavaSparkContext;
|
||||
import org.apache.spark.api.java.function.Function;
|
||||
|
||||
|
||||
/**
|
||||
@@ -74,8 +72,7 @@ public class HoodieMergeOnReadTable<T extends HoodieRecordPayload> extends
|
||||
|
||||
private static Logger logger = LogManager.getLogger(HoodieMergeOnReadTable.class);
|
||||
|
||||
public HoodieMergeOnReadTable(HoodieWriteConfig config,
|
||||
HoodieTableMetaClient metaClient) {
|
||||
public HoodieMergeOnReadTable(HoodieWriteConfig config, HoodieTableMetaClient metaClient) {
|
||||
super(config, metaClient);
|
||||
}
|
||||
|
||||
@@ -195,7 +192,7 @@ public class HoodieMergeOnReadTable<T extends HoodieRecordPayload> extends
|
||||
.onParentPath(
|
||||
new Path(this.getMetaClient().getBasePath(), partitionPath))
|
||||
.withFileId(wStat.getFileId()).overBaseCommit(wStat.getPrevCommit())
|
||||
.withFs(FSUtils.getFs())
|
||||
.withFs(getMetaClient().getFs())
|
||||
.withFileExtension(HoodieLogFile.DELTA_EXTENSION).build();
|
||||
Long numRollbackBlocks = 0L;
|
||||
// generate metadata
|
||||
@@ -215,7 +212,8 @@ public class HoodieMergeOnReadTable<T extends HoodieRecordPayload> extends
|
||||
numRollbackBlocks++;
|
||||
}
|
||||
filesToNumBlocksRollback
|
||||
.put(FSUtils.getFs().getFileStatus(writer.getLogFile().getPath()),
|
||||
.put(getMetaClient().getFs()
|
||||
.getFileStatus(writer.getLogFile().getPath()),
|
||||
numRollbackBlocks);
|
||||
} catch (IOException | InterruptedException io) {
|
||||
throw new HoodieRollbackException(
|
||||
|
||||
@@ -34,12 +34,6 @@ import com.uber.hoodie.config.HoodieWriteConfig;
|
||||
import com.uber.hoodie.exception.HoodieCommitException;
|
||||
import com.uber.hoodie.exception.HoodieException;
|
||||
import com.uber.hoodie.exception.HoodieSavepointException;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.log4j.LogManager;
|
||||
import org.apache.log4j.Logger;
|
||||
import org.apache.spark.Partitioner;
|
||||
import org.apache.spark.api.java.JavaSparkContext;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Serializable;
|
||||
import java.util.Iterator;
|
||||
@@ -47,6 +41,11 @@ import java.util.List;
|
||||
import java.util.Optional;
|
||||
import java.util.stream.Collectors;
|
||||
import java.util.stream.Stream;
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.log4j.LogManager;
|
||||
import org.apache.log4j.Logger;
|
||||
import org.apache.spark.Partitioner;
|
||||
import org.apache.spark.api.java.JavaSparkContext;
|
||||
|
||||
/**
|
||||
* Abstract implementation of a HoodieTable
|
||||
@@ -55,6 +54,7 @@ public abstract class HoodieTable<T extends HoodieRecordPayload> implements Seri
|
||||
|
||||
protected final HoodieWriteConfig config;
|
||||
protected final HoodieTableMetaClient metaClient;
|
||||
|
||||
private static Logger logger = LogManager.getLogger(HoodieTable.class);
|
||||
|
||||
protected HoodieTable(HoodieWriteConfig config, HoodieTableMetaClient metaClient) {
|
||||
@@ -87,8 +87,8 @@ public abstract class HoodieTable<T extends HoodieRecordPayload> implements Seri
|
||||
return metaClient;
|
||||
}
|
||||
|
||||
public FileSystem getFs() {
|
||||
return metaClient.getFs();
|
||||
public Configuration getHadoopConf() {
|
||||
return metaClient.getHadoopConf();
|
||||
}
|
||||
|
||||
/**
|
||||
|
||||
@@ -81,7 +81,7 @@ public class HoodieClientExample {
|
||||
|
||||
// initialize the table, if not done already
|
||||
Path path = new Path(tablePath);
|
||||
FileSystem fs = FSUtils.getFs();
|
||||
FileSystem fs = FSUtils.getFs(tablePath, jsc.hadoopConfiguration());
|
||||
if (!fs.exists(path)) {
|
||||
HoodieTableMetaClient
|
||||
.initTableType(fs, tablePath, HoodieTableType.valueOf(tableType), tableName,
|
||||
|
||||
@@ -84,6 +84,7 @@ public class TestHoodieClientOnCopyOnWriteStorage implements Serializable {
|
||||
|
||||
private transient JavaSparkContext jsc = null;
|
||||
private transient SQLContext sqlContext;
|
||||
private transient FileSystem fs;
|
||||
private String basePath = null;
|
||||
private transient HoodieTestDataGenerator dataGen = null;
|
||||
private String[] partitionPaths = {"2016/01/01", "2016/02/02", "2016/06/02"};
|
||||
@@ -100,6 +101,7 @@ public class TestHoodieClientOnCopyOnWriteStorage implements Serializable {
|
||||
TemporaryFolder folder = new TemporaryFolder();
|
||||
folder.create();
|
||||
basePath = folder.getRoot().getAbsolutePath();
|
||||
fs = FSUtils.getFs(basePath.toString(), jsc.hadoopConfiguration());
|
||||
HoodieTestUtils.init(basePath);
|
||||
dataGen = new HoodieTestDataGenerator();
|
||||
}
|
||||
@@ -212,7 +214,6 @@ public class TestHoodieClientOnCopyOnWriteStorage implements Serializable {
|
||||
HoodieWriteConfig cfg = getConfig();
|
||||
HoodieWriteClient client = new HoodieWriteClient(jsc, cfg);
|
||||
HoodieIndex index = HoodieIndex.createIndex(cfg, jsc);
|
||||
FileSystem fs = FSUtils.getFs();
|
||||
|
||||
/**
|
||||
* Write 1 (only inserts)
|
||||
@@ -230,8 +231,9 @@ public class TestHoodieClientOnCopyOnWriteStorage implements Serializable {
|
||||
assertPartitionMetadata(HoodieTestDataGenerator.DEFAULT_PARTITION_PATHS, fs);
|
||||
|
||||
// verify that there is a commit
|
||||
HoodieTableMetaClient metaClient = new HoodieTableMetaClient(fs, basePath);
|
||||
HoodieTimeline timeline = new HoodieActiveTimeline(fs, metaClient.getMetaPath())
|
||||
HoodieTableMetaClient metaClient = new HoodieTableMetaClient(jsc.hadoopConfiguration(),
|
||||
basePath);
|
||||
HoodieTimeline timeline = new HoodieActiveTimeline(metaClient)
|
||||
.getCommitTimeline();
|
||||
|
||||
assertEquals("Expecting a single commit.", 1,
|
||||
@@ -242,7 +244,8 @@ public class TestHoodieClientOnCopyOnWriteStorage implements Serializable {
|
||||
records.size(),
|
||||
HoodieClientTestUtils.readCommit(basePath, sqlContext, timeline, newCommitTime).count());
|
||||
// Should have 100 records in table (check using Index), all in locations marked at commit
|
||||
HoodieTable table = HoodieTable.getHoodieTable(metaClient, getConfig());
|
||||
HoodieTable table = HoodieTable
|
||||
.getHoodieTable(metaClient, getConfig());
|
||||
|
||||
List<HoodieRecord> taggedRecords = index.tagLocation(jsc.parallelize(records, 1), table)
|
||||
.collect();
|
||||
@@ -268,13 +271,13 @@ public class TestHoodieClientOnCopyOnWriteStorage implements Serializable {
|
||||
assertNoWriteErrors(statuses);
|
||||
|
||||
// verify there are now 2 commits
|
||||
timeline = new HoodieActiveTimeline(fs, metaClient.getMetaPath()).getCommitTimeline();
|
||||
timeline = new HoodieActiveTimeline(metaClient).getCommitTimeline();
|
||||
assertEquals("Expecting two commits.",
|
||||
timeline.findInstantsAfter("000", Integer.MAX_VALUE).countInstants(), 2);
|
||||
assertEquals("Latest commit should be 004", timeline.lastInstant().get().getTimestamp(),
|
||||
newCommitTime);
|
||||
|
||||
metaClient = new HoodieTableMetaClient(fs, basePath);
|
||||
metaClient = new HoodieTableMetaClient(jsc.hadoopConfiguration(), basePath);
|
||||
table = HoodieTable.getHoodieTable(metaClient, getConfig());
|
||||
|
||||
// Index should be able to locate all updates in correct locations.
|
||||
@@ -305,7 +308,6 @@ public class TestHoodieClientOnCopyOnWriteStorage implements Serializable {
|
||||
HoodieWriteConfig cfg = getConfig();
|
||||
HoodieWriteClient client = new HoodieWriteClient(jsc, cfg);
|
||||
HoodieIndex index = HoodieIndex.createIndex(cfg, jsc);
|
||||
FileSystem fs = FSUtils.getFs();
|
||||
|
||||
/**
|
||||
* Write 1 (inserts and deletes)
|
||||
@@ -326,8 +328,9 @@ public class TestHoodieClientOnCopyOnWriteStorage implements Serializable {
|
||||
assertNoWriteErrors(statuses);
|
||||
|
||||
// verify that there is a commit
|
||||
HoodieTableMetaClient metaClient = new HoodieTableMetaClient(fs, basePath);
|
||||
HoodieTimeline timeline = new HoodieActiveTimeline(fs, metaClient.getMetaPath())
|
||||
HoodieTableMetaClient metaClient = new HoodieTableMetaClient(jsc.hadoopConfiguration(),
|
||||
basePath);
|
||||
HoodieTimeline timeline = new HoodieActiveTimeline(metaClient)
|
||||
.getCommitTimeline();
|
||||
assertEquals("Expecting a single commit.", 1,
|
||||
timeline.findInstantsAfter("000", Integer.MAX_VALUE).countInstants());
|
||||
@@ -336,7 +339,8 @@ public class TestHoodieClientOnCopyOnWriteStorage implements Serializable {
|
||||
assertEquals("Must contain 200 records", fewRecordsForInsert.size(),
|
||||
HoodieClientTestUtils.readCommit(basePath, sqlContext, timeline, newCommitTime).count());
|
||||
// Should have 100 records in table (check using Index), all in locations marked at commit
|
||||
HoodieTable table = HoodieTable.getHoodieTable(metaClient, getConfig());
|
||||
HoodieTable table = HoodieTable
|
||||
.getHoodieTable(metaClient, getConfig());
|
||||
|
||||
List<HoodieRecord> taggedRecords = index
|
||||
.tagLocation(jsc.parallelize(fewRecordsForInsert, 1), table).collect();
|
||||
@@ -359,7 +363,7 @@ public class TestHoodieClientOnCopyOnWriteStorage implements Serializable {
|
||||
assertNoWriteErrors(statuses);
|
||||
|
||||
// verify there are now 2 commits
|
||||
timeline = new HoodieActiveTimeline(fs, metaClient.getMetaPath()).getCommitTimeline();
|
||||
timeline = new HoodieActiveTimeline(metaClient).getCommitTimeline();
|
||||
assertEquals("Expecting two commits.",
|
||||
timeline.findInstantsAfter("000", Integer.MAX_VALUE).countInstants(), 2);
|
||||
assertEquals("Latest commit should be 004", timeline.lastInstant().get().getTimestamp(),
|
||||
@@ -393,7 +397,6 @@ public class TestHoodieClientOnCopyOnWriteStorage implements Serializable {
|
||||
.withCleanerPolicy(HoodieCleaningPolicy.KEEP_LATEST_COMMITS).retainCommits(1)
|
||||
.build()).build();
|
||||
HoodieWriteClient client = new HoodieWriteClient(jsc, cfg);
|
||||
FileSystem fs = FSUtils.getFs();
|
||||
HoodieTestDataGenerator
|
||||
.writePartitionMetadata(fs, HoodieTestDataGenerator.DEFAULT_PARTITION_PATHS, basePath);
|
||||
|
||||
@@ -451,8 +454,10 @@ public class TestHoodieClientOnCopyOnWriteStorage implements Serializable {
|
||||
|
||||
List<String> partitionPaths = FSUtils
|
||||
.getAllPartitionPaths(fs, cfg.getBasePath(), getConfig().shouldAssumeDatePartitioning());
|
||||
HoodieTableMetaClient metaClient = new HoodieTableMetaClient(fs, basePath);
|
||||
HoodieTable table = HoodieTable.getHoodieTable(metaClient, getConfig());
|
||||
HoodieTableMetaClient metaClient = new HoodieTableMetaClient(jsc.hadoopConfiguration(),
|
||||
basePath);
|
||||
HoodieTable table = HoodieTable
|
||||
.getHoodieTable(metaClient, getConfig());
|
||||
final TableFileSystemView.ReadOptimizedView view = table.getROFileSystemView();
|
||||
List<HoodieDataFile> dataFiles = partitionPaths.stream().flatMap(s -> {
|
||||
return view.getAllDataFiles(s).filter(f -> f.getCommitTime().equals("002"));
|
||||
@@ -472,7 +477,7 @@ public class TestHoodieClientOnCopyOnWriteStorage implements Serializable {
|
||||
// Verify there are no errors
|
||||
assertNoWriteErrors(statuses);
|
||||
|
||||
metaClient = new HoodieTableMetaClient(fs, basePath);
|
||||
metaClient = new HoodieTableMetaClient(jsc.hadoopConfiguration(), basePath);
|
||||
table = HoodieTable.getHoodieTable(metaClient, getConfig());
|
||||
final TableFileSystemView.ReadOptimizedView view1 = table.getROFileSystemView();
|
||||
dataFiles = partitionPaths.stream().flatMap(s -> {
|
||||
@@ -490,7 +495,6 @@ public class TestHoodieClientOnCopyOnWriteStorage implements Serializable {
|
||||
.withCleanerPolicy(HoodieCleaningPolicy.KEEP_LATEST_COMMITS).retainCommits(1)
|
||||
.build()).build();
|
||||
HoodieWriteClient client = new HoodieWriteClient(jsc, cfg);
|
||||
FileSystem fs = FSUtils.getFs();
|
||||
HoodieTestDataGenerator
|
||||
.writePartitionMetadata(fs, HoodieTestDataGenerator.DEFAULT_PARTITION_PATHS, basePath);
|
||||
|
||||
@@ -531,7 +535,8 @@ public class TestHoodieClientOnCopyOnWriteStorage implements Serializable {
|
||||
assertNoWriteErrors(statuses);
|
||||
List<String> partitionPaths = FSUtils
|
||||
.getAllPartitionPaths(fs, cfg.getBasePath(), getConfig().shouldAssumeDatePartitioning());
|
||||
HoodieTableMetaClient metaClient = new HoodieTableMetaClient(fs, basePath);
|
||||
HoodieTableMetaClient metaClient = new HoodieTableMetaClient(jsc.hadoopConfiguration(),
|
||||
basePath);
|
||||
HoodieTable table = HoodieTable.getHoodieTable(metaClient, getConfig());
|
||||
final TableFileSystemView.ReadOptimizedView view1 = table.getROFileSystemView();
|
||||
|
||||
@@ -551,7 +556,7 @@ public class TestHoodieClientOnCopyOnWriteStorage implements Serializable {
|
||||
// Verify there are no errors
|
||||
assertNoWriteErrors(statuses);
|
||||
|
||||
metaClient = new HoodieTableMetaClient(fs, basePath);
|
||||
metaClient = new HoodieTableMetaClient(jsc.hadoopConfiguration(), basePath);
|
||||
table = HoodieTable.getHoodieTable(metaClient, getConfig());
|
||||
final TableFileSystemView.ReadOptimizedView view2 = table.getROFileSystemView();
|
||||
|
||||
@@ -573,7 +578,7 @@ public class TestHoodieClientOnCopyOnWriteStorage implements Serializable {
|
||||
table.getCompletedSavepointTimeline().getInstants().findFirst().get();
|
||||
client.rollbackToSavepoint(savepoint.getTimestamp());
|
||||
|
||||
metaClient = new HoodieTableMetaClient(fs, basePath);
|
||||
metaClient = new HoodieTableMetaClient(jsc.hadoopConfiguration(), basePath);
|
||||
table = HoodieTable.getHoodieTable(metaClient, getConfig());
|
||||
final TableFileSystemView.ReadOptimizedView view3 = table.getROFileSystemView();
|
||||
dataFiles = partitionPaths.stream().flatMap(s -> {
|
||||
@@ -602,7 +607,6 @@ public class TestHoodieClientOnCopyOnWriteStorage implements Serializable {
|
||||
.retainFileVersions(maxVersions).build()).build();
|
||||
HoodieWriteClient client = new HoodieWriteClient(jsc, cfg);
|
||||
HoodieIndex index = HoodieIndex.createIndex(cfg, jsc);
|
||||
FileSystem fs = FSUtils.getFs();
|
||||
|
||||
/**
|
||||
* do a big insert
|
||||
@@ -618,8 +622,9 @@ public class TestHoodieClientOnCopyOnWriteStorage implements Serializable {
|
||||
assertNoWriteErrors(statuses);
|
||||
|
||||
// verify that there is a commit
|
||||
HoodieTableMetaClient metaClient = new HoodieTableMetaClient(fs, basePath);
|
||||
HoodieTimeline timeline = new HoodieActiveTimeline(fs, metaClient.getMetaPath())
|
||||
HoodieTableMetaClient metaClient = new HoodieTableMetaClient(jsc.hadoopConfiguration(),
|
||||
basePath);
|
||||
HoodieTimeline timeline = new HoodieActiveTimeline(metaClient)
|
||||
.getCommitTimeline();
|
||||
assertEquals("Expecting a single commit.", 1,
|
||||
timeline.findInstantsAfter("000", Integer.MAX_VALUE).countInstants());
|
||||
@@ -647,7 +652,8 @@ public class TestHoodieClientOnCopyOnWriteStorage implements Serializable {
|
||||
// Verify there are no errors
|
||||
assertNoWriteErrors(statuses);
|
||||
|
||||
HoodieTableMetaClient metadata = new HoodieTableMetaClient(fs, basePath);
|
||||
HoodieTableMetaClient metadata = new HoodieTableMetaClient(jsc.hadoopConfiguration(),
|
||||
basePath);
|
||||
table = HoodieTable.getHoodieTable(metadata, getConfig());
|
||||
timeline = table.getCommitsTimeline();
|
||||
|
||||
@@ -702,7 +708,6 @@ public class TestHoodieClientOnCopyOnWriteStorage implements Serializable {
|
||||
.retainCommits(maxCommits).build()).build();
|
||||
HoodieWriteClient client = new HoodieWriteClient(jsc, cfg);
|
||||
HoodieIndex index = HoodieIndex.createIndex(cfg, jsc);
|
||||
FileSystem fs = FSUtils.getFs();
|
||||
|
||||
/**
|
||||
* do a big insert
|
||||
@@ -718,13 +723,15 @@ public class TestHoodieClientOnCopyOnWriteStorage implements Serializable {
|
||||
assertNoWriteErrors(statuses);
|
||||
|
||||
// verify that there is a commit
|
||||
HoodieTableMetaClient metaClient = new HoodieTableMetaClient(fs, basePath);
|
||||
HoodieTimeline timeline = new HoodieActiveTimeline(fs, metaClient.getMetaPath())
|
||||
HoodieTableMetaClient metaClient = new HoodieTableMetaClient(jsc.hadoopConfiguration(),
|
||||
basePath);
|
||||
HoodieTimeline timeline = new HoodieActiveTimeline(metaClient)
|
||||
.getCommitTimeline();
|
||||
assertEquals("Expecting a single commit.", 1,
|
||||
timeline.findInstantsAfter("000", Integer.MAX_VALUE).countInstants());
|
||||
// Should have 100 records in table (check using Index), all in locations marked at commit
|
||||
HoodieTable table = HoodieTable.getHoodieTable(metaClient, getConfig());
|
||||
HoodieTable table = HoodieTable
|
||||
.getHoodieTable(metaClient, getConfig());
|
||||
|
||||
assertFalse(table.getCompletedCommitTimeline().empty());
|
||||
String commitTime =
|
||||
@@ -747,7 +754,8 @@ public class TestHoodieClientOnCopyOnWriteStorage implements Serializable {
|
||||
// Verify there are no errors
|
||||
assertNoWriteErrors(statuses);
|
||||
|
||||
HoodieTableMetaClient metadata = new HoodieTableMetaClient(fs, basePath);
|
||||
HoodieTableMetaClient metadata = new HoodieTableMetaClient(jsc.hadoopConfiguration(),
|
||||
basePath);
|
||||
HoodieTable table1 = HoodieTable.getHoodieTable(metadata, cfg);
|
||||
HoodieTimeline activeTimeline = table1.getCompletedCommitTimeline();
|
||||
Optional<HoodieInstant>
|
||||
@@ -788,8 +796,8 @@ public class TestHoodieClientOnCopyOnWriteStorage implements Serializable {
|
||||
String commitTime2 = "20160502020601";
|
||||
String commitTime3 = "20160506030611";
|
||||
new File(basePath + "/.hoodie").mkdirs();
|
||||
HoodieTestDataGenerator.writePartitionMetadata(FSUtils.getFs(),
|
||||
new String[]{"2016/05/01", "2016/05/02", "2016/05/06"},
|
||||
HoodieTestDataGenerator
|
||||
.writePartitionMetadata(fs, new String[]{"2016/05/01", "2016/05/02", "2016/05/06"},
|
||||
basePath);
|
||||
|
||||
// Only first two have commit files
|
||||
@@ -878,9 +886,9 @@ public class TestHoodieClientOnCopyOnWriteStorage implements Serializable {
|
||||
String commitTime2 = "20160502020601";
|
||||
String commitTime3 = "20160506030611";
|
||||
new File(basePath + "/.hoodie").mkdirs();
|
||||
HoodieTestDataGenerator.writePartitionMetadata(FSUtils.getFs(),
|
||||
new String[]{"2016/05/01", "2016/05/02", "2016/05/06"},
|
||||
basePath);
|
||||
HoodieTestDataGenerator
|
||||
.writePartitionMetadata(fs, new String[]{"2016/05/01", "2016/05/02", "2016/05/06"},
|
||||
basePath);
|
||||
|
||||
// One good commit
|
||||
HoodieTestUtils.createCommitFiles(basePath, commitTime1);
|
||||
@@ -956,8 +964,6 @@ public class TestHoodieClientOnCopyOnWriteStorage implements Serializable {
|
||||
|
||||
@Test
|
||||
public void testSmallInsertHandlingForUpserts() throws Exception {
|
||||
|
||||
FileSystem fs = FSUtils.getFs();
|
||||
final String TEST_PARTITION_PATH = "2016/09/26";
|
||||
final int INSERT_SPLIT_LIMIT = 100;
|
||||
// setup the small file handling params
|
||||
@@ -982,7 +988,7 @@ public class TestHoodieClientOnCopyOnWriteStorage implements Serializable {
|
||||
assertEquals("Just 1 file needs to be added.", 1, statuses.size());
|
||||
String file1 = statuses.get(0).getFileId();
|
||||
assertEquals("file should contain 100 records",
|
||||
ParquetUtils.readRowKeysFromParquet(new Path(basePath,
|
||||
ParquetUtils.readRowKeysFromParquet(jsc.hadoopConfiguration(), new Path(basePath,
|
||||
TEST_PARTITION_PATH + "/" + FSUtils.makeDataFileName(commitTime1, 0, file1))).size(),
|
||||
100);
|
||||
|
||||
@@ -1006,9 +1012,9 @@ public class TestHoodieClientOnCopyOnWriteStorage implements Serializable {
|
||||
Path newFile = new Path(basePath,
|
||||
TEST_PARTITION_PATH + "/" + FSUtils.makeDataFileName(commitTime2, 0, file1));
|
||||
assertEquals("file should contain 140 records",
|
||||
ParquetUtils.readRowKeysFromParquet(newFile).size(), 140);
|
||||
ParquetUtils.readRowKeysFromParquet(jsc.hadoopConfiguration(), newFile).size(), 140);
|
||||
|
||||
List<GenericRecord> records = ParquetUtils.readAvroRecords(newFile);
|
||||
List<GenericRecord> records = ParquetUtils.readAvroRecords(jsc.hadoopConfiguration(), newFile);
|
||||
for (GenericRecord record : records) {
|
||||
String recordKey = record.get(HoodieRecord.RECORD_KEY_METADATA_FIELD).toString();
|
||||
assertEquals("only expect commit2", commitTime2,
|
||||
@@ -1030,7 +1036,7 @@ public class TestHoodieClientOnCopyOnWriteStorage implements Serializable {
|
||||
assertNoWriteErrors(statuses);
|
||||
|
||||
assertEquals("2 files needs to be committed.", 2, statuses.size());
|
||||
HoodieTableMetaClient metadata = new HoodieTableMetaClient(fs, basePath);
|
||||
HoodieTableMetaClient metadata = new HoodieTableMetaClient(jsc.hadoopConfiguration(), basePath);
|
||||
HoodieTable table = HoodieTable.getHoodieTable(metadata, config);
|
||||
TableFileSystemView.ReadOptimizedView fileSystemView = table.getROFileSystemView();
|
||||
List<HoodieDataFile> files = fileSystemView
|
||||
@@ -1040,7 +1046,7 @@ public class TestHoodieClientOnCopyOnWriteStorage implements Serializable {
|
||||
for (HoodieDataFile file : files) {
|
||||
if (file.getFileName().contains(file1)) {
|
||||
assertEquals("Existing file should be expanded", commitTime3, file.getCommitTime());
|
||||
records = ParquetUtils.readAvroRecords(new Path(file.getPath()));
|
||||
records = ParquetUtils.readAvroRecords(jsc.hadoopConfiguration(), new Path(file.getPath()));
|
||||
for (GenericRecord record : records) {
|
||||
String recordKey = record.get(HoodieRecord.RECORD_KEY_METADATA_FIELD).toString();
|
||||
String recordCommitTime = record.get(HoodieRecord.COMMIT_TIME_METADATA_FIELD).toString();
|
||||
@@ -1057,7 +1063,7 @@ public class TestHoodieClientOnCopyOnWriteStorage implements Serializable {
|
||||
keys2.size());
|
||||
} else {
|
||||
assertEquals("New file must be written for commit 3", commitTime3, file.getCommitTime());
|
||||
records = ParquetUtils.readAvroRecords(new Path(file.getPath()));
|
||||
records = ParquetUtils.readAvroRecords(jsc.hadoopConfiguration(), new Path(file.getPath()));
|
||||
for (GenericRecord record : records) {
|
||||
String recordKey = record.get(HoodieRecord.RECORD_KEY_METADATA_FIELD).toString();
|
||||
assertEquals("only expect commit3", commitTime3,
|
||||
@@ -1091,12 +1097,12 @@ public class TestHoodieClientOnCopyOnWriteStorage implements Serializable {
|
||||
List<WriteStatus> statuses = client.insert(insertRecordsRDD1, commitTime1).collect();
|
||||
|
||||
assertNoWriteErrors(statuses);
|
||||
assertPartitionMetadata(new String[]{TEST_PARTITION_PATH}, FSUtils.getFs());
|
||||
assertPartitionMetadata(new String[]{TEST_PARTITION_PATH}, fs);
|
||||
|
||||
assertEquals("Just 1 file needs to be added.", 1, statuses.size());
|
||||
String file1 = statuses.get(0).getFileId();
|
||||
assertEquals("file should contain 100 records",
|
||||
ParquetUtils.readRowKeysFromParquet(new Path(basePath,
|
||||
ParquetUtils.readRowKeysFromParquet(jsc.hadoopConfiguration(), new Path(basePath,
|
||||
TEST_PARTITION_PATH + "/" + FSUtils.makeDataFileName(commitTime1, 0, file1))).size(),
|
||||
100);
|
||||
|
||||
@@ -1116,9 +1122,9 @@ public class TestHoodieClientOnCopyOnWriteStorage implements Serializable {
|
||||
Path newFile = new Path(basePath,
|
||||
TEST_PARTITION_PATH + "/" + FSUtils.makeDataFileName(commitTime2, 0, file1));
|
||||
assertEquals("file should contain 140 records",
|
||||
ParquetUtils.readRowKeysFromParquet(newFile).size(), 140);
|
||||
ParquetUtils.readRowKeysFromParquet(jsc.hadoopConfiguration(), newFile).size(), 140);
|
||||
|
||||
List<GenericRecord> records = ParquetUtils.readAvroRecords(newFile);
|
||||
List<GenericRecord> records = ParquetUtils.readAvroRecords(jsc.hadoopConfiguration(), newFile);
|
||||
for (GenericRecord record : records) {
|
||||
String recordKey = record.get(HoodieRecord.RECORD_KEY_METADATA_FIELD).toString();
|
||||
String recCommitTime = record.get(HoodieRecord.COMMIT_TIME_METADATA_FIELD).toString();
|
||||
@@ -1137,8 +1143,8 @@ public class TestHoodieClientOnCopyOnWriteStorage implements Serializable {
|
||||
assertNoWriteErrors(statuses);
|
||||
assertEquals("2 files needs to be committed.", 2, statuses.size());
|
||||
|
||||
FileSystem fs = FSUtils.getFs();
|
||||
HoodieTableMetaClient metaClient = new HoodieTableMetaClient(fs, basePath);
|
||||
HoodieTableMetaClient metaClient = new HoodieTableMetaClient(jsc.hadoopConfiguration(),
|
||||
basePath);
|
||||
HoodieTable table = HoodieTable.getHoodieTable(metaClient, config);
|
||||
List<HoodieDataFile> files =
|
||||
table.getROFileSystemView().getLatestDataFilesBeforeOrOn(TEST_PARTITION_PATH, commitTime3)
|
||||
@@ -1148,7 +1154,7 @@ public class TestHoodieClientOnCopyOnWriteStorage implements Serializable {
|
||||
int totalInserts = 0;
|
||||
for (HoodieDataFile file : files) {
|
||||
assertEquals("All files must be at commit 3", commitTime3, file.getCommitTime());
|
||||
records = ParquetUtils.readAvroRecords(new Path(file.getPath()));
|
||||
records = ParquetUtils.readAvroRecords(jsc.hadoopConfiguration(), new Path(file.getPath()));
|
||||
totalInserts += records.size();
|
||||
}
|
||||
assertEquals("Total number of records must add up", totalInserts,
|
||||
@@ -1169,7 +1175,8 @@ public class TestHoodieClientOnCopyOnWriteStorage implements Serializable {
|
||||
String file1P0C0 = HoodieTestUtils.createNewDataFile(basePath, partitionPaths[0], "000");
|
||||
String file1P1C0 = HoodieTestUtils.createNewDataFile(basePath, partitionPaths[1], "000");
|
||||
HoodieTable table = HoodieTable
|
||||
.getHoodieTable(new HoodieTableMetaClient(FSUtils.getFs(), config.getBasePath(), true),
|
||||
.getHoodieTable(
|
||||
new HoodieTableMetaClient(jsc.hadoopConfiguration(), config.getBasePath(), true),
|
||||
config);
|
||||
|
||||
List<HoodieCleanStat> hoodieCleanStatsOne = table.clean(jsc);
|
||||
@@ -1183,7 +1190,8 @@ public class TestHoodieClientOnCopyOnWriteStorage implements Serializable {
|
||||
// make next commit, with 1 insert & 1 update per partition
|
||||
HoodieTestUtils.createCommitFiles(basePath, "001");
|
||||
table = HoodieTable
|
||||
.getHoodieTable(new HoodieTableMetaClient(FSUtils.getFs(), config.getBasePath(), true),
|
||||
.getHoodieTable(
|
||||
new HoodieTableMetaClient(jsc.hadoopConfiguration(), basePath, true),
|
||||
config);
|
||||
|
||||
String file2P0C1 = HoodieTestUtils
|
||||
@@ -1206,7 +1214,8 @@ public class TestHoodieClientOnCopyOnWriteStorage implements Serializable {
|
||||
// make next commit, with 2 updates to existing files, and 1 insert
|
||||
HoodieTestUtils.createCommitFiles(basePath, "002");
|
||||
table = HoodieTable
|
||||
.getHoodieTable(new HoodieTableMetaClient(FSUtils.getFs(), config.getBasePath(), true),
|
||||
.getHoodieTable(
|
||||
new HoodieTableMetaClient(jsc.hadoopConfiguration(), config.getBasePath(), true),
|
||||
config);
|
||||
|
||||
HoodieTestUtils.createDataFile(basePath, partitionPaths[0], "002", file1P0C0); // update
|
||||
@@ -1260,8 +1269,7 @@ public class TestHoodieClientOnCopyOnWriteStorage implements Serializable {
|
||||
// make 1 compaction commit
|
||||
HoodieTestUtils.createCompactionCommitFiles(basePath, "001");
|
||||
|
||||
HoodieTable table = HoodieTable
|
||||
.getHoodieTable(metaClient, config);
|
||||
HoodieTable table = HoodieTable.getHoodieTable(metaClient, config);
|
||||
List<HoodieCleanStat> hoodieCleanStats = table.clean(jsc);
|
||||
assertEquals("Must clean three files, one parquet and 2 log files", 3,
|
||||
getCleanStat(hoodieCleanStats, partitionPaths[0]).getSuccessDeleteFiles().size());
|
||||
@@ -1286,9 +1294,8 @@ public class TestHoodieClientOnCopyOnWriteStorage implements Serializable {
|
||||
String file1P0C0 = HoodieTestUtils.createNewDataFile(basePath, partitionPaths[0], "000");
|
||||
String file1P1C0 = HoodieTestUtils.createNewDataFile(basePath, partitionPaths[1], "000");
|
||||
|
||||
HoodieTable table = HoodieTable
|
||||
.getHoodieTable(new HoodieTableMetaClient(FSUtils.getFs(), config.getBasePath(), true),
|
||||
config);
|
||||
HoodieTable table = HoodieTable.getHoodieTable(
|
||||
new HoodieTableMetaClient(jsc.hadoopConfiguration(), config.getBasePath(), true), config);
|
||||
|
||||
List<HoodieCleanStat> hoodieCleanStatsOne = table.clean(jsc);
|
||||
assertEquals("Must not clean any files", 0,
|
||||
@@ -1300,9 +1307,8 @@ public class TestHoodieClientOnCopyOnWriteStorage implements Serializable {
|
||||
|
||||
// make next commit, with 1 insert & 1 update per partition
|
||||
HoodieTestUtils.createCommitFiles(basePath, "001");
|
||||
table = HoodieTable
|
||||
.getHoodieTable(new HoodieTableMetaClient(FSUtils.getFs(), config.getBasePath(), true),
|
||||
config);
|
||||
table = HoodieTable.getHoodieTable(
|
||||
new HoodieTableMetaClient(jsc.hadoopConfiguration(), config.getBasePath(), true), config);
|
||||
|
||||
String file2P0C1 = HoodieTestUtils
|
||||
.createNewDataFile(basePath, partitionPaths[0], "001"); // insert
|
||||
@@ -1324,7 +1330,8 @@ public class TestHoodieClientOnCopyOnWriteStorage implements Serializable {
|
||||
// make next commit, with 2 updates to existing files, and 1 insert
|
||||
HoodieTestUtils.createCommitFiles(basePath, "002");
|
||||
table = HoodieTable
|
||||
.getHoodieTable(new HoodieTableMetaClient(FSUtils.getFs(), config.getBasePath(), true),
|
||||
.getHoodieTable(
|
||||
new HoodieTableMetaClient(jsc.hadoopConfiguration(), config.getBasePath(), true),
|
||||
config);
|
||||
|
||||
HoodieTestUtils.createDataFile(basePath, partitionPaths[0], "002", file1P0C0); // update
|
||||
@@ -1341,7 +1348,8 @@ public class TestHoodieClientOnCopyOnWriteStorage implements Serializable {
|
||||
// make next commit, with 2 updates to existing files, and 1 insert
|
||||
HoodieTestUtils.createCommitFiles(basePath, "003");
|
||||
table = HoodieTable
|
||||
.getHoodieTable(new HoodieTableMetaClient(FSUtils.getFs(), config.getBasePath(), true),
|
||||
.getHoodieTable(
|
||||
new HoodieTableMetaClient(jsc.hadoopConfiguration(), config.getBasePath(), true),
|
||||
config);
|
||||
|
||||
HoodieTestUtils.createDataFile(basePath, partitionPaths[0], "003", file1P0C0); // update
|
||||
@@ -1384,7 +1392,8 @@ public class TestHoodieClientOnCopyOnWriteStorage implements Serializable {
|
||||
HoodieTestUtils.createCommitFiles(basePath, "000");
|
||||
|
||||
HoodieTable table = HoodieTable
|
||||
.getHoodieTable(new HoodieTableMetaClient(FSUtils.getFs(), config.getBasePath(), true),
|
||||
.getHoodieTable(
|
||||
new HoodieTableMetaClient(jsc.hadoopConfiguration(), config.getBasePath(), true),
|
||||
config);
|
||||
|
||||
List<HoodieCleanStat> hoodieCleanStatsOne = table.clean(jsc);
|
||||
@@ -1449,7 +1458,8 @@ public class TestHoodieClientOnCopyOnWriteStorage implements Serializable {
|
||||
updateAllFilesInPartition(filesP2C0, partitionPaths[2], "003");
|
||||
|
||||
HoodieTable table = HoodieTable
|
||||
.getHoodieTable(new HoodieTableMetaClient(FSUtils.getFs(), config.getBasePath(), true),
|
||||
.getHoodieTable(
|
||||
new HoodieTableMetaClient(jsc.hadoopConfiguration(), config.getBasePath(), true),
|
||||
config);
|
||||
List<HoodieCleanStat> hoodieCleanStats = table.clean(jsc);
|
||||
|
||||
@@ -1476,8 +1486,8 @@ public class TestHoodieClientOnCopyOnWriteStorage implements Serializable {
|
||||
|
||||
HoodieWriteConfig cfg = getConfigBuilder().withAutoCommit(false).build();
|
||||
HoodieWriteClient client = new HoodieWriteClient(jsc, cfg);
|
||||
FileSystem fs = FSUtils.getFs();
|
||||
HoodieTableMetaClient metaClient = new HoodieTableMetaClient(fs, basePath);
|
||||
HoodieTableMetaClient metaClient = new HoodieTableMetaClient(jsc.hadoopConfiguration(),
|
||||
basePath);
|
||||
HoodieTable table = HoodieTable.getHoodieTable(metaClient, cfg);
|
||||
|
||||
String commitTime = "000";
|
||||
|
||||
159
hoodie-client/src/test/java/com/uber/hoodie/TestMultiFS.java
Normal file
159
hoodie-client/src/test/java/com/uber/hoodie/TestMultiFS.java
Normal file
@@ -0,0 +1,159 @@
|
||||
/*
|
||||
* Copyright (c) 2017 Uber Technologies, Inc. (hoodie-dev-group@uber.com)
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package com.uber.hoodie;
|
||||
|
||||
|
||||
import static org.junit.Assert.assertEquals;
|
||||
|
||||
import com.uber.hoodie.common.HoodieClientTestUtils;
|
||||
import com.uber.hoodie.common.HoodieTestDataGenerator;
|
||||
import com.uber.hoodie.common.minicluster.HdfsTestService;
|
||||
import com.uber.hoodie.common.model.HoodieAvroPayload;
|
||||
import com.uber.hoodie.common.model.HoodieRecord;
|
||||
import com.uber.hoodie.common.model.HoodieTableType;
|
||||
import com.uber.hoodie.common.model.HoodieTestUtils;
|
||||
import com.uber.hoodie.common.table.HoodieTableMetaClient;
|
||||
import com.uber.hoodie.common.table.HoodieTimeline;
|
||||
import com.uber.hoodie.common.table.timeline.HoodieActiveTimeline;
|
||||
import com.uber.hoodie.common.util.FSUtils;
|
||||
import com.uber.hoodie.config.HoodieIndexConfig;
|
||||
import com.uber.hoodie.config.HoodieWriteConfig;
|
||||
import com.uber.hoodie.index.HoodieIndex;
|
||||
import java.io.Serializable;
|
||||
import java.util.List;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.apache.hadoop.hdfs.DistributedFileSystem;
|
||||
import org.apache.hadoop.hdfs.MiniDFSCluster;
|
||||
import org.apache.log4j.LogManager;
|
||||
import org.apache.log4j.Logger;
|
||||
import org.apache.spark.SparkConf;
|
||||
import org.apache.spark.api.java.JavaRDD;
|
||||
import org.apache.spark.api.java.JavaSparkContext;
|
||||
import org.apache.spark.sql.Dataset;
|
||||
import org.apache.spark.sql.Row;
|
||||
import org.apache.spark.sql.SQLContext;
|
||||
import org.junit.After;
|
||||
import org.junit.Before;
|
||||
import org.junit.Test;
|
||||
|
||||
public class TestMultiFS implements Serializable {
|
||||
|
||||
private static String dfsBasePath;
|
||||
private static HdfsTestService hdfsTestService;
|
||||
private static MiniDFSCluster dfsCluster;
|
||||
private static DistributedFileSystem dfs;
|
||||
private static Logger logger = LogManager.getLogger(TestMultiFS.class);
|
||||
private String tablePath = "file:///tmp/hoodie/sample-table";
|
||||
private String tableName = "hoodie_rt";
|
||||
private String tableType = HoodieTableType.COPY_ON_WRITE.name();
|
||||
private static JavaSparkContext jsc;
|
||||
private static SQLContext sqlContext;
|
||||
|
||||
@Before
|
||||
public void initClass() throws Exception {
|
||||
hdfsTestService = new HdfsTestService();
|
||||
dfsCluster = hdfsTestService.start(true);
|
||||
|
||||
// Create a temp folder as the base path
|
||||
dfs = dfsCluster.getFileSystem();
|
||||
dfsBasePath = dfs.getWorkingDirectory().toString();
|
||||
dfs.mkdirs(new Path(dfsBasePath));
|
||||
|
||||
SparkConf sparkConf = new SparkConf().setAppName("hoodie-client-example");
|
||||
sparkConf.setMaster("local[1]");
|
||||
sparkConf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer");
|
||||
sparkConf.set("spark.kryoserializer.buffer.max", "512m");
|
||||
jsc = new JavaSparkContext(sparkConf);
|
||||
sqlContext = new SQLContext(jsc);
|
||||
}
|
||||
|
||||
@After
|
||||
public void cleanupClass() throws Exception {
|
||||
if (hdfsTestService != null) {
|
||||
hdfsTestService.stop();
|
||||
}
|
||||
if (jsc != null) {
|
||||
jsc.stop();
|
||||
}
|
||||
FSUtils.setFs(null);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void readLocalWriteHDFS() throws Exception {
|
||||
|
||||
// Generator of some records to be loaded in.
|
||||
HoodieTestDataGenerator dataGen = new HoodieTestDataGenerator();
|
||||
|
||||
// Initialize table and filesystem
|
||||
FileSystem hdfs = FSUtils.getFs(dfsBasePath, jsc.hadoopConfiguration());
|
||||
HoodieTableMetaClient
|
||||
.initTableType(hdfs, dfsBasePath, HoodieTableType.valueOf(tableType), tableName,
|
||||
HoodieAvroPayload.class.getName());
|
||||
|
||||
//Create write client to write some records in
|
||||
HoodieWriteConfig cfg = HoodieWriteConfig.newBuilder().withPath(dfsBasePath)
|
||||
.withSchema(HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA).withParallelism(2, 2)
|
||||
.forTable(tableName).withIndexConfig(
|
||||
HoodieIndexConfig.newBuilder().withIndexType(HoodieIndex.IndexType.BLOOM).build())
|
||||
.build();
|
||||
HoodieWriteClient hdfsWriteClient = new HoodieWriteClient(jsc, cfg);
|
||||
|
||||
// Write generated data to hdfs (only inserts)
|
||||
String readCommitTime = hdfsWriteClient.startCommit();
|
||||
logger.info("Starting commit " + readCommitTime);
|
||||
List<HoodieRecord> records = dataGen.generateInserts(readCommitTime, 100);
|
||||
JavaRDD<HoodieRecord> writeRecords = jsc.parallelize(records, 1);
|
||||
hdfsWriteClient.upsert(writeRecords, readCommitTime);
|
||||
|
||||
// Read from hdfs
|
||||
FileSystem fs = FSUtils.getFs(dfsBasePath, HoodieTestUtils.getDefaultHadoopConf());
|
||||
HoodieTableMetaClient metaClient = new HoodieTableMetaClient(fs.getConf(), dfsBasePath);
|
||||
HoodieTimeline timeline = new HoodieActiveTimeline(metaClient)
|
||||
.getCommitTimeline();
|
||||
Dataset<Row> readRecords = HoodieClientTestUtils
|
||||
.readCommit(dfsBasePath, sqlContext, timeline, readCommitTime);
|
||||
assertEquals("Should contain 100 records", readRecords.count(), records.size());
|
||||
|
||||
// Write to local
|
||||
FileSystem local = FSUtils.getFs(tablePath, jsc.hadoopConfiguration());
|
||||
HoodieTableMetaClient
|
||||
.initTableType(local, tablePath, HoodieTableType.valueOf(tableType), tableName,
|
||||
HoodieAvroPayload.class.getName());
|
||||
HoodieWriteConfig localConfig = HoodieWriteConfig.newBuilder().withPath(tablePath)
|
||||
.withSchema(HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA).withParallelism(2, 2)
|
||||
.forTable(tableName).withIndexConfig(
|
||||
HoodieIndexConfig.newBuilder().withIndexType(HoodieIndex.IndexType.BLOOM).build())
|
||||
.build();
|
||||
HoodieWriteClient localWriteClient = new HoodieWriteClient(jsc, localConfig);
|
||||
|
||||
String writeCommitTime = localWriteClient.startCommit();
|
||||
logger.info("Starting write commit " + writeCommitTime);
|
||||
List<HoodieRecord> localRecords = dataGen.generateInserts(writeCommitTime, 100);
|
||||
JavaRDD<HoodieRecord> localWriteRecords = jsc.parallelize(localRecords, 1);
|
||||
logger.info("Writing to path: " + tablePath);
|
||||
localWriteClient.upsert(localWriteRecords, writeCommitTime);
|
||||
|
||||
logger.info("Reading from path: " + tablePath);
|
||||
fs = FSUtils.getFs(tablePath, HoodieTestUtils.getDefaultHadoopConf());
|
||||
metaClient = new HoodieTableMetaClient(fs.getConf(), tablePath);
|
||||
timeline = new HoodieActiveTimeline(metaClient).getCommitTimeline();
|
||||
Dataset<Row> localReadRecords = HoodieClientTestUtils
|
||||
.readCommit(tablePath, sqlContext, timeline, writeCommitTime);
|
||||
assertEquals("Should contain 100 records", localReadRecords.count(), localRecords.size());
|
||||
}
|
||||
}
|
||||
@@ -174,7 +174,7 @@ public class HoodieClientTestUtils {
|
||||
List<String> filteredPaths = new ArrayList<>();
|
||||
try {
|
||||
HoodieTable hoodieTable = HoodieTable
|
||||
.getHoodieTable(new HoodieTableMetaClient(fs, basePath, true), null);
|
||||
.getHoodieTable(new HoodieTableMetaClient(fs.getConf(), basePath, true), null);
|
||||
for (String path : paths) {
|
||||
TableFileSystemView.ReadOptimizedView fileSystemView = new HoodieTableFileSystemView(
|
||||
hoodieTable.getMetaClient(),
|
||||
|
||||
@@ -18,7 +18,7 @@ package com.uber.hoodie.common;
|
||||
|
||||
import static com.uber.hoodie.common.HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA;
|
||||
|
||||
import com.uber.hoodie.common.util.FSUtils;
|
||||
import com.uber.hoodie.common.model.HoodieTestUtils;
|
||||
import com.uber.hoodie.common.util.HoodieAvroUtils;
|
||||
import com.uber.hoodie.hadoop.realtime.HoodieRealtimeInputFormat;
|
||||
import java.io.IOException;
|
||||
@@ -42,12 +42,13 @@ import org.apache.hadoop.mapred.RecordReader;
|
||||
*/
|
||||
public class HoodieMergeOnReadTestUtils {
|
||||
|
||||
public static List<GenericRecord> getRecordsUsingInputFormat(List<String> inputPaths)
|
||||
public static List<GenericRecord> getRecordsUsingInputFormat(List<String> inputPaths,
|
||||
String basePath)
|
||||
throws IOException {
|
||||
JobConf jobConf = new JobConf();
|
||||
Schema schema = HoodieAvroUtils.addMetadataFields(Schema.parse(TRIP_EXAMPLE_SCHEMA));
|
||||
HoodieRealtimeInputFormat inputFormat = new HoodieRealtimeInputFormat();
|
||||
setPropsForInputFormat(inputFormat, jobConf, schema);
|
||||
setPropsForInputFormat(inputFormat, jobConf, schema, basePath);
|
||||
return inputPaths.stream().map(path -> {
|
||||
setInputPath(jobConf, path);
|
||||
List<GenericRecord> records = new ArrayList<>();
|
||||
@@ -76,12 +77,12 @@ public class HoodieMergeOnReadTestUtils {
|
||||
}
|
||||
|
||||
private static void setPropsForInputFormat(HoodieRealtimeInputFormat inputFormat, JobConf jobConf,
|
||||
Schema schema) {
|
||||
Schema schema, String basePath) {
|
||||
List<Schema.Field> fields = schema.getFields();
|
||||
String names = fields.stream().map(f -> f.name().toString()).collect(Collectors.joining(","));
|
||||
String postions = fields.stream().map(f -> String.valueOf(f.pos()))
|
||||
.collect(Collectors.joining(","));
|
||||
Configuration conf = FSUtils.getFs().getConf();
|
||||
Configuration conf = HoodieTestUtils.getDefaultHadoopConf();
|
||||
jobConf.set(ColumnProjectionUtils.READ_COLUMN_NAMES_CONF_STR, names);
|
||||
jobConf.set(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR, postions);
|
||||
jobConf.set("partition_columns", "datestr");
|
||||
|
||||
@@ -20,6 +20,7 @@ import com.uber.hoodie.common.model.HoodieCommitMetadata;
|
||||
import com.uber.hoodie.common.model.HoodieKey;
|
||||
import com.uber.hoodie.common.model.HoodiePartitionMetadata;
|
||||
import com.uber.hoodie.common.model.HoodieRecord;
|
||||
import com.uber.hoodie.common.model.HoodieTestUtils;
|
||||
import com.uber.hoodie.common.table.HoodieTableMetaClient;
|
||||
import com.uber.hoodie.common.table.HoodieTimeline;
|
||||
import com.uber.hoodie.common.util.FSUtils;
|
||||
@@ -193,7 +194,7 @@ public class HoodieTestDataGenerator {
|
||||
Path commitFile =
|
||||
new Path(basePath + "/" + HoodieTableMetaClient.METAFOLDER_NAME + "/" + HoodieTimeline
|
||||
.makeCommitFileName(commitTime));
|
||||
FileSystem fs = FSUtils.getFs();
|
||||
FileSystem fs = FSUtils.getFs(basePath, HoodieTestUtils.getDefaultHadoopConf());
|
||||
FSDataOutputStream os = fs.create(commitFile, true);
|
||||
HoodieCommitMetadata commitMetadata = new HoodieCommitMetadata();
|
||||
try {
|
||||
@@ -209,7 +210,7 @@ public class HoodieTestDataGenerator {
|
||||
Path commitFile =
|
||||
new Path(basePath + "/" + HoodieTableMetaClient.METAFOLDER_NAME + "/" + HoodieTimeline
|
||||
.makeSavePointFileName(commitTime));
|
||||
FileSystem fs = FSUtils.getFs();
|
||||
FileSystem fs = FSUtils.getFs(basePath, HoodieTestUtils.getDefaultHadoopConf());
|
||||
FSDataOutputStream os = fs.create(commitFile, true);
|
||||
HoodieCommitMetadata commitMetadata = new HoodieCommitMetadata();
|
||||
try {
|
||||
|
||||
@@ -55,8 +55,9 @@ public class TestUpdateMapFunction {
|
||||
public void testSchemaEvolutionOnUpdate() throws Exception {
|
||||
// Create a bunch of records with a old version of schema
|
||||
HoodieWriteConfig config = makeHoodieClientConfig("/exampleSchema.txt");
|
||||
HoodieTableMetaClient metadata = new HoodieTableMetaClient(FSUtils.getFs(), basePath);
|
||||
HoodieCopyOnWriteTable table = new HoodieCopyOnWriteTable(config, metadata);
|
||||
HoodieTableMetaClient metaClient = new HoodieTableMetaClient(
|
||||
HoodieTestUtils.getDefaultHadoopConf(), basePath);
|
||||
HoodieCopyOnWriteTable table = new HoodieCopyOnWriteTable(config, metaClient);
|
||||
|
||||
String recordStr1 =
|
||||
"{\"_row_key\":\"8eb5b87a-1feh-4edd-87b4-6ec96dc405a0\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}";
|
||||
@@ -80,16 +81,16 @@ public class TestUpdateMapFunction {
|
||||
Iterator<List<WriteStatus>> insertResult = table.handleInsert("100", records.iterator());
|
||||
Path commitFile =
|
||||
new Path(config.getBasePath() + "/.hoodie/" + HoodieTimeline.makeCommitFileName("100"));
|
||||
FSUtils.getFs().create(commitFile);
|
||||
FSUtils.getFs(basePath, HoodieTestUtils.getDefaultHadoopConf()).create(commitFile);
|
||||
|
||||
// Now try an update with an evolved schema
|
||||
// Evolved schema does not have guarantee on preserving the original field ordering
|
||||
config = makeHoodieClientConfig("/exampleEvolvedSchema.txt");
|
||||
metadata = new HoodieTableMetaClient(FSUtils.getFs(), basePath);
|
||||
metaClient = new HoodieTableMetaClient(HoodieTestUtils.getDefaultHadoopConf(), basePath);
|
||||
String fileId = insertResult.next().get(0).getFileId();
|
||||
System.out.println(fileId);
|
||||
|
||||
table = new HoodieCopyOnWriteTable(config, metadata);
|
||||
table = new HoodieCopyOnWriteTable(config, metaClient);
|
||||
// New content with values for the newly added field
|
||||
recordStr1 =
|
||||
"{\"_row_key\":\"8eb5b87a-1feh-4edd-87b4-6ec96dc405a0\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12,\"added_field\":1}";
|
||||
|
||||
@@ -54,7 +54,6 @@ import java.util.stream.Collectors;
|
||||
import org.apache.avro.Schema;
|
||||
import org.apache.avro.generic.GenericRecord;
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.apache.parquet.avro.AvroSchemaConverter;
|
||||
@@ -73,12 +72,11 @@ public class TestHoodieBloomIndex {
|
||||
|
||||
private JavaSparkContext jsc = null;
|
||||
private String basePath = null;
|
||||
private transient final FileSystem fs;
|
||||
private transient FileSystem fs;
|
||||
private String schemaStr;
|
||||
private Schema schema;
|
||||
|
||||
public TestHoodieBloomIndex() throws Exception {
|
||||
fs = FSUtils.getFs();
|
||||
}
|
||||
|
||||
@Before
|
||||
@@ -89,6 +87,7 @@ public class TestHoodieBloomIndex {
|
||||
TemporaryFolder folder = new TemporaryFolder();
|
||||
folder.create();
|
||||
basePath = folder.getRoot().getAbsolutePath();
|
||||
fs = FSUtils.getFs(basePath, jsc.hadoopConfiguration());
|
||||
HoodieTestUtils.init(basePath);
|
||||
// We have some records to be tagged (two different partitions)
|
||||
schemaStr = IOUtils.toString(getClass().getResourceAsStream("/exampleSchema.txt"), "UTF-8");
|
||||
@@ -120,8 +119,6 @@ public class TestHoodieBloomIndex {
|
||||
.parallelize(Arrays.asList(record1, record2, record3, record4));
|
||||
|
||||
// Load to memory
|
||||
HoodieWriteConfig config = HoodieWriteConfig.newBuilder().withPath(basePath).build();
|
||||
|
||||
Map<String, Iterable<String>> map = recordRDD
|
||||
.mapToPair(record -> new Tuple2<>(record.getPartitionPath(), record.getRecordKey()))
|
||||
.groupByKey().collectAsMap();
|
||||
@@ -174,7 +171,7 @@ public class TestHoodieBloomIndex {
|
||||
Arrays.asList(record2, record3, record4), schema, null, false);
|
||||
|
||||
List<String> partitions = Arrays.asList("2016/01/21", "2016/04/01", "2015/03/12");
|
||||
HoodieTableMetaClient metadata = new HoodieTableMetaClient(fs, basePath);
|
||||
HoodieTableMetaClient metadata = new HoodieTableMetaClient(jsc.hadoopConfiguration(), basePath);
|
||||
HoodieTable table = HoodieTable.getHoodieTable(metadata, config);
|
||||
List<Tuple2<String, BloomIndexFileInfo>> filesList = index.loadInvolvedFiles(partitions, table);
|
||||
// Still 0, as no valid commit
|
||||
@@ -291,7 +288,8 @@ public class TestHoodieBloomIndex {
|
||||
List<String> uuids = Arrays.asList(record1.getRecordKey(), record2.getRecordKey(),
|
||||
record3.getRecordKey(), record4.getRecordKey());
|
||||
|
||||
List<String> results = HoodieBloomIndexCheckFunction.checkCandidatesAgainstFile(uuids,
|
||||
List<String> results = HoodieBloomIndexCheckFunction
|
||||
.checkCandidatesAgainstFile(jsc.hadoopConfiguration(), uuids,
|
||||
new Path(basePath + "/2016/01/31/" + filename));
|
||||
assertEquals(results.size(), 2);
|
||||
assertTrue(results.get(0).equals("1eb5b87a-1feh-4edd-87b4-6ec96dc405a0")
|
||||
@@ -308,7 +306,7 @@ public class TestHoodieBloomIndex {
|
||||
// We have some records to be tagged (two different partitions)
|
||||
JavaRDD<HoodieRecord> recordRDD = jsc.emptyRDD();
|
||||
// Also create the metadata and config
|
||||
HoodieTableMetaClient metadata = new HoodieTableMetaClient(fs, basePath);
|
||||
HoodieTableMetaClient metadata = new HoodieTableMetaClient(jsc.hadoopConfiguration(), basePath);
|
||||
HoodieWriteConfig config = HoodieWriteConfig.newBuilder().withPath(basePath).build();
|
||||
HoodieTable table = HoodieTable.getHoodieTable(metadata, config);
|
||||
|
||||
@@ -348,7 +346,7 @@ public class TestHoodieBloomIndex {
|
||||
.parallelize(Arrays.asList(record1, record2, record3, record4));
|
||||
|
||||
// Also create the metadata and config
|
||||
HoodieTableMetaClient metadata = new HoodieTableMetaClient(fs, basePath);
|
||||
HoodieTableMetaClient metadata = new HoodieTableMetaClient(jsc.hadoopConfiguration(), basePath);
|
||||
HoodieWriteConfig config = HoodieWriteConfig.newBuilder().withPath(basePath).build();
|
||||
HoodieTable table = HoodieTable.getHoodieTable(metadata, config);
|
||||
|
||||
@@ -367,7 +365,7 @@ public class TestHoodieBloomIndex {
|
||||
String filename3 = writeParquetFile("2015/01/31", Arrays.asList(record4), schema, null, true);
|
||||
|
||||
// We do the tag again
|
||||
metadata = new HoodieTableMetaClient(fs, basePath);
|
||||
metadata = new HoodieTableMetaClient(jsc.hadoopConfiguration(), basePath);
|
||||
table = HoodieTable.getHoodieTable(metadata, config);
|
||||
|
||||
taggedRecordRDD = bloomIndex.tagLocation(recordRDD, table);
|
||||
@@ -409,7 +407,7 @@ public class TestHoodieBloomIndex {
|
||||
JavaRDD<HoodieKey> keysRDD = jsc.parallelize(Arrays.asList(key1, key2, key3, key4));
|
||||
|
||||
// Also create the metadata and config
|
||||
HoodieTableMetaClient metadata = new HoodieTableMetaClient(fs, basePath);
|
||||
HoodieTableMetaClient metadata = new HoodieTableMetaClient(jsc.hadoopConfiguration(), basePath);
|
||||
HoodieWriteConfig config = HoodieWriteConfig.newBuilder().withPath(basePath).build();
|
||||
HoodieTable table = HoodieTable.getHoodieTable(metadata, config);
|
||||
|
||||
@@ -429,7 +427,7 @@ public class TestHoodieBloomIndex {
|
||||
String filename3 = writeParquetFile("2015/01/31", Arrays.asList(record4), schema, null, true);
|
||||
|
||||
// We do the tag again
|
||||
metadata = new HoodieTableMetaClient(fs, basePath);
|
||||
metadata = new HoodieTableMetaClient(jsc.hadoopConfiguration(), basePath);
|
||||
table = HoodieTable.getHoodieTable(metadata, config);
|
||||
taggedRecordRDD = bloomIndex.fetchRecordLocation(keysRDD, table);
|
||||
|
||||
@@ -476,7 +474,7 @@ public class TestHoodieBloomIndex {
|
||||
|
||||
// We do the tag
|
||||
JavaRDD<HoodieRecord> recordRDD = jsc.parallelize(Arrays.asList(record1, record2));
|
||||
HoodieTableMetaClient metadata = new HoodieTableMetaClient(fs, basePath);
|
||||
HoodieTableMetaClient metadata = new HoodieTableMetaClient(jsc.hadoopConfiguration(), basePath);
|
||||
HoodieWriteConfig config = HoodieWriteConfig.newBuilder().withPath(basePath).build();
|
||||
HoodieTable table = HoodieTable.getHoodieTable(metadata, config);
|
||||
|
||||
@@ -515,7 +513,7 @@ public class TestHoodieBloomIndex {
|
||||
String commitTime = FSUtils.getCommitTime(filename);
|
||||
HoodieParquetConfig config = new HoodieParquetConfig(writeSupport, CompressionCodecName.GZIP,
|
||||
ParquetWriter.DEFAULT_BLOCK_SIZE, ParquetWriter.DEFAULT_PAGE_SIZE, 120 * 1024 * 1024,
|
||||
new Configuration());
|
||||
HoodieTestUtils.getDefaultHadoopConf());
|
||||
HoodieParquetWriter writer = new HoodieParquetWriter(
|
||||
commitTime,
|
||||
new Path(basePath + "/" + partitionPath + "/" + filename),
|
||||
|
||||
@@ -55,7 +55,7 @@ public class TestHoodieCommitArchiveLog {
|
||||
folder.create();
|
||||
basePath = folder.getRoot().getAbsolutePath();
|
||||
HoodieTestUtils.init(basePath);
|
||||
fs = FSUtils.getFs();
|
||||
fs = FSUtils.getFs(basePath, HoodieTestUtils.getDefaultHadoopConf());
|
||||
}
|
||||
|
||||
@Test
|
||||
@@ -82,7 +82,7 @@ public class TestHoodieCommitArchiveLog {
|
||||
HoodieTestDataGenerator.createCommitFile(basePath, "104");
|
||||
HoodieTestDataGenerator.createCommitFile(basePath, "105");
|
||||
|
||||
HoodieTableMetaClient metadata = new HoodieTableMetaClient(fs, basePath);
|
||||
HoodieTableMetaClient metadata = new HoodieTableMetaClient(fs.getConf(), basePath);
|
||||
HoodieTimeline timeline =
|
||||
metadata.getActiveTimeline().getCommitsTimeline().filterCompletedInstants();
|
||||
|
||||
@@ -112,8 +112,8 @@ public class TestHoodieCommitArchiveLog {
|
||||
originalCommits.removeAll(timeline.getInstants().collect(Collectors.toList()));
|
||||
|
||||
//read the file
|
||||
HoodieLogFormat.Reader reader = HoodieLogFormat.newReader(FSUtils.getFs(),
|
||||
new HoodieLogFile(new Path(basePath + "/.hoodie/.commits_.archive.1")),
|
||||
HoodieLogFormat.Reader reader = HoodieLogFormat
|
||||
.newReader(fs, new HoodieLogFile(new Path(basePath + "/.hoodie/.commits_.archive.1")),
|
||||
HoodieArchivedMetaEntry.getClassSchema(), false);
|
||||
|
||||
int archivedRecordsCount = 0;
|
||||
@@ -147,7 +147,7 @@ public class TestHoodieCommitArchiveLog {
|
||||
.withSchema(HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA).withParallelism(2, 2)
|
||||
.forTable("test-trip-table").withCompactionConfig(
|
||||
HoodieCompactionConfig.newBuilder().archiveCommitsWith(2, 5).build()).build();
|
||||
HoodieTableMetaClient metadata = new HoodieTableMetaClient(fs, basePath);
|
||||
HoodieTableMetaClient metadata = new HoodieTableMetaClient(fs.getConf(), basePath);
|
||||
HoodieCommitArchiveLog archiveLog = new HoodieCommitArchiveLog(cfg, fs);
|
||||
HoodieTestDataGenerator.createCommitFile(basePath, "100");
|
||||
HoodieTestDataGenerator.createCommitFile(basePath, "101");
|
||||
@@ -173,7 +173,7 @@ public class TestHoodieCommitArchiveLog {
|
||||
.withSchema(HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA).withParallelism(2, 2)
|
||||
.forTable("test-trip-table").withCompactionConfig(
|
||||
HoodieCompactionConfig.newBuilder().archiveCommitsWith(2, 5).build()).build();
|
||||
HoodieTableMetaClient metadata = new HoodieTableMetaClient(fs, basePath);
|
||||
HoodieTableMetaClient metadata = new HoodieTableMetaClient(fs.getConf(), basePath);
|
||||
HoodieCommitArchiveLog archiveLog = new HoodieCommitArchiveLog(cfg, fs);
|
||||
HoodieTestDataGenerator.createCommitFile(basePath, "100");
|
||||
HoodieTestDataGenerator.createCommitFile(basePath, "101");
|
||||
@@ -206,7 +206,7 @@ public class TestHoodieCommitArchiveLog {
|
||||
.withSchema(HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA).withParallelism(2, 2)
|
||||
.forTable("test-trip-table").withCompactionConfig(
|
||||
HoodieCompactionConfig.newBuilder().archiveCommitsWith(2, 5).build()).build();
|
||||
HoodieTableMetaClient metadata = new HoodieTableMetaClient(fs, basePath);
|
||||
HoodieTableMetaClient metadata = new HoodieTableMetaClient(fs.getConf(), basePath);
|
||||
HoodieCommitArchiveLog archiveLog = new HoodieCommitArchiveLog(cfg, fs);
|
||||
HoodieTestDataGenerator.createCommitFile(basePath, "100");
|
||||
HoodieTestDataGenerator.createCommitFile(basePath, "101");
|
||||
|
||||
@@ -16,6 +16,9 @@
|
||||
|
||||
package com.uber.hoodie.io;
|
||||
|
||||
import static org.junit.Assert.assertEquals;
|
||||
import static org.junit.Assert.assertTrue;
|
||||
|
||||
import com.uber.hoodie.HoodieWriteClient;
|
||||
import com.uber.hoodie.WriteStatus;
|
||||
import com.uber.hoodie.common.HoodieClientTestUtils;
|
||||
@@ -28,7 +31,6 @@ import com.uber.hoodie.common.model.HoodieTestUtils;
|
||||
import com.uber.hoodie.common.table.HoodieTableMetaClient;
|
||||
import com.uber.hoodie.common.table.HoodieTimeline;
|
||||
import com.uber.hoodie.common.table.timeline.HoodieActiveTimeline;
|
||||
import com.uber.hoodie.common.util.FSUtils;
|
||||
import com.uber.hoodie.config.HoodieCompactionConfig;
|
||||
import com.uber.hoodie.config.HoodieIndexConfig;
|
||||
import com.uber.hoodie.config.HoodieStorageConfig;
|
||||
@@ -38,7 +40,10 @@ import com.uber.hoodie.index.bloom.HoodieBloomIndex;
|
||||
import com.uber.hoodie.io.compact.HoodieCompactor;
|
||||
import com.uber.hoodie.io.compact.HoodieRealtimeTableCompactor;
|
||||
import com.uber.hoodie.table.HoodieTable;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.util.List;
|
||||
import java.util.stream.Collectors;
|
||||
import org.apache.spark.api.java.JavaRDD;
|
||||
import org.apache.spark.api.java.JavaSparkContext;
|
||||
import org.junit.After;
|
||||
@@ -46,14 +51,6 @@ import org.junit.Before;
|
||||
import org.junit.Test;
|
||||
import org.junit.rules.TemporaryFolder;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.util.List;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import static org.junit.Assert.assertEquals;
|
||||
import static org.junit.Assert.assertTrue;
|
||||
|
||||
public class TestHoodieCompactor {
|
||||
|
||||
private transient JavaSparkContext jsc = null;
|
||||
@@ -104,18 +101,19 @@ public class TestHoodieCompactor {
|
||||
@Test(expected = IllegalArgumentException.class)
|
||||
public void testCompactionOnCopyOnWriteFail() throws Exception {
|
||||
HoodieTestUtils.initTableType(basePath, HoodieTableType.COPY_ON_WRITE);
|
||||
|
||||
HoodieTableMetaClient metaClient = new HoodieTableMetaClient(FSUtils.getFs(), basePath);
|
||||
HoodieTableMetaClient metaClient = new HoodieTableMetaClient(jsc.hadoopConfiguration(),
|
||||
basePath);
|
||||
HoodieTable table = HoodieTable.getHoodieTable(metaClient, getConfig());
|
||||
|
||||
compactor.compact(jsc, getConfig(), table, HoodieActiveTimeline.createNewCommitTime());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testCompactionEmpty() throws Exception {
|
||||
HoodieTableMetaClient metaClient = new HoodieTableMetaClient(FSUtils.getFs(), basePath);
|
||||
HoodieTableMetaClient metaClient = new HoodieTableMetaClient(jsc.hadoopConfiguration(),
|
||||
basePath);
|
||||
HoodieWriteConfig config = getConfig();
|
||||
HoodieTable table = HoodieTable.getHoodieTable(metaClient, config);
|
||||
HoodieTable table = HoodieTable
|
||||
.getHoodieTable(metaClient, config);
|
||||
HoodieWriteClient writeClient = new HoodieWriteClient(jsc, config);
|
||||
|
||||
String newCommitTime = writeClient.startCommit();
|
||||
@@ -132,7 +130,6 @@ public class TestHoodieCompactor {
|
||||
|
||||
@Test
|
||||
public void testLogFileCountsAfterCompaction() throws Exception {
|
||||
FileSystem fs = FSUtils.getFs();
|
||||
// insert 100 records
|
||||
HoodieWriteConfig config = getConfig();
|
||||
HoodieWriteClient writeClient = new HoodieWriteClient(jsc, config);
|
||||
@@ -144,7 +141,8 @@ public class TestHoodieCompactor {
|
||||
List<WriteStatus> statuses = writeClient.insert(recordsRDD, newCommitTime).collect();
|
||||
|
||||
// Update all the 100 records
|
||||
HoodieTableMetaClient metaClient = new HoodieTableMetaClient(fs, basePath);
|
||||
HoodieTableMetaClient metaClient = new HoodieTableMetaClient(jsc.hadoopConfiguration(),
|
||||
basePath);
|
||||
HoodieTable table = HoodieTable.getHoodieTable(metaClient, config);
|
||||
|
||||
newCommitTime = "101";
|
||||
@@ -161,7 +159,7 @@ public class TestHoodieCompactor {
|
||||
updatedRecords);
|
||||
|
||||
// Verify that all data file has one log file
|
||||
metaClient = new HoodieTableMetaClient(fs, basePath);
|
||||
metaClient = new HoodieTableMetaClient(jsc.hadoopConfiguration(), basePath);
|
||||
table = HoodieTable.getHoodieTable(metaClient, config);
|
||||
for (String partitionPath : dataGen.getPartitionPaths()) {
|
||||
List<FileSlice> groupedLogFiles =
|
||||
@@ -174,14 +172,14 @@ public class TestHoodieCompactor {
|
||||
}
|
||||
|
||||
// Do a compaction
|
||||
metaClient = new HoodieTableMetaClient(fs, basePath);
|
||||
metaClient = new HoodieTableMetaClient(jsc.hadoopConfiguration(), basePath);
|
||||
table = HoodieTable.getHoodieTable(metaClient, config);
|
||||
|
||||
HoodieCommitMetadata result =
|
||||
compactor.compact(jsc, getConfig(), table, HoodieActiveTimeline.createNewCommitTime());
|
||||
|
||||
// Verify that recently written compacted data file has no log file
|
||||
metaClient = new HoodieTableMetaClient(fs, basePath);
|
||||
metaClient = new HoodieTableMetaClient(jsc.hadoopConfiguration(), basePath);
|
||||
table = HoodieTable.getHoodieTable(metaClient, config);
|
||||
HoodieActiveTimeline timeline = metaClient.getActiveTimeline();
|
||||
|
||||
|
||||
@@ -89,7 +89,8 @@ public class TestCopyOnWriteTable {
|
||||
|
||||
String commitTime = HoodieTestUtils.makeNewCommitTime();
|
||||
HoodieWriteConfig config = makeHoodieClientConfig();
|
||||
HoodieTableMetaClient metaClient = new HoodieTableMetaClient(FSUtils.getFs(), basePath);
|
||||
HoodieTableMetaClient metaClient = new HoodieTableMetaClient(jsc.hadoopConfiguration(),
|
||||
basePath);
|
||||
HoodieTable table = HoodieTable.getHoodieTable(metaClient, config);
|
||||
|
||||
HoodieCreateHandle io = new HoodieCreateHandle(config, commitTime, table, partitionPath);
|
||||
@@ -115,7 +116,7 @@ public class TestCopyOnWriteTable {
|
||||
// Prepare the AvroParquetIO
|
||||
HoodieWriteConfig config = makeHoodieClientConfig();
|
||||
String firstCommitTime = HoodieTestUtils.makeNewCommitTime();
|
||||
HoodieTableMetaClient metadata = new HoodieTableMetaClient(FSUtils.getFs(), basePath);
|
||||
HoodieTableMetaClient metadata = new HoodieTableMetaClient(jsc.hadoopConfiguration(), basePath);
|
||||
|
||||
String partitionPath = "/2016/01/31";
|
||||
HoodieCopyOnWriteTable table = new HoodieCopyOnWriteTable(config, metadata);
|
||||
@@ -154,7 +155,8 @@ public class TestCopyOnWriteTable {
|
||||
|
||||
// Read out the bloom filter and make sure filter can answer record exist or not
|
||||
Path parquetFilePath = new Path(parquetFile.getAbsolutePath());
|
||||
BloomFilter filter = ParquetUtils.readBloomFilterFromParquetMetadata(parquetFilePath);
|
||||
BloomFilter filter = ParquetUtils
|
||||
.readBloomFilterFromParquetMetadata(jsc.hadoopConfiguration(), parquetFilePath);
|
||||
for (HoodieRecord record : records) {
|
||||
assertTrue(filter.mightContain(record.getRecordKey()));
|
||||
}
|
||||
@@ -163,7 +165,8 @@ public class TestCopyOnWriteTable {
|
||||
+ FSUtils.getCommitTime(parquetFile.getName()) + ".commit").createNewFile();
|
||||
|
||||
// Read the parquet file, check the record content
|
||||
List<GenericRecord> fileRecords = ParquetUtils.readAvroRecords(parquetFilePath);
|
||||
List<GenericRecord> fileRecords = ParquetUtils
|
||||
.readAvroRecords(jsc.hadoopConfiguration(), parquetFilePath);
|
||||
GenericRecord newRecord;
|
||||
int index = 0;
|
||||
for (GenericRecord record : fileRecords) {
|
||||
@@ -188,7 +191,7 @@ public class TestCopyOnWriteTable {
|
||||
|
||||
Thread.sleep(1000);
|
||||
String newCommitTime = HoodieTestUtils.makeNewCommitTime();
|
||||
metadata = new HoodieTableMetaClient(FSUtils.getFs(), basePath);
|
||||
metadata = new HoodieTableMetaClient(jsc.hadoopConfiguration(), basePath);
|
||||
table = new HoodieCopyOnWriteTable(config, metadata);
|
||||
Iterator<List<WriteStatus>> iter = table
|
||||
.handleUpdate(newCommitTime, updatedRecord1.getCurrentLocation().getFileId(),
|
||||
@@ -211,7 +214,7 @@ public class TestCopyOnWriteTable {
|
||||
// Check whether the record has been updated
|
||||
Path updatedParquetFilePath = new Path(updatedParquetFile.getAbsolutePath());
|
||||
BloomFilter updatedFilter = ParquetUtils
|
||||
.readBloomFilterFromParquetMetadata(updatedParquetFilePath);
|
||||
.readBloomFilterFromParquetMetadata(jsc.hadoopConfiguration(), updatedParquetFilePath);
|
||||
for (HoodieRecord record : records) {
|
||||
// No change to the _row_key
|
||||
assertTrue(updatedFilter.mightContain(record.getRecordKey()));
|
||||
@@ -261,7 +264,7 @@ public class TestCopyOnWriteTable {
|
||||
HoodieWriteConfig config = makeHoodieClientConfigBuilder()
|
||||
.withWriteStatusClass(MetadataMergeWriteStatus.class).build();
|
||||
String firstCommitTime = HoodieTestUtils.makeNewCommitTime();
|
||||
HoodieTableMetaClient metadata = new HoodieTableMetaClient(FSUtils.getFs(), basePath);
|
||||
HoodieTableMetaClient metadata = new HoodieTableMetaClient(jsc.hadoopConfiguration(), basePath);
|
||||
|
||||
HoodieCopyOnWriteTable table = new HoodieCopyOnWriteTable(config, metadata);
|
||||
|
||||
@@ -298,8 +301,8 @@ public class TestCopyOnWriteTable {
|
||||
public void testInsertWithPartialFailures() throws Exception {
|
||||
HoodieWriteConfig config = makeHoodieClientConfig();
|
||||
String commitTime = HoodieTestUtils.makeNewCommitTime();
|
||||
FileSystem fs = FSUtils.getFs();
|
||||
HoodieTableMetaClient metadata = new HoodieTableMetaClient(fs, basePath);
|
||||
FileSystem fs = FSUtils.getFs(basePath, jsc.hadoopConfiguration());
|
||||
HoodieTableMetaClient metadata = new HoodieTableMetaClient(jsc.hadoopConfiguration(), basePath);
|
||||
HoodieCopyOnWriteTable table = new HoodieCopyOnWriteTable(config, metadata);
|
||||
|
||||
// Write a few records, and get atleast one file
|
||||
@@ -340,7 +343,7 @@ public class TestCopyOnWriteTable {
|
||||
public void testInsertRecords() throws Exception {
|
||||
HoodieWriteConfig config = makeHoodieClientConfig();
|
||||
String commitTime = HoodieTestUtils.makeNewCommitTime();
|
||||
HoodieTableMetaClient metadata = new HoodieTableMetaClient(FSUtils.getFs(), basePath);
|
||||
HoodieTableMetaClient metadata = new HoodieTableMetaClient(jsc.hadoopConfiguration(), basePath);
|
||||
HoodieCopyOnWriteTable table = new HoodieCopyOnWriteTable(config, metadata);
|
||||
|
||||
// Case 1:
|
||||
@@ -389,7 +392,7 @@ public class TestCopyOnWriteTable {
|
||||
HoodieStorageConfig.newBuilder().limitFileSize(64 * 1024).parquetBlockSize(64 * 1024)
|
||||
.parquetPageSize(64 * 1024).build()).build();
|
||||
String commitTime = HoodieTestUtils.makeNewCommitTime();
|
||||
HoodieTableMetaClient metadata = new HoodieTableMetaClient(FSUtils.getFs(), basePath);
|
||||
HoodieTableMetaClient metadata = new HoodieTableMetaClient(jsc.hadoopConfiguration(), basePath);
|
||||
HoodieCopyOnWriteTable table = new HoodieCopyOnWriteTable(config, metadata);
|
||||
|
||||
List<HoodieRecord> records = new ArrayList<>();
|
||||
@@ -437,7 +440,7 @@ public class TestCopyOnWriteTable {
|
||||
HoodieClientTestUtils.fakeCommitFile(basePath, "001");
|
||||
HoodieClientTestUtils.fakeDataFile(basePath, TEST_PARTITION_PATH, "001", "file1", fileSize);
|
||||
|
||||
HoodieTableMetaClient metadata = new HoodieTableMetaClient(FSUtils.getFs(), basePath);
|
||||
HoodieTableMetaClient metadata = new HoodieTableMetaClient(jsc.hadoopConfiguration(), basePath);
|
||||
HoodieCopyOnWriteTable table = new HoodieCopyOnWriteTable(config, metadata);
|
||||
|
||||
HoodieTestDataGenerator dataGenerator = new HoodieTestDataGenerator(
|
||||
|
||||
@@ -19,6 +19,11 @@
|
||||
package com.uber.hoodie.table;
|
||||
|
||||
|
||||
import static com.uber.hoodie.common.HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA;
|
||||
import static org.junit.Assert.assertEquals;
|
||||
import static org.junit.Assert.assertFalse;
|
||||
import static org.junit.Assert.assertTrue;
|
||||
|
||||
import com.uber.hoodie.HoodieWriteClient;
|
||||
import com.uber.hoodie.WriteStatus;
|
||||
import com.uber.hoodie.common.HoodieClientTestUtils;
|
||||
@@ -45,6 +50,14 @@ import com.uber.hoodie.config.HoodieWriteConfig;
|
||||
import com.uber.hoodie.index.HoodieIndex;
|
||||
import com.uber.hoodie.io.compact.HoodieCompactor;
|
||||
import com.uber.hoodie.io.compact.HoodieRealtimeTableCompactor;
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Optional;
|
||||
import java.util.stream.Collectors;
|
||||
import java.util.stream.Stream;
|
||||
import org.apache.avro.generic.GenericRecord;
|
||||
import org.apache.hadoop.fs.FileStatus;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
@@ -61,25 +74,11 @@ import org.junit.BeforeClass;
|
||||
import org.junit.Test;
|
||||
import org.junit.rules.TemporaryFolder;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Optional;
|
||||
import java.util.stream.Collectors;
|
||||
import java.util.stream.Stream;
|
||||
|
||||
import static com.uber.hoodie.common.HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA;
|
||||
import static org.junit.Assert.assertEquals;
|
||||
import static org.junit.Assert.assertFalse;
|
||||
import static org.junit.Assert.assertTrue;
|
||||
|
||||
public class TestMergeOnReadTable {
|
||||
|
||||
private transient JavaSparkContext jsc = null;
|
||||
private transient SQLContext sqlContext;
|
||||
private String basePath = null;
|
||||
private static String basePath = null;
|
||||
private HoodieCompactor compactor;
|
||||
private FileSystem fs;
|
||||
|
||||
@@ -94,12 +93,11 @@ public class TestMergeOnReadTable {
|
||||
if (hdfsTestService != null) {
|
||||
hdfsTestService.stop();
|
||||
dfsCluster.shutdown();
|
||||
;
|
||||
}
|
||||
FSUtils.setFs(null);
|
||||
// Need to closeAll to clear FileSystem.Cache, required because DFS and LocalFS used in the same JVM
|
||||
FileSystem.closeAll();
|
||||
HoodieTestUtils.resetFS();
|
||||
HoodieTestUtils.resetFS(basePath);
|
||||
}
|
||||
|
||||
@BeforeClass
|
||||
@@ -113,30 +111,28 @@ public class TestMergeOnReadTable {
|
||||
dfs = dfsCluster.getFileSystem();
|
||||
}
|
||||
FSUtils.setFs(dfs);
|
||||
HoodieTestUtils.resetFS();
|
||||
HoodieTestUtils.resetFS(basePath);
|
||||
}
|
||||
|
||||
@Before
|
||||
public void init() throws IOException {
|
||||
this.fs = FSUtils.getFs();
|
||||
|
||||
// Initialize a local spark env
|
||||
jsc = new JavaSparkContext(
|
||||
HoodieClientTestUtils.getSparkConfForTest("TestHoodieMergeOnReadTable"));
|
||||
jsc.hadoopConfiguration().addResource(FSUtils.getFs().getConf());
|
||||
|
||||
// Create a temp folder as the base path
|
||||
TemporaryFolder folder = new TemporaryFolder();
|
||||
folder.create();
|
||||
basePath = folder.getRoot().getAbsolutePath();
|
||||
fs = FSUtils.getFs(basePath, jsc.hadoopConfiguration());
|
||||
jsc.hadoopConfiguration().addResource(fs.getConf());
|
||||
|
||||
dfs.mkdirs(new Path(basePath));
|
||||
FSUtils.setFs(dfs);
|
||||
HoodieTestUtils.initTableType(basePath, HoodieTableType.MERGE_ON_READ);
|
||||
|
||||
sqlContext = new SQLContext(jsc); // SQLContext stuff
|
||||
compactor = new HoodieRealtimeTableCompactor();
|
||||
|
||||
//SQLContext stuff
|
||||
sqlContext = new SQLContext(jsc);
|
||||
}
|
||||
|
||||
@After
|
||||
@@ -167,7 +163,8 @@ public class TestMergeOnReadTable {
|
||||
List<WriteStatus> statuses = client.upsert(writeRecords, newCommitTime).collect();
|
||||
assertNoWriteErrors(statuses);
|
||||
|
||||
HoodieTableMetaClient metaClient = new HoodieTableMetaClient(fs, cfg.getBasePath());
|
||||
HoodieTableMetaClient metaClient = new HoodieTableMetaClient(jsc.hadoopConfiguration(),
|
||||
cfg.getBasePath());
|
||||
HoodieTable hoodieTable = HoodieTable.getHoodieTable(metaClient, cfg);
|
||||
|
||||
Optional<HoodieInstant> deltaCommit =
|
||||
@@ -209,7 +206,7 @@ public class TestMergeOnReadTable {
|
||||
statuses = client.upsert(jsc.parallelize(records, 1), newCommitTime).collect();
|
||||
// Verify there are no errors
|
||||
assertNoWriteErrors(statuses);
|
||||
metaClient = new HoodieTableMetaClient(fs, cfg.getBasePath());
|
||||
metaClient = new HoodieTableMetaClient(jsc.hadoopConfiguration(), cfg.getBasePath());
|
||||
deltaCommit = metaClient.getActiveTimeline().getDeltaCommitTimeline().lastInstant();
|
||||
assertTrue(deltaCommit.isPresent());
|
||||
assertEquals("Latest Delta commit should be 004", "004", deltaCommit.get().getTimestamp());
|
||||
@@ -229,8 +226,9 @@ public class TestMergeOnReadTable {
|
||||
assertTrue(dataFilesToRead.findAny().isPresent());
|
||||
|
||||
// verify that there is a commit
|
||||
table = HoodieTable
|
||||
.getHoodieTable(new HoodieTableMetaClient(fs, cfg.getBasePath(), true), getConfig(false));
|
||||
table = HoodieTable.getHoodieTable(
|
||||
new HoodieTableMetaClient(jsc.hadoopConfiguration(), cfg.getBasePath(), true),
|
||||
getConfig(false));
|
||||
HoodieTimeline timeline = table.getCommitTimeline().filterCompletedInstants();
|
||||
assertEquals("Expecting a single commit.", 1,
|
||||
timeline.findInstantsAfter("000", Integer.MAX_VALUE).countInstants());
|
||||
@@ -284,7 +282,8 @@ public class TestMergeOnReadTable {
|
||||
List<WriteStatus> statuses = client.upsert(writeRecords, newCommitTime).collect();
|
||||
assertNoWriteErrors(statuses);
|
||||
|
||||
HoodieTableMetaClient metaClient = new HoodieTableMetaClient(fs, cfg.getBasePath());
|
||||
HoodieTableMetaClient metaClient = new HoodieTableMetaClient(jsc.hadoopConfiguration(),
|
||||
cfg.getBasePath());
|
||||
HoodieTable hoodieTable = HoodieTable.getHoodieTable(metaClient, cfg);
|
||||
|
||||
Optional<HoodieInstant> deltaCommit =
|
||||
@@ -332,7 +331,7 @@ public class TestMergeOnReadTable {
|
||||
// Verify there are no errors
|
||||
assertNoWriteErrors(statuses);
|
||||
|
||||
metaClient = new HoodieTableMetaClient(fs, cfg.getBasePath());
|
||||
metaClient = new HoodieTableMetaClient(jsc.hadoopConfiguration(), cfg.getBasePath());
|
||||
deltaCommit = metaClient.getActiveTimeline().getDeltaCommitTimeline().lastInstant();
|
||||
assertTrue(deltaCommit.isPresent());
|
||||
assertEquals("Latest Delta commit should be 004", "004", deltaCommit.get().getTimestamp());
|
||||
@@ -349,7 +348,7 @@ public class TestMergeOnReadTable {
|
||||
List<String> dataFiles = roView.getLatestDataFiles().map(hf -> hf.getPath())
|
||||
.collect(Collectors.toList());
|
||||
List<GenericRecord> recordsRead = HoodieMergeOnReadTestUtils
|
||||
.getRecordsUsingInputFormat(dataFiles);
|
||||
.getRecordsUsingInputFormat(dataFiles, basePath);
|
||||
//Wrote 40 records and deleted 20 records, so remaining 40-20 = 20
|
||||
assertEquals("Must contain 20 records", 20, recordsRead.size());
|
||||
}
|
||||
@@ -377,9 +376,8 @@ public class TestMergeOnReadTable {
|
||||
//verify there are no errors
|
||||
assertNoWriteErrors(statuses);
|
||||
|
||||
HoodieTableMetaClient metaClient = new HoodieTableMetaClient(fs, cfg.getBasePath());
|
||||
HoodieTable hoodieTable = HoodieTable.getHoodieTable(metaClient, cfg);
|
||||
|
||||
HoodieTableMetaClient metaClient = new HoodieTableMetaClient(jsc.hadoopConfiguration(),
|
||||
cfg.getBasePath());
|
||||
Optional<HoodieInstant> commit =
|
||||
metaClient.getActiveTimeline().getCommitTimeline().firstInstant();
|
||||
assertTrue(commit.isPresent());
|
||||
@@ -403,8 +401,9 @@ public class TestMergeOnReadTable {
|
||||
//rollback a COW commit when TableType is MOR
|
||||
client.rollback(newCommitTime);
|
||||
|
||||
metaClient = new HoodieTableMetaClient(fs, cfg.getBasePath());
|
||||
hoodieTable = HoodieTable.getHoodieTable(metaClient, cfg);
|
||||
metaClient = new HoodieTableMetaClient(jsc.hadoopConfiguration(), cfg.getBasePath());
|
||||
HoodieTable hoodieTable = HoodieTable
|
||||
.getHoodieTable(metaClient, cfg);
|
||||
FileStatus[] allFiles = HoodieTestUtils
|
||||
.listAllDataFilesInPath(metaClient.getFs(), cfg.getBasePath());
|
||||
HoodieTableFileSystemView roView = new HoodieTableFileSystemView(metaClient,
|
||||
@@ -440,7 +439,8 @@ public class TestMergeOnReadTable {
|
||||
List<WriteStatus> statuses = client.upsert(writeRecords, newCommitTime).collect();
|
||||
assertNoWriteErrors(statuses);
|
||||
|
||||
HoodieTableMetaClient metaClient = new HoodieTableMetaClient(fs, cfg.getBasePath());
|
||||
HoodieTableMetaClient metaClient = new HoodieTableMetaClient(jsc.hadoopConfiguration(),
|
||||
cfg.getBasePath());
|
||||
HoodieTable hoodieTable = HoodieTable.getHoodieTable(metaClient, cfg);
|
||||
|
||||
Optional<HoodieInstant> deltaCommit =
|
||||
@@ -476,7 +476,7 @@ public class TestMergeOnReadTable {
|
||||
statuses = client.upsert(jsc.parallelize(records, 1), newCommitTime).collect();
|
||||
// Verify there are no errors
|
||||
assertNoWriteErrors(statuses);
|
||||
metaClient = new HoodieTableMetaClient(fs, cfg.getBasePath());
|
||||
metaClient = new HoodieTableMetaClient(jsc.hadoopConfiguration(), cfg.getBasePath());
|
||||
deltaCommit = metaClient.getActiveTimeline().getDeltaCommitTimeline().lastInstant();
|
||||
assertTrue(deltaCommit.isPresent());
|
||||
assertEquals("Latest Delta commit should be 002", "002", deltaCommit.get().getTimestamp());
|
||||
@@ -487,19 +487,19 @@ public class TestMergeOnReadTable {
|
||||
List<String> dataFiles = roView.getLatestDataFiles().map(hf -> hf.getPath())
|
||||
.collect(Collectors.toList());
|
||||
List<GenericRecord> recordsRead = HoodieMergeOnReadTestUtils
|
||||
.getRecordsUsingInputFormat(dataFiles);
|
||||
.getRecordsUsingInputFormat(dataFiles, basePath);
|
||||
|
||||
assertEquals(recordsRead.size(), 200);
|
||||
|
||||
// Test delta commit rollback
|
||||
client.rollback(newCommitTime);
|
||||
|
||||
metaClient = new HoodieTableMetaClient(fs, cfg.getBasePath());
|
||||
metaClient = new HoodieTableMetaClient(jsc.hadoopConfiguration(), cfg.getBasePath());
|
||||
hoodieTable = HoodieTable.getHoodieTable(metaClient, cfg);
|
||||
roView = new HoodieTableFileSystemView(metaClient, hoodieTable.getCompletedCommitTimeline(),
|
||||
allFiles);
|
||||
dataFiles = roView.getLatestDataFiles().map(hf -> hf.getPath()).collect(Collectors.toList());
|
||||
recordsRead = HoodieMergeOnReadTestUtils.getRecordsUsingInputFormat(dataFiles);
|
||||
recordsRead = HoodieMergeOnReadTestUtils.getRecordsUsingInputFormat(dataFiles, basePath);
|
||||
|
||||
assertEquals(recordsRead.size(), 200);
|
||||
|
||||
@@ -516,13 +516,14 @@ public class TestMergeOnReadTable {
|
||||
assertNoWriteErrors(statuses);
|
||||
|
||||
HoodieCompactor compactor = new HoodieRealtimeTableCompactor();
|
||||
metaClient = new HoodieTableMetaClient(fs, cfg.getBasePath());
|
||||
HoodieTable table = HoodieTable.getHoodieTable(metaClient, getConfig(true));
|
||||
metaClient = new HoodieTableMetaClient(jsc.hadoopConfiguration(), cfg.getBasePath());
|
||||
HoodieTable table = HoodieTable
|
||||
.getHoodieTable(metaClient, getConfig(true));
|
||||
|
||||
compactor.compact(jsc, getConfig(true), table, HoodieActiveTimeline.createNewCommitTime());
|
||||
|
||||
allFiles = HoodieTestUtils.listAllDataFilesInPath(metaClient.getFs(), cfg.getBasePath());
|
||||
metaClient = new HoodieTableMetaClient(fs, cfg.getBasePath());
|
||||
metaClient = new HoodieTableMetaClient(jsc.hadoopConfiguration(), cfg.getBasePath());
|
||||
hoodieTable = HoodieTable.getHoodieTable(metaClient, cfg);
|
||||
roView = new HoodieTableFileSystemView(metaClient, hoodieTable.getCommitsTimeline(),
|
||||
allFiles);
|
||||
@@ -541,7 +542,7 @@ public class TestMergeOnReadTable {
|
||||
client.rollback(compactedCommitTime);
|
||||
|
||||
allFiles = HoodieTestUtils.listAllDataFilesInPath(metaClient.getFs(), cfg.getBasePath());
|
||||
metaClient = new HoodieTableMetaClient(fs, cfg.getBasePath());
|
||||
metaClient = new HoodieTableMetaClient(jsc.hadoopConfiguration(), cfg.getBasePath());
|
||||
hoodieTable = HoodieTable.getHoodieTable(metaClient, cfg);
|
||||
roView = new HoodieTableFileSystemView(metaClient, hoodieTable.getCommitsTimeline(),
|
||||
allFiles);
|
||||
|
||||
@@ -140,5 +140,11 @@
|
||||
</exclusion>
|
||||
</exclusions>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>com.github.stefanbirkner</groupId>
|
||||
<artifactId>system-rules</artifactId>
|
||||
<version>1.16.0</version>
|
||||
<scope>test</scope>
|
||||
</dependency>
|
||||
</dependencies>
|
||||
</project>
|
||||
|
||||
@@ -77,7 +77,7 @@ public class MercifulJsonConverter {
|
||||
switch (schema.getType()) {
|
||||
case BOOLEAN:
|
||||
if (value instanceof Boolean) {
|
||||
return (Boolean) value;
|
||||
return value;
|
||||
}
|
||||
break;
|
||||
case DOUBLE:
|
||||
|
||||
@@ -0,0 +1,56 @@
|
||||
/*
|
||||
* Copyright (c) 2017 Uber Technologies, Inc. (hoodie-dev-group@uber.com)
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*
|
||||
*
|
||||
*/
|
||||
|
||||
package com.uber.hoodie.common;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.ObjectInputStream;
|
||||
import java.io.ObjectOutputStream;
|
||||
import java.io.Serializable;
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
|
||||
public class SerializableConfiguration implements Serializable {
|
||||
|
||||
private transient Configuration configuration;
|
||||
|
||||
public SerializableConfiguration(Configuration configuration) {
|
||||
this.configuration = configuration;
|
||||
}
|
||||
|
||||
public Configuration get() {
|
||||
return configuration;
|
||||
}
|
||||
|
||||
private void writeObject(ObjectOutputStream out) throws IOException {
|
||||
out.defaultWriteObject();
|
||||
configuration.write(out);
|
||||
}
|
||||
|
||||
private void readObject(ObjectInputStream in) throws IOException {
|
||||
configuration = new Configuration(false);
|
||||
configuration.readFields(in);
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
StringBuilder str = new StringBuilder();
|
||||
configuration.iterator().forEachRemaining(e ->
|
||||
str.append(String.format("%s => %s \n", e.getKey(), e.getValue())));
|
||||
return configuration.toString();
|
||||
}
|
||||
}
|
||||
@@ -17,5 +17,5 @@
|
||||
package com.uber.hoodie.common.model;
|
||||
|
||||
public enum ActionType {
|
||||
commit, savepoint, compaction, clean, rollback;
|
||||
commit, savepoint, compaction, clean, rollback
|
||||
}
|
||||
|
||||
@@ -16,6 +16,7 @@
|
||||
|
||||
package com.uber.hoodie.common.table;
|
||||
|
||||
import com.uber.hoodie.common.SerializableConfiguration;
|
||||
import com.uber.hoodie.common.model.HoodieTableType;
|
||||
import com.uber.hoodie.common.table.timeline.HoodieActiveTimeline;
|
||||
import com.uber.hoodie.common.table.timeline.HoodieArchivedTimeline;
|
||||
@@ -26,6 +27,7 @@ import java.io.IOException;
|
||||
import java.io.Serializable;
|
||||
import java.util.Objects;
|
||||
import java.util.Properties;
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.fs.FileStatus;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
@@ -52,24 +54,28 @@ public class HoodieTableMetaClient implements Serializable {
|
||||
private String basePath;
|
||||
private transient FileSystem fs;
|
||||
private String metaPath;
|
||||
private SerializableConfiguration hadoopConf;
|
||||
private HoodieTableType tableType;
|
||||
private HoodieTableConfig tableConfig;
|
||||
private HoodieActiveTimeline activeTimeline;
|
||||
private HoodieArchivedTimeline archivedTimeline;
|
||||
|
||||
public HoodieTableMetaClient(FileSystem fs, String basePath) throws DatasetNotFoundException {
|
||||
public HoodieTableMetaClient(Configuration conf, String basePath)
|
||||
throws DatasetNotFoundException {
|
||||
// Do not load any timeline by default
|
||||
this(fs, basePath, false);
|
||||
this(conf, basePath, false);
|
||||
}
|
||||
|
||||
public HoodieTableMetaClient(FileSystem fs, String basePath, boolean loadActiveTimelineOnLoad)
|
||||
public HoodieTableMetaClient(Configuration conf, String basePath,
|
||||
boolean loadActiveTimelineOnLoad)
|
||||
throws DatasetNotFoundException {
|
||||
log.info("Loading HoodieTableMetaClient from " + basePath);
|
||||
this.basePath = basePath;
|
||||
this.fs = fs;
|
||||
this.hadoopConf = new SerializableConfiguration(conf);
|
||||
Path basePathDir = new Path(this.basePath);
|
||||
this.metaPath = basePath + File.separator + METAFOLDER_NAME;
|
||||
Path metaPathDir = new Path(this.metaPath);
|
||||
this.fs = getFs();
|
||||
DatasetNotFoundException.checkValidDataset(fs, basePathDir, metaPathDir);
|
||||
this.tableConfig = new HoodieTableConfig(fs, metaPath);
|
||||
this.tableType = tableConfig.getTableType();
|
||||
@@ -96,7 +102,7 @@ public class HoodieTableMetaClient implements Serializable {
|
||||
private void readObject(java.io.ObjectInputStream in)
|
||||
throws IOException, ClassNotFoundException {
|
||||
in.defaultReadObject();
|
||||
this.fs = FSUtils.getFs();
|
||||
fs = null; // will be lazily inited
|
||||
}
|
||||
|
||||
private void writeObject(java.io.ObjectOutputStream out)
|
||||
@@ -136,9 +142,16 @@ public class HoodieTableMetaClient implements Serializable {
|
||||
* Get the FS implementation for this table
|
||||
*/
|
||||
public FileSystem getFs() {
|
||||
if (fs == null) {
|
||||
fs = FSUtils.getFs(metaPath, hadoopConf.get());
|
||||
}
|
||||
return fs;
|
||||
}
|
||||
|
||||
public Configuration getHadoopConf() {
|
||||
return hadoopConf.get();
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the active instants as a timeline
|
||||
*
|
||||
@@ -146,7 +159,7 @@ public class HoodieTableMetaClient implements Serializable {
|
||||
*/
|
||||
public synchronized HoodieActiveTimeline getActiveTimeline() {
|
||||
if (activeTimeline == null) {
|
||||
activeTimeline = new HoodieActiveTimeline(fs, metaPath);
|
||||
activeTimeline = new HoodieActiveTimeline(this);
|
||||
}
|
||||
return activeTimeline;
|
||||
}
|
||||
@@ -159,7 +172,7 @@ public class HoodieTableMetaClient implements Serializable {
|
||||
*/
|
||||
public synchronized HoodieArchivedTimeline getArchivedTimeline() {
|
||||
if (archivedTimeline == null) {
|
||||
archivedTimeline = new HoodieArchivedTimeline(fs, metaPath);
|
||||
archivedTimeline = new HoodieArchivedTimeline(this);
|
||||
}
|
||||
return archivedTimeline;
|
||||
}
|
||||
@@ -196,7 +209,7 @@ public class HoodieTableMetaClient implements Serializable {
|
||||
fs.mkdirs(metaPathDir);
|
||||
}
|
||||
HoodieTableConfig.createHoodieProperties(fs, metaPathDir, props);
|
||||
HoodieTableMetaClient metaClient = new HoodieTableMetaClient(fs, basePath);
|
||||
HoodieTableMetaClient metaClient = new HoodieTableMetaClient(fs.getConf(), basePath);
|
||||
log.info("Finished initializing Table of type " + metaClient.getTableConfig().getTableType()
|
||||
+ " from " + basePath);
|
||||
return metaClient;
|
||||
|
||||
@@ -83,7 +83,7 @@ public class HoodieCompactedLogRecordScanner implements
|
||||
Schema readerSchema, String latestInstantTime) {
|
||||
this.readerSchema = readerSchema;
|
||||
this.latestInstantTime = latestInstantTime;
|
||||
this.hoodieTableMetaClient = new HoodieTableMetaClient(fs, basePath);
|
||||
this.hoodieTableMetaClient = new HoodieTableMetaClient(fs.getConf(), basePath);
|
||||
// load class from the payload fully qualified class name
|
||||
this.payloadClassFQN = this.hoodieTableMetaClient.getTableConfig().getPayloadClass();
|
||||
|
||||
|
||||
@@ -150,7 +150,7 @@ public interface HoodieLogFormat {
|
||||
public Writer build() throws IOException, InterruptedException {
|
||||
log.info("Building HoodieLogFormat Writer");
|
||||
if (fs == null) {
|
||||
fs = FSUtils.getFs();
|
||||
throw new IllegalArgumentException("fs is not specified");
|
||||
}
|
||||
if (logFileId == null) {
|
||||
throw new IllegalArgumentException("FileID is not specified");
|
||||
|
||||
@@ -58,7 +58,7 @@ public abstract class HoodieLogBlock {
|
||||
*/
|
||||
public enum LogMetadataType {
|
||||
INSTANT_TIME,
|
||||
TARGET_INSTANT_TIME;
|
||||
TARGET_INSTANT_TIME
|
||||
}
|
||||
|
||||
public HoodieLogBlock(Map<LogMetadataType, String> logMetadata) {
|
||||
|
||||
@@ -37,7 +37,6 @@ import java.util.stream.Stream;
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.hadoop.fs.FSDataInputStream;
|
||||
import org.apache.hadoop.fs.FSDataOutputStream;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.apache.log4j.LogManager;
|
||||
import org.apache.log4j.Logger;
|
||||
@@ -54,11 +53,8 @@ public class HoodieActiveTimeline extends HoodieDefaultTimeline {
|
||||
|
||||
public static final SimpleDateFormat COMMIT_FORMATTER = new SimpleDateFormat("yyyyMMddHHmmss");
|
||||
|
||||
|
||||
private final transient static Logger log = LogManager.getLogger(HoodieActiveTimeline.class);
|
||||
private String metaPath;
|
||||
private transient FileSystem fs;
|
||||
|
||||
private HoodieTableMetaClient metaClient;
|
||||
|
||||
/**
|
||||
* Returns next commit time in the {@link #COMMIT_FORMATTER} format.
|
||||
@@ -67,12 +63,14 @@ public class HoodieActiveTimeline extends HoodieDefaultTimeline {
|
||||
return HoodieActiveTimeline.COMMIT_FORMATTER.format(new Date());
|
||||
}
|
||||
|
||||
protected HoodieActiveTimeline(FileSystem fs, String metaPath, String[] includedExtensions) {
|
||||
protected HoodieActiveTimeline(HoodieTableMetaClient metaClient, String[] includedExtensions) {
|
||||
// Filter all the filter in the metapath and include only the extensions passed and
|
||||
// convert them into HoodieInstant
|
||||
try {
|
||||
this.instants =
|
||||
Arrays.stream(HoodieTableMetaClient.scanFiles(fs, new Path(metaPath), path -> {
|
||||
Arrays.stream(
|
||||
HoodieTableMetaClient
|
||||
.scanFiles(metaClient.getFs(), new Path(metaClient.getMetaPath()), path -> {
|
||||
// Include only the meta files with extensions that needs to be included
|
||||
String extension = FSUtils.getFileExtension(path.getName());
|
||||
return Arrays.stream(includedExtensions).anyMatch(Predicate.isEqual(extension));
|
||||
@@ -85,14 +83,13 @@ public class HoodieActiveTimeline extends HoodieDefaultTimeline {
|
||||
} catch (IOException e) {
|
||||
throw new HoodieIOException("Failed to scan metadata", e);
|
||||
}
|
||||
this.fs = fs;
|
||||
this.metaPath = metaPath;
|
||||
this.metaClient = metaClient;
|
||||
// multiple casts will make this lambda serializable - http://docs.oracle.com/javase/specs/jls/se8/html/jls-15.html#jls-15.16
|
||||
this.details = (Function<HoodieInstant, Optional<byte[]>> & Serializable) this::getInstantDetails;
|
||||
}
|
||||
|
||||
public HoodieActiveTimeline(FileSystem fs, String metaPath) {
|
||||
this(fs, metaPath,
|
||||
public HoodieActiveTimeline(HoodieTableMetaClient metaClient) {
|
||||
this(metaClient,
|
||||
new String[]{COMMIT_EXTENSION, INFLIGHT_COMMIT_EXTENSION, DELTA_COMMIT_EXTENSION,
|
||||
INFLIGHT_DELTA_COMMIT_EXTENSION, SAVEPOINT_EXTENSION, INFLIGHT_SAVEPOINT_EXTENSION,
|
||||
CLEAN_EXTENSION, INFLIGHT_CLEAN_EXTENSION});
|
||||
@@ -114,7 +111,6 @@ public class HoodieActiveTimeline extends HoodieDefaultTimeline {
|
||||
private void readObject(java.io.ObjectInputStream in)
|
||||
throws IOException, ClassNotFoundException {
|
||||
in.defaultReadObject();
|
||||
this.fs = FSUtils.getFs();
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -214,9 +210,9 @@ public class HoodieActiveTimeline extends HoodieDefaultTimeline {
|
||||
|
||||
public void deleteInflight(HoodieInstant instant) {
|
||||
log.info("Deleting in-flight " + instant);
|
||||
Path inFlightCommitFilePath = new Path(metaPath, instant.getFileName());
|
||||
Path inFlightCommitFilePath = new Path(metaClient.getMetaPath(), instant.getFileName());
|
||||
try {
|
||||
boolean result = fs.delete(inFlightCommitFilePath, false);
|
||||
boolean result = metaClient.getFs().delete(inFlightCommitFilePath, false);
|
||||
if (result) {
|
||||
log.info("Removed in-flight " + instant);
|
||||
} else {
|
||||
@@ -230,18 +226,18 @@ public class HoodieActiveTimeline extends HoodieDefaultTimeline {
|
||||
|
||||
@Override
|
||||
public Optional<byte[]> getInstantDetails(HoodieInstant instant) {
|
||||
Path detailPath = new Path(metaPath, instant.getFileName());
|
||||
Path detailPath = new Path(metaClient.getMetaPath(), instant.getFileName());
|
||||
return readDataFromPath(detailPath);
|
||||
}
|
||||
|
||||
protected void moveInflightToComplete(HoodieInstant inflight, HoodieInstant completed,
|
||||
Optional<byte[]> data) {
|
||||
Path commitFilePath = new Path(metaPath, completed.getFileName());
|
||||
Path commitFilePath = new Path(metaClient.getMetaPath(), completed.getFileName());
|
||||
try {
|
||||
// open a new file and write the commit metadata in
|
||||
Path inflightCommitFile = new Path(metaPath, inflight.getFileName());
|
||||
Path inflightCommitFile = new Path(metaClient.getMetaPath(), inflight.getFileName());
|
||||
createFileInMetaPath(inflight.getFileName(), data);
|
||||
boolean success = fs.rename(inflightCommitFile, commitFilePath);
|
||||
boolean success = metaClient.getFs().rename(inflightCommitFile, commitFilePath);
|
||||
if (!success) {
|
||||
throw new HoodieIOException(
|
||||
"Could not rename " + inflightCommitFile + " to " + commitFilePath);
|
||||
@@ -252,11 +248,11 @@ public class HoodieActiveTimeline extends HoodieDefaultTimeline {
|
||||
}
|
||||
|
||||
protected void moveCompleteToInflight(HoodieInstant completed, HoodieInstant inflight) {
|
||||
Path inFlightCommitFilePath = new Path(metaPath, inflight.getFileName());
|
||||
Path inFlightCommitFilePath = new Path(metaClient.getMetaPath(), inflight.getFileName());
|
||||
try {
|
||||
if (!fs.exists(inFlightCommitFilePath)) {
|
||||
Path commitFilePath = new Path(metaPath, completed.getFileName());
|
||||
boolean success = fs.rename(commitFilePath, inFlightCommitFilePath);
|
||||
if (!metaClient.getFs().exists(inFlightCommitFilePath)) {
|
||||
Path commitFilePath = new Path(metaClient.getMetaPath(), completed.getFileName());
|
||||
boolean success = metaClient.getFs().rename(commitFilePath, inFlightCommitFilePath);
|
||||
if (!success) {
|
||||
throw new HoodieIOException(
|
||||
"Could not rename " + commitFilePath + " to " + inFlightCommitFilePath);
|
||||
@@ -272,15 +268,15 @@ public class HoodieActiveTimeline extends HoodieDefaultTimeline {
|
||||
}
|
||||
|
||||
protected void createFileInMetaPath(String filename, Optional<byte[]> content) {
|
||||
Path fullPath = new Path(metaPath, filename);
|
||||
Path fullPath = new Path(metaClient.getMetaPath(), filename);
|
||||
try {
|
||||
if (!content.isPresent()) {
|
||||
if (fs.createNewFile(fullPath)) {
|
||||
if (metaClient.getFs().createNewFile(fullPath)) {
|
||||
log.info("Created a new file in meta path: " + fullPath);
|
||||
return;
|
||||
}
|
||||
} else {
|
||||
FSDataOutputStream fsout = fs.create(fullPath, true);
|
||||
FSDataOutputStream fsout = metaClient.getFs().create(fullPath, true);
|
||||
fsout.write(content.get());
|
||||
fsout.close();
|
||||
return;
|
||||
@@ -292,7 +288,7 @@ public class HoodieActiveTimeline extends HoodieDefaultTimeline {
|
||||
}
|
||||
|
||||
protected Optional<byte[]> readDataFromPath(Path detailPath) {
|
||||
try (FSDataInputStream is = fs.open(detailPath)) {
|
||||
try (FSDataInputStream is = metaClient.getFs().open(detailPath)) {
|
||||
return Optional.of(IOUtils.toByteArray(is));
|
||||
} catch (IOException e) {
|
||||
throw new HoodieIOException("Could not read commit details from " + detailPath, e);
|
||||
@@ -300,6 +296,6 @@ public class HoodieActiveTimeline extends HoodieDefaultTimeline {
|
||||
}
|
||||
|
||||
public HoodieActiveTimeline reload() {
|
||||
return new HoodieActiveTimeline(fs, metaPath);
|
||||
return new HoodieActiveTimeline(metaClient);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -16,8 +16,8 @@
|
||||
|
||||
package com.uber.hoodie.common.table.timeline;
|
||||
|
||||
import com.uber.hoodie.common.table.HoodieTableMetaClient;
|
||||
import com.uber.hoodie.common.table.HoodieTimeline;
|
||||
import com.uber.hoodie.common.util.FSUtils;
|
||||
import com.uber.hoodie.exception.HoodieIOException;
|
||||
import java.io.IOException;
|
||||
import java.io.Serializable;
|
||||
@@ -27,7 +27,6 @@ import java.util.Map;
|
||||
import java.util.Optional;
|
||||
import java.util.function.Function;
|
||||
import java.util.stream.Collectors;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.apache.hadoop.io.SequenceFile;
|
||||
import org.apache.hadoop.io.Text;
|
||||
@@ -44,17 +43,17 @@ import org.apache.log4j.Logger;
|
||||
public class HoodieArchivedTimeline extends HoodieDefaultTimeline {
|
||||
|
||||
private static final String HOODIE_COMMIT_ARCHIVE_LOG_FILE = "commits";
|
||||
private transient FileSystem fs;
|
||||
private String metaPath;
|
||||
private HoodieTableMetaClient metaClient;
|
||||
private Map<String, byte[]> readCommits = new HashMap<>();
|
||||
|
||||
private final transient static Logger log = LogManager.getLogger(HoodieArchivedTimeline.class);
|
||||
|
||||
public HoodieArchivedTimeline(FileSystem fs, String metaPath) {
|
||||
public HoodieArchivedTimeline(HoodieTableMetaClient metaClient) {
|
||||
// Read back the commits to make sure
|
||||
Path archiveLogPath = getArchiveLogPath(metaPath);
|
||||
Path archiveLogPath = getArchiveLogPath(metaClient.getMetaPath());
|
||||
try (SequenceFile.Reader reader =
|
||||
new SequenceFile.Reader(fs.getConf(), SequenceFile.Reader.file(archiveLogPath))) {
|
||||
new SequenceFile.Reader(metaClient.getHadoopConf(),
|
||||
SequenceFile.Reader.file(archiveLogPath))) {
|
||||
Text key = new Text();
|
||||
Text val = new Text();
|
||||
while (reader.next(key, val)) {
|
||||
@@ -71,8 +70,7 @@ public class HoodieArchivedTimeline extends HoodieDefaultTimeline {
|
||||
}
|
||||
// multiple casts will make this lambda serializable - http://docs.oracle.com/javase/specs/jls/se8/html/jls-15.html#jls-15.16
|
||||
this.details = (Function<HoodieInstant, Optional<byte[]>> & Serializable) this::getInstantDetails;
|
||||
this.fs = fs;
|
||||
this.metaPath = metaPath;
|
||||
this.metaClient = metaClient;
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -91,7 +89,6 @@ public class HoodieArchivedTimeline extends HoodieDefaultTimeline {
|
||||
private void readObject(java.io.ObjectInputStream in)
|
||||
throws IOException, ClassNotFoundException {
|
||||
in.defaultReadObject();
|
||||
this.fs = FSUtils.getFs();
|
||||
}
|
||||
|
||||
|
||||
@@ -105,7 +102,7 @@ public class HoodieArchivedTimeline extends HoodieDefaultTimeline {
|
||||
}
|
||||
|
||||
public HoodieArchivedTimeline reload() {
|
||||
return new HoodieArchivedTimeline(fs, metaPath);
|
||||
return new HoodieArchivedTimeline(metaClient);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@@ -40,7 +40,6 @@ import java.util.stream.Collectors;
|
||||
import java.util.stream.Stream;
|
||||
import org.apache.commons.lang3.tuple.Pair;
|
||||
import org.apache.hadoop.fs.FileStatus;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
|
||||
/**
|
||||
@@ -57,7 +56,6 @@ public class HoodieTableFileSystemView implements TableFileSystemView,
|
||||
TableFileSystemView.RealtimeView, Serializable {
|
||||
|
||||
protected HoodieTableMetaClient metaClient;
|
||||
protected transient FileSystem fs;
|
||||
// This is the commits that will be visible for all views extending this view
|
||||
protected HoodieTimeline visibleActiveTimeline;
|
||||
|
||||
@@ -72,7 +70,6 @@ public class HoodieTableFileSystemView implements TableFileSystemView,
|
||||
public HoodieTableFileSystemView(HoodieTableMetaClient metaClient,
|
||||
HoodieTimeline visibleActiveTimeline) {
|
||||
this.metaClient = metaClient;
|
||||
this.fs = metaClient.getFs();
|
||||
this.visibleActiveTimeline = visibleActiveTimeline;
|
||||
this.fileGroupMap = new HashMap<>();
|
||||
this.partitionToFileGroupsMap = new HashMap<>();
|
||||
@@ -98,7 +95,6 @@ public class HoodieTableFileSystemView implements TableFileSystemView,
|
||||
private void readObject(java.io.ObjectInputStream in)
|
||||
throws IOException, ClassNotFoundException {
|
||||
in.defaultReadObject();
|
||||
this.fs = FSUtils.getFs();
|
||||
}
|
||||
|
||||
private void writeObject(java.io.ObjectOutputStream out)
|
||||
@@ -255,8 +251,8 @@ public class HoodieTableFileSystemView implements TableFileSystemView,
|
||||
try {
|
||||
// Create the path if it does not exist already
|
||||
Path partitionPath = new Path(metaClient.getBasePath(), partitionPathStr);
|
||||
FSUtils.createPathIfNotExists(fs, partitionPath);
|
||||
FileStatus[] statuses = fs.listStatus(partitionPath);
|
||||
FSUtils.createPathIfNotExists(metaClient.getFs(), partitionPath);
|
||||
FileStatus[] statuses = metaClient.getFs().listStatus(partitionPath);
|
||||
List<HoodieFileGroup> fileGroups = addFilesToView(statuses);
|
||||
return fileGroups.stream();
|
||||
} catch (IOException e) {
|
||||
|
||||
@@ -94,7 +94,7 @@ public class AvroUtils {
|
||||
public static HoodieCleanMetadata convertCleanMetadata(String startCleanTime,
|
||||
Optional<Long> durationInMs, List<HoodieCleanStat> cleanStats) {
|
||||
ImmutableMap.Builder<String, HoodieCleanPartitionMetadata> partitionMetadataBuilder =
|
||||
ImmutableMap.<String, HoodieCleanPartitionMetadata>builder();
|
||||
ImmutableMap.builder();
|
||||
int totalDeleted = 0;
|
||||
String earliestCommitToRetain = null;
|
||||
for (HoodieCleanStat stat : cleanStats) {
|
||||
@@ -116,7 +116,7 @@ public class AvroUtils {
|
||||
public static HoodieRollbackMetadata convertRollbackMetadata(String startRollbackTime,
|
||||
Optional<Long> durationInMs, List<String> commits, List<HoodieRollbackStat> stats) {
|
||||
ImmutableMap.Builder<String, HoodieRollbackPartitionMetadata> partitionMetadataBuilder =
|
||||
ImmutableMap.<String, HoodieRollbackPartitionMetadata>builder();
|
||||
ImmutableMap.builder();
|
||||
int totalDeleted = 0;
|
||||
for (HoodieRollbackStat stat : stats) {
|
||||
HoodieRollbackPartitionMetadata metadata =
|
||||
@@ -132,7 +132,7 @@ public class AvroUtils {
|
||||
public static HoodieSavepointMetadata convertSavepointMetadata(String user, String comment,
|
||||
Map<String, List<String>> latestFiles) {
|
||||
ImmutableMap.Builder<String, HoodieSavepointPartitionMetadata> partitionMetadataBuilder =
|
||||
ImmutableMap.<String, HoodieSavepointPartitionMetadata>builder();
|
||||
ImmutableMap.builder();
|
||||
for (Map.Entry<String, List<String>> stat : latestFiles.entrySet()) {
|
||||
HoodieSavepointPartitionMetadata metadata =
|
||||
new HoodieSavepointPartitionMetadata(stat.getKey(), stat.getValue());
|
||||
|
||||
@@ -29,6 +29,7 @@ import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.Comparator;
|
||||
import java.util.List;
|
||||
import java.util.Map.Entry;
|
||||
import java.util.Optional;
|
||||
import java.util.regex.Matcher;
|
||||
import java.util.regex.Pattern;
|
||||
@@ -55,6 +56,7 @@ public class FSUtils {
|
||||
private static final int MAX_ATTEMPTS_RECOVER_LEASE = 10;
|
||||
private static final long MIN_CLEAN_TO_KEEP = 10;
|
||||
private static final long MIN_ROLLBACK_TO_KEEP = 10;
|
||||
private static final String HOODIE_ENV_PROPS_PREFIX = "HOODIE_ENV_";
|
||||
private static FileSystem fs;
|
||||
|
||||
/**
|
||||
@@ -65,17 +67,32 @@ public class FSUtils {
|
||||
FSUtils.fs = fs;
|
||||
}
|
||||
|
||||
public static Configuration prepareHadoopConf(Configuration conf) {
|
||||
conf.set("fs.hdfs.impl", org.apache.hadoop.hdfs.DistributedFileSystem.class.getName());
|
||||
conf.set("fs.file.impl", org.apache.hadoop.fs.LocalFileSystem.class.getName());
|
||||
|
||||
public static FileSystem getFs() {
|
||||
// look for all properties, prefixed to be picked up
|
||||
for (Entry<String, String> prop : System.getenv().entrySet()) {
|
||||
if (prop.getKey().startsWith(HOODIE_ENV_PROPS_PREFIX)) {
|
||||
LOG.info("Picking up value for hoodie env var :" + prop.getKey());
|
||||
conf.set(prop.getKey()
|
||||
.replace(HOODIE_ENV_PROPS_PREFIX, "")
|
||||
.replaceAll("_DOT_", "."),
|
||||
prop.getValue());
|
||||
}
|
||||
}
|
||||
return conf;
|
||||
}
|
||||
|
||||
|
||||
public static FileSystem getFs(String path, Configuration conf) {
|
||||
if (fs != null) {
|
||||
return fs;
|
||||
}
|
||||
Configuration conf = new Configuration();
|
||||
conf.set("fs.hdfs.impl", org.apache.hadoop.hdfs.DistributedFileSystem.class.getName());
|
||||
conf.set("fs.file.impl", org.apache.hadoop.fs.LocalFileSystem.class.getName());
|
||||
FileSystem fs;
|
||||
conf = prepareHadoopConf(conf);
|
||||
try {
|
||||
fs = FileSystem.get(conf);
|
||||
fs = new Path(path).getFileSystem(conf);
|
||||
} catch (IOException e) {
|
||||
throw new HoodieIOException("Failed to get instance of " + FileSystem.class.getName(),
|
||||
e);
|
||||
@@ -83,7 +100,6 @@ public class FSUtils {
|
||||
LOG.info(
|
||||
String.format("Hadoop Configuration: fs.defaultFS: [%s], Config:[%s], FileSystem: [%s]",
|
||||
conf.getRaw("fs.defaultFS"), conf.toString(), fs.toString()));
|
||||
|
||||
return fs;
|
||||
}
|
||||
|
||||
|
||||
@@ -51,10 +51,11 @@ public class ParquetUtils {
|
||||
* Read the rowKey list from the given parquet file.
|
||||
*
|
||||
* @param filePath The parquet file path.
|
||||
* @param configuration configuration to build fs object
|
||||
*/
|
||||
public static Set<String> readRowKeysFromParquet(Path filePath) {
|
||||
Configuration conf = new Configuration();
|
||||
conf.addResource(getFs().getConf());
|
||||
public static Set<String> readRowKeysFromParquet(Configuration configuration, Path filePath) {
|
||||
Configuration conf = new Configuration(configuration);
|
||||
conf.addResource(getFs(filePath.toString(), conf).getConf());
|
||||
Schema readSchema = HoodieAvroUtils.getRecordKeySchema();
|
||||
AvroReadSupport.setAvroReadSchema(conf, readSchema);
|
||||
AvroReadSupport.setRequestedProjection(conf, readSchema);
|
||||
@@ -84,19 +85,12 @@ public class ParquetUtils {
|
||||
return rowKeys;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Read the metadata from a parquet file
|
||||
*/
|
||||
public static ParquetMetadata readMetadata(Path parquetFilePath) {
|
||||
return readMetadata(new Configuration(), parquetFilePath);
|
||||
}
|
||||
|
||||
public static ParquetMetadata readMetadata(Configuration conf, Path parquetFilePath) {
|
||||
ParquetMetadata footer;
|
||||
try {
|
||||
// TODO(vc): Should we use the parallel reading version here?
|
||||
footer = ParquetFileReader.readFooter(getFs().getConf(), parquetFilePath);
|
||||
footer = ParquetFileReader
|
||||
.readFooter(getFs(parquetFilePath.toString(), conf).getConf(), parquetFilePath);
|
||||
} catch (IOException e) {
|
||||
throw new HoodieIOException("Failed to read footer for parquet " + parquetFilePath,
|
||||
e);
|
||||
@@ -108,14 +102,15 @@ public class ParquetUtils {
|
||||
/**
|
||||
* Get the schema of the given parquet file.
|
||||
*/
|
||||
public static MessageType readSchema(Path parquetFilePath) {
|
||||
return readMetadata(parquetFilePath).getFileMetaData().getSchema();
|
||||
public static MessageType readSchema(Configuration configuration, Path parquetFilePath) {
|
||||
return readMetadata(configuration, parquetFilePath).getFileMetaData().getSchema();
|
||||
}
|
||||
|
||||
|
||||
private static List<String> readParquetFooter(Path parquetFilePath, String... footerNames) {
|
||||
private static List<String> readParquetFooter(Configuration configuration, Path parquetFilePath,
|
||||
String... footerNames) {
|
||||
List<String> footerVals = new ArrayList<>();
|
||||
ParquetMetadata footer = readMetadata(parquetFilePath);
|
||||
ParquetMetadata footer = readMetadata(configuration, parquetFilePath);
|
||||
Map<String, String> metadata = footer.getFileMetaData().getKeyValueMetaData();
|
||||
for (String footerName : footerNames) {
|
||||
if (metadata.containsKey(footerName)) {
|
||||
@@ -128,21 +123,22 @@ public class ParquetUtils {
|
||||
return footerVals;
|
||||
}
|
||||
|
||||
public static Schema readAvroSchema(Path parquetFilePath) {
|
||||
return new AvroSchemaConverter().convert(readSchema(parquetFilePath));
|
||||
public static Schema readAvroSchema(Configuration configuration, Path parquetFilePath) {
|
||||
return new AvroSchemaConverter().convert(readSchema(configuration, parquetFilePath));
|
||||
}
|
||||
|
||||
/**
|
||||
* Read out the bloom filter from the parquet file meta data.
|
||||
*/
|
||||
public static BloomFilter readBloomFilterFromParquetMetadata(Path parquetFilePath) {
|
||||
String footerVal = readParquetFooter(parquetFilePath,
|
||||
public static BloomFilter readBloomFilterFromParquetMetadata(Configuration configuration,
|
||||
Path parquetFilePath) {
|
||||
String footerVal = readParquetFooter(configuration, parquetFilePath,
|
||||
HoodieAvroWriteSupport.HOODIE_AVRO_BLOOM_FILTER_METADATA_KEY).get(0);
|
||||
return new BloomFilter(footerVal);
|
||||
}
|
||||
|
||||
public static String[] readMinMaxRecordKeys(Path parquetFilePath) {
|
||||
List<String> minMaxKeys = readParquetFooter(parquetFilePath,
|
||||
public static String[] readMinMaxRecordKeys(Configuration configuration, Path parquetFilePath) {
|
||||
List<String> minMaxKeys = readParquetFooter(configuration, parquetFilePath,
|
||||
HoodieAvroWriteSupport.HOODIE_MIN_RECORD_KEY_FOOTER,
|
||||
HoodieAvroWriteSupport.HOODIE_MAX_RECORD_KEY_FOOTER);
|
||||
if (minMaxKeys.size() != 2) {
|
||||
@@ -156,11 +152,11 @@ public class ParquetUtils {
|
||||
/**
|
||||
* NOTE: This literally reads the entire file contents, thus should be used with caution.
|
||||
*/
|
||||
public static List<GenericRecord> readAvroRecords(Path filePath) {
|
||||
public static List<GenericRecord> readAvroRecords(Configuration configuration, Path filePath) {
|
||||
ParquetReader reader = null;
|
||||
List<GenericRecord> records = new ArrayList<>();
|
||||
try {
|
||||
reader = AvroParquetReader.builder(filePath).build();
|
||||
reader = AvroParquetReader.builder(filePath).withConf(configuration).build();
|
||||
Object obj = reader.read();
|
||||
while (obj != null) {
|
||||
if (obj instanceof GenericRecord) {
|
||||
|
||||
@@ -19,6 +19,7 @@ package com.uber.hoodie.common.minicluster;
|
||||
|
||||
import com.google.common.base.Preconditions;
|
||||
import com.google.common.io.Files;
|
||||
import com.uber.hoodie.common.model.HoodieTestUtils;
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import org.apache.commons.io.FileUtils;
|
||||
@@ -54,7 +55,7 @@ public class HdfsTestService {
|
||||
private MiniDFSCluster miniDfsCluster;
|
||||
|
||||
public HdfsTestService() {
|
||||
hadoopConf = new Configuration();
|
||||
hadoopConf = HoodieTestUtils.getDefaultHadoopConf();
|
||||
workDir = Files.createTempDir().getAbsolutePath();
|
||||
}
|
||||
|
||||
@@ -67,7 +68,7 @@ public class HdfsTestService {
|
||||
.checkState(workDir != null, "The work dir must be set before starting cluster.");
|
||||
|
||||
if (hadoopConf == null) {
|
||||
hadoopConf = new Configuration();
|
||||
hadoopConf = HoodieTestUtils.getDefaultHadoopConf();
|
||||
}
|
||||
|
||||
// If clean, then remove the work dir so we can start fresh.
|
||||
|
||||
@@ -58,6 +58,7 @@ import java.util.stream.Stream;
|
||||
import org.apache.avro.Schema;
|
||||
import org.apache.avro.generic.GenericRecord;
|
||||
import org.apache.avro.generic.IndexedRecord;
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.fs.FSDataOutputStream;
|
||||
import org.apache.hadoop.fs.FileStatus;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
@@ -69,18 +70,23 @@ import org.junit.rules.TemporaryFolder;
|
||||
|
||||
public class HoodieTestUtils {
|
||||
|
||||
public static FileSystem fs = FSUtils.getFs();
|
||||
public static FileSystem fs;
|
||||
public static final String TEST_EXTENSION = ".test";
|
||||
public static final String RAW_TRIPS_TEST_NAME = "raw_trips";
|
||||
public static final int DEFAULT_TASK_PARTITIONID = 1;
|
||||
public static final String[] DEFAULT_PARTITION_PATHS = {"2016/03/15", "2015/03/16", "2015/03/17"};
|
||||
private static Random rand = new Random(46474747);
|
||||
|
||||
public static void resetFS() {
|
||||
HoodieTestUtils.fs = FSUtils.getFs();
|
||||
public static void resetFS(String basePath) {
|
||||
HoodieTestUtils.fs = FSUtils.getFs(basePath, HoodieTestUtils.getDefaultHadoopConf());
|
||||
}
|
||||
|
||||
public static Configuration getDefaultHadoopConf() {
|
||||
return new Configuration();
|
||||
}
|
||||
|
||||
public static HoodieTableMetaClient init(String basePath) throws IOException {
|
||||
fs = FSUtils.getFs(basePath, HoodieTestUtils.getDefaultHadoopConf());
|
||||
return initTableType(basePath, HoodieTableType.COPY_ON_WRITE);
|
||||
}
|
||||
|
||||
@@ -211,7 +217,7 @@ public class HoodieTestUtils {
|
||||
Path commitFile =
|
||||
new Path(basePath + "/" + HoodieTableMetaClient.METAFOLDER_NAME + "/" + HoodieTimeline
|
||||
.makeCleanerFileName(commitTime));
|
||||
FileSystem fs = FSUtils.getFs();
|
||||
FileSystem fs = FSUtils.getFs(basePath, HoodieTestUtils.getDefaultHadoopConf());
|
||||
FSDataOutputStream os = fs.create(commitFile, true);
|
||||
try {
|
||||
HoodieCleanStat cleanStats = new HoodieCleanStat(
|
||||
|
||||
@@ -67,7 +67,7 @@ public class HoodieLogFormatTest {
|
||||
|
||||
private FileSystem fs;
|
||||
private Path partitionPath;
|
||||
private String basePath;
|
||||
private static String basePath;
|
||||
|
||||
@BeforeClass
|
||||
public static void setUpClass() throws IOException, InterruptedException {
|
||||
@@ -78,7 +78,7 @@ public class HoodieLogFormatTest {
|
||||
@AfterClass
|
||||
public static void tearDownClass() {
|
||||
MiniClusterUtil.shutdown();
|
||||
HoodieTestUtils.resetFS();
|
||||
HoodieTestUtils.resetFS(basePath);
|
||||
}
|
||||
|
||||
@Before
|
||||
@@ -343,7 +343,7 @@ public class HoodieLogFormatTest {
|
||||
writer.close();
|
||||
|
||||
// Append some arbit byte[] to thee end of the log (mimics a partially written commit)
|
||||
fs = FileSystem.get(fs.getConf());
|
||||
fs = FSUtils.getFs(fs.getUri().toString(), fs.getConf());
|
||||
FSDataOutputStream outputStream = fs.append(writer.getLogFile().getPath());
|
||||
// create a block with
|
||||
outputStream.write(HoodieLogFormat.MAGIC);
|
||||
@@ -533,7 +533,7 @@ public class HoodieLogFormatTest {
|
||||
writer.close();
|
||||
|
||||
// Append some arbit byte[] to thee end of the log (mimics a partially written commit)
|
||||
fs = FileSystem.get(fs.getConf());
|
||||
fs = FSUtils.getFs(fs.getUri().toString(), fs.getConf());
|
||||
FSDataOutputStream outputStream = fs.append(writer.getLogFile().getPath());
|
||||
// create a block with
|
||||
outputStream.write(HoodieLogFormat.MAGIC);
|
||||
|
||||
@@ -74,7 +74,7 @@ public class HoodieActiveTimelineTest {
|
||||
HoodieInstant instant5 =
|
||||
new HoodieInstant(true, HoodieTimeline.COMMIT_ACTION, "9");
|
||||
|
||||
timeline = new HoodieActiveTimeline(HoodieTestUtils.fs, metaClient.getMetaPath());
|
||||
timeline = new HoodieActiveTimeline(metaClient);
|
||||
timeline.saveAsComplete(instant1, Optional.empty());
|
||||
timeline.saveAsComplete(instant2, Optional.empty());
|
||||
timeline.saveAsComplete(instant3, Optional.empty());
|
||||
@@ -98,7 +98,7 @@ public class HoodieActiveTimelineTest {
|
||||
|
||||
@Test
|
||||
public void testTimelineOperationsBasic() throws Exception {
|
||||
timeline = new HoodieActiveTimeline(HoodieTestUtils.fs, metaClient.getMetaPath());
|
||||
timeline = new HoodieActiveTimeline(metaClient);
|
||||
assertTrue(timeline.empty());
|
||||
assertEquals("", 0, timeline.countInstants());
|
||||
assertEquals("", Optional.empty(), timeline.firstInstant());
|
||||
|
||||
@@ -69,7 +69,7 @@ public class HoodieTableFileSystemViewTest {
|
||||
}
|
||||
|
||||
private void refreshFsView(FileStatus[] statuses) {
|
||||
metaClient = new HoodieTableMetaClient(HoodieTestUtils.fs, basePath, true);
|
||||
metaClient = new HoodieTableMetaClient(HoodieTestUtils.fs.getConf(), basePath, true);
|
||||
if (statuses != null) {
|
||||
fsView = new HoodieTableFileSystemView(metaClient,
|
||||
metaClient.getActiveTimeline().getCommitTimeline().filterCompletedInstants(),
|
||||
|
||||
@@ -16,15 +16,24 @@
|
||||
|
||||
package com.uber.hoodie.common.util;
|
||||
|
||||
import static org.junit.Assert.assertEquals;
|
||||
import static org.junit.Assert.assertTrue;
|
||||
|
||||
import com.uber.hoodie.common.model.HoodieTestUtils;
|
||||
import java.text.SimpleDateFormat;
|
||||
import java.util.Date;
|
||||
import java.util.UUID;
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.junit.Rule;
|
||||
import org.junit.Test;
|
||||
import org.junit.contrib.java.lang.system.EnvironmentVariables;
|
||||
|
||||
public class TestFSUtils {
|
||||
|
||||
@Rule
|
||||
public final EnvironmentVariables environmentVariables
|
||||
= new EnvironmentVariables();
|
||||
|
||||
@Test
|
||||
public void testMakeDataFileName() {
|
||||
String commitTime = new SimpleDateFormat("yyyyMMddHHmmss").format(new Date());
|
||||
@@ -59,4 +68,15 @@ public class TestFSUtils {
|
||||
String fullFileName = FSUtils.makeDataFileName(commitTime, taskPartitionId, fileName);
|
||||
assertTrue(FSUtils.getFileId(fullFileName).equals(fileName));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testEnvVarVariablesPickedup() {
|
||||
environmentVariables.set("HOODIE_ENV_fs_DOT_key1", "value1");
|
||||
Configuration conf = FSUtils.prepareHadoopConf(HoodieTestUtils.getDefaultHadoopConf());
|
||||
assertEquals("value1", conf.get("fs.key1"));
|
||||
conf.set("fs.key1", "value11");
|
||||
conf.set("fs.key2", "value2");
|
||||
assertEquals("value11", conf.get("fs.key1"));
|
||||
assertEquals("value2", conf.get("fs.key2"));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -22,6 +22,7 @@ import static org.junit.Assert.assertTrue;
|
||||
import com.uber.hoodie.avro.HoodieAvroWriteSupport;
|
||||
import com.uber.hoodie.common.BloomFilter;
|
||||
import com.uber.hoodie.common.model.HoodieRecord;
|
||||
import com.uber.hoodie.common.model.HoodieTestUtils;
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
@@ -79,12 +80,15 @@ public class TestParquetUtils {
|
||||
|
||||
// Read and verify
|
||||
List<String> rowKeysInFile = new ArrayList<>(
|
||||
ParquetUtils.readRowKeysFromParquet(new Path(filePath)));
|
||||
ParquetUtils
|
||||
.readRowKeysFromParquet(HoodieTestUtils.getDefaultHadoopConf(), new Path(filePath)));
|
||||
Collections.sort(rowKeysInFile);
|
||||
Collections.sort(rowKeys);
|
||||
|
||||
assertEquals("Did not read back the expected list of keys", rowKeys, rowKeysInFile);
|
||||
BloomFilter filterInFile = ParquetUtils.readBloomFilterFromParquetMetadata(new Path(filePath));
|
||||
BloomFilter filterInFile = ParquetUtils
|
||||
.readBloomFilterFromParquetMetadata(HoodieTestUtils.getDefaultHadoopConf(),
|
||||
new Path(filePath));
|
||||
for (String rowKey : rowKeys) {
|
||||
assertTrue("key should be found in bloom filter", filterInFile.mightContain(rowKey));
|
||||
}
|
||||
|
||||
@@ -295,6 +295,6 @@ public class HoodieInputFormat extends MapredParquetInputFormat
|
||||
}
|
||||
Path baseDir = HoodieHiveUtil.getNthParent(dataPath, levels);
|
||||
LOG.info("Reading hoodie metadata from path " + baseDir.toString());
|
||||
return new HoodieTableMetaClient(fs, baseDir.toString());
|
||||
return new HoodieTableMetaClient(fs.getConf(), baseDir.toString());
|
||||
}
|
||||
}
|
||||
|
||||
@@ -19,6 +19,7 @@ import com.uber.hoodie.common.model.HoodieDataFile;
|
||||
import com.uber.hoodie.common.model.HoodiePartitionMetadata;
|
||||
import com.uber.hoodie.common.table.HoodieTableMetaClient;
|
||||
import com.uber.hoodie.common.table.view.HoodieTableFileSystemView;
|
||||
import com.uber.hoodie.common.util.FSUtils;
|
||||
import com.uber.hoodie.exception.DatasetNotFoundException;
|
||||
import com.uber.hoodie.exception.HoodieException;
|
||||
import java.io.Serializable;
|
||||
@@ -86,7 +87,7 @@ public class HoodieROTablePathFilter implements PathFilter, Serializable {
|
||||
}
|
||||
Path folder = null;
|
||||
try {
|
||||
FileSystem fs = path.getFileSystem(new Configuration());
|
||||
FileSystem fs = path.getFileSystem(FSUtils.prepareHadoopConf(new Configuration()));
|
||||
if (fs.isDirectory(path)) {
|
||||
return true;
|
||||
}
|
||||
@@ -123,7 +124,7 @@ public class HoodieROTablePathFilter implements PathFilter, Serializable {
|
||||
if (baseDir != null) {
|
||||
try {
|
||||
HoodieTableMetaClient metaClient =
|
||||
new HoodieTableMetaClient(fs, baseDir.toString());
|
||||
new HoodieTableMetaClient(fs.getConf(), baseDir.toString());
|
||||
HoodieTableFileSystemView fsView = new HoodieTableFileSystemView(metaClient,
|
||||
metaClient.getActiveTimeline().getCommitTimeline()
|
||||
.filterCompletedInstants(),
|
||||
|
||||
@@ -84,7 +84,7 @@ public class HoodieRealtimeRecordReader implements RecordReader<Void, ArrayWrita
|
||||
LOG.info("cfg ==> " + job.get(ColumnProjectionUtils.READ_COLUMN_NAMES_CONF_STR));
|
||||
try {
|
||||
baseFileSchema = readSchema(jobConf, split.getPath());
|
||||
readAndCompactLog();
|
||||
readAndCompactLog(jobConf);
|
||||
} catch (IOException e) {
|
||||
throw new HoodieIOException(
|
||||
"Could not create HoodieRealtimeRecordReader on path " + this.split.getPath(), e);
|
||||
@@ -110,7 +110,7 @@ public class HoodieRealtimeRecordReader implements RecordReader<Void, ArrayWrita
|
||||
* Goes through the log files and populates a map with latest version of each key logged, since
|
||||
* the base split was written.
|
||||
*/
|
||||
private void readAndCompactLog() throws IOException {
|
||||
private void readAndCompactLog(JobConf jobConf) throws IOException {
|
||||
Schema writerSchema = new AvroSchemaConverter().convert(baseFileSchema);
|
||||
List<String> projectionFields = orderFields(
|
||||
jobConf.get(ColumnProjectionUtils.READ_COLUMN_NAMES_CONF_STR),
|
||||
@@ -123,7 +123,8 @@ public class HoodieRealtimeRecordReader implements RecordReader<Void, ArrayWrita
|
||||
String.format("About to read compacted logs %s for base split %s, projecting cols %s",
|
||||
split.getDeltaFilePaths(), split.getPath(), projectionFields));
|
||||
HoodieCompactedLogRecordScanner compactedLogRecordScanner =
|
||||
new HoodieCompactedLogRecordScanner(FSUtils.getFs(), split.getBasePath(),
|
||||
new HoodieCompactedLogRecordScanner(FSUtils.getFs(split.getPath().toString(), jobConf),
|
||||
split.getBasePath(),
|
||||
split.getDeltaFilePaths(),
|
||||
readerSchema, split.getMaxCommitTime());
|
||||
// NOTE: HoodieCompactedLogRecordScanner will not return records for an in-flight commit
|
||||
|
||||
@@ -41,6 +41,7 @@ import java.util.Map;
|
||||
import java.util.stream.Collectors;
|
||||
import org.apache.avro.Schema;
|
||||
import org.apache.avro.generic.IndexedRecord;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat;
|
||||
import org.apache.hadoop.hive.serde2.ColumnProjectionUtils;
|
||||
@@ -64,10 +65,13 @@ import org.junit.rules.TemporaryFolder;
|
||||
public class HoodieRealtimeRecordReaderTest {
|
||||
|
||||
private JobConf jobConf;
|
||||
private FileSystem fs;
|
||||
|
||||
@Before
|
||||
public void setUp() {
|
||||
jobConf = new JobConf();
|
||||
fs = FSUtils
|
||||
.getFs(basePath.getRoot().getAbsolutePath(), HoodieTestUtils.getDefaultHadoopConf());
|
||||
}
|
||||
|
||||
@Rule
|
||||
@@ -79,7 +83,9 @@ public class HoodieRealtimeRecordReaderTest {
|
||||
HoodieLogFormat.Writer writer = HoodieLogFormat.newWriterBuilder()
|
||||
.onParentPath(new Path(partitionDir.getPath()))
|
||||
.withFileExtension(HoodieLogFile.DELTA_EXTENSION).withFileId(fileId)
|
||||
.overBaseCommit(baseCommit).withFs(FSUtils.getFs()).build();
|
||||
.overBaseCommit(baseCommit)
|
||||
.withFs(fs)
|
||||
.build();
|
||||
List<IndexedRecord> records = new ArrayList<>();
|
||||
for (int i = 0; i < numberOfRecords; i++) {
|
||||
records.add(SchemaTestUtil.generateAvroRecordFromJson(schema, i, newCommit, "fileid0"));
|
||||
@@ -123,8 +129,8 @@ public class HoodieRealtimeRecordReaderTest {
|
||||
//create a RecordReader to be used by HoodieRealtimeRecordReader
|
||||
RecordReader<Void, ArrayWritable> reader =
|
||||
new MapredParquetInputFormat().
|
||||
getRecordReader(new FileSplit(split.getPath(), 0,
|
||||
FSUtils.getFs().getLength(split.getPath()), (String[]) null), jobConf, null);
|
||||
getRecordReader(new FileSplit(split.getPath(), 0, fs.getLength(split.getPath()),
|
||||
(String[]) null), jobConf, null);
|
||||
JobConf jobConf = new JobConf();
|
||||
List<Schema.Field> fields = schema.getFields();
|
||||
String names = fields.stream().map(f -> f.name().toString()).collect(Collectors.joining(","));
|
||||
@@ -183,8 +189,8 @@ public class HoodieRealtimeRecordReaderTest {
|
||||
//create a RecordReader to be used by HoodieRealtimeRecordReader
|
||||
RecordReader<Void, ArrayWritable> reader =
|
||||
new MapredParquetInputFormat().
|
||||
getRecordReader(new FileSplit(split.getPath(), 0,
|
||||
FSUtils.getFs().getLength(split.getPath()), (String[]) null), jobConf, null);
|
||||
getRecordReader(new FileSplit(split.getPath(), 0, fs.getLength(split.getPath()),
|
||||
(String[]) null), jobConf, null);
|
||||
JobConf jobConf = new JobConf();
|
||||
List<Schema.Field> fields = schema.getFields();
|
||||
|
||||
|
||||
@@ -30,6 +30,7 @@ import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Optional;
|
||||
import java.util.stream.Collectors;
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.hadoop.hive.conf.HiveConf;
|
||||
import org.apache.hadoop.hive.metastore.api.Partition;
|
||||
@@ -183,7 +184,7 @@ public class HiveSyncTool {
|
||||
cmd.usage();
|
||||
System.exit(1);
|
||||
}
|
||||
FileSystem fs = FSUtils.getFs();
|
||||
FileSystem fs = FSUtils.getFs(cfg.basePath, new Configuration());
|
||||
HiveConf hiveConf = new HiveConf();
|
||||
hiveConf.addResource(fs.getConf());
|
||||
new HiveSyncTool(cfg, hiveConf, fs).syncHoodieTable();
|
||||
|
||||
@@ -91,7 +91,7 @@ public class HoodieHiveClient {
|
||||
HoodieHiveClient(HiveSyncConfig cfg, HiveConf configuration, FileSystem fs) {
|
||||
this.syncConfig = cfg;
|
||||
this.fs = fs;
|
||||
this.metaClient = new HoodieTableMetaClient(fs, cfg.basePath, true);
|
||||
this.metaClient = new HoodieTableMetaClient(fs.getConf(), cfg.basePath, true);
|
||||
this.tableType = metaClient.getTableType();
|
||||
|
||||
LOG.info("Creating hive connection " + cfg.jdbcUrl);
|
||||
|
||||
@@ -20,6 +20,7 @@ package com.uber.hoodie.hive.util;
|
||||
import com.google.common.base.Preconditions;
|
||||
import com.google.common.collect.Maps;
|
||||
import com.google.common.io.Files;
|
||||
import com.uber.hoodie.common.model.HoodieTestUtils;
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.net.InetSocketAddress;
|
||||
@@ -87,7 +88,7 @@ public class HiveTestService {
|
||||
.checkState(workDir != null, "The work dir must be set before starting cluster.");
|
||||
|
||||
if (hadoopConf == null) {
|
||||
hadoopConf = new Configuration();
|
||||
hadoopConf = HoodieTestUtils.getDefaultHadoopConf();
|
||||
}
|
||||
|
||||
String localHiveLocation = getHiveLocation(workDir);
|
||||
|
||||
@@ -67,7 +67,7 @@ public class HoodieDataSourceHelpers {
|
||||
*/
|
||||
public static HoodieTimeline allCompletedCommitsCompactions(FileSystem fs, String basePath) {
|
||||
HoodieTable table = HoodieTable
|
||||
.getHoodieTable(new HoodieTableMetaClient(fs, basePath, true), null);
|
||||
.getHoodieTable(new HoodieTableMetaClient(fs.getConf(), basePath, true), null);
|
||||
if (table.getMetaClient().getTableType().equals(HoodieTableType.MERGE_ON_READ)) {
|
||||
return table.getActiveTimeline().getTimelineOfActions(
|
||||
Sets.newHashSet(HoodieActiveTimeline.COMMIT_ACTION,
|
||||
|
||||
@@ -47,7 +47,7 @@ class IncrementalRelation(val sqlContext: SQLContext,
|
||||
private val log = LogManager.getLogger(classOf[IncrementalRelation])
|
||||
|
||||
val fs = new Path(basePath).getFileSystem(sqlContext.sparkContext.hadoopConfiguration)
|
||||
val metaClient = new HoodieTableMetaClient(fs, basePath, true)
|
||||
val metaClient = new HoodieTableMetaClient(sqlContext.sparkContext.hadoopConfiguration, basePath, true)
|
||||
// MOR datasets not supported yet
|
||||
if (metaClient.getTableType.equals(HoodieTableType.MERGE_ON_READ)) {
|
||||
throw new HoodieException("Incremental view not implemented yet, for merge-on-read datasets")
|
||||
@@ -72,7 +72,8 @@ class IncrementalRelation(val sqlContext: SQLContext,
|
||||
val latestMeta = HoodieCommitMetadata
|
||||
.fromBytes(commitTimeline.getInstantDetails(commitsToReturn.last).get)
|
||||
val metaFilePath = latestMeta.getFileIdAndFullPaths(basePath).values().iterator().next()
|
||||
AvroConversionUtils.convertAvroSchemaToStructType(ParquetUtils.readAvroSchema(new Path(metaFilePath)))
|
||||
AvroConversionUtils.convertAvroSchemaToStructType(ParquetUtils.readAvroSchema(
|
||||
sqlContext.sparkContext.hadoopConfiguration, new Path(metaFilePath)))
|
||||
}
|
||||
|
||||
override def schema: StructType = latestSchema
|
||||
|
||||
@@ -57,7 +57,7 @@ class DataSourceTest extends AssertionsForJUnit {
|
||||
val folder = new TemporaryFolder
|
||||
folder.create
|
||||
basePath = folder.getRoot.getAbsolutePath
|
||||
fs = FSUtils.getFs
|
||||
fs = FSUtils.getFs(basePath, spark.sparkContext.hadoopConfiguration)
|
||||
}
|
||||
|
||||
@Test def testCopyOnWriteStorage() {
|
||||
|
||||
@@ -56,21 +56,18 @@ import org.apache.spark.Accumulator;
|
||||
import org.apache.spark.SparkConf;
|
||||
import org.apache.spark.api.java.JavaRDD;
|
||||
import org.apache.spark.api.java.JavaSparkContext;
|
||||
import org.apache.spark.api.java.function.Function;
|
||||
import org.apache.spark.api.java.function.VoidFunction;
|
||||
import scala.Tuple2;
|
||||
|
||||
public class HDFSParquetImporter implements Serializable {
|
||||
|
||||
private static volatile Logger logger = LogManager.getLogger(HDFSParquetImporter.class);
|
||||
private final Config cfg;
|
||||
private final transient FileSystem fs;
|
||||
private transient FileSystem fs;
|
||||
public static final SimpleDateFormat PARTITION_FORMATTER = new SimpleDateFormat("yyyy/MM/dd");
|
||||
|
||||
public HDFSParquetImporter(
|
||||
Config cfg) throws IOException {
|
||||
this.cfg = cfg;
|
||||
fs = FSUtils.getFs();
|
||||
}
|
||||
|
||||
public static class FormatValidator implements IValueValidator<String> {
|
||||
@@ -203,6 +200,7 @@ public class HDFSParquetImporter implements Serializable {
|
||||
}
|
||||
|
||||
public int dataImport(JavaSparkContext jsc, int retry) throws Exception {
|
||||
this.fs = FSUtils.getFs(cfg.targetPath, jsc.hadoopConfiguration());
|
||||
int ret = -1;
|
||||
try {
|
||||
// Verify that targetPath is not present.
|
||||
@@ -251,43 +249,36 @@ public class HDFSParquetImporter implements Serializable {
|
||||
GenericRecord.class, job.getConfiguration())
|
||||
// To reduce large number of tasks.
|
||||
.coalesce(16 * cfg.parallelism)
|
||||
.map(new Function<Tuple2<Void, GenericRecord>, HoodieRecord<HoodieJsonPayload>>() {
|
||||
@Override
|
||||
public HoodieRecord<HoodieJsonPayload> call(Tuple2<Void, GenericRecord> entry)
|
||||
throws Exception {
|
||||
GenericRecord genericRecord = entry._2();
|
||||
Object partitionField = genericRecord.get(cfg.partitionKey);
|
||||
if (partitionField == null) {
|
||||
throw new HoodieIOException(
|
||||
"partition key is missing. :" + cfg.partitionKey);
|
||||
}
|
||||
Object rowField = genericRecord.get(cfg.rowKey);
|
||||
if (rowField == null) {
|
||||
throw new HoodieIOException(
|
||||
"row field is missing. :" + cfg.rowKey);
|
||||
}
|
||||
long ts = (long) ((Double) partitionField * 1000l);
|
||||
String partitionPath = PARTITION_FORMATTER.format(new Date(ts));
|
||||
return new HoodieRecord<HoodieJsonPayload>(
|
||||
new HoodieKey((String) rowField, partitionPath),
|
||||
new HoodieJsonPayload(genericRecord.toString()));
|
||||
}
|
||||
}
|
||||
.map(entry -> {
|
||||
GenericRecord genericRecord = ((Tuple2<Void, GenericRecord>) entry)._2();
|
||||
Object partitionField = genericRecord.get(cfg.partitionKey);
|
||||
if (partitionField == null) {
|
||||
throw new HoodieIOException(
|
||||
"partition key is missing. :" + cfg.partitionKey);
|
||||
}
|
||||
Object rowField = genericRecord.get(cfg.rowKey);
|
||||
if (rowField == null) {
|
||||
throw new HoodieIOException(
|
||||
"row field is missing. :" + cfg.rowKey);
|
||||
}
|
||||
long ts = (long) ((Double) partitionField * 1000l);
|
||||
String partitionPath = PARTITION_FORMATTER.format(new Date(ts));
|
||||
return new HoodieRecord<>(
|
||||
new HoodieKey((String) rowField, partitionPath),
|
||||
new HoodieJsonPayload(genericRecord.toString()));
|
||||
}
|
||||
);
|
||||
// Get commit time.
|
||||
String commitTime = client.startCommit();
|
||||
|
||||
JavaRDD<WriteStatus> writeResponse = client.bulkInsert(hoodieRecords, commitTime);
|
||||
Accumulator<Integer> errors = jsc.accumulator(0);
|
||||
writeResponse.foreach(new VoidFunction<WriteStatus>() {
|
||||
@Override
|
||||
public void call(WriteStatus writeStatus) throws Exception {
|
||||
writeResponse.foreach(writeStatus -> {
|
||||
if (writeStatus.hasErrors()) {
|
||||
errors.add(1);
|
||||
logger.error(String.format("Error processing records :writeStatus:%s",
|
||||
writeStatus.getStat().toString()));
|
||||
}
|
||||
}
|
||||
});
|
||||
if (errors.value() == 0) {
|
||||
logger.info(String
|
||||
|
||||
@@ -291,7 +291,7 @@ public class HiveIncrementalPuller {
|
||||
if (!fs.exists(new Path(targetDataPath)) || !fs.exists(new Path(targetDataPath + "/.hoodie"))) {
|
||||
return "0";
|
||||
}
|
||||
HoodieTableMetaClient metadata = new HoodieTableMetaClient(fs, targetDataPath);
|
||||
HoodieTableMetaClient metadata = new HoodieTableMetaClient(fs.getConf(), targetDataPath);
|
||||
|
||||
Optional<HoodieInstant>
|
||||
lastCommit = metadata.getActiveTimeline().getCommitsTimeline()
|
||||
@@ -331,7 +331,7 @@ public class HiveIncrementalPuller {
|
||||
|
||||
private String getLastCommitTimePulled(FileSystem fs, String sourceTableLocation)
|
||||
throws IOException {
|
||||
HoodieTableMetaClient metadata = new HoodieTableMetaClient(fs, sourceTableLocation);
|
||||
HoodieTableMetaClient metadata = new HoodieTableMetaClient(fs.getConf(), sourceTableLocation);
|
||||
List<String> commitsToSync = metadata.getActiveTimeline().getCommitsTimeline()
|
||||
.filterCompletedInstants()
|
||||
.findInstantsAfter(config.fromCommitTime, config.maxCommits).getInstants()
|
||||
|
||||
@@ -20,6 +20,7 @@ package com.uber.hoodie.utilities;
|
||||
|
||||
import com.beust.jcommander.JCommander;
|
||||
import com.beust.jcommander.Parameter;
|
||||
import com.uber.hoodie.common.SerializableConfiguration;
|
||||
import com.uber.hoodie.common.model.HoodieDataFile;
|
||||
import com.uber.hoodie.common.model.HoodiePartitionMetadata;
|
||||
import com.uber.hoodie.common.table.HoodieTableConfig;
|
||||
@@ -70,8 +71,10 @@ public class HoodieSnapshotCopier implements Serializable {
|
||||
|
||||
public void snapshot(JavaSparkContext jsc, String baseDir, final String outputDir,
|
||||
final boolean shouldAssumeDatePartitioning) throws IOException {
|
||||
FileSystem fs = FSUtils.getFs();
|
||||
final HoodieTableMetaClient tableMetadata = new HoodieTableMetaClient(fs, baseDir);
|
||||
FileSystem fs = FSUtils.getFs(baseDir, jsc.hadoopConfiguration());
|
||||
final SerializableConfiguration serConf = new SerializableConfiguration(
|
||||
jsc.hadoopConfiguration());
|
||||
final HoodieTableMetaClient tableMetadata = new HoodieTableMetaClient(fs.getConf(), baseDir);
|
||||
final TableFileSystemView.ReadOptimizedView fsView = new HoodieTableFileSystemView(
|
||||
tableMetadata,
|
||||
tableMetadata.getActiveTimeline().getCommitsTimeline()
|
||||
@@ -104,7 +107,7 @@ public class HoodieSnapshotCopier implements Serializable {
|
||||
jsc.parallelize(partitions, partitions.size())
|
||||
.flatMap(partition -> {
|
||||
// Only take latest version files <= latestCommit.
|
||||
FileSystem fs1 = FSUtils.getFs();
|
||||
FileSystem fs1 = FSUtils.getFs(baseDir, serConf.get());
|
||||
List<Tuple2<String, String>> filePaths = new ArrayList<>();
|
||||
Stream<HoodieDataFile> dataFiles = fsView
|
||||
.getLatestDataFilesBeforeOrOn(partition, latestCommitTimestamp);
|
||||
@@ -123,13 +126,13 @@ public class HoodieSnapshotCopier implements Serializable {
|
||||
String partition = tuple._1();
|
||||
Path sourceFilePath = new Path(tuple._2());
|
||||
Path toPartitionPath = new Path(outputDir, partition);
|
||||
FileSystem fs1 = FSUtils.getFs();
|
||||
FileSystem ifs = FSUtils.getFs(baseDir, serConf.get());
|
||||
|
||||
if (!fs1.exists(toPartitionPath)) {
|
||||
fs1.mkdirs(toPartitionPath);
|
||||
if (!ifs.exists(toPartitionPath)) {
|
||||
ifs.mkdirs(toPartitionPath);
|
||||
}
|
||||
FileUtil.copy(fs1, sourceFilePath, fs1,
|
||||
new Path(toPartitionPath, sourceFilePath.getName()), false, fs1.getConf());
|
||||
FileUtil.copy(ifs, sourceFilePath, ifs,
|
||||
new Path(toPartitionPath, sourceFilePath.getName()), false, ifs.getConf());
|
||||
});
|
||||
|
||||
// Also copy the .commit files
|
||||
|
||||
@@ -115,10 +115,11 @@ public class HoodieDeltaStreamer implements Serializable {
|
||||
|
||||
public HoodieDeltaStreamer(Config cfg) throws IOException {
|
||||
this.cfg = cfg;
|
||||
this.fs = FSUtils.getFs();
|
||||
this.jssc = getSparkContext();
|
||||
this.fs = FSUtils.getFs(cfg.targetBasePath, jssc.hadoopConfiguration());
|
||||
|
||||
if (fs.exists(new Path(cfg.targetBasePath))) {
|
||||
HoodieTableMetaClient meta = new HoodieTableMetaClient(fs, cfg.targetBasePath);
|
||||
HoodieTableMetaClient meta = new HoodieTableMetaClient(fs.getConf(), cfg.targetBasePath);
|
||||
this.commitTimelineOpt = Optional
|
||||
.of(meta.getActiveTimeline().getCommitsTimeline()
|
||||
.filterCompletedInstants());
|
||||
@@ -129,8 +130,6 @@ public class HoodieDeltaStreamer implements Serializable {
|
||||
//TODO(vc) Should these be passed from outside?
|
||||
initSchemaProvider();
|
||||
initKeyGenerator();
|
||||
this.jssc = getSparkContext();
|
||||
|
||||
initSource();
|
||||
}
|
||||
|
||||
@@ -203,7 +202,9 @@ public class HoodieDeltaStreamer implements Serializable {
|
||||
Properties properties = new Properties();
|
||||
properties.put(HoodieWriteConfig.TABLE_NAME, cfg.targetTableName);
|
||||
HoodieTableMetaClient
|
||||
.initializePathAsHoodieDataset(FSUtils.getFs(), cfg.targetBasePath, properties);
|
||||
.initializePathAsHoodieDataset(
|
||||
FSUtils.getFs(cfg.targetBasePath, jssc.hadoopConfiguration()), cfg.targetBasePath,
|
||||
properties);
|
||||
}
|
||||
log.info("Checkpoint to resume from : " + resumeCheckpointStr);
|
||||
|
||||
|
||||
@@ -25,6 +25,7 @@ import java.io.IOException;
|
||||
import java.util.Arrays;
|
||||
import org.apache.avro.Schema;
|
||||
import org.apache.commons.configuration.PropertiesConfiguration;
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
|
||||
@@ -50,7 +51,7 @@ public class FilebasedSchemaProvider extends SchemaProvider {
|
||||
|
||||
public FilebasedSchemaProvider(PropertiesConfiguration config) {
|
||||
super(config);
|
||||
this.fs = FSUtils.getFs();
|
||||
this.fs = FSUtils.getFs(config.getBasePath(), new Configuration());
|
||||
|
||||
DataSourceUtils.checkRequiredProperties(config,
|
||||
Arrays.asList(Config.SOURCE_SCHEMA_FILE_PROP, Config.TARGET_SCHEMA_FILE_PROP));
|
||||
|
||||
@@ -65,7 +65,7 @@ public class DFSSource extends Source {
|
||||
public DFSSource(PropertiesConfiguration config, JavaSparkContext sparkContext,
|
||||
SourceDataFormat dataFormat, SchemaProvider schemaProvider) {
|
||||
super(config, sparkContext, dataFormat, schemaProvider);
|
||||
this.fs = FSUtils.getFs();
|
||||
this.fs = FSUtils.getFs(config.getBasePath(), sparkContext.hadoopConfiguration());
|
||||
DataSourceUtils.checkRequiredProperties(config, Arrays.asList(Config.ROOT_INPUT_PATH_PROP));
|
||||
}
|
||||
|
||||
|
||||
@@ -72,7 +72,7 @@ public class HiveIncrPullSource extends Source {
|
||||
public HiveIncrPullSource(PropertiesConfiguration config, JavaSparkContext sparkContext,
|
||||
SourceDataFormat dataFormat, SchemaProvider schemaProvider) {
|
||||
super(config, sparkContext, dataFormat, schemaProvider);
|
||||
this.fs = FSUtils.getFs();
|
||||
this.fs = FSUtils.getFs(config.getBasePath(), sparkContext.hadoopConfiguration());
|
||||
DataSourceUtils.checkRequiredProperties(config, Arrays.asList(Config.ROOT_INPUT_PATH_PROP));
|
||||
this.incrPullRootPath = config.getString(Config.ROOT_INPUT_PATH_PROP);
|
||||
}
|
||||
|
||||
@@ -48,7 +48,6 @@ import org.apache.spark.streaming.kafka.KafkaCluster;
|
||||
import org.apache.spark.streaming.kafka.KafkaUtils;
|
||||
import org.apache.spark.streaming.kafka.OffsetRange;
|
||||
import scala.Predef;
|
||||
import scala.Tuple2;
|
||||
import scala.collection.JavaConverters;
|
||||
import scala.collection.immutable.Map;
|
||||
import scala.collection.immutable.Set;
|
||||
@@ -134,16 +133,16 @@ public class KafkaSource extends Source {
|
||||
|
||||
public static <K, V> Map<K, V> toScalaMap(HashMap<K, V> m) {
|
||||
return JavaConverters.mapAsScalaMapConverter(m).asScala().toMap(
|
||||
Predef.<Tuple2<K, V>>conforms()
|
||||
Predef.conforms()
|
||||
);
|
||||
}
|
||||
|
||||
public static Set<String> toScalaSet(HashSet<String> s) {
|
||||
return JavaConverters.asScalaSetConverter(s).asScala().<String>toSet();
|
||||
return JavaConverters.asScalaSetConverter(s).asScala().toSet();
|
||||
}
|
||||
|
||||
public static <K, V> java.util.Map<K, V> toJavaMap(Map<K, V> m) {
|
||||
return JavaConverters.<K, V>mapAsJavaMapConverter(m).asJava();
|
||||
return JavaConverters.mapAsJavaMapConverter(m).asJava();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -24,6 +24,7 @@ import com.uber.hoodie.HoodieReadClient;
|
||||
import com.uber.hoodie.HoodieWriteClient;
|
||||
import com.uber.hoodie.common.HoodieTestDataGenerator;
|
||||
import com.uber.hoodie.common.minicluster.HdfsTestService;
|
||||
import com.uber.hoodie.common.model.HoodieTestUtils;
|
||||
import com.uber.hoodie.common.table.HoodieTimeline;
|
||||
import com.uber.hoodie.common.table.timeline.HoodieActiveTimeline;
|
||||
import com.uber.hoodie.common.util.FSUtils;
|
||||
@@ -38,7 +39,6 @@ import java.util.Map.Entry;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
import java.util.concurrent.atomic.AtomicInteger;
|
||||
import org.apache.avro.generic.GenericRecord;
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.fs.FSDataOutputStream;
|
||||
import org.apache.hadoop.fs.LocatedFileStatus;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
@@ -174,7 +174,7 @@ public class TestHDFSParquetImporter implements Serializable {
|
||||
ParquetWriter<GenericRecord> writer = AvroParquetWriter
|
||||
.<GenericRecord>builder(srcFile)
|
||||
.withSchema(HoodieTestDataGenerator.avroSchema)
|
||||
.withConf(new Configuration())
|
||||
.withConf(HoodieTestUtils.getDefaultHadoopConf())
|
||||
.build();
|
||||
for (GenericRecord record : records) {
|
||||
writer.write(record);
|
||||
|
||||
@@ -44,17 +44,22 @@ public class TestHoodieSnapshotCopier {
|
||||
|
||||
@Before
|
||||
public void init() throws IOException {
|
||||
// Prepare directories
|
||||
TemporaryFolder folder = new TemporaryFolder();
|
||||
folder.create();
|
||||
rootPath = folder.getRoot().getAbsolutePath();
|
||||
basePath = rootPath + "/" + HoodieTestUtils.RAW_TRIPS_TEST_NAME;
|
||||
HoodieTestUtils.init(basePath);
|
||||
outputPath = rootPath + "/output";
|
||||
fs = FSUtils.getFs();
|
||||
// Start a local Spark job
|
||||
SparkConf conf = new SparkConf().setAppName("snapshot-test-job").setMaster("local[2]");
|
||||
jsc = new JavaSparkContext(conf);
|
||||
try {
|
||||
// Prepare directories
|
||||
TemporaryFolder folder = new TemporaryFolder();
|
||||
folder.create();
|
||||
rootPath = "file://" + folder.getRoot().getAbsolutePath();
|
||||
basePath = rootPath + "/" + HoodieTestUtils.RAW_TRIPS_TEST_NAME;
|
||||
outputPath = rootPath + "/output";
|
||||
|
||||
fs = FSUtils.getFs(basePath, HoodieTestUtils.getDefaultHadoopConf());
|
||||
HoodieTestUtils.init(basePath);
|
||||
// Start a local Spark job
|
||||
SparkConf conf = new SparkConf().setAppName("snapshot-test-job").setMaster("local[2]");
|
||||
jsc = new JavaSparkContext(conf);
|
||||
} catch (Exception e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
|
||||
Reference in New Issue
Block a user