Reformatting code per Google Code Style all over
This commit is contained in:
committed by
vinoth chandar
parent
5a62480a92
commit
e45679f5e2
@@ -17,25 +17,19 @@
|
||||
package com.uber.hoodie;
|
||||
|
||||
import com.google.common.base.Optional;
|
||||
|
||||
import com.uber.hoodie.common.model.HoodieCommitMetadata;
|
||||
import com.uber.hoodie.common.model.HoodieDataFile;
|
||||
import com.uber.hoodie.common.model.HoodieKey;
|
||||
import com.uber.hoodie.common.model.HoodieRecord;
|
||||
import com.uber.hoodie.common.table.HoodieTableMetaClient;
|
||||
import com.uber.hoodie.common.table.HoodieTimeline;
|
||||
import com.uber.hoodie.common.table.TableFileSystemView;
|
||||
import com.uber.hoodie.common.table.timeline.HoodieInstant;
|
||||
import com.uber.hoodie.common.table.view.HoodieTableFileSystemView;
|
||||
import com.uber.hoodie.common.util.FSUtils;
|
||||
import com.uber.hoodie.config.HoodieWriteConfig;
|
||||
import com.uber.hoodie.exception.HoodieException;
|
||||
import com.uber.hoodie.index.bloom.HoodieBloomIndex;
|
||||
|
||||
import com.uber.hoodie.table.HoodieTable;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.apache.log4j.LogManager;
|
||||
import org.apache.log4j.Logger;
|
||||
import org.apache.spark.SparkConf;
|
||||
@@ -46,136 +40,126 @@ import org.apache.spark.sql.Dataset;
|
||||
import org.apache.spark.sql.Row;
|
||||
import org.apache.spark.sql.SQLContext;
|
||||
import org.apache.spark.sql.types.StructType;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Serializable;
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashMap;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import scala.Tuple2;
|
||||
|
||||
/**
|
||||
* Provides an RDD based API for accessing/filtering Hoodie tables, based on keys.
|
||||
*
|
||||
*/
|
||||
public class HoodieReadClient implements Serializable {
|
||||
|
||||
private static Logger logger = LogManager.getLogger(HoodieReadClient.class);
|
||||
private static Logger logger = LogManager.getLogger(HoodieReadClient.class);
|
||||
|
||||
private transient final JavaSparkContext jsc;
|
||||
private transient final JavaSparkContext jsc;
|
||||
|
||||
private transient final FileSystem fs;
|
||||
/**
|
||||
* TODO: We need to persist the index type into hoodie.properties and be able to access the
|
||||
* index just with a simple basepath pointing to the dataset. Until, then just always assume a
|
||||
* BloomIndex
|
||||
*/
|
||||
private transient final HoodieBloomIndex index;
|
||||
private final HoodieTimeline commitTimeline;
|
||||
private HoodieTable hoodieTable;
|
||||
private transient Optional<SQLContext> sqlContextOpt;
|
||||
private transient final FileSystem fs;
|
||||
/**
|
||||
* TODO: We need to persist the index type into hoodie.properties and be able to access the index
|
||||
* just with a simple basepath pointing to the dataset. Until, then just always assume a
|
||||
* BloomIndex
|
||||
*/
|
||||
private transient final HoodieBloomIndex index;
|
||||
private final HoodieTimeline commitTimeline;
|
||||
private HoodieTable hoodieTable;
|
||||
private transient Optional<SQLContext> sqlContextOpt;
|
||||
|
||||
/**
|
||||
* @param basePath path to Hoodie dataset
|
||||
*/
|
||||
public HoodieReadClient(JavaSparkContext jsc, String basePath) {
|
||||
this.jsc = jsc;
|
||||
this.fs = FSUtils.getFs();
|
||||
// Create a Hoodie table which encapsulated the commits and files visible
|
||||
this.hoodieTable = HoodieTable
|
||||
.getHoodieTable(new HoodieTableMetaClient(fs, basePath, true), null);
|
||||
this.commitTimeline = hoodieTable.getCompletedCompactionCommitTimeline();
|
||||
this.index =
|
||||
new HoodieBloomIndex(HoodieWriteConfig.newBuilder().withPath(basePath).build(), jsc);
|
||||
this.sqlContextOpt = Optional.absent();
|
||||
/**
|
||||
* @param basePath path to Hoodie dataset
|
||||
*/
|
||||
public HoodieReadClient(JavaSparkContext jsc, String basePath) {
|
||||
this.jsc = jsc;
|
||||
this.fs = FSUtils.getFs();
|
||||
// Create a Hoodie table which encapsulated the commits and files visible
|
||||
this.hoodieTable = HoodieTable
|
||||
.getHoodieTable(new HoodieTableMetaClient(fs, basePath, true), null);
|
||||
this.commitTimeline = hoodieTable.getCompletedCompactionCommitTimeline();
|
||||
this.index =
|
||||
new HoodieBloomIndex(HoodieWriteConfig.newBuilder().withPath(basePath).build(), jsc);
|
||||
this.sqlContextOpt = Optional.absent();
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @param jsc
|
||||
* @param basePath
|
||||
* @param sqlContext
|
||||
*/
|
||||
public HoodieReadClient(JavaSparkContext jsc, String basePath, SQLContext sqlContext) {
|
||||
this(jsc, basePath);
|
||||
this.sqlContextOpt = Optional.of(sqlContext);
|
||||
}
|
||||
|
||||
/**
|
||||
* Adds support for accessing Hoodie built tables from SparkSQL, as you normally would.
|
||||
*
|
||||
* @return SparkConf object to be used to construct the SparkContext by caller
|
||||
*/
|
||||
public static SparkConf addHoodieSupport(SparkConf conf) {
|
||||
conf.set("spark.sql.hive.convertMetastoreParquet", "false");
|
||||
return conf;
|
||||
}
|
||||
|
||||
private void assertSqlContext() {
|
||||
if (!sqlContextOpt.isPresent()) {
|
||||
throw new IllegalStateException(
|
||||
"SQLContext must be set, when performing dataframe operations");
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
*
|
||||
* @param jsc
|
||||
* @param basePath
|
||||
* @param sqlContext
|
||||
*/
|
||||
public HoodieReadClient(JavaSparkContext jsc, String basePath, SQLContext sqlContext) {
|
||||
this(jsc, basePath);
|
||||
this.sqlContextOpt = Optional.of(sqlContext);
|
||||
}
|
||||
/**
|
||||
* Given a bunch of hoodie keys, fetches all the individual records out as a data frame
|
||||
*
|
||||
* @return a dataframe
|
||||
*/
|
||||
public Dataset<Row> read(JavaRDD<HoodieKey> hoodieKeys, int parallelism)
|
||||
throws Exception {
|
||||
|
||||
/**
|
||||
* Adds support for accessing Hoodie built tables from SparkSQL, as you normally would.
|
||||
*
|
||||
* @return SparkConf object to be used to construct the SparkContext by caller
|
||||
*/
|
||||
public static SparkConf addHoodieSupport(SparkConf conf) {
|
||||
conf.set("spark.sql.hive.convertMetastoreParquet", "false");
|
||||
return conf;
|
||||
}
|
||||
assertSqlContext();
|
||||
JavaPairRDD<HoodieKey, Optional<String>> keyToFileRDD =
|
||||
index.fetchRecordLocation(hoodieKeys, hoodieTable);
|
||||
List<String> paths = keyToFileRDD
|
||||
.filter(keyFileTuple -> keyFileTuple._2().isPresent())
|
||||
.map(keyFileTuple -> keyFileTuple._2().get())
|
||||
.collect();
|
||||
|
||||
private void assertSqlContext() {
|
||||
if (!sqlContextOpt.isPresent()) {
|
||||
throw new IllegalStateException("SQLContext must be set, when performing dataframe operations");
|
||||
}
|
||||
}
|
||||
// record locations might be same for multiple keys, so need a unique list
|
||||
Set<String> uniquePaths = new HashSet<>(paths);
|
||||
Dataset<Row> originalDF = sqlContextOpt.get().read()
|
||||
.parquet(uniquePaths.toArray(new String[uniquePaths.size()]));
|
||||
StructType schema = originalDF.schema();
|
||||
JavaPairRDD<HoodieKey, Row> keyRowRDD = originalDF.javaRDD()
|
||||
.mapToPair(row -> {
|
||||
HoodieKey key = new HoodieKey(
|
||||
row.<String>getAs(HoodieRecord.RECORD_KEY_METADATA_FIELD),
|
||||
row.<String>getAs(HoodieRecord.PARTITION_PATH_METADATA_FIELD));
|
||||
return new Tuple2<>(key, row);
|
||||
});
|
||||
|
||||
/**
|
||||
* Given a bunch of hoodie keys, fetches all the individual records out as a data frame
|
||||
*
|
||||
* @return a dataframe
|
||||
*/
|
||||
public Dataset<Row> read(JavaRDD<HoodieKey> hoodieKeys, int parallelism)
|
||||
throws Exception {
|
||||
// Now, we need to further filter out, for only rows that match the supplied hoodie keys
|
||||
JavaRDD<Row> rowRDD = keyRowRDD.join(keyToFileRDD, parallelism)
|
||||
.map(tuple -> tuple._2()._1());
|
||||
|
||||
assertSqlContext();
|
||||
JavaPairRDD<HoodieKey, Optional<String>> keyToFileRDD =
|
||||
index.fetchRecordLocation(hoodieKeys, hoodieTable);
|
||||
List<String> paths = keyToFileRDD
|
||||
.filter(keyFileTuple -> keyFileTuple._2().isPresent())
|
||||
.map(keyFileTuple -> keyFileTuple._2().get())
|
||||
.collect();
|
||||
return sqlContextOpt.get().createDataFrame(rowRDD, schema);
|
||||
}
|
||||
|
||||
// record locations might be same for multiple keys, so need a unique list
|
||||
Set<String> uniquePaths = new HashSet<>(paths);
|
||||
Dataset<Row> originalDF = sqlContextOpt.get().read()
|
||||
.parquet(uniquePaths.toArray(new String[uniquePaths.size()]));
|
||||
StructType schema = originalDF.schema();
|
||||
JavaPairRDD<HoodieKey, Row> keyRowRDD = originalDF.javaRDD()
|
||||
.mapToPair(row -> {
|
||||
HoodieKey key = new HoodieKey(
|
||||
row.<String>getAs(HoodieRecord.RECORD_KEY_METADATA_FIELD),
|
||||
row.<String>getAs(HoodieRecord.PARTITION_PATH_METADATA_FIELD));
|
||||
return new Tuple2<>(key, row);
|
||||
});
|
||||
/**
|
||||
* Checks if the given [Keys] exists in the hoodie table and returns [Key, Optional[FullFilePath]]
|
||||
* If the optional FullFilePath value is not present, then the key is not found. If the
|
||||
* FullFilePath value is present, it is the path component (without scheme) of the URI underlying
|
||||
* file
|
||||
*/
|
||||
public JavaPairRDD<HoodieKey, Optional<String>> checkExists(JavaRDD<HoodieKey> hoodieKeys) {
|
||||
return index.fetchRecordLocation(hoodieKeys, hoodieTable);
|
||||
}
|
||||
|
||||
// Now, we need to further filter out, for only rows that match the supplied hoodie keys
|
||||
JavaRDD<Row> rowRDD = keyRowRDD.join(keyToFileRDD, parallelism)
|
||||
.map(tuple -> tuple._2()._1());
|
||||
|
||||
return sqlContextOpt.get().createDataFrame(rowRDD, schema);
|
||||
}
|
||||
|
||||
/**
|
||||
* Checks if the given [Keys] exists in the hoodie table and returns [Key,
|
||||
* Optional[FullFilePath]] If the optional FullFilePath value is not present, then the key is
|
||||
* not found. If the FullFilePath value is present, it is the path component (without scheme) of
|
||||
* the URI underlying file
|
||||
*/
|
||||
public JavaPairRDD<HoodieKey, Optional<String>> checkExists(JavaRDD<HoodieKey> hoodieKeys) {
|
||||
return index.fetchRecordLocation(hoodieKeys, hoodieTable);
|
||||
}
|
||||
|
||||
/**
|
||||
* Filter out HoodieRecords that already exists in the output folder. This is useful in
|
||||
* deduplication.
|
||||
*
|
||||
* @param hoodieRecords Input RDD of Hoodie records.
|
||||
* @return A subset of hoodieRecords RDD, with existing records filtered out.
|
||||
*/
|
||||
public JavaRDD<HoodieRecord> filterExists(JavaRDD<HoodieRecord> hoodieRecords) {
|
||||
JavaRDD<HoodieRecord> recordsWithLocation = index.tagLocation(hoodieRecords, hoodieTable);
|
||||
return recordsWithLocation.filter(v1 -> !v1.isCurrentLocationKnown());
|
||||
}
|
||||
/**
|
||||
* Filter out HoodieRecords that already exists in the output folder. This is useful in
|
||||
* deduplication.
|
||||
*
|
||||
* @param hoodieRecords Input RDD of Hoodie records.
|
||||
* @return A subset of hoodieRecords RDD, with existing records filtered out.
|
||||
*/
|
||||
public JavaRDD<HoodieRecord> filterExists(JavaRDD<HoodieRecord> hoodieRecords) {
|
||||
JavaRDD<HoodieRecord> recordsWithLocation = index.tagLocation(hoodieRecords, hoodieTable);
|
||||
return recordsWithLocation.filter(v1 -> !v1.isCurrentLocationKnown());
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user