1
0

Moving to Spark 2.1.0

This commit is contained in:
Prasanna Rajaperumal
2017-02-20 16:47:52 -08:00
parent be1dd9444f
commit 0e234ac0ef
6 changed files with 35 additions and 33 deletions

View File

@@ -40,7 +40,7 @@ import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.api.java.function.PairFunction;
import org.apache.spark.sql.DataFrame;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.SQLContext;
import org.apache.spark.sql.types.StructType;
@@ -122,7 +122,7 @@ public class HoodieReadClient implements Serializable {
*
* @return a dataframe
*/
public DataFrame read(JavaRDD<HoodieKey> hoodieKeys, int parallelism)
public Dataset<Row> read(JavaRDD<HoodieKey> hoodieKeys, int parallelism)
throws Exception {
assertSqlContext();
@@ -145,7 +145,7 @@ public class HoodieReadClient implements Serializable {
// record locations might be same for multiple keys, so need a unique list
Set<String> uniquePaths = new HashSet<>(paths);
DataFrame originalDF = sqlContextOpt.get().read()
Dataset<Row> originalDF = sqlContextOpt.get().read()
.parquet(uniquePaths.toArray(new String[uniquePaths.size()]));
StructType schema = originalDF.schema();
JavaPairRDD<HoodieKey, Row> keyRowRDD = originalDF.javaRDD()
@@ -174,7 +174,7 @@ public class HoodieReadClient implements Serializable {
/**
* Reads the paths under the a hoodie dataset out as a DataFrame
*/
public DataFrame read(String... paths) {
public Dataset<Row> read(String... paths) {
assertSqlContext();
List<String> filteredPaths = new ArrayList<>();
try {
@@ -203,7 +203,7 @@ public class HoodieReadClient implements Serializable {
* If you made a prior call to {@link HoodieReadClient#latestCommit()}, it gives you all data in
* the time window (commitTimestamp, latestCommit)
*/
public DataFrame readSince(String lastCommitTimestamp) {
public Dataset<Row> readSince(String lastCommitTimestamp) {
List<String> commitsToReturn = metadata.findCommitsAfter(lastCommitTimestamp, Integer.MAX_VALUE);
//TODO: we can potentially trim this down to only affected partitions, using CommitMetadata
@@ -227,7 +227,7 @@ public class HoodieReadClient implements Serializable {
/**
* Obtain
*/
public DataFrame readCommit(String commitTime) {
public Dataset<Row> readCommit(String commitTime) {
assertSqlContext();
HoodieCommits commits = metadata.getAllCommits();
if (!commits.contains(commitTime)) {