Reformatting code per Google Code Style all over
This commit is contained in:
committed by
vinoth chandar
parent
5a62480a92
commit
e45679f5e2
File diff suppressed because it is too large
Load Diff
@@ -29,15 +29,6 @@ import com.uber.hoodie.common.table.view.HoodieTableFileSystemView;
|
||||
import com.uber.hoodie.common.util.FSUtils;
|
||||
import com.uber.hoodie.exception.HoodieException;
|
||||
import com.uber.hoodie.table.HoodieTable;
|
||||
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.apache.spark.sql.Dataset;
|
||||
import org.apache.spark.sql.Row;
|
||||
import org.apache.spark.sql.SQLContext;
|
||||
|
||||
import org.apache.spark.SparkConf;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.io.RandomAccessFile;
|
||||
@@ -49,6 +40,12 @@ import java.util.Iterator;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
import java.util.stream.Collectors;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.apache.spark.SparkConf;
|
||||
import org.apache.spark.sql.Dataset;
|
||||
import org.apache.spark.sql.Row;
|
||||
import org.apache.spark.sql.SQLContext;
|
||||
|
||||
/**
|
||||
* Utility methods to aid testing inside the HoodieClient module.
|
||||
@@ -56,133 +53,142 @@ import java.util.stream.Collectors;
|
||||
public class HoodieClientTestUtils {
|
||||
|
||||
|
||||
public static List<WriteStatus> collectStatuses(Iterator<List<WriteStatus>> statusListItr) {
|
||||
List<WriteStatus> statuses = new ArrayList<>();
|
||||
while (statusListItr.hasNext()) {
|
||||
statuses.addAll(statusListItr.next());
|
||||
}
|
||||
return statuses;
|
||||
}
|
||||
|
||||
public static Set<String> getRecordKeys(List<HoodieRecord> hoodieRecords) {
|
||||
Set<String> keys = new HashSet<>();
|
||||
for (HoodieRecord rec: hoodieRecords) {
|
||||
keys.add(rec.getRecordKey());
|
||||
}
|
||||
return keys;
|
||||
}
|
||||
|
||||
private static void fakeMetaFile(String basePath, String commitTime, String suffix) throws IOException {
|
||||
String parentPath = basePath + "/"+ HoodieTableMetaClient.METAFOLDER_NAME;
|
||||
new File(parentPath).mkdirs();
|
||||
new File(parentPath + "/" + commitTime + suffix).createNewFile();
|
||||
}
|
||||
|
||||
|
||||
public static void fakeCommitFile(String basePath, String commitTime) throws IOException {
|
||||
fakeMetaFile(basePath, commitTime, HoodieTimeline.COMMIT_EXTENSION);
|
||||
}
|
||||
|
||||
public static void fakeInFlightFile(String basePath, String commitTime) throws IOException {
|
||||
fakeMetaFile(basePath, commitTime, HoodieTimeline.INFLIGHT_EXTENSION);
|
||||
}
|
||||
|
||||
public static void fakeDataFile(String basePath, String partitionPath, String commitTime, String fileId) throws Exception {
|
||||
fakeDataFile(basePath, partitionPath, commitTime, fileId, 0);
|
||||
}
|
||||
|
||||
public static void fakeDataFile(String basePath, String partitionPath, String commitTime, String fileId, long length) throws Exception {
|
||||
String parentPath = String.format("%s/%s", basePath, partitionPath);
|
||||
new File(parentPath).mkdirs();
|
||||
String path = String.format("%s/%s", parentPath, FSUtils.makeDataFileName(commitTime, 0, fileId));
|
||||
new File(path).createNewFile();
|
||||
new RandomAccessFile(path, "rw").setLength(length);
|
||||
}
|
||||
|
||||
public static SparkConf getSparkConfForTest(String appName) {
|
||||
SparkConf sparkConf = new SparkConf()
|
||||
.setAppName(appName)
|
||||
.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
|
||||
.setMaster("local[1]");
|
||||
return HoodieReadClient.addHoodieSupport(sparkConf);
|
||||
}
|
||||
|
||||
public static HashMap<String, String> getLatestFileIDsToFullPath(String basePath,
|
||||
HoodieTimeline commitTimeline,
|
||||
List<HoodieInstant> commitsToReturn) throws IOException {
|
||||
HashMap<String, String> fileIdToFullPath = new HashMap<>();
|
||||
for (HoodieInstant commit : commitsToReturn) {
|
||||
HoodieCommitMetadata metadata =
|
||||
HoodieCommitMetadata.fromBytes(commitTimeline.getInstantDetails(commit).get());
|
||||
fileIdToFullPath.putAll(metadata.getFileIdAndFullPaths(basePath));
|
||||
}
|
||||
return fileIdToFullPath;
|
||||
}
|
||||
|
||||
public static Dataset<Row> readCommit(String basePath,
|
||||
SQLContext sqlContext,
|
||||
HoodieTimeline commitTimeline,
|
||||
String commitTime) {
|
||||
HoodieInstant commitInstant =
|
||||
new HoodieInstant(false, HoodieTimeline.COMMIT_ACTION, commitTime);
|
||||
if (!commitTimeline.containsInstant(commitInstant)) {
|
||||
new HoodieException("No commit exists at " + commitTime);
|
||||
}
|
||||
try {
|
||||
HashMap<String, String> paths = getLatestFileIDsToFullPath(basePath, commitTimeline, Arrays.asList(commitInstant));
|
||||
return sqlContext.read()
|
||||
.parquet(paths.values().toArray(new String[paths.size()]))
|
||||
.filter(String.format("%s ='%s'", HoodieRecord.COMMIT_TIME_METADATA_FIELD, commitTime));
|
||||
} catch (Exception e) {
|
||||
throw new HoodieException("Error reading commit " + commitTime, e);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Obtain all new data written into the Hoodie dataset since the given timestamp.
|
||||
*/
|
||||
public static Dataset<Row> readSince(String basePath,
|
||||
SQLContext sqlContext,
|
||||
HoodieTimeline commitTimeline,
|
||||
String lastCommitTime) {
|
||||
List<HoodieInstant> commitsToReturn =
|
||||
commitTimeline.findInstantsAfter(lastCommitTime, Integer.MAX_VALUE)
|
||||
.getInstants().collect(Collectors.toList());
|
||||
try {
|
||||
// Go over the commit metadata, and obtain the new files that need to be read.
|
||||
HashMap<String, String> fileIdToFullPath = getLatestFileIDsToFullPath(basePath, commitTimeline, commitsToReturn);
|
||||
return sqlContext.read()
|
||||
.parquet(fileIdToFullPath.values().toArray(new String[fileIdToFullPath.size()]))
|
||||
.filter(String.format("%s >'%s'", HoodieRecord.COMMIT_TIME_METADATA_FIELD, lastCommitTime));
|
||||
} catch (IOException e) {
|
||||
throw new HoodieException("Error pulling data incrementally from commitTimestamp :" + lastCommitTime, e);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Reads the paths under the a hoodie dataset out as a DataFrame
|
||||
*/
|
||||
public static Dataset<Row> read(String basePath,
|
||||
SQLContext sqlContext,
|
||||
FileSystem fs,
|
||||
String... paths) {
|
||||
List<String> filteredPaths = new ArrayList<>();
|
||||
try {
|
||||
HoodieTable hoodieTable = HoodieTable
|
||||
.getHoodieTable(new HoodieTableMetaClient(fs, basePath, true), null);
|
||||
for (String path : paths) {
|
||||
TableFileSystemView.ReadOptimizedView fileSystemView = new HoodieTableFileSystemView(hoodieTable.getMetaClient(),
|
||||
hoodieTable.getCompletedCommitTimeline(), fs.globStatus(new Path(path)));
|
||||
List<HoodieDataFile> latestFiles = fileSystemView.getLatestDataFiles().collect(
|
||||
Collectors.toList());
|
||||
for (HoodieDataFile file : latestFiles) {
|
||||
filteredPaths.add(file.getPath());
|
||||
}
|
||||
}
|
||||
return sqlContext.read()
|
||||
.parquet(filteredPaths.toArray(new String[filteredPaths.size()]));
|
||||
} catch (Exception e) {
|
||||
throw new HoodieException("Error reading hoodie dataset as a dataframe", e);
|
||||
public static List<WriteStatus> collectStatuses(Iterator<List<WriteStatus>> statusListItr) {
|
||||
List<WriteStatus> statuses = new ArrayList<>();
|
||||
while (statusListItr.hasNext()) {
|
||||
statuses.addAll(statusListItr.next());
|
||||
}
|
||||
return statuses;
|
||||
}
|
||||
|
||||
public static Set<String> getRecordKeys(List<HoodieRecord> hoodieRecords) {
|
||||
Set<String> keys = new HashSet<>();
|
||||
for (HoodieRecord rec : hoodieRecords) {
|
||||
keys.add(rec.getRecordKey());
|
||||
}
|
||||
return keys;
|
||||
}
|
||||
|
||||
private static void fakeMetaFile(String basePath, String commitTime, String suffix)
|
||||
throws IOException {
|
||||
String parentPath = basePath + "/" + HoodieTableMetaClient.METAFOLDER_NAME;
|
||||
new File(parentPath).mkdirs();
|
||||
new File(parentPath + "/" + commitTime + suffix).createNewFile();
|
||||
}
|
||||
|
||||
|
||||
public static void fakeCommitFile(String basePath, String commitTime) throws IOException {
|
||||
fakeMetaFile(basePath, commitTime, HoodieTimeline.COMMIT_EXTENSION);
|
||||
}
|
||||
|
||||
public static void fakeInFlightFile(String basePath, String commitTime) throws IOException {
|
||||
fakeMetaFile(basePath, commitTime, HoodieTimeline.INFLIGHT_EXTENSION);
|
||||
}
|
||||
|
||||
public static void fakeDataFile(String basePath, String partitionPath, String commitTime,
|
||||
String fileId) throws Exception {
|
||||
fakeDataFile(basePath, partitionPath, commitTime, fileId, 0);
|
||||
}
|
||||
|
||||
public static void fakeDataFile(String basePath, String partitionPath, String commitTime,
|
||||
String fileId, long length) throws Exception {
|
||||
String parentPath = String.format("%s/%s", basePath, partitionPath);
|
||||
new File(parentPath).mkdirs();
|
||||
String path = String
|
||||
.format("%s/%s", parentPath, FSUtils.makeDataFileName(commitTime, 0, fileId));
|
||||
new File(path).createNewFile();
|
||||
new RandomAccessFile(path, "rw").setLength(length);
|
||||
}
|
||||
|
||||
public static SparkConf getSparkConfForTest(String appName) {
|
||||
SparkConf sparkConf = new SparkConf()
|
||||
.setAppName(appName)
|
||||
.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
|
||||
.setMaster("local[1]");
|
||||
return HoodieReadClient.addHoodieSupport(sparkConf);
|
||||
}
|
||||
|
||||
public static HashMap<String, String> getLatestFileIDsToFullPath(String basePath,
|
||||
HoodieTimeline commitTimeline,
|
||||
List<HoodieInstant> commitsToReturn) throws IOException {
|
||||
HashMap<String, String> fileIdToFullPath = new HashMap<>();
|
||||
for (HoodieInstant commit : commitsToReturn) {
|
||||
HoodieCommitMetadata metadata =
|
||||
HoodieCommitMetadata.fromBytes(commitTimeline.getInstantDetails(commit).get());
|
||||
fileIdToFullPath.putAll(metadata.getFileIdAndFullPaths(basePath));
|
||||
}
|
||||
return fileIdToFullPath;
|
||||
}
|
||||
|
||||
public static Dataset<Row> readCommit(String basePath,
|
||||
SQLContext sqlContext,
|
||||
HoodieTimeline commitTimeline,
|
||||
String commitTime) {
|
||||
HoodieInstant commitInstant =
|
||||
new HoodieInstant(false, HoodieTimeline.COMMIT_ACTION, commitTime);
|
||||
if (!commitTimeline.containsInstant(commitInstant)) {
|
||||
new HoodieException("No commit exists at " + commitTime);
|
||||
}
|
||||
try {
|
||||
HashMap<String, String> paths = getLatestFileIDsToFullPath(basePath, commitTimeline,
|
||||
Arrays.asList(commitInstant));
|
||||
return sqlContext.read()
|
||||
.parquet(paths.values().toArray(new String[paths.size()]))
|
||||
.filter(String.format("%s ='%s'", HoodieRecord.COMMIT_TIME_METADATA_FIELD, commitTime));
|
||||
} catch (Exception e) {
|
||||
throw new HoodieException("Error reading commit " + commitTime, e);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Obtain all new data written into the Hoodie dataset since the given timestamp.
|
||||
*/
|
||||
public static Dataset<Row> readSince(String basePath,
|
||||
SQLContext sqlContext,
|
||||
HoodieTimeline commitTimeline,
|
||||
String lastCommitTime) {
|
||||
List<HoodieInstant> commitsToReturn =
|
||||
commitTimeline.findInstantsAfter(lastCommitTime, Integer.MAX_VALUE)
|
||||
.getInstants().collect(Collectors.toList());
|
||||
try {
|
||||
// Go over the commit metadata, and obtain the new files that need to be read.
|
||||
HashMap<String, String> fileIdToFullPath = getLatestFileIDsToFullPath(basePath,
|
||||
commitTimeline, commitsToReturn);
|
||||
return sqlContext.read()
|
||||
.parquet(fileIdToFullPath.values().toArray(new String[fileIdToFullPath.size()]))
|
||||
.filter(
|
||||
String.format("%s >'%s'", HoodieRecord.COMMIT_TIME_METADATA_FIELD, lastCommitTime));
|
||||
} catch (IOException e) {
|
||||
throw new HoodieException(
|
||||
"Error pulling data incrementally from commitTimestamp :" + lastCommitTime, e);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Reads the paths under the a hoodie dataset out as a DataFrame
|
||||
*/
|
||||
public static Dataset<Row> read(String basePath,
|
||||
SQLContext sqlContext,
|
||||
FileSystem fs,
|
||||
String... paths) {
|
||||
List<String> filteredPaths = new ArrayList<>();
|
||||
try {
|
||||
HoodieTable hoodieTable = HoodieTable
|
||||
.getHoodieTable(new HoodieTableMetaClient(fs, basePath, true), null);
|
||||
for (String path : paths) {
|
||||
TableFileSystemView.ReadOptimizedView fileSystemView = new HoodieTableFileSystemView(
|
||||
hoodieTable.getMetaClient(),
|
||||
hoodieTable.getCompletedCommitTimeline(), fs.globStatus(new Path(path)));
|
||||
List<HoodieDataFile> latestFiles = fileSystemView.getLatestDataFiles().collect(
|
||||
Collectors.toList());
|
||||
for (HoodieDataFile file : latestFiles) {
|
||||
filteredPaths.add(file.getPath());
|
||||
}
|
||||
}
|
||||
return sqlContext.read()
|
||||
.parquet(filteredPaths.toArray(new String[filteredPaths.size()]));
|
||||
} catch (Exception e) {
|
||||
throw new HoodieException("Error reading hoodie dataset as a dataframe", e);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -16,9 +16,16 @@
|
||||
|
||||
package com.uber.hoodie.common;
|
||||
|
||||
import static com.uber.hoodie.common.HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA;
|
||||
|
||||
import com.uber.hoodie.common.util.FSUtils;
|
||||
import com.uber.hoodie.common.util.HoodieAvroUtils;
|
||||
import com.uber.hoodie.hadoop.realtime.HoodieRealtimeInputFormat;
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
import java.util.stream.Collectors;
|
||||
import org.apache.avro.Schema;
|
||||
import org.apache.avro.generic.GenericRecord;
|
||||
import org.apache.avro.generic.GenericRecordBuilder;
|
||||
@@ -30,69 +37,64 @@ import org.apache.hadoop.mapred.InputSplit;
|
||||
import org.apache.hadoop.mapred.JobConf;
|
||||
import org.apache.hadoop.mapred.RecordReader;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import static com.uber.hoodie.common.HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA;
|
||||
|
||||
/**
|
||||
* Utility methods to aid in testing MergeOnRead (workaround for HoodieReadClient for MOR)
|
||||
*/
|
||||
public class HoodieMergeOnReadTestUtils {
|
||||
|
||||
public static List<GenericRecord> getRecordsUsingInputFormat(List<String> inputPaths) throws IOException {
|
||||
JobConf jobConf = new JobConf();
|
||||
Schema schema = HoodieAvroUtils.addMetadataFields(Schema.parse(TRIP_EXAMPLE_SCHEMA));
|
||||
HoodieRealtimeInputFormat inputFormat = new HoodieRealtimeInputFormat();
|
||||
setPropsForInputFormat(inputFormat, jobConf, schema);
|
||||
return inputPaths.stream().map(path -> {
|
||||
setInputPath(jobConf, path);
|
||||
List<GenericRecord> records = new ArrayList<>();
|
||||
try {
|
||||
List<InputSplit> splits = Arrays.asList(inputFormat.getSplits(jobConf, 1));
|
||||
RecordReader recordReader = inputFormat.getRecordReader(splits.get(0), jobConf, null);
|
||||
Void key = (Void) recordReader.createKey();
|
||||
ArrayWritable writable = (ArrayWritable) recordReader.createValue();
|
||||
while (recordReader.next(key, writable)) {
|
||||
GenericRecordBuilder newRecord = new GenericRecordBuilder(schema);
|
||||
// writable returns an array with [field1, field2, _hoodie_commit_time, _hoodie_commit_seqno]
|
||||
Writable[] values = writable.get();
|
||||
schema.getFields().forEach(field -> {
|
||||
newRecord.set(field, values[2]);
|
||||
});
|
||||
records.add(newRecord.build());
|
||||
}
|
||||
} catch (IOException ie) {
|
||||
ie.printStackTrace();
|
||||
}
|
||||
return records;
|
||||
}).reduce((a, b) -> {
|
||||
a.addAll(b);
|
||||
return a;
|
||||
}).get();
|
||||
}
|
||||
public static List<GenericRecord> getRecordsUsingInputFormat(List<String> inputPaths)
|
||||
throws IOException {
|
||||
JobConf jobConf = new JobConf();
|
||||
Schema schema = HoodieAvroUtils.addMetadataFields(Schema.parse(TRIP_EXAMPLE_SCHEMA));
|
||||
HoodieRealtimeInputFormat inputFormat = new HoodieRealtimeInputFormat();
|
||||
setPropsForInputFormat(inputFormat, jobConf, schema);
|
||||
return inputPaths.stream().map(path -> {
|
||||
setInputPath(jobConf, path);
|
||||
List<GenericRecord> records = new ArrayList<>();
|
||||
try {
|
||||
List<InputSplit> splits = Arrays.asList(inputFormat.getSplits(jobConf, 1));
|
||||
RecordReader recordReader = inputFormat.getRecordReader(splits.get(0), jobConf, null);
|
||||
Void key = (Void) recordReader.createKey();
|
||||
ArrayWritable writable = (ArrayWritable) recordReader.createValue();
|
||||
while (recordReader.next(key, writable)) {
|
||||
GenericRecordBuilder newRecord = new GenericRecordBuilder(schema);
|
||||
// writable returns an array with [field1, field2, _hoodie_commit_time, _hoodie_commit_seqno]
|
||||
Writable[] values = writable.get();
|
||||
schema.getFields().forEach(field -> {
|
||||
newRecord.set(field, values[2]);
|
||||
});
|
||||
records.add(newRecord.build());
|
||||
}
|
||||
} catch (IOException ie) {
|
||||
ie.printStackTrace();
|
||||
}
|
||||
return records;
|
||||
}).reduce((a, b) -> {
|
||||
a.addAll(b);
|
||||
return a;
|
||||
}).get();
|
||||
}
|
||||
|
||||
private static void setPropsForInputFormat(HoodieRealtimeInputFormat inputFormat, JobConf jobConf, Schema schema) {
|
||||
List<Schema.Field> fields = schema.getFields();
|
||||
String names = fields.stream().map(f -> f.name().toString()).collect(Collectors.joining(","));
|
||||
String postions = fields.stream().map(f -> String.valueOf(f.pos())).collect(Collectors.joining(","));
|
||||
Configuration conf = FSUtils.getFs().getConf();
|
||||
jobConf.set(ColumnProjectionUtils.READ_COLUMN_NAMES_CONF_STR, names);
|
||||
jobConf.set(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR, postions);
|
||||
jobConf.set("partition_columns", "datestr");
|
||||
conf.set(ColumnProjectionUtils.READ_COLUMN_NAMES_CONF_STR, names);
|
||||
conf.set(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR, postions);
|
||||
conf.set("partition_columns", "datestr");
|
||||
inputFormat.setConf(conf);
|
||||
jobConf.addResource(conf);
|
||||
}
|
||||
private static void setPropsForInputFormat(HoodieRealtimeInputFormat inputFormat, JobConf jobConf,
|
||||
Schema schema) {
|
||||
List<Schema.Field> fields = schema.getFields();
|
||||
String names = fields.stream().map(f -> f.name().toString()).collect(Collectors.joining(","));
|
||||
String postions = fields.stream().map(f -> String.valueOf(f.pos()))
|
||||
.collect(Collectors.joining(","));
|
||||
Configuration conf = FSUtils.getFs().getConf();
|
||||
jobConf.set(ColumnProjectionUtils.READ_COLUMN_NAMES_CONF_STR, names);
|
||||
jobConf.set(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR, postions);
|
||||
jobConf.set("partition_columns", "datestr");
|
||||
conf.set(ColumnProjectionUtils.READ_COLUMN_NAMES_CONF_STR, names);
|
||||
conf.set(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR, postions);
|
||||
conf.set("partition_columns", "datestr");
|
||||
inputFormat.setConf(conf);
|
||||
jobConf.addResource(conf);
|
||||
}
|
||||
|
||||
private static void setInputPath(JobConf jobConf, String inputPath) {
|
||||
jobConf.set("mapreduce.input.fileinputformat.inputdir", inputPath);
|
||||
jobConf.set("mapreduce.input.fileinputformat.inputdir", inputPath);
|
||||
jobConf.set("map.input.dir", inputPath);
|
||||
}
|
||||
private static void setInputPath(JobConf jobConf, String inputPath) {
|
||||
jobConf.set("mapreduce.input.fileinputformat.inputdir", inputPath);
|
||||
jobConf.set("mapreduce.input.fileinputformat.inputdir", inputPath);
|
||||
jobConf.set("map.input.dir", inputPath);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -16,17 +16,21 @@
|
||||
|
||||
package com.uber.hoodie.common;
|
||||
|
||||
import com.uber.hoodie.avro.model.HoodieCleanMetadata;
|
||||
import com.uber.hoodie.common.model.HoodieCleaningPolicy;
|
||||
import com.uber.hoodie.common.model.HoodieCommitMetadata;
|
||||
import com.uber.hoodie.common.model.HoodieKey;
|
||||
import com.uber.hoodie.common.model.HoodiePartitionMetadata;
|
||||
import com.uber.hoodie.common.model.HoodieRecord;
|
||||
import com.uber.hoodie.common.table.HoodieTableMetaClient;
|
||||
import com.uber.hoodie.common.table.HoodieTimeline;
|
||||
import com.uber.hoodie.common.util.AvroUtils;
|
||||
import com.uber.hoodie.common.util.FSUtils;
|
||||
import com.uber.hoodie.common.util.HoodieAvroUtils;
|
||||
import java.io.IOException;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.Optional;
|
||||
import java.util.Random;
|
||||
import java.util.UUID;
|
||||
import org.apache.avro.Schema;
|
||||
import org.apache.avro.generic.GenericData;
|
||||
import org.apache.avro.generic.GenericRecord;
|
||||
@@ -34,15 +38,6 @@ import org.apache.hadoop.fs.FSDataOutputStream;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
import java.util.Optional;
|
||||
import java.util.Random;
|
||||
import java.util.UUID;
|
||||
|
||||
/**
|
||||
* Class to be used in tests to keep generating test inserts and updates against a corpus.
|
||||
*
|
||||
@@ -51,153 +46,164 @@ import java.util.UUID;
|
||||
public class HoodieTestDataGenerator {
|
||||
|
||||
static class KeyPartition {
|
||||
HoodieKey key;
|
||||
String partitionPath;
|
||||
|
||||
HoodieKey key;
|
||||
String partitionPath;
|
||||
}
|
||||
|
||||
public static String TRIP_EXAMPLE_SCHEMA = "{\"type\": \"record\","
|
||||
+ "\"name\": \"triprec\","
|
||||
+ "\"fields\": [ "
|
||||
+ "{\"name\": \"timestamp\",\"type\": \"double\"},"
|
||||
+ "{\"name\": \"_row_key\", \"type\": \"string\"},"
|
||||
+ "{\"name\": \"rider\", \"type\": \"string\"},"
|
||||
+ "{\"name\": \"driver\", \"type\": \"string\"},"
|
||||
+ "{\"name\": \"begin_lat\", \"type\": \"double\"},"
|
||||
+ "{\"name\": \"begin_lon\", \"type\": \"double\"},"
|
||||
+ "{\"name\": \"end_lat\", \"type\": \"double\"},"
|
||||
+ "{\"name\": \"end_lon\", \"type\": \"double\"},"
|
||||
+ "{\"name\":\"fare\",\"type\": \"double\"}]}";
|
||||
|
||||
// based on examination of sample file, the schema produces the following per record size
|
||||
public static final int SIZE_PER_RECORD = 50 * 1024;
|
||||
|
||||
public static final String[] DEFAULT_PARTITION_PATHS = {"2016/03/15", "2015/03/16", "2015/03/17"};
|
||||
|
||||
|
||||
public static void writePartitionMetadata(FileSystem fs, String[] partitionPaths,
|
||||
String basePath) {
|
||||
for (String partitionPath : partitionPaths) {
|
||||
new HoodiePartitionMetadata(fs, "000", new Path(basePath), new Path(basePath, partitionPath))
|
||||
.trySave(0);
|
||||
}
|
||||
}
|
||||
|
||||
public static String TRIP_EXAMPLE_SCHEMA = "{\"type\": \"record\","
|
||||
+ "\"name\": \"triprec\","
|
||||
+ "\"fields\": [ "
|
||||
+ "{\"name\": \"timestamp\",\"type\": \"double\"},"
|
||||
+ "{\"name\": \"_row_key\", \"type\": \"string\"},"
|
||||
+ "{\"name\": \"rider\", \"type\": \"string\"},"
|
||||
+ "{\"name\": \"driver\", \"type\": \"string\"},"
|
||||
+ "{\"name\": \"begin_lat\", \"type\": \"double\"},"
|
||||
+ "{\"name\": \"begin_lon\", \"type\": \"double\"},"
|
||||
+ "{\"name\": \"end_lat\", \"type\": \"double\"},"
|
||||
+ "{\"name\": \"end_lon\", \"type\": \"double\"},"
|
||||
+ "{\"name\":\"fare\",\"type\": \"double\"}]}";
|
||||
private List<KeyPartition> existingKeysList = new ArrayList<>();
|
||||
public static Schema avroSchema = HoodieAvroUtils
|
||||
.addMetadataFields(new Schema.Parser().parse(TRIP_EXAMPLE_SCHEMA));
|
||||
private static Random rand = new Random(46474747);
|
||||
private String[] partitionPaths = DEFAULT_PARTITION_PATHS;
|
||||
|
||||
// based on examination of sample file, the schema produces the following per record size
|
||||
public static final int SIZE_PER_RECORD = 50 * 1024;
|
||||
public HoodieTestDataGenerator(String[] partitionPaths) {
|
||||
this.partitionPaths = partitionPaths;
|
||||
}
|
||||
|
||||
public static final String[] DEFAULT_PARTITION_PATHS = {"2016/03/15", "2015/03/16", "2015/03/17"};
|
||||
public HoodieTestDataGenerator() {
|
||||
this(new String[]{"2016/03/15", "2015/03/16", "2015/03/17"});
|
||||
}
|
||||
|
||||
|
||||
public static void writePartitionMetadata(FileSystem fs, String[] partitionPaths, String basePath) {
|
||||
for (String partitionPath: partitionPaths) {
|
||||
new HoodiePartitionMetadata(fs, "000", new Path(basePath), new Path(basePath, partitionPath)).trySave(0);
|
||||
}
|
||||
/**
|
||||
* Generates new inserts, uniformly across the partition paths above. It also updates the list of
|
||||
* existing keys.
|
||||
*/
|
||||
public List<HoodieRecord> generateInserts(String commitTime, int n) throws IOException {
|
||||
List<HoodieRecord> inserts = new ArrayList<>();
|
||||
for (int i = 0; i < n; i++) {
|
||||
String partitionPath = partitionPaths[rand.nextInt(partitionPaths.length)];
|
||||
HoodieKey key = new HoodieKey(UUID.randomUUID().toString(), partitionPath);
|
||||
HoodieRecord record = new HoodieRecord(key, generateRandomValue(key, commitTime));
|
||||
inserts.add(record);
|
||||
|
||||
KeyPartition kp = new KeyPartition();
|
||||
kp.key = key;
|
||||
kp.partitionPath = partitionPath;
|
||||
existingKeysList.add(kp);
|
||||
}
|
||||
return inserts;
|
||||
}
|
||||
|
||||
private List<KeyPartition> existingKeysList = new ArrayList<>();
|
||||
public static Schema avroSchema = HoodieAvroUtils.addMetadataFields(new Schema.Parser().parse(TRIP_EXAMPLE_SCHEMA));
|
||||
private static Random rand = new Random(46474747);
|
||||
private String[] partitionPaths = DEFAULT_PARTITION_PATHS;
|
||||
public List<HoodieRecord> generateDeletes(String commitTime, int n) throws IOException {
|
||||
List<HoodieRecord> inserts = generateInserts(commitTime, n);
|
||||
return generateDeletesFromExistingRecords(inserts);
|
||||
}
|
||||
|
||||
public List<HoodieRecord> generateDeletesFromExistingRecords(List<HoodieRecord> existingRecords)
|
||||
throws IOException {
|
||||
List<HoodieRecord> deletes = new ArrayList<>();
|
||||
for (HoodieRecord existingRecord : existingRecords) {
|
||||
HoodieRecord record = generateDeleteRecord(existingRecord);
|
||||
deletes.add(record);
|
||||
|
||||
public HoodieTestDataGenerator(String[] partitionPaths) {
|
||||
this.partitionPaths = partitionPaths;
|
||||
}
|
||||
return deletes;
|
||||
}
|
||||
|
||||
public HoodieTestDataGenerator() {
|
||||
this(new String[]{"2016/03/15", "2015/03/16", "2015/03/17"});
|
||||
public HoodieRecord generateDeleteRecord(HoodieRecord existingRecord) throws IOException {
|
||||
HoodieKey key = existingRecord.getKey();
|
||||
TestRawTripPayload payload = new TestRawTripPayload(Optional.empty(), key.getRecordKey(),
|
||||
key.getPartitionPath(), null, true);
|
||||
return new HoodieRecord(key, payload);
|
||||
}
|
||||
|
||||
public List<HoodieRecord> generateUpdates(String commitTime, List<HoodieRecord> baseRecords)
|
||||
throws IOException {
|
||||
List<HoodieRecord> updates = new ArrayList<>();
|
||||
for (HoodieRecord baseRecord : baseRecords) {
|
||||
HoodieRecord record = new HoodieRecord(baseRecord.getKey(),
|
||||
generateRandomValue(baseRecord.getKey(), commitTime));
|
||||
updates.add(record);
|
||||
}
|
||||
return updates;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Generates new inserts, uniformly across the partition paths above. It also updates the list
|
||||
* of existing keys.
|
||||
*/
|
||||
public List<HoodieRecord> generateInserts(String commitTime, int n) throws IOException {
|
||||
List<HoodieRecord> inserts = new ArrayList<>();
|
||||
for (int i = 0; i < n; i++) {
|
||||
String partitionPath = partitionPaths[rand.nextInt(partitionPaths.length)];
|
||||
HoodieKey key = new HoodieKey(UUID.randomUUID().toString(), partitionPath);
|
||||
HoodieRecord record = new HoodieRecord(key, generateRandomValue(key, commitTime));
|
||||
inserts.add(record);
|
||||
|
||||
KeyPartition kp = new KeyPartition();
|
||||
kp.key = key;
|
||||
kp.partitionPath = partitionPath;
|
||||
existingKeysList.add(kp);
|
||||
}
|
||||
return inserts;
|
||||
/**
|
||||
* Generates new updates, randomly distributed across the keys above.
|
||||
*/
|
||||
public List<HoodieRecord> generateUpdates(String commitTime, int n) throws IOException {
|
||||
List<HoodieRecord> updates = new ArrayList<>();
|
||||
for (int i = 0; i < n; i++) {
|
||||
KeyPartition kp = existingKeysList.get(rand.nextInt(existingKeysList.size() - 1));
|
||||
HoodieRecord record = new HoodieRecord(kp.key, generateRandomValue(kp.key, commitTime));
|
||||
updates.add(record);
|
||||
}
|
||||
return updates;
|
||||
}
|
||||
|
||||
public List<HoodieRecord> generateDeletes(String commitTime, int n) throws IOException {
|
||||
List<HoodieRecord> inserts = generateInserts(commitTime, n);
|
||||
return generateDeletesFromExistingRecords(inserts);
|
||||
}
|
||||
|
||||
public List<HoodieRecord> generateDeletesFromExistingRecords(List<HoodieRecord> existingRecords) throws IOException {
|
||||
List<HoodieRecord> deletes = new ArrayList<>();
|
||||
for (HoodieRecord existingRecord: existingRecords) {
|
||||
HoodieRecord record = generateDeleteRecord(existingRecord);
|
||||
deletes.add(record);
|
||||
|
||||
}
|
||||
return deletes;
|
||||
}
|
||||
|
||||
public HoodieRecord generateDeleteRecord(HoodieRecord existingRecord) throws IOException {
|
||||
HoodieKey key = existingRecord.getKey();
|
||||
TestRawTripPayload payload = new TestRawTripPayload(Optional.empty(), key.getRecordKey(), key.getPartitionPath(), null, true);
|
||||
return new HoodieRecord(key, payload);
|
||||
}
|
||||
|
||||
public List<HoodieRecord> generateUpdates(String commitTime, List<HoodieRecord> baseRecords) throws IOException {
|
||||
List<HoodieRecord> updates = new ArrayList<>();
|
||||
for (HoodieRecord baseRecord: baseRecords) {
|
||||
HoodieRecord record = new HoodieRecord(baseRecord.getKey(), generateRandomValue(baseRecord.getKey(), commitTime));
|
||||
updates.add(record);
|
||||
}
|
||||
return updates;
|
||||
}
|
||||
|
||||
/**
|
||||
* Generates new updates, randomly distributed across the keys above.
|
||||
*/
|
||||
public List<HoodieRecord> generateUpdates(String commitTime, int n) throws IOException {
|
||||
List<HoodieRecord> updates = new ArrayList<>();
|
||||
for (int i = 0; i < n; i++) {
|
||||
KeyPartition kp = existingKeysList.get(rand.nextInt(existingKeysList.size() - 1));
|
||||
HoodieRecord record = new HoodieRecord(kp.key, generateRandomValue(kp.key, commitTime));
|
||||
updates.add(record);
|
||||
}
|
||||
return updates;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Generates a new avro record of the above schema format, retaining the key if optionally
|
||||
* provided.
|
||||
*/
|
||||
public static TestRawTripPayload generateRandomValue(HoodieKey key, String commitTime) throws IOException {
|
||||
GenericRecord rec = generateGenericRecord(key.getRecordKey(), "rider-" + commitTime,
|
||||
"driver-" + commitTime, 0.0);
|
||||
HoodieAvroUtils.addCommitMetadataToRecord(rec, commitTime, "-1");
|
||||
return new TestRawTripPayload(rec.toString(), key.getRecordKey(), key.getPartitionPath(), TRIP_EXAMPLE_SCHEMA);
|
||||
}
|
||||
|
||||
public static GenericRecord generateGenericRecord(String rowKey, String riderName,
|
||||
String driverName, double timestamp) {
|
||||
GenericRecord rec = new GenericData.Record(avroSchema);
|
||||
rec.put("_row_key", rowKey);
|
||||
rec.put("timestamp", timestamp);
|
||||
rec.put("rider", riderName);
|
||||
rec.put("driver", driverName);
|
||||
rec.put("begin_lat", rand.nextDouble());
|
||||
rec.put("begin_lon", rand.nextDouble());
|
||||
rec.put("end_lat", rand.nextDouble());
|
||||
rec.put("end_lon", rand.nextDouble());
|
||||
rec.put("fare", rand.nextDouble() * 100);
|
||||
return rec;
|
||||
}
|
||||
|
||||
public static void createCommitFile(String basePath, String commitTime) throws IOException {
|
||||
Path commitFile =
|
||||
new Path(basePath + "/" + HoodieTableMetaClient.METAFOLDER_NAME + "/" + HoodieTimeline.makeCommitFileName(commitTime));
|
||||
FileSystem fs = FSUtils.getFs();
|
||||
FSDataOutputStream os = fs.create(commitFile, true);
|
||||
HoodieCommitMetadata commitMetadata = new HoodieCommitMetadata();
|
||||
try {
|
||||
// Write empty commit metadata
|
||||
os.writeBytes(new String(commitMetadata.toJsonString().getBytes(
|
||||
StandardCharsets.UTF_8)));
|
||||
} finally {
|
||||
os.close();
|
||||
}
|
||||
|
||||
/**
|
||||
* Generates a new avro record of the above schema format, retaining the key if optionally
|
||||
* provided.
|
||||
*/
|
||||
public static TestRawTripPayload generateRandomValue(HoodieKey key, String commitTime)
|
||||
throws IOException {
|
||||
GenericRecord rec = generateGenericRecord(key.getRecordKey(), "rider-" + commitTime,
|
||||
"driver-" + commitTime, 0.0);
|
||||
HoodieAvroUtils.addCommitMetadataToRecord(rec, commitTime, "-1");
|
||||
return new TestRawTripPayload(rec.toString(), key.getRecordKey(), key.getPartitionPath(),
|
||||
TRIP_EXAMPLE_SCHEMA);
|
||||
}
|
||||
|
||||
public static GenericRecord generateGenericRecord(String rowKey, String riderName,
|
||||
String driverName, double timestamp) {
|
||||
GenericRecord rec = new GenericData.Record(avroSchema);
|
||||
rec.put("_row_key", rowKey);
|
||||
rec.put("timestamp", timestamp);
|
||||
rec.put("rider", riderName);
|
||||
rec.put("driver", driverName);
|
||||
rec.put("begin_lat", rand.nextDouble());
|
||||
rec.put("begin_lon", rand.nextDouble());
|
||||
rec.put("end_lat", rand.nextDouble());
|
||||
rec.put("end_lon", rand.nextDouble());
|
||||
rec.put("fare", rand.nextDouble() * 100);
|
||||
return rec;
|
||||
}
|
||||
|
||||
public static void createCommitFile(String basePath, String commitTime) throws IOException {
|
||||
Path commitFile =
|
||||
new Path(basePath + "/" + HoodieTableMetaClient.METAFOLDER_NAME + "/" + HoodieTimeline
|
||||
.makeCommitFileName(commitTime));
|
||||
FileSystem fs = FSUtils.getFs();
|
||||
FSDataOutputStream os = fs.create(commitFile, true);
|
||||
HoodieCommitMetadata commitMetadata = new HoodieCommitMetadata();
|
||||
try {
|
||||
// Write empty commit metadata
|
||||
os.writeBytes(new String(commitMetadata.toJsonString().getBytes(
|
||||
StandardCharsets.UTF_8)));
|
||||
} finally {
|
||||
os.close();
|
||||
}
|
||||
}
|
||||
|
||||
public static void createSavepointFile(String basePath, String commitTime) throws IOException {
|
||||
Path commitFile =
|
||||
@@ -215,7 +221,7 @@ public class HoodieTestDataGenerator {
|
||||
}
|
||||
}
|
||||
|
||||
public String[] getPartitionPaths() {
|
||||
return partitionPaths;
|
||||
}
|
||||
public String[] getPartitionPaths() {
|
||||
return partitionPaths;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -17,174 +17,182 @@
|
||||
package com.uber.hoodie.common;
|
||||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
|
||||
import com.uber.hoodie.WriteStatus;
|
||||
import com.uber.hoodie.avro.MercifulJsonConverter;
|
||||
import com.uber.hoodie.common.model.HoodieRecord;
|
||||
import com.uber.hoodie.common.model.HoodieRecordPayload;
|
||||
|
||||
import java.io.ByteArrayInputStream;
|
||||
import java.io.ByteArrayOutputStream;
|
||||
import java.io.IOException;
|
||||
import java.io.StringWriter;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map.Entry;
|
||||
import org.apache.avro.Schema;
|
||||
import org.apache.avro.generic.IndexedRecord;
|
||||
import org.apache.commons.io.IOUtils;
|
||||
|
||||
import java.io.*;
|
||||
import java.util.Map;
|
||||
import java.util.Map.Entry;
|
||||
import java.util.Optional;
|
||||
import java.util.zip.Deflater;
|
||||
import java.util.zip.DeflaterOutputStream;
|
||||
import java.util.zip.InflaterInputStream;
|
||||
import org.apache.avro.Schema;
|
||||
import org.apache.avro.generic.IndexedRecord;
|
||||
import org.apache.commons.io.IOUtils;
|
||||
|
||||
/**
|
||||
* Example row change event based on some example data used by testcases. The data avro schema is
|
||||
* src/test/resources/schema1.
|
||||
*/
|
||||
public class TestRawTripPayload implements HoodieRecordPayload<TestRawTripPayload> {
|
||||
private transient static final ObjectMapper mapper = new ObjectMapper();
|
||||
private String partitionPath;
|
||||
private String rowKey;
|
||||
private byte[] jsonDataCompressed;
|
||||
private int dataSize;
|
||||
private boolean isDeleted;
|
||||
|
||||
public TestRawTripPayload(Optional<String> jsonData, String rowKey, String partitionPath,
|
||||
String schemaStr, Boolean isDeleted) throws IOException {
|
||||
if(jsonData.isPresent()) {
|
||||
this.jsonDataCompressed = compressData(jsonData.get());
|
||||
this.dataSize = jsonData.get().length();
|
||||
}
|
||||
this.rowKey = rowKey;
|
||||
this.partitionPath = partitionPath;
|
||||
this.isDeleted = isDeleted;
|
||||
private transient static final ObjectMapper mapper = new ObjectMapper();
|
||||
private String partitionPath;
|
||||
private String rowKey;
|
||||
private byte[] jsonDataCompressed;
|
||||
private int dataSize;
|
||||
private boolean isDeleted;
|
||||
|
||||
public TestRawTripPayload(Optional<String> jsonData, String rowKey, String partitionPath,
|
||||
String schemaStr, Boolean isDeleted) throws IOException {
|
||||
if (jsonData.isPresent()) {
|
||||
this.jsonDataCompressed = compressData(jsonData.get());
|
||||
this.dataSize = jsonData.get().length();
|
||||
}
|
||||
this.rowKey = rowKey;
|
||||
this.partitionPath = partitionPath;
|
||||
this.isDeleted = isDeleted;
|
||||
}
|
||||
|
||||
public TestRawTripPayload(String jsonData, String rowKey, String partitionPath,
|
||||
String schemaStr)throws IOException {
|
||||
this(Optional.of(jsonData), rowKey, partitionPath, schemaStr, false);
|
||||
public TestRawTripPayload(String jsonData, String rowKey, String partitionPath,
|
||||
String schemaStr) throws IOException {
|
||||
this(Optional.of(jsonData), rowKey, partitionPath, schemaStr, false);
|
||||
}
|
||||
|
||||
public TestRawTripPayload(String jsonData) throws IOException {
|
||||
this.jsonDataCompressed = compressData(jsonData);
|
||||
this.dataSize = jsonData.length();
|
||||
Map<String, Object> jsonRecordMap = mapper.readValue(jsonData, Map.class);
|
||||
this.rowKey = jsonRecordMap.get("_row_key").toString();
|
||||
this.partitionPath = jsonRecordMap.get("time").toString().split("T")[0].replace("-", "/");
|
||||
this.isDeleted = false;
|
||||
}
|
||||
|
||||
public String getPartitionPath() {
|
||||
return partitionPath;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public TestRawTripPayload preCombine(TestRawTripPayload another) {
|
||||
return another;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Optional<IndexedRecord> combineAndGetUpdateValue(IndexedRecord oldRec, Schema schema)
|
||||
throws IOException {
|
||||
return this.getInsertValue(schema);
|
||||
}
|
||||
|
||||
@Override
|
||||
public Optional<IndexedRecord> getInsertValue(Schema schema) throws IOException {
|
||||
if (isDeleted) {
|
||||
return Optional.empty();
|
||||
} else {
|
||||
MercifulJsonConverter jsonConverter = new MercifulJsonConverter(schema);
|
||||
return Optional.of(jsonConverter.convert(getJsonData()));
|
||||
}
|
||||
}
|
||||
|
||||
public TestRawTripPayload(String jsonData) throws IOException {
|
||||
this.jsonDataCompressed = compressData(jsonData);
|
||||
this.dataSize = jsonData.length();
|
||||
Map<String, Object> jsonRecordMap = mapper.readValue(jsonData, Map.class);
|
||||
this.rowKey = jsonRecordMap.get("_row_key").toString();
|
||||
this.partitionPath = jsonRecordMap.get("time").toString().split("T")[0].replace("-", "/");
|
||||
this.isDeleted = false;
|
||||
}
|
||||
|
||||
public String getPartitionPath() {
|
||||
return partitionPath;
|
||||
@Override
|
||||
public Optional<Map<String, String>> getMetadata() {
|
||||
// Let's assume we want to count the number of input row change events
|
||||
// that are processed. Let the time-bucket for this row change event be 1506582000.
|
||||
Map<String, String> metadataMap = new HashMap<>();
|
||||
metadataMap.put("InputRecordCount_1506582000", "2");
|
||||
return Optional.of(metadataMap);
|
||||
}
|
||||
|
||||
public String getRowKey() {
|
||||
return rowKey;
|
||||
}
|
||||
|
||||
public String getJsonData() throws IOException {
|
||||
return unCompressData(jsonDataCompressed);
|
||||
}
|
||||
|
||||
private byte[] compressData(String jsonData) throws IOException {
|
||||
ByteArrayOutputStream baos = new ByteArrayOutputStream();
|
||||
DeflaterOutputStream dos =
|
||||
new DeflaterOutputStream(baos, new Deflater(Deflater.BEST_COMPRESSION), true);
|
||||
try {
|
||||
dos.write(jsonData.getBytes());
|
||||
} finally {
|
||||
dos.flush();
|
||||
dos.close();
|
||||
}
|
||||
return baos.toByteArray();
|
||||
}
|
||||
|
||||
|
||||
@Override public TestRawTripPayload preCombine(TestRawTripPayload another) {
|
||||
return another;
|
||||
}
|
||||
private String unCompressData(byte[] data) throws IOException {
|
||||
InflaterInputStream iis = new InflaterInputStream(new ByteArrayInputStream(data));
|
||||
StringWriter sw = new StringWriter(dataSize);
|
||||
IOUtils.copy(iis, sw);
|
||||
return sw.toString();
|
||||
}
|
||||
|
||||
@Override public Optional<IndexedRecord> combineAndGetUpdateValue(IndexedRecord oldRec, Schema schema) throws IOException {
|
||||
return this.getInsertValue(schema);
|
||||
}
|
||||
/**
|
||||
* A custom {@link WriteStatus} that merges passed metadata key value map to {@code
|
||||
* WriteStatus.markSuccess()} and {@code WriteStatus.markFailure()}.
|
||||
*/
|
||||
public static class MetadataMergeWriteStatus extends WriteStatus {
|
||||
|
||||
@Override public Optional<IndexedRecord> getInsertValue(Schema schema) throws IOException {
|
||||
if(isDeleted){
|
||||
return Optional.empty();
|
||||
} else {
|
||||
MercifulJsonConverter jsonConverter = new MercifulJsonConverter(schema);
|
||||
return Optional.of(jsonConverter.convert(getJsonData()));
|
||||
}
|
||||
private Map<String, String> mergedMetadataMap = new HashMap<>();
|
||||
|
||||
@Override
|
||||
public void markSuccess(HoodieRecord record, Optional<Map<String, String>> recordMetadata) {
|
||||
super.markSuccess(record, recordMetadata);
|
||||
if (recordMetadata.isPresent()) {
|
||||
mergeMetadataMaps(recordMetadata.get(), mergedMetadataMap);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public Optional<Map<String, String>> getMetadata() {
|
||||
// Let's assume we want to count the number of input row change events
|
||||
// that are processed. Let the time-bucket for this row change event be 1506582000.
|
||||
Map<String, String> metadataMap = new HashMap<>();
|
||||
metadataMap.put("InputRecordCount_1506582000", "2");
|
||||
return Optional.of(metadataMap);
|
||||
public void markFailure(HoodieRecord record, Throwable t,
|
||||
Optional<Map<String, String>> recordMetadata) {
|
||||
super.markFailure(record, t, recordMetadata);
|
||||
if (recordMetadata.isPresent()) {
|
||||
mergeMetadataMaps(recordMetadata.get(), mergedMetadataMap);
|
||||
}
|
||||
}
|
||||
|
||||
public String getRowKey() {
|
||||
return rowKey;
|
||||
public static Map<String, String> mergeMetadataForWriteStatuses(
|
||||
List<WriteStatus> writeStatuses) {
|
||||
Map<String, String> allWriteStatusMergedMetadataMap = new HashMap<>();
|
||||
for (WriteStatus writeStatus : writeStatuses) {
|
||||
MetadataMergeWriteStatus.mergeMetadataMaps(
|
||||
((MetadataMergeWriteStatus) writeStatus).getMergedMetadataMap(),
|
||||
allWriteStatusMergedMetadataMap);
|
||||
}
|
||||
return allWriteStatusMergedMetadataMap;
|
||||
}
|
||||
|
||||
public String getJsonData() throws IOException {
|
||||
return unCompressData(jsonDataCompressed);
|
||||
private static void mergeMetadataMaps(Map<String, String> mergeFromMap,
|
||||
Map<String, String> mergeToMap) {
|
||||
for (Entry<String, String> entry : mergeFromMap.entrySet()) {
|
||||
String key = entry.getKey();
|
||||
if (!mergeToMap.containsKey(key)) {
|
||||
mergeToMap.put(key, "0");
|
||||
}
|
||||
mergeToMap
|
||||
.put(key, addStrsAsInt(entry.getValue(), mergeToMap.get(key)));
|
||||
}
|
||||
}
|
||||
|
||||
private byte[] compressData(String jsonData) throws IOException {
|
||||
ByteArrayOutputStream baos = new ByteArrayOutputStream();
|
||||
DeflaterOutputStream dos =
|
||||
new DeflaterOutputStream(baos, new Deflater(Deflater.BEST_COMPRESSION), true);
|
||||
try {
|
||||
dos.write(jsonData.getBytes());
|
||||
} finally {
|
||||
dos.flush();
|
||||
dos.close();
|
||||
}
|
||||
return baos.toByteArray();
|
||||
private Map<String, String> getMergedMetadataMap() {
|
||||
return mergedMetadataMap;
|
||||
}
|
||||
|
||||
|
||||
private String unCompressData(byte[] data) throws IOException {
|
||||
InflaterInputStream iis = new InflaterInputStream(new ByteArrayInputStream(data));
|
||||
StringWriter sw = new StringWriter(dataSize);
|
||||
IOUtils.copy(iis, sw);
|
||||
return sw.toString();
|
||||
}
|
||||
|
||||
/**
|
||||
* A custom {@link WriteStatus} that merges passed metadata key value map
|
||||
* to {@code WriteStatus.markSuccess()} and {@code WriteStatus.markFailure()}.
|
||||
*/
|
||||
public static class MetadataMergeWriteStatus extends WriteStatus {
|
||||
private Map<String, String> mergedMetadataMap = new HashMap<>();
|
||||
|
||||
@Override
|
||||
public void markSuccess(HoodieRecord record, Optional<Map<String, String>> recordMetadata) {
|
||||
super.markSuccess(record, recordMetadata);
|
||||
if(recordMetadata.isPresent()) {
|
||||
mergeMetadataMaps(recordMetadata.get(), mergedMetadataMap);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void markFailure(HoodieRecord record, Throwable t,
|
||||
Optional<Map<String, String>> recordMetadata) {
|
||||
super.markFailure(record, t, recordMetadata);
|
||||
if(recordMetadata.isPresent()) {
|
||||
mergeMetadataMaps(recordMetadata.get(), mergedMetadataMap);
|
||||
}
|
||||
}
|
||||
|
||||
public static Map<String, String> mergeMetadataForWriteStatuses(List<WriteStatus> writeStatuses) {
|
||||
Map<String, String> allWriteStatusMergedMetadataMap = new HashMap<>();
|
||||
for (WriteStatus writeStatus : writeStatuses) {
|
||||
MetadataMergeWriteStatus.mergeMetadataMaps(
|
||||
((MetadataMergeWriteStatus)writeStatus).getMergedMetadataMap(),
|
||||
allWriteStatusMergedMetadataMap);
|
||||
}
|
||||
return allWriteStatusMergedMetadataMap;
|
||||
}
|
||||
|
||||
private static void mergeMetadataMaps(Map<String, String> mergeFromMap, Map<String, String> mergeToMap) {
|
||||
for (Entry<String, String> entry : mergeFromMap.entrySet()) {
|
||||
String key = entry.getKey();
|
||||
if(!mergeToMap.containsKey(key)) {
|
||||
mergeToMap.put(key, "0");
|
||||
}
|
||||
mergeToMap
|
||||
.put(key, addStrsAsInt(entry.getValue(), mergeToMap.get(key)));
|
||||
}
|
||||
}
|
||||
|
||||
private Map<String, String> getMergedMetadataMap() {
|
||||
return mergedMetadataMap;
|
||||
}
|
||||
|
||||
private static String addStrsAsInt(String a, String b) {
|
||||
return String.valueOf(Integer.parseInt(a) + Integer.parseInt(b));
|
||||
}
|
||||
private static String addStrsAsInt(String a, String b) {
|
||||
return String.valueOf(Integer.parseInt(a) + Integer.parseInt(b));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -16,7 +16,7 @@
|
||||
|
||||
package com.uber.hoodie.config;
|
||||
|
||||
import static org.junit.Assert.*;
|
||||
import static org.junit.Assert.assertEquals;
|
||||
|
||||
import com.google.common.collect.Maps;
|
||||
import com.uber.hoodie.config.HoodieWriteConfig.Builder;
|
||||
@@ -29,6 +29,7 @@ import java.util.Properties;
|
||||
import org.junit.Test;
|
||||
|
||||
public class HoodieWriteConfigTest {
|
||||
|
||||
@Test
|
||||
public void testPropertyLoading() throws IOException {
|
||||
Builder builder = HoodieWriteConfig.newBuilder().withPath("/tmp");
|
||||
@@ -46,13 +47,14 @@ public class HoodieWriteConfigTest {
|
||||
HoodieWriteConfig config = builder.build();
|
||||
assertEquals(config.getMaxCommitsToKeep(), 5);
|
||||
assertEquals(config.getMinCommitsToKeep(), 2);
|
||||
}
|
||||
}
|
||||
|
||||
private ByteArrayOutputStream saveParamsIntoOutputStream(Map<String, String> params) throws IOException {
|
||||
private ByteArrayOutputStream saveParamsIntoOutputStream(Map<String, String> params)
|
||||
throws IOException {
|
||||
Properties properties = new Properties();
|
||||
properties.putAll(params);
|
||||
ByteArrayOutputStream outStream = new ByteArrayOutputStream();
|
||||
properties.store(outStream, "Saved on " + new Date(System.currentTimeMillis()));
|
||||
return outStream;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -16,105 +16,103 @@
|
||||
|
||||
package com.uber.hoodie.func;
|
||||
|
||||
import com.uber.hoodie.common.table.HoodieTableMetaClient;
|
||||
import com.uber.hoodie.common.table.HoodieTimeline;
|
||||
import com.uber.hoodie.config.HoodieWriteConfig;
|
||||
import static org.junit.Assert.fail;
|
||||
|
||||
import com.uber.hoodie.WriteStatus;
|
||||
import com.uber.hoodie.common.TestRawTripPayload;
|
||||
import com.uber.hoodie.common.model.HoodieKey;
|
||||
import com.uber.hoodie.common.model.HoodieRecord;
|
||||
import com.uber.hoodie.common.model.HoodieRecordLocation;
|
||||
import com.uber.hoodie.common.model.HoodieTestUtils;
|
||||
import com.uber.hoodie.common.table.HoodieTableMetaClient;
|
||||
import com.uber.hoodie.common.table.HoodieTimeline;
|
||||
import com.uber.hoodie.common.util.FSUtils;
|
||||
import com.uber.hoodie.config.HoodieWriteConfig;
|
||||
import com.uber.hoodie.table.HoodieCopyOnWriteTable;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.junit.Before;
|
||||
import org.junit.Test;
|
||||
import org.junit.rules.TemporaryFolder;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
|
||||
import static org.junit.Assert.fail;
|
||||
|
||||
public class TestUpdateMapFunction {
|
||||
private String basePath = null;
|
||||
|
||||
@Before
|
||||
public void init() throws Exception {
|
||||
// Create a temp folder as the base path
|
||||
TemporaryFolder folder = new TemporaryFolder();
|
||||
folder.create();
|
||||
this.basePath = folder.getRoot().getAbsolutePath();
|
||||
HoodieTestUtils.init(basePath);
|
||||
private String basePath = null;
|
||||
|
||||
@Before
|
||||
public void init() throws Exception {
|
||||
// Create a temp folder as the base path
|
||||
TemporaryFolder folder = new TemporaryFolder();
|
||||
folder.create();
|
||||
this.basePath = folder.getRoot().getAbsolutePath();
|
||||
HoodieTestUtils.init(basePath);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testSchemaEvolutionOnUpdate() throws Exception {
|
||||
// Create a bunch of records with a old version of schema
|
||||
HoodieWriteConfig config = makeHoodieClientConfig("/exampleSchema.txt");
|
||||
HoodieTableMetaClient metadata = new HoodieTableMetaClient(FSUtils.getFs(), basePath);
|
||||
HoodieCopyOnWriteTable table = new HoodieCopyOnWriteTable(config, metadata);
|
||||
|
||||
String recordStr1 =
|
||||
"{\"_row_key\":\"8eb5b87a-1feh-4edd-87b4-6ec96dc405a0\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}";
|
||||
String recordStr2 =
|
||||
"{\"_row_key\":\"8eb5b87b-1feu-4edd-87b4-6ec96dc405a0\",\"time\":\"2016-01-31T03:20:41.415Z\",\"number\":100}";
|
||||
String recordStr3 =
|
||||
"{\"_row_key\":\"8eb5b87c-1fej-4edd-87b4-6ec96dc405a0\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":15}";
|
||||
List<HoodieRecord> records = new ArrayList<>();
|
||||
TestRawTripPayload rowChange1 = new TestRawTripPayload(recordStr1);
|
||||
records.add(
|
||||
new HoodieRecord(new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()),
|
||||
rowChange1));
|
||||
TestRawTripPayload rowChange2 = new TestRawTripPayload(recordStr2);
|
||||
records.add(
|
||||
new HoodieRecord(new HoodieKey(rowChange2.getRowKey(), rowChange2.getPartitionPath()),
|
||||
rowChange2));
|
||||
TestRawTripPayload rowChange3 = new TestRawTripPayload(recordStr3);
|
||||
records.add(
|
||||
new HoodieRecord(new HoodieKey(rowChange3.getRowKey(), rowChange3.getPartitionPath()),
|
||||
rowChange3));
|
||||
Iterator<List<WriteStatus>> insertResult = table.handleInsert("100", records.iterator());
|
||||
Path commitFile =
|
||||
new Path(config.getBasePath() + "/.hoodie/" + HoodieTimeline.makeCommitFileName("100"));
|
||||
FSUtils.getFs().create(commitFile);
|
||||
|
||||
// Now try an update with an evolved schema
|
||||
// Evolved schema does not have guarantee on preserving the original field ordering
|
||||
config = makeHoodieClientConfig("/exampleEvolvedSchema.txt");
|
||||
metadata = new HoodieTableMetaClient(FSUtils.getFs(), basePath);
|
||||
String fileId = insertResult.next().get(0).getFileId();
|
||||
System.out.println(fileId);
|
||||
|
||||
table = new HoodieCopyOnWriteTable(config, metadata);
|
||||
// New content with values for the newly added field
|
||||
recordStr1 =
|
||||
"{\"_row_key\":\"8eb5b87a-1feh-4edd-87b4-6ec96dc405a0\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12,\"added_field\":1}";
|
||||
records = new ArrayList<>();
|
||||
rowChange1 = new TestRawTripPayload(recordStr1);
|
||||
HoodieRecord record1 =
|
||||
new HoodieRecord(new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()),
|
||||
rowChange1);
|
||||
record1.setCurrentLocation(new HoodieRecordLocation("100", fileId));
|
||||
records.add(record1);
|
||||
|
||||
try {
|
||||
table.handleUpdate("101", fileId, records.iterator());
|
||||
} catch (ClassCastException e) {
|
||||
fail(
|
||||
"UpdateFunction could not read records written with exampleSchema.txt using the exampleEvolvedSchema.txt");
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testSchemaEvolutionOnUpdate() throws Exception {
|
||||
// Create a bunch of records with a old version of schema
|
||||
HoodieWriteConfig config = makeHoodieClientConfig("/exampleSchema.txt");
|
||||
HoodieTableMetaClient metadata = new HoodieTableMetaClient(FSUtils.getFs(), basePath);
|
||||
HoodieCopyOnWriteTable table = new HoodieCopyOnWriteTable(config, metadata);
|
||||
|
||||
String recordStr1 =
|
||||
"{\"_row_key\":\"8eb5b87a-1feh-4edd-87b4-6ec96dc405a0\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}";
|
||||
String recordStr2 =
|
||||
"{\"_row_key\":\"8eb5b87b-1feu-4edd-87b4-6ec96dc405a0\",\"time\":\"2016-01-31T03:20:41.415Z\",\"number\":100}";
|
||||
String recordStr3 =
|
||||
"{\"_row_key\":\"8eb5b87c-1fej-4edd-87b4-6ec96dc405a0\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":15}";
|
||||
List<HoodieRecord> records = new ArrayList<>();
|
||||
TestRawTripPayload rowChange1 = new TestRawTripPayload(recordStr1);
|
||||
records.add(
|
||||
new HoodieRecord(new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()),
|
||||
rowChange1));
|
||||
TestRawTripPayload rowChange2 = new TestRawTripPayload(recordStr2);
|
||||
records.add(
|
||||
new HoodieRecord(new HoodieKey(rowChange2.getRowKey(), rowChange2.getPartitionPath()),
|
||||
rowChange2));
|
||||
TestRawTripPayload rowChange3 = new TestRawTripPayload(recordStr3);
|
||||
records.add(
|
||||
new HoodieRecord(new HoodieKey(rowChange3.getRowKey(), rowChange3.getPartitionPath()),
|
||||
rowChange3));
|
||||
Iterator<List<WriteStatus>> insertResult = table.handleInsert("100", records.iterator());
|
||||
Path commitFile =
|
||||
new Path(config.getBasePath() + "/.hoodie/" + HoodieTimeline.makeCommitFileName("100"));
|
||||
FSUtils.getFs().create(commitFile);
|
||||
|
||||
// Now try an update with an evolved schema
|
||||
// Evolved schema does not have guarantee on preserving the original field ordering
|
||||
config = makeHoodieClientConfig("/exampleEvolvedSchema.txt");
|
||||
metadata = new HoodieTableMetaClient(FSUtils.getFs(), basePath);
|
||||
String fileId = insertResult.next().get(0).getFileId();
|
||||
System.out.println(fileId);
|
||||
|
||||
|
||||
table = new HoodieCopyOnWriteTable(config, metadata);
|
||||
// New content with values for the newly added field
|
||||
recordStr1 =
|
||||
"{\"_row_key\":\"8eb5b87a-1feh-4edd-87b4-6ec96dc405a0\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12,\"added_field\":1}";
|
||||
records = new ArrayList<>();
|
||||
rowChange1 = new TestRawTripPayload(recordStr1);
|
||||
HoodieRecord record1 =
|
||||
new HoodieRecord(new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()),
|
||||
rowChange1);
|
||||
record1.setCurrentLocation(new HoodieRecordLocation("100", fileId));
|
||||
records.add(record1);
|
||||
|
||||
try {
|
||||
table.handleUpdate("101", fileId, records.iterator());
|
||||
} catch (ClassCastException e) {
|
||||
fail(
|
||||
"UpdateFunction could not read records written with exampleSchema.txt using the exampleEvolvedSchema.txt");
|
||||
}
|
||||
}
|
||||
|
||||
private HoodieWriteConfig makeHoodieClientConfig(String schema) throws Exception {
|
||||
// Prepare the AvroParquetIO
|
||||
String schemaStr = IOUtils.toString(getClass().getResourceAsStream(schema), "UTF-8");
|
||||
return HoodieWriteConfig.newBuilder().withPath(basePath).withSchema(schemaStr).build();
|
||||
}
|
||||
private HoodieWriteConfig makeHoodieClientConfig(String schema) throws Exception {
|
||||
// Prepare the AvroParquetIO
|
||||
String schemaStr = IOUtils.toString(getClass().getResourceAsStream(schema), "UTF-8");
|
||||
return HoodieWriteConfig.newBuilder().withPath(basePath).withSchema(schemaStr).build();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@@ -16,32 +16,31 @@
|
||||
|
||||
package com.uber.hoodie.index;
|
||||
|
||||
import com.uber.hoodie.config.HoodieWriteConfig;
|
||||
import static org.junit.Assert.assertTrue;
|
||||
|
||||
import com.uber.hoodie.config.HoodieIndexConfig;
|
||||
import com.uber.hoodie.config.HoodieWriteConfig;
|
||||
import com.uber.hoodie.index.bloom.HoodieBloomIndex;
|
||||
import com.uber.hoodie.index.hbase.HBaseIndex;
|
||||
|
||||
import org.junit.Test;
|
||||
|
||||
import static org.junit.Assert.*;
|
||||
|
||||
public class TestHoodieIndex {
|
||||
@Test
|
||||
public void testCreateIndex() throws Exception {
|
||||
HoodieWriteConfig.Builder clientConfigBuilder = HoodieWriteConfig.newBuilder();
|
||||
HoodieIndexConfig.Builder indexConfigBuilder = HoodieIndexConfig.newBuilder();
|
||||
// Different types
|
||||
HoodieWriteConfig config = clientConfigBuilder.withPath("")
|
||||
.withIndexConfig(indexConfigBuilder.withIndexType(HoodieIndex.IndexType.HBASE).build())
|
||||
.build();
|
||||
assertTrue(HoodieIndex.createIndex(config, null) instanceof HBaseIndex);
|
||||
config = clientConfigBuilder.withPath("").withIndexConfig(
|
||||
indexConfigBuilder.withIndexType(HoodieIndex.IndexType.INMEMORY).build()).build();
|
||||
assertTrue(HoodieIndex.createIndex(config, null) instanceof InMemoryHashIndex);
|
||||
config = clientConfigBuilder.withPath("")
|
||||
.withIndexConfig(indexConfigBuilder.withIndexType(HoodieIndex.IndexType.BLOOM).build())
|
||||
.build();
|
||||
assertTrue(HoodieIndex.createIndex(config, null) instanceof HoodieBloomIndex);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testCreateIndex() throws Exception {
|
||||
HoodieWriteConfig.Builder clientConfigBuilder = HoodieWriteConfig.newBuilder();
|
||||
HoodieIndexConfig.Builder indexConfigBuilder = HoodieIndexConfig.newBuilder();
|
||||
// Different types
|
||||
HoodieWriteConfig config = clientConfigBuilder.withPath("")
|
||||
.withIndexConfig(indexConfigBuilder.withIndexType(HoodieIndex.IndexType.HBASE).build())
|
||||
.build();
|
||||
assertTrue(HoodieIndex.createIndex(config, null) instanceof HBaseIndex);
|
||||
config = clientConfigBuilder.withPath("").withIndexConfig(
|
||||
indexConfigBuilder.withIndexType(HoodieIndex.IndexType.INMEMORY).build()).build();
|
||||
assertTrue(HoodieIndex.createIndex(config, null) instanceof InMemoryHashIndex);
|
||||
config = clientConfigBuilder.withPath("")
|
||||
.withIndexConfig(indexConfigBuilder.withIndexType(HoodieIndex.IndexType.BLOOM).build())
|
||||
.build();
|
||||
assertTrue(HoodieIndex.createIndex(config, null) instanceof HoodieBloomIndex);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -18,28 +18,39 @@
|
||||
|
||||
package com.uber.hoodie.index.bloom;
|
||||
|
||||
import static org.junit.Assert.assertEquals;
|
||||
import static org.junit.Assert.assertFalse;
|
||||
import static org.junit.Assert.assertNotNull;
|
||||
import static org.junit.Assert.assertNull;
|
||||
import static org.junit.Assert.assertTrue;
|
||||
import static org.junit.Assert.fail;
|
||||
|
||||
import com.google.common.base.Optional;
|
||||
import com.google.common.collect.Lists;
|
||||
|
||||
import com.uber.hoodie.common.HoodieClientTestUtils;
|
||||
import com.uber.hoodie.common.table.HoodieTableMetaClient;
|
||||
import com.uber.hoodie.config.HoodieIndexConfig;
|
||||
import com.uber.hoodie.config.HoodieWriteConfig;
|
||||
import com.uber.hoodie.avro.HoodieAvroWriteSupport;
|
||||
import com.uber.hoodie.common.BloomFilter;
|
||||
import com.uber.hoodie.common.HoodieClientTestUtils;
|
||||
import com.uber.hoodie.common.TestRawTripPayload;
|
||||
import com.uber.hoodie.common.model.HoodieKey;
|
||||
import com.uber.hoodie.common.model.HoodieRecord;
|
||||
import com.uber.hoodie.common.model.HoodieTestUtils;
|
||||
import com.uber.hoodie.common.table.HoodieTableMetaClient;
|
||||
import com.uber.hoodie.common.util.FSUtils;
|
||||
import com.uber.hoodie.common.util.HoodieAvroUtils;
|
||||
|
||||
import com.uber.hoodie.index.bloom.BloomIndexFileInfo;
|
||||
import com.uber.hoodie.index.bloom.HoodieBloomIndex;
|
||||
import com.uber.hoodie.index.bloom.HoodieBloomIndexCheckFunction;
|
||||
import com.uber.hoodie.config.HoodieWriteConfig;
|
||||
import com.uber.hoodie.io.storage.HoodieParquetConfig;
|
||||
import com.uber.hoodie.io.storage.HoodieParquetWriter;
|
||||
import com.uber.hoodie.table.HoodieTable;
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.text.SimpleDateFormat;
|
||||
import java.util.Arrays;
|
||||
import java.util.Date;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.UUID;
|
||||
import java.util.stream.Collectors;
|
||||
import org.apache.avro.Schema;
|
||||
import org.apache.avro.generic.GenericRecord;
|
||||
import org.apache.commons.io.IOUtils;
|
||||
@@ -47,11 +58,8 @@ import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.apache.parquet.avro.AvroSchemaConverter;
|
||||
import org.apache.parquet.avro.AvroWriteSupport;
|
||||
import org.apache.parquet.hadoop.ParquetWriter;
|
||||
import org.apache.parquet.hadoop.api.WriteSupport;
|
||||
import org.apache.parquet.hadoop.metadata.CompressionCodecName;
|
||||
import org.apache.spark.SparkConf;
|
||||
import org.apache.spark.api.java.JavaPairRDD;
|
||||
import org.apache.spark.api.java.JavaRDD;
|
||||
import org.apache.spark.api.java.JavaSparkContext;
|
||||
@@ -59,464 +67,489 @@ import org.junit.After;
|
||||
import org.junit.Before;
|
||||
import org.junit.Test;
|
||||
import org.junit.rules.TemporaryFolder;
|
||||
import org.mockito.Mockito;
|
||||
|
||||
import scala.Tuple2;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.text.SimpleDateFormat;
|
||||
import java.util.*;
|
||||
import java.util.stream.Collectors;
|
||||
import java.util.stream.Stream;
|
||||
|
||||
import static org.junit.Assert.*;
|
||||
|
||||
public class TestHoodieBloomIndex {
|
||||
private JavaSparkContext jsc = null;
|
||||
private String basePath = null;
|
||||
private transient final FileSystem fs;
|
||||
private String schemaStr;
|
||||
private Schema schema;
|
||||
|
||||
public TestHoodieBloomIndex() throws Exception {
|
||||
fs = FSUtils.getFs();
|
||||
}
|
||||
private JavaSparkContext jsc = null;
|
||||
private String basePath = null;
|
||||
private transient final FileSystem fs;
|
||||
private String schemaStr;
|
||||
private Schema schema;
|
||||
|
||||
@Before
|
||||
public void init() throws IOException {
|
||||
// Initialize a local spark env
|
||||
jsc = new JavaSparkContext(HoodieClientTestUtils.getSparkConfForTest("TestHoodieBloomIndex"));
|
||||
// Create a temp folder as the base path
|
||||
TemporaryFolder folder = new TemporaryFolder();
|
||||
folder.create();
|
||||
basePath = folder.getRoot().getAbsolutePath();
|
||||
HoodieTestUtils.init(basePath);
|
||||
// We have some records to be tagged (two different partitions)
|
||||
schemaStr = IOUtils.toString(getClass().getResourceAsStream("/exampleSchema.txt"), "UTF-8");
|
||||
schema = HoodieAvroUtils.addMetadataFields(new Schema.Parser().parse(schemaStr));
|
||||
}
|
||||
public TestHoodieBloomIndex() throws Exception {
|
||||
fs = FSUtils.getFs();
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testLoadUUIDsInMemory() throws IOException {
|
||||
// Create one RDD of hoodie record
|
||||
String recordStr1 = "{\"_row_key\":\"1eb5b87a-1feh-4edd-87b4-6ec96dc405a0\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}";
|
||||
String recordStr2 = "{\"_row_key\":\"2eb5b87b-1feu-4edd-87b4-6ec96dc405a0\",\"time\":\"2016-01-31T03:20:41.415Z\",\"number\":100}";
|
||||
String recordStr3 = "{\"_row_key\":\"3eb5b87c-1fej-4edd-87b4-6ec96dc405a0\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":15}";
|
||||
String recordStr4 = "{\"_row_key\":\"4eb5b87c-1fej-4edd-87b4-6ec96dc405a0\",\"time\":\"2015-01-31T03:16:41.415Z\",\"number\":32}";
|
||||
@Before
|
||||
public void init() throws IOException {
|
||||
// Initialize a local spark env
|
||||
jsc = new JavaSparkContext(HoodieClientTestUtils.getSparkConfForTest("TestHoodieBloomIndex"));
|
||||
// Create a temp folder as the base path
|
||||
TemporaryFolder folder = new TemporaryFolder();
|
||||
folder.create();
|
||||
basePath = folder.getRoot().getAbsolutePath();
|
||||
HoodieTestUtils.init(basePath);
|
||||
// We have some records to be tagged (two different partitions)
|
||||
schemaStr = IOUtils.toString(getClass().getResourceAsStream("/exampleSchema.txt"), "UTF-8");
|
||||
schema = HoodieAvroUtils.addMetadataFields(new Schema.Parser().parse(schemaStr));
|
||||
}
|
||||
|
||||
TestRawTripPayload rowChange1 = new TestRawTripPayload(recordStr1);
|
||||
HoodieRecord record1 = new HoodieRecord(new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()), rowChange1);
|
||||
TestRawTripPayload rowChange2 = new TestRawTripPayload(recordStr2);
|
||||
HoodieRecord record2 = new HoodieRecord(new HoodieKey(rowChange2.getRowKey(), rowChange2.getPartitionPath()), rowChange2);
|
||||
TestRawTripPayload rowChange3 = new TestRawTripPayload(recordStr3);
|
||||
HoodieRecord record3 = new HoodieRecord(new HoodieKey(rowChange3.getRowKey(), rowChange3.getPartitionPath()), rowChange3);
|
||||
TestRawTripPayload rowChange4 = new TestRawTripPayload(recordStr4);
|
||||
HoodieRecord record4 = new HoodieRecord(new HoodieKey(rowChange4.getRowKey(), rowChange4.getPartitionPath()), rowChange4);
|
||||
@Test
|
||||
public void testLoadUUIDsInMemory() throws IOException {
|
||||
// Create one RDD of hoodie record
|
||||
String recordStr1 = "{\"_row_key\":\"1eb5b87a-1feh-4edd-87b4-6ec96dc405a0\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}";
|
||||
String recordStr2 = "{\"_row_key\":\"2eb5b87b-1feu-4edd-87b4-6ec96dc405a0\",\"time\":\"2016-01-31T03:20:41.415Z\",\"number\":100}";
|
||||
String recordStr3 = "{\"_row_key\":\"3eb5b87c-1fej-4edd-87b4-6ec96dc405a0\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":15}";
|
||||
String recordStr4 = "{\"_row_key\":\"4eb5b87c-1fej-4edd-87b4-6ec96dc405a0\",\"time\":\"2015-01-31T03:16:41.415Z\",\"number\":32}";
|
||||
|
||||
JavaRDD<HoodieRecord> recordRDD = jsc.parallelize(Arrays.asList(record1, record2, record3, record4));
|
||||
TestRawTripPayload rowChange1 = new TestRawTripPayload(recordStr1);
|
||||
HoodieRecord record1 = new HoodieRecord(
|
||||
new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()), rowChange1);
|
||||
TestRawTripPayload rowChange2 = new TestRawTripPayload(recordStr2);
|
||||
HoodieRecord record2 = new HoodieRecord(
|
||||
new HoodieKey(rowChange2.getRowKey(), rowChange2.getPartitionPath()), rowChange2);
|
||||
TestRawTripPayload rowChange3 = new TestRawTripPayload(recordStr3);
|
||||
HoodieRecord record3 = new HoodieRecord(
|
||||
new HoodieKey(rowChange3.getRowKey(), rowChange3.getPartitionPath()), rowChange3);
|
||||
TestRawTripPayload rowChange4 = new TestRawTripPayload(recordStr4);
|
||||
HoodieRecord record4 = new HoodieRecord(
|
||||
new HoodieKey(rowChange4.getRowKey(), rowChange4.getPartitionPath()), rowChange4);
|
||||
|
||||
// Load to memory
|
||||
HoodieWriteConfig config = HoodieWriteConfig.newBuilder().withPath(basePath).build();
|
||||
JavaRDD<HoodieRecord> recordRDD = jsc
|
||||
.parallelize(Arrays.asList(record1, record2, record3, record4));
|
||||
|
||||
Map<String, Iterable<String>> map = recordRDD
|
||||
.mapToPair(record -> new Tuple2<>(record.getPartitionPath(), record.getRecordKey()))
|
||||
.groupByKey().collectAsMap();
|
||||
assertEquals(map.size(), 2);
|
||||
List<String> list1 = Lists.newArrayList(map.get("2016/01/31"));
|
||||
List<String> list2 = Lists.newArrayList(map.get("2015/01/31"));
|
||||
assertEquals(list1.size(), 3);
|
||||
assertEquals(list2.size(), 1);
|
||||
}
|
||||
// Load to memory
|
||||
HoodieWriteConfig config = HoodieWriteConfig.newBuilder().withPath(basePath).build();
|
||||
|
||||
@Test
|
||||
public void testLoadInvolvedFiles() throws IOException {
|
||||
HoodieWriteConfig config = HoodieWriteConfig.newBuilder()
|
||||
.withPath(basePath)
|
||||
.build();
|
||||
HoodieBloomIndex index = new HoodieBloomIndex(config, jsc);
|
||||
Map<String, Iterable<String>> map = recordRDD
|
||||
.mapToPair(record -> new Tuple2<>(record.getPartitionPath(), record.getRecordKey()))
|
||||
.groupByKey().collectAsMap();
|
||||
assertEquals(map.size(), 2);
|
||||
List<String> list1 = Lists.newArrayList(map.get("2016/01/31"));
|
||||
List<String> list2 = Lists.newArrayList(map.get("2015/01/31"));
|
||||
assertEquals(list1.size(), 3);
|
||||
assertEquals(list2.size(), 1);
|
||||
}
|
||||
|
||||
// Create some partitions, and put some files
|
||||
// "2016/01/21": 0 file
|
||||
// "2016/04/01": 1 file (2_0_20160401010101.parquet)
|
||||
// "2015/03/12": 3 files (1_0_20150312101010.parquet, 3_0_20150312101010.parquet, 4_0_20150312101010.parquet)
|
||||
new File(basePath + "/2016/01/21").mkdirs();
|
||||
new File(basePath + "/2016/04/01").mkdirs();
|
||||
new File(basePath + "/2015/03/12").mkdirs();
|
||||
@Test
|
||||
public void testLoadInvolvedFiles() throws IOException {
|
||||
HoodieWriteConfig config = HoodieWriteConfig.newBuilder()
|
||||
.withPath(basePath)
|
||||
.build();
|
||||
HoodieBloomIndex index = new HoodieBloomIndex(config, jsc);
|
||||
|
||||
TestRawTripPayload rowChange1 = new TestRawTripPayload("{\"_row_key\":\"000\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}");
|
||||
HoodieRecord record1 = new HoodieRecord(new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()), rowChange1);
|
||||
TestRawTripPayload rowChange2 = new TestRawTripPayload("{\"_row_key\":\"001\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}");
|
||||
HoodieRecord record2 = new HoodieRecord(new HoodieKey(rowChange2.getRowKey(), rowChange2.getPartitionPath()), rowChange2);
|
||||
TestRawTripPayload rowChange3 = new TestRawTripPayload("{\"_row_key\":\"002\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}");
|
||||
HoodieRecord record3 = new HoodieRecord(new HoodieKey(rowChange3.getRowKey(), rowChange3.getPartitionPath()), rowChange3);
|
||||
TestRawTripPayload rowChange4 = new TestRawTripPayload("{\"_row_key\":\"003\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}");
|
||||
HoodieRecord record4 = new HoodieRecord(new HoodieKey(rowChange4.getRowKey(), rowChange4.getPartitionPath()), rowChange4);
|
||||
// Create some partitions, and put some files
|
||||
// "2016/01/21": 0 file
|
||||
// "2016/04/01": 1 file (2_0_20160401010101.parquet)
|
||||
// "2015/03/12": 3 files (1_0_20150312101010.parquet, 3_0_20150312101010.parquet, 4_0_20150312101010.parquet)
|
||||
new File(basePath + "/2016/01/21").mkdirs();
|
||||
new File(basePath + "/2016/04/01").mkdirs();
|
||||
new File(basePath + "/2015/03/12").mkdirs();
|
||||
|
||||
TestRawTripPayload rowChange1 = new TestRawTripPayload(
|
||||
"{\"_row_key\":\"000\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}");
|
||||
HoodieRecord record1 = new HoodieRecord(
|
||||
new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()), rowChange1);
|
||||
TestRawTripPayload rowChange2 = new TestRawTripPayload(
|
||||
"{\"_row_key\":\"001\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}");
|
||||
HoodieRecord record2 = new HoodieRecord(
|
||||
new HoodieKey(rowChange2.getRowKey(), rowChange2.getPartitionPath()), rowChange2);
|
||||
TestRawTripPayload rowChange3 = new TestRawTripPayload(
|
||||
"{\"_row_key\":\"002\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}");
|
||||
HoodieRecord record3 = new HoodieRecord(
|
||||
new HoodieKey(rowChange3.getRowKey(), rowChange3.getPartitionPath()), rowChange3);
|
||||
TestRawTripPayload rowChange4 = new TestRawTripPayload(
|
||||
"{\"_row_key\":\"003\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}");
|
||||
HoodieRecord record4 = new HoodieRecord(
|
||||
new HoodieKey(rowChange4.getRowKey(), rowChange4.getPartitionPath()), rowChange4);
|
||||
|
||||
writeParquetFile("2016/04/01","2_0_20160401010101.parquet", Lists.newArrayList(), schema, null, false);
|
||||
writeParquetFile("2015/03/12","1_0_20150312101010.parquet", Lists.newArrayList(), schema, null, false);
|
||||
writeParquetFile("2015/03/12","3_0_20150312101010.parquet", Arrays.asList(record1), schema, null, false);
|
||||
writeParquetFile("2015/03/12","4_0_20150312101010.parquet", Arrays.asList(record2, record3, record4), schema, null, false);
|
||||
writeParquetFile("2016/04/01", "2_0_20160401010101.parquet", Lists.newArrayList(), schema, null,
|
||||
false);
|
||||
writeParquetFile("2015/03/12", "1_0_20150312101010.parquet", Lists.newArrayList(), schema, null,
|
||||
false);
|
||||
writeParquetFile("2015/03/12", "3_0_20150312101010.parquet", Arrays.asList(record1), schema,
|
||||
null, false);
|
||||
writeParquetFile("2015/03/12", "4_0_20150312101010.parquet",
|
||||
Arrays.asList(record2, record3, record4), schema, null, false);
|
||||
|
||||
List<String> partitions = Arrays.asList("2016/01/21", "2016/04/01", "2015/03/12");
|
||||
HoodieTableMetaClient metadata = new HoodieTableMetaClient(fs, basePath);
|
||||
HoodieTable table = HoodieTable.getHoodieTable(metadata, config);
|
||||
List<Tuple2<String, BloomIndexFileInfo>> filesList = index.loadInvolvedFiles(partitions, table);
|
||||
// Still 0, as no valid commit
|
||||
assertEquals(filesList.size(), 0);
|
||||
List<String> partitions = Arrays.asList("2016/01/21", "2016/04/01", "2015/03/12");
|
||||
HoodieTableMetaClient metadata = new HoodieTableMetaClient(fs, basePath);
|
||||
HoodieTable table = HoodieTable.getHoodieTable(metadata, config);
|
||||
List<Tuple2<String, BloomIndexFileInfo>> filesList = index.loadInvolvedFiles(partitions, table);
|
||||
// Still 0, as no valid commit
|
||||
assertEquals(filesList.size(), 0);
|
||||
|
||||
// Add some commits
|
||||
new File(basePath + "/.hoodie").mkdirs();
|
||||
new File(basePath + "/.hoodie/20160401010101.commit").createNewFile();
|
||||
new File(basePath + "/.hoodie/20150312101010.commit").createNewFile();
|
||||
// Add some commits
|
||||
new File(basePath + "/.hoodie").mkdirs();
|
||||
new File(basePath + "/.hoodie/20160401010101.commit").createNewFile();
|
||||
new File(basePath + "/.hoodie/20150312101010.commit").createNewFile();
|
||||
|
||||
filesList = index.loadInvolvedFiles(partitions, table);
|
||||
assertEquals(filesList.size(), 4);
|
||||
// these files will not have the key ranges
|
||||
assertNull(filesList.get(0)._2().getMaxRecordKey());
|
||||
assertNull(filesList.get(0)._2().getMinRecordKey());
|
||||
assertFalse(filesList.get(1)._2().hasKeyRanges());
|
||||
assertNotNull(filesList.get(2)._2().getMaxRecordKey());
|
||||
assertNotNull(filesList.get(2)._2().getMinRecordKey());
|
||||
assertTrue(filesList.get(3)._2().hasKeyRanges());
|
||||
filesList = index.loadInvolvedFiles(partitions, table);
|
||||
assertEquals(filesList.size(), 4);
|
||||
// these files will not have the key ranges
|
||||
assertNull(filesList.get(0)._2().getMaxRecordKey());
|
||||
assertNull(filesList.get(0)._2().getMinRecordKey());
|
||||
assertFalse(filesList.get(1)._2().hasKeyRanges());
|
||||
assertNotNull(filesList.get(2)._2().getMaxRecordKey());
|
||||
assertNotNull(filesList.get(2)._2().getMinRecordKey());
|
||||
assertTrue(filesList.get(3)._2().hasKeyRanges());
|
||||
|
||||
// no longer sorted, but should have same files.
|
||||
// no longer sorted, but should have same files.
|
||||
|
||||
List<Tuple2<String, BloomIndexFileInfo>> expected = Arrays.asList(
|
||||
new Tuple2<>("2016/04/01", new BloomIndexFileInfo("2_0_20160401010101.parquet")),
|
||||
new Tuple2<>("2015/03/12",new BloomIndexFileInfo("1_0_20150312101010.parquet")),
|
||||
new Tuple2<>("2015/03/12",new BloomIndexFileInfo("3_0_20150312101010.parquet", "000", "000")),
|
||||
new Tuple2<>("2015/03/12",new BloomIndexFileInfo("4_0_20150312101010.parquet", "001", "003"))
|
||||
);
|
||||
assertEquals(expected, filesList);
|
||||
}
|
||||
List<Tuple2<String, BloomIndexFileInfo>> expected = Arrays.asList(
|
||||
new Tuple2<>("2016/04/01", new BloomIndexFileInfo("2_0_20160401010101.parquet")),
|
||||
new Tuple2<>("2015/03/12", new BloomIndexFileInfo("1_0_20150312101010.parquet")),
|
||||
new Tuple2<>("2015/03/12",
|
||||
new BloomIndexFileInfo("3_0_20150312101010.parquet", "000", "000")),
|
||||
new Tuple2<>("2015/03/12",
|
||||
new BloomIndexFileInfo("4_0_20150312101010.parquet", "001", "003"))
|
||||
);
|
||||
assertEquals(expected, filesList);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testRangePruning() {
|
||||
@Test
|
||||
public void testRangePruning() {
|
||||
|
||||
HoodieWriteConfig config = HoodieWriteConfig.newBuilder()
|
||||
.withPath(basePath)
|
||||
.build();
|
||||
HoodieBloomIndex index = new HoodieBloomIndex(config, jsc);
|
||||
HoodieWriteConfig config = HoodieWriteConfig.newBuilder()
|
||||
.withPath(basePath)
|
||||
.build();
|
||||
HoodieBloomIndex index = new HoodieBloomIndex(config, jsc);
|
||||
|
||||
final Map<String, List<BloomIndexFileInfo>> partitionToFileIndexInfo = new HashMap<>();
|
||||
partitionToFileIndexInfo.put("2017/10/22", Arrays.asList(
|
||||
new BloomIndexFileInfo("f1"),
|
||||
new BloomIndexFileInfo("f2", "000", "000"),
|
||||
new BloomIndexFileInfo("f3", "001", "003"),
|
||||
new BloomIndexFileInfo("f4", "002", "007"),
|
||||
new BloomIndexFileInfo("f5", "009", "010")
|
||||
));
|
||||
|
||||
final Map<String, List<BloomIndexFileInfo>> partitionToFileIndexInfo = new HashMap<>();
|
||||
partitionToFileIndexInfo.put("2017/10/22", Arrays.asList(
|
||||
new BloomIndexFileInfo("f1"),
|
||||
new BloomIndexFileInfo("f2", "000", "000"),
|
||||
new BloomIndexFileInfo("f3", "001", "003"),
|
||||
new BloomIndexFileInfo("f4", "002", "007"),
|
||||
new BloomIndexFileInfo("f5", "009", "010")
|
||||
JavaPairRDD<String, String> partitionRecordKeyPairRDD = jsc
|
||||
.parallelize(Arrays.asList(
|
||||
new Tuple2<>("2017/10/22", "003"),
|
||||
new Tuple2<>("2017/10/22", "002"),
|
||||
new Tuple2<>("2017/10/22", "005"),
|
||||
new Tuple2<>("2017/10/22", "004")
|
||||
))
|
||||
.mapToPair(t -> t);
|
||||
|
||||
List<Tuple2<String, Tuple2<String, HoodieKey>>> comparisonKeyList = index
|
||||
.explodeRecordRDDWithFileComparisons(partitionToFileIndexInfo, partitionRecordKeyPairRDD)
|
||||
.collect();
|
||||
|
||||
assertEquals(10, comparisonKeyList.size());
|
||||
Map<String, List<String>> recordKeyToFileComps = comparisonKeyList.stream()
|
||||
.collect(Collectors.groupingBy(
|
||||
t -> t._2()._2().getRecordKey(),
|
||||
Collectors.mapping(t -> t._2()._1().split("#")[0], Collectors.toList()
|
||||
)
|
||||
));
|
||||
|
||||
JavaPairRDD<String, String> partitionRecordKeyPairRDD = jsc
|
||||
.parallelize(Arrays.asList(
|
||||
new Tuple2<>("2017/10/22","003"),
|
||||
new Tuple2<>("2017/10/22","002"),
|
||||
new Tuple2<>("2017/10/22","005"),
|
||||
new Tuple2<>("2017/10/22","004")
|
||||
))
|
||||
.mapToPair(t -> t);
|
||||
assertEquals(4, recordKeyToFileComps.size());
|
||||
assertEquals(Arrays.asList("f1", "f3", "f4"), recordKeyToFileComps.get("002"));
|
||||
assertEquals(Arrays.asList("f1", "f3", "f4"), recordKeyToFileComps.get("003"));
|
||||
assertEquals(Arrays.asList("f1", "f4"), recordKeyToFileComps.get("004"));
|
||||
assertEquals(Arrays.asList("f1", "f4"), recordKeyToFileComps.get("005"));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testCheckUUIDsAgainstOneFile()
|
||||
throws IOException, InterruptedException, ClassNotFoundException {
|
||||
|
||||
// Create some records to use
|
||||
String recordStr1 = "{\"_row_key\":\"1eb5b87a-1feh-4edd-87b4-6ec96dc405a0\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}";
|
||||
String recordStr2 = "{\"_row_key\":\"2eb5b87b-1feu-4edd-87b4-6ec96dc405a0\",\"time\":\"2016-01-31T03:20:41.415Z\",\"number\":100}";
|
||||
String recordStr3 = "{\"_row_key\":\"3eb5b87c-1fej-4edd-87b4-6ec96dc405a0\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":15}";
|
||||
String recordStr4 = "{\"_row_key\":\"4eb5b87c-1fej-4edd-87b4-6ec96dc405a0\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":32}";
|
||||
TestRawTripPayload rowChange1 = new TestRawTripPayload(recordStr1);
|
||||
HoodieRecord record1 = new HoodieRecord(
|
||||
new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()), rowChange1);
|
||||
TestRawTripPayload rowChange2 = new TestRawTripPayload(recordStr2);
|
||||
HoodieRecord record2 = new HoodieRecord(
|
||||
new HoodieKey(rowChange2.getRowKey(), rowChange2.getPartitionPath()), rowChange2);
|
||||
TestRawTripPayload rowChange3 = new TestRawTripPayload(recordStr3);
|
||||
HoodieRecord record3 = new HoodieRecord(
|
||||
new HoodieKey(rowChange3.getRowKey(), rowChange3.getPartitionPath()), rowChange3);
|
||||
TestRawTripPayload rowChange4 = new TestRawTripPayload(recordStr4);
|
||||
HoodieRecord record4 = new HoodieRecord(
|
||||
new HoodieKey(rowChange4.getRowKey(), rowChange4.getPartitionPath()), rowChange4);
|
||||
|
||||
// We write record1, record2 to a parquet file, but the bloom filter contains (record1, record2, record3).
|
||||
BloomFilter filter = new BloomFilter(10000, 0.0000001);
|
||||
filter.add(record3.getRecordKey());
|
||||
String filename = writeParquetFile("2016/01/31", Arrays.asList(record1, record2), schema,
|
||||
filter, true);
|
||||
|
||||
// The bloom filter contains 3 records
|
||||
assertTrue(filter.mightContain(record1.getRecordKey()));
|
||||
assertTrue(filter.mightContain(record2.getRecordKey()));
|
||||
assertTrue(filter.mightContain(record3.getRecordKey()));
|
||||
assertFalse(filter.mightContain(record4.getRecordKey()));
|
||||
|
||||
// Compare with file
|
||||
List<String> uuids = Arrays.asList(record1.getRecordKey(), record2.getRecordKey(),
|
||||
record3.getRecordKey(), record4.getRecordKey());
|
||||
|
||||
List<String> results = HoodieBloomIndexCheckFunction.checkCandidatesAgainstFile(uuids,
|
||||
new Path(basePath + "/2016/01/31/" + filename));
|
||||
assertEquals(results.size(), 2);
|
||||
assertTrue(results.get(0).equals("1eb5b87a-1feh-4edd-87b4-6ec96dc405a0")
|
||||
|| results.get(1).equals("1eb5b87a-1feh-4edd-87b4-6ec96dc405a0"));
|
||||
assertTrue(results.get(0).equals("2eb5b87b-1feu-4edd-87b4-6ec96dc405a0")
|
||||
|| results.get(1).equals("2eb5b87b-1feu-4edd-87b4-6ec96dc405a0"));
|
||||
// TODO(vc): Need more coverage on actual filenames
|
||||
//assertTrue(results.get(0)._2().equals(filename));
|
||||
//assertTrue(results.get(1)._2().equals(filename));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testTagLocationWithEmptyRDD() throws Exception {
|
||||
// We have some records to be tagged (two different partitions)
|
||||
JavaRDD<HoodieRecord> recordRDD = jsc.emptyRDD();
|
||||
// Also create the metadata and config
|
||||
HoodieTableMetaClient metadata = new HoodieTableMetaClient(fs, basePath);
|
||||
HoodieWriteConfig config = HoodieWriteConfig.newBuilder().withPath(basePath).build();
|
||||
HoodieTable table = HoodieTable.getHoodieTable(metadata, config);
|
||||
|
||||
// Let's tag
|
||||
HoodieBloomIndex bloomIndex = new HoodieBloomIndex(config, jsc);
|
||||
|
||||
try {
|
||||
bloomIndex.tagLocation(recordRDD, table);
|
||||
} catch (IllegalArgumentException e) {
|
||||
fail(
|
||||
"EmptyRDD should not result in IllegalArgumentException: Positive number of slices required");
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
List<Tuple2<String, Tuple2<String, HoodieKey>>> comparisonKeyList = index
|
||||
.explodeRecordRDDWithFileComparisons(partitionToFileIndexInfo, partitionRecordKeyPairRDD)
|
||||
.collect();
|
||||
@Test
|
||||
public void testTagLocation() throws Exception {
|
||||
// We have some records to be tagged (two different partitions)
|
||||
|
||||
assertEquals(10, comparisonKeyList.size());
|
||||
Map<String, List<String>> recordKeyToFileComps = comparisonKeyList.stream()
|
||||
.collect(Collectors.groupingBy(
|
||||
t -> t._2()._2().getRecordKey(),
|
||||
Collectors.mapping(t -> t._2()._1().split("#")[0], Collectors.toList()
|
||||
)
|
||||
));
|
||||
String recordStr1 = "{\"_row_key\":\"1eb5b87a-1feh-4edd-87b4-6ec96dc405a0\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}";
|
||||
String recordStr2 = "{\"_row_key\":\"2eb5b87b-1feu-4edd-87b4-6ec96dc405a0\",\"time\":\"2016-01-31T03:20:41.415Z\",\"number\":100}";
|
||||
String recordStr3 = "{\"_row_key\":\"3eb5b87c-1fej-4edd-87b4-6ec96dc405a0\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":15}";
|
||||
String recordStr4 = "{\"_row_key\":\"4eb5b87c-1fej-4edd-87b4-6ec96dc405a0\",\"time\":\"2015-01-31T03:16:41.415Z\",\"number\":32}";
|
||||
TestRawTripPayload rowChange1 = new TestRawTripPayload(recordStr1);
|
||||
HoodieRecord record1 = new HoodieRecord(
|
||||
new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()), rowChange1);
|
||||
TestRawTripPayload rowChange2 = new TestRawTripPayload(recordStr2);
|
||||
HoodieRecord record2 = new HoodieRecord(
|
||||
new HoodieKey(rowChange2.getRowKey(), rowChange2.getPartitionPath()), rowChange2);
|
||||
TestRawTripPayload rowChange3 = new TestRawTripPayload(recordStr3);
|
||||
HoodieRecord record3 = new HoodieRecord(
|
||||
new HoodieKey(rowChange3.getRowKey(), rowChange3.getPartitionPath()), rowChange3);
|
||||
TestRawTripPayload rowChange4 = new TestRawTripPayload(recordStr4);
|
||||
HoodieRecord record4 = new HoodieRecord(
|
||||
new HoodieKey(rowChange4.getRowKey(), rowChange4.getPartitionPath()), rowChange4);
|
||||
JavaRDD<HoodieRecord> recordRDD = jsc
|
||||
.parallelize(Arrays.asList(record1, record2, record3, record4));
|
||||
|
||||
assertEquals(4, recordKeyToFileComps.size());
|
||||
assertEquals(Arrays.asList("f1", "f3", "f4"), recordKeyToFileComps.get("002"));
|
||||
assertEquals(Arrays.asList("f1", "f3", "f4"), recordKeyToFileComps.get("003"));
|
||||
assertEquals(Arrays.asList("f1", "f4"), recordKeyToFileComps.get("004"));
|
||||
assertEquals(Arrays.asList("f1", "f4"), recordKeyToFileComps.get("005"));
|
||||
// Also create the metadata and config
|
||||
HoodieTableMetaClient metadata = new HoodieTableMetaClient(fs, basePath);
|
||||
HoodieWriteConfig config = HoodieWriteConfig.newBuilder().withPath(basePath).build();
|
||||
HoodieTable table = HoodieTable.getHoodieTable(metadata, config);
|
||||
|
||||
// Let's tag
|
||||
HoodieBloomIndex bloomIndex = new HoodieBloomIndex(config, jsc);
|
||||
JavaRDD<HoodieRecord> taggedRecordRDD = bloomIndex.tagLocation(recordRDD, table);
|
||||
|
||||
// Should not find any files
|
||||
for (HoodieRecord record : taggedRecordRDD.collect()) {
|
||||
assertTrue(!record.isCurrentLocationKnown());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testCheckUUIDsAgainstOneFile() throws IOException, InterruptedException, ClassNotFoundException {
|
||||
// We create three parquet file, each having one record. (two different partitions)
|
||||
String filename1 = writeParquetFile("2016/01/31", Arrays.asList(record1), schema, null, true);
|
||||
String filename2 = writeParquetFile("2016/01/31", Arrays.asList(record2), schema, null, true);
|
||||
String filename3 = writeParquetFile("2015/01/31", Arrays.asList(record4), schema, null, true);
|
||||
|
||||
// Create some records to use
|
||||
String recordStr1 = "{\"_row_key\":\"1eb5b87a-1feh-4edd-87b4-6ec96dc405a0\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}";
|
||||
String recordStr2 = "{\"_row_key\":\"2eb5b87b-1feu-4edd-87b4-6ec96dc405a0\",\"time\":\"2016-01-31T03:20:41.415Z\",\"number\":100}";
|
||||
String recordStr3 = "{\"_row_key\":\"3eb5b87c-1fej-4edd-87b4-6ec96dc405a0\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":15}";
|
||||
String recordStr4 = "{\"_row_key\":\"4eb5b87c-1fej-4edd-87b4-6ec96dc405a0\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":32}";
|
||||
TestRawTripPayload rowChange1 = new TestRawTripPayload(recordStr1);
|
||||
HoodieRecord record1 = new HoodieRecord(new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()), rowChange1);
|
||||
TestRawTripPayload rowChange2 = new TestRawTripPayload(recordStr2);
|
||||
HoodieRecord record2 = new HoodieRecord(new HoodieKey(rowChange2.getRowKey(), rowChange2.getPartitionPath()), rowChange2);
|
||||
TestRawTripPayload rowChange3 = new TestRawTripPayload(recordStr3);
|
||||
HoodieRecord record3 = new HoodieRecord(new HoodieKey(rowChange3.getRowKey(), rowChange3.getPartitionPath()), rowChange3);
|
||||
TestRawTripPayload rowChange4 = new TestRawTripPayload(recordStr4);
|
||||
HoodieRecord record4 = new HoodieRecord(new HoodieKey(rowChange4.getRowKey(), rowChange4.getPartitionPath()), rowChange4);
|
||||
// We do the tag again
|
||||
metadata = new HoodieTableMetaClient(fs, basePath);
|
||||
table = HoodieTable.getHoodieTable(metadata, config);
|
||||
|
||||
taggedRecordRDD = bloomIndex.tagLocation(recordRDD, table);
|
||||
|
||||
// We write record1, record2 to a parquet file, but the bloom filter contains (record1, record2, record3).
|
||||
BloomFilter filter = new BloomFilter(10000, 0.0000001);
|
||||
filter.add(record3.getRecordKey());
|
||||
String filename = writeParquetFile("2016/01/31", Arrays.asList(record1, record2), schema, filter, true);
|
||||
// Check results
|
||||
for (HoodieRecord record : taggedRecordRDD.collect()) {
|
||||
if (record.getRecordKey().equals("1eb5b87a-1feh-4edd-87b4-6ec96dc405a0")) {
|
||||
assertTrue(record.getCurrentLocation().getFileId().equals(FSUtils.getFileId(filename1)));
|
||||
} else if (record.getRecordKey().equals("2eb5b87b-1feu-4edd-87b4-6ec96dc405a0")) {
|
||||
assertTrue(record.getCurrentLocation().getFileId().equals(FSUtils.getFileId(filename2)));
|
||||
} else if (record.getRecordKey().equals("3eb5b87c-1fej-4edd-87b4-6ec96dc405a0")) {
|
||||
assertTrue(!record.isCurrentLocationKnown());
|
||||
} else if (record.getRecordKey().equals("4eb5b87c-1fej-4edd-87b4-6ec96dc405a0")) {
|
||||
assertTrue(record.getCurrentLocation().getFileId().equals(FSUtils.getFileId(filename3)));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// The bloom filter contains 3 records
|
||||
assertTrue(filter.mightContain(record1.getRecordKey()));
|
||||
assertTrue(filter.mightContain(record2.getRecordKey()));
|
||||
assertTrue(filter.mightContain(record3.getRecordKey()));
|
||||
assertFalse(filter.mightContain(record4.getRecordKey()));
|
||||
@Test
|
||||
public void testCheckExists() throws Exception {
|
||||
// We have some records to be tagged (two different partitions)
|
||||
|
||||
// Compare with file
|
||||
List<String> uuids = Arrays.asList(record1.getRecordKey(), record2.getRecordKey(),
|
||||
record3.getRecordKey(), record4.getRecordKey());
|
||||
String recordStr1 = "{\"_row_key\":\"1eb5b87a-1feh-4edd-87b4-6ec96dc405a0\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}";
|
||||
String recordStr2 = "{\"_row_key\":\"2eb5b87b-1feu-4edd-87b4-6ec96dc405a0\",\"time\":\"2016-01-31T03:20:41.415Z\",\"number\":100}";
|
||||
String recordStr3 = "{\"_row_key\":\"3eb5b87c-1fej-4edd-87b4-6ec96dc405a0\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":15}";
|
||||
String recordStr4 = "{\"_row_key\":\"4eb5b87c-1fej-4edd-87b4-6ec96dc405a0\",\"time\":\"2015-01-31T03:16:41.415Z\",\"number\":32}";
|
||||
TestRawTripPayload rowChange1 = new TestRawTripPayload(recordStr1);
|
||||
HoodieKey key1 = new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath());
|
||||
HoodieRecord record1 = new HoodieRecord(key1, rowChange1);
|
||||
TestRawTripPayload rowChange2 = new TestRawTripPayload(recordStr2);
|
||||
HoodieKey key2 = new HoodieKey(rowChange2.getRowKey(), rowChange2.getPartitionPath());
|
||||
HoodieRecord record2 = new HoodieRecord(key2, rowChange2);
|
||||
TestRawTripPayload rowChange3 = new TestRawTripPayload(recordStr3);
|
||||
HoodieKey key3 = new HoodieKey(rowChange3.getRowKey(), rowChange3.getPartitionPath());
|
||||
HoodieRecord record3 = new HoodieRecord(key3, rowChange3);
|
||||
TestRawTripPayload rowChange4 = new TestRawTripPayload(recordStr4);
|
||||
HoodieKey key4 = new HoodieKey(rowChange4.getRowKey(), rowChange4.getPartitionPath());
|
||||
HoodieRecord record4 = new HoodieRecord(key4, rowChange4);
|
||||
JavaRDD<HoodieKey> keysRDD = jsc.parallelize(Arrays.asList(key1, key2, key3, key4));
|
||||
|
||||
List<String> results = HoodieBloomIndexCheckFunction.checkCandidatesAgainstFile(uuids,
|
||||
new Path(basePath + "/2016/01/31/" + filename));
|
||||
assertEquals(results.size(), 2);
|
||||
assertTrue(results.get(0).equals("1eb5b87a-1feh-4edd-87b4-6ec96dc405a0")
|
||||
|| results.get(1).equals("1eb5b87a-1feh-4edd-87b4-6ec96dc405a0"));
|
||||
assertTrue(results.get(0).equals("2eb5b87b-1feu-4edd-87b4-6ec96dc405a0")
|
||||
|| results.get(1).equals("2eb5b87b-1feu-4edd-87b4-6ec96dc405a0"));
|
||||
// TODO(vc): Need more coverage on actual filenames
|
||||
//assertTrue(results.get(0)._2().equals(filename));
|
||||
//assertTrue(results.get(1)._2().equals(filename));
|
||||
// Also create the metadata and config
|
||||
HoodieTableMetaClient metadata = new HoodieTableMetaClient(fs, basePath);
|
||||
HoodieWriteConfig config = HoodieWriteConfig.newBuilder().withPath(basePath).build();
|
||||
HoodieTable table = HoodieTable.getHoodieTable(metadata, config);
|
||||
|
||||
// Let's tag
|
||||
HoodieBloomIndex bloomIndex = new HoodieBloomIndex(config, jsc);
|
||||
JavaPairRDD<HoodieKey, Optional<String>> taggedRecordRDD = bloomIndex
|
||||
.fetchRecordLocation(keysRDD, table);
|
||||
|
||||
// Should not find any files
|
||||
for (Tuple2<HoodieKey, Optional<String>> record : taggedRecordRDD.collect()) {
|
||||
assertTrue(!record._2.isPresent());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testTagLocationWithEmptyRDD() throws Exception {
|
||||
// We have some records to be tagged (two different partitions)
|
||||
JavaRDD<HoodieRecord> recordRDD = jsc.emptyRDD();
|
||||
// Also create the metadata and config
|
||||
HoodieTableMetaClient metadata = new HoodieTableMetaClient(fs, basePath);
|
||||
HoodieWriteConfig config = HoodieWriteConfig.newBuilder().withPath(basePath).build();
|
||||
HoodieTable table = HoodieTable.getHoodieTable(metadata, config);
|
||||
// We create three parquet file, each having one record. (two different partitions)
|
||||
String filename1 = writeParquetFile("2016/01/31", Arrays.asList(record1), schema, null, true);
|
||||
String filename2 = writeParquetFile("2016/01/31", Arrays.asList(record2), schema, null, true);
|
||||
String filename3 = writeParquetFile("2015/01/31", Arrays.asList(record4), schema, null, true);
|
||||
|
||||
// Let's tag
|
||||
HoodieBloomIndex bloomIndex = new HoodieBloomIndex(config, jsc);
|
||||
// We do the tag again
|
||||
metadata = new HoodieTableMetaClient(fs, basePath);
|
||||
table = HoodieTable.getHoodieTable(metadata, config);
|
||||
taggedRecordRDD = bloomIndex.fetchRecordLocation(keysRDD, table);
|
||||
|
||||
try {
|
||||
bloomIndex.tagLocation(recordRDD, table);
|
||||
} catch (IllegalArgumentException e) {
|
||||
fail("EmptyRDD should not result in IllegalArgumentException: Positive number of slices required");
|
||||
}
|
||||
// Check results
|
||||
for (Tuple2<HoodieKey, Optional<String>> record : taggedRecordRDD.collect()) {
|
||||
if (record._1.getRecordKey().equals("1eb5b87a-1feh-4edd-87b4-6ec96dc405a0")) {
|
||||
assertTrue(record._2.isPresent());
|
||||
Path path1 = new Path(record._2.get());
|
||||
assertEquals(FSUtils.getFileId(filename1), FSUtils.getFileId(path1.getName()));
|
||||
} else if (record._1.getRecordKey().equals("2eb5b87b-1feu-4edd-87b4-6ec96dc405a0")) {
|
||||
assertTrue(record._2.isPresent());
|
||||
Path path2 = new Path(record._2.get());
|
||||
assertEquals(FSUtils.getFileId(filename2), FSUtils.getFileId(path2.getName()));
|
||||
} else if (record._1.getRecordKey().equals("3eb5b87c-1fej-4edd-87b4-6ec96dc405a0")) {
|
||||
assertTrue(!record._2.isPresent());
|
||||
} else if (record._1.getRecordKey().equals("4eb5b87c-1fej-4edd-87b4-6ec96dc405a0")) {
|
||||
assertTrue(record._2.isPresent());
|
||||
Path path3 = new Path(record._2.get());
|
||||
assertEquals(FSUtils.getFileId(filename3), FSUtils.getFileId(path3.getName()));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
public void testTagLocation() throws Exception {
|
||||
// We have some records to be tagged (two different partitions)
|
||||
@Test
|
||||
public void testBloomFilterFalseError() throws IOException, InterruptedException {
|
||||
// We have two hoodie records
|
||||
String recordStr1 = "{\"_row_key\":\"1eb5b87a-1feh-4edd-87b4-6ec96dc405a0\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}";
|
||||
String recordStr2 = "{\"_row_key\":\"2eb5b87b-1feu-4edd-87b4-6ec96dc405a0\",\"time\":\"2016-01-31T03:20:41.415Z\",\"number\":100}";
|
||||
|
||||
String recordStr1 = "{\"_row_key\":\"1eb5b87a-1feh-4edd-87b4-6ec96dc405a0\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}";
|
||||
String recordStr2 = "{\"_row_key\":\"2eb5b87b-1feu-4edd-87b4-6ec96dc405a0\",\"time\":\"2016-01-31T03:20:41.415Z\",\"number\":100}";
|
||||
String recordStr3 = "{\"_row_key\":\"3eb5b87c-1fej-4edd-87b4-6ec96dc405a0\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":15}";
|
||||
String recordStr4 = "{\"_row_key\":\"4eb5b87c-1fej-4edd-87b4-6ec96dc405a0\",\"time\":\"2015-01-31T03:16:41.415Z\",\"number\":32}";
|
||||
TestRawTripPayload rowChange1 = new TestRawTripPayload(recordStr1);
|
||||
HoodieRecord record1 = new HoodieRecord(new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()), rowChange1);
|
||||
TestRawTripPayload rowChange2 = new TestRawTripPayload(recordStr2);
|
||||
HoodieRecord record2 = new HoodieRecord(new HoodieKey(rowChange2.getRowKey(), rowChange2.getPartitionPath()), rowChange2);
|
||||
TestRawTripPayload rowChange3 = new TestRawTripPayload(recordStr3);
|
||||
HoodieRecord record3 = new HoodieRecord(new HoodieKey(rowChange3.getRowKey(), rowChange3.getPartitionPath()), rowChange3);
|
||||
TestRawTripPayload rowChange4 = new TestRawTripPayload(recordStr4);
|
||||
HoodieRecord record4 = new HoodieRecord(new HoodieKey(rowChange4.getRowKey(), rowChange4.getPartitionPath()), rowChange4);
|
||||
JavaRDD<HoodieRecord> recordRDD = jsc.parallelize(Arrays.asList(record1, record2, record3, record4));
|
||||
// We write record1 to a parquet file, using a bloom filter having both records
|
||||
TestRawTripPayload rowChange1 = new TestRawTripPayload(recordStr1);
|
||||
HoodieRecord record1 = new HoodieRecord(
|
||||
new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()), rowChange1);
|
||||
TestRawTripPayload rowChange2 = new TestRawTripPayload(recordStr2);
|
||||
HoodieRecord record2 = new HoodieRecord(
|
||||
new HoodieKey(rowChange2.getRowKey(), rowChange2.getPartitionPath()), rowChange2);
|
||||
|
||||
// Also create the metadata and config
|
||||
HoodieTableMetaClient metadata = new HoodieTableMetaClient(fs, basePath);
|
||||
HoodieWriteConfig config = HoodieWriteConfig.newBuilder().withPath(basePath).build();
|
||||
HoodieTable table = HoodieTable.getHoodieTable(metadata, config);
|
||||
BloomFilter filter = new BloomFilter(10000, 0.0000001);
|
||||
filter.add(record2.getRecordKey());
|
||||
String filename = writeParquetFile("2016/01/31", Arrays.asList(record1), schema, filter, true);
|
||||
assertTrue(filter.mightContain(record1.getRecordKey()));
|
||||
assertTrue(filter.mightContain(record2.getRecordKey()));
|
||||
|
||||
// Let's tag
|
||||
HoodieBloomIndex bloomIndex = new HoodieBloomIndex(config, jsc);
|
||||
JavaRDD<HoodieRecord> taggedRecordRDD = bloomIndex.tagLocation(recordRDD, table);
|
||||
// We do the tag
|
||||
JavaRDD<HoodieRecord> recordRDD = jsc.parallelize(Arrays.asList(record1, record2));
|
||||
HoodieTableMetaClient metadata = new HoodieTableMetaClient(fs, basePath);
|
||||
HoodieWriteConfig config = HoodieWriteConfig.newBuilder().withPath(basePath).build();
|
||||
HoodieTable table = HoodieTable.getHoodieTable(metadata, config);
|
||||
|
||||
// Should not find any files
|
||||
for (HoodieRecord record : taggedRecordRDD.collect()) {
|
||||
assertTrue(!record.isCurrentLocationKnown());
|
||||
}
|
||||
HoodieBloomIndex bloomIndex = new HoodieBloomIndex(config, jsc);
|
||||
JavaRDD<HoodieRecord> taggedRecordRDD = bloomIndex.tagLocation(recordRDD, table);
|
||||
|
||||
// We create three parquet file, each having one record. (two different partitions)
|
||||
String filename1 = writeParquetFile("2016/01/31", Arrays.asList(record1), schema, null, true);
|
||||
String filename2 = writeParquetFile("2016/01/31", Arrays.asList(record2), schema, null, true);
|
||||
String filename3 = writeParquetFile("2015/01/31", Arrays.asList(record4), schema, null, true);
|
||||
|
||||
// We do the tag again
|
||||
metadata = new HoodieTableMetaClient(fs, basePath);
|
||||
table = HoodieTable.getHoodieTable(metadata, config);
|
||||
|
||||
taggedRecordRDD = bloomIndex.tagLocation(recordRDD, table);
|
||||
|
||||
// Check results
|
||||
for (HoodieRecord record : taggedRecordRDD.collect()) {
|
||||
if (record.getRecordKey().equals("1eb5b87a-1feh-4edd-87b4-6ec96dc405a0")) {
|
||||
assertTrue(record.getCurrentLocation().getFileId().equals(FSUtils.getFileId(filename1)));
|
||||
} else if (record.getRecordKey().equals("2eb5b87b-1feu-4edd-87b4-6ec96dc405a0")) {
|
||||
assertTrue(record.getCurrentLocation().getFileId().equals(FSUtils.getFileId(filename2)));
|
||||
} else if (record.getRecordKey().equals("3eb5b87c-1fej-4edd-87b4-6ec96dc405a0")) {
|
||||
assertTrue(!record.isCurrentLocationKnown());
|
||||
} else if (record.getRecordKey().equals("4eb5b87c-1fej-4edd-87b4-6ec96dc405a0")) {
|
||||
assertTrue(record.getCurrentLocation().getFileId().equals(FSUtils.getFileId(filename3)));
|
||||
}
|
||||
}
|
||||
// Check results
|
||||
for (HoodieRecord record : taggedRecordRDD.collect()) {
|
||||
if (record.getKey().equals("1eb5b87a-1feh-4edd-87b4-6ec96dc405a0")) {
|
||||
assertTrue(record.getCurrentLocation().getFileId().equals(FSUtils.getFileId(filename)));
|
||||
} else if (record.getRecordKey().equals("2eb5b87b-1feu-4edd-87b4-6ec96dc405a0")) {
|
||||
assertFalse(record.isCurrentLocationKnown());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testCheckExists() throws Exception {
|
||||
// We have some records to be tagged (two different partitions)
|
||||
private String writeParquetFile(String partitionPath, List<HoodieRecord> records, Schema schema,
|
||||
BloomFilter filter, boolean createCommitTime) throws IOException, InterruptedException {
|
||||
Thread.sleep(1000);
|
||||
String commitTime = new SimpleDateFormat("yyyyMMddHHmmss").format(new Date());
|
||||
String fileId = UUID.randomUUID().toString();
|
||||
String filename = FSUtils.makeDataFileName(commitTime, 1, fileId);
|
||||
|
||||
String recordStr1 = "{\"_row_key\":\"1eb5b87a-1feh-4edd-87b4-6ec96dc405a0\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}";
|
||||
String recordStr2 = "{\"_row_key\":\"2eb5b87b-1feu-4edd-87b4-6ec96dc405a0\",\"time\":\"2016-01-31T03:20:41.415Z\",\"number\":100}";
|
||||
String recordStr3 = "{\"_row_key\":\"3eb5b87c-1fej-4edd-87b4-6ec96dc405a0\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":15}";
|
||||
String recordStr4 = "{\"_row_key\":\"4eb5b87c-1fej-4edd-87b4-6ec96dc405a0\",\"time\":\"2015-01-31T03:16:41.415Z\",\"number\":32}";
|
||||
TestRawTripPayload rowChange1 = new TestRawTripPayload(recordStr1);
|
||||
HoodieKey key1 = new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath());
|
||||
HoodieRecord record1 = new HoodieRecord(key1, rowChange1);
|
||||
TestRawTripPayload rowChange2 = new TestRawTripPayload(recordStr2);
|
||||
HoodieKey key2 = new HoodieKey(rowChange2.getRowKey(), rowChange2.getPartitionPath());
|
||||
HoodieRecord record2 = new HoodieRecord(key2, rowChange2);
|
||||
TestRawTripPayload rowChange3 = new TestRawTripPayload(recordStr3);
|
||||
HoodieKey key3 = new HoodieKey(rowChange3.getRowKey(), rowChange3.getPartitionPath());
|
||||
HoodieRecord record3 = new HoodieRecord(key3, rowChange3);
|
||||
TestRawTripPayload rowChange4 = new TestRawTripPayload(recordStr4);
|
||||
HoodieKey key4 = new HoodieKey(rowChange4.getRowKey(), rowChange4.getPartitionPath());
|
||||
HoodieRecord record4 = new HoodieRecord(key4, rowChange4);
|
||||
JavaRDD<HoodieKey> keysRDD = jsc.parallelize(Arrays.asList(key1, key2, key3, key4));
|
||||
return writeParquetFile(partitionPath, filename, records, schema, filter, createCommitTime);
|
||||
}
|
||||
|
||||
// Also create the metadata and config
|
||||
HoodieTableMetaClient metadata = new HoodieTableMetaClient(fs, basePath);
|
||||
HoodieWriteConfig config = HoodieWriteConfig.newBuilder().withPath(basePath).build();
|
||||
HoodieTable table = HoodieTable.getHoodieTable(metadata, config);
|
||||
private String writeParquetFile(String partitionPath, String filename, List<HoodieRecord> records,
|
||||
Schema schema,
|
||||
BloomFilter filter, boolean createCommitTime) throws IOException {
|
||||
|
||||
// Let's tag
|
||||
HoodieBloomIndex bloomIndex = new HoodieBloomIndex(config, jsc);
|
||||
JavaPairRDD<HoodieKey, Optional<String>> taggedRecordRDD = bloomIndex.fetchRecordLocation(keysRDD, table);
|
||||
|
||||
// Should not find any files
|
||||
for (Tuple2<HoodieKey, Optional<String>> record : taggedRecordRDD.collect()) {
|
||||
assertTrue(!record._2.isPresent());
|
||||
}
|
||||
|
||||
// We create three parquet file, each having one record. (two different partitions)
|
||||
String filename1 = writeParquetFile("2016/01/31", Arrays.asList(record1), schema, null, true);
|
||||
String filename2 = writeParquetFile("2016/01/31", Arrays.asList(record2), schema, null, true);
|
||||
String filename3 = writeParquetFile("2015/01/31", Arrays.asList(record4), schema, null, true);
|
||||
|
||||
// We do the tag again
|
||||
metadata = new HoodieTableMetaClient(fs, basePath);
|
||||
table = HoodieTable.getHoodieTable(metadata, config);
|
||||
taggedRecordRDD = bloomIndex.fetchRecordLocation(keysRDD, table);
|
||||
|
||||
// Check results
|
||||
for (Tuple2<HoodieKey, Optional<String>> record : taggedRecordRDD.collect()) {
|
||||
if (record._1.getRecordKey().equals("1eb5b87a-1feh-4edd-87b4-6ec96dc405a0")) {
|
||||
assertTrue(record._2.isPresent());
|
||||
Path path1 = new Path(record._2.get());
|
||||
assertEquals(FSUtils.getFileId(filename1), FSUtils.getFileId(path1.getName()));
|
||||
} else if (record._1.getRecordKey().equals("2eb5b87b-1feu-4edd-87b4-6ec96dc405a0")) {
|
||||
assertTrue(record._2.isPresent());
|
||||
Path path2 = new Path(record._2.get());
|
||||
assertEquals(FSUtils.getFileId(filename2), FSUtils.getFileId(path2.getName()));
|
||||
} else if (record._1.getRecordKey().equals("3eb5b87c-1fej-4edd-87b4-6ec96dc405a0")) {
|
||||
assertTrue(!record._2.isPresent());
|
||||
} else if (record._1.getRecordKey().equals("4eb5b87c-1fej-4edd-87b4-6ec96dc405a0")) {
|
||||
assertTrue(record._2.isPresent());
|
||||
Path path3 = new Path(record._2.get());
|
||||
assertEquals(FSUtils.getFileId(filename3), FSUtils.getFileId(path3.getName()));
|
||||
}
|
||||
}
|
||||
if (filter == null) {
|
||||
filter = new BloomFilter(10000, 0.0000001);
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
public void testBloomFilterFalseError() throws IOException, InterruptedException {
|
||||
// We have two hoodie records
|
||||
String recordStr1 = "{\"_row_key\":\"1eb5b87a-1feh-4edd-87b4-6ec96dc405a0\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}";
|
||||
String recordStr2 = "{\"_row_key\":\"2eb5b87b-1feu-4edd-87b4-6ec96dc405a0\",\"time\":\"2016-01-31T03:20:41.415Z\",\"number\":100}";
|
||||
|
||||
// We write record1 to a parquet file, using a bloom filter having both records
|
||||
TestRawTripPayload rowChange1 = new TestRawTripPayload(recordStr1);
|
||||
HoodieRecord record1 = new HoodieRecord(new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()), rowChange1);
|
||||
TestRawTripPayload rowChange2 = new TestRawTripPayload(recordStr2);
|
||||
HoodieRecord record2 = new HoodieRecord(new HoodieKey(rowChange2.getRowKey(), rowChange2.getPartitionPath()), rowChange2);
|
||||
|
||||
BloomFilter filter = new BloomFilter(10000, 0.0000001);
|
||||
filter.add(record2.getRecordKey());
|
||||
String filename = writeParquetFile("2016/01/31", Arrays.asList(record1), schema, filter, true);
|
||||
assertTrue(filter.mightContain(record1.getRecordKey()));
|
||||
assertTrue(filter.mightContain(record2.getRecordKey()));
|
||||
|
||||
// We do the tag
|
||||
JavaRDD<HoodieRecord> recordRDD = jsc.parallelize(Arrays.asList(record1, record2));
|
||||
HoodieTableMetaClient metadata = new HoodieTableMetaClient(fs, basePath);
|
||||
HoodieWriteConfig config = HoodieWriteConfig.newBuilder().withPath(basePath).build();
|
||||
HoodieTable table = HoodieTable.getHoodieTable(metadata, config);
|
||||
|
||||
HoodieBloomIndex bloomIndex = new HoodieBloomIndex(config, jsc);
|
||||
JavaRDD<HoodieRecord> taggedRecordRDD = bloomIndex.tagLocation(recordRDD, table);
|
||||
|
||||
// Check results
|
||||
for (HoodieRecord record : taggedRecordRDD.collect()) {
|
||||
if (record.getKey().equals("1eb5b87a-1feh-4edd-87b4-6ec96dc405a0")) {
|
||||
assertTrue(record.getCurrentLocation().getFileId().equals(FSUtils.getFileId(filename)));
|
||||
} else if (record.getRecordKey().equals("2eb5b87b-1feu-4edd-87b4-6ec96dc405a0")) {
|
||||
assertFalse(record.isCurrentLocationKnown());
|
||||
}
|
||||
}
|
||||
HoodieAvroWriteSupport writeSupport = new HoodieAvroWriteSupport(
|
||||
new AvroSchemaConverter().convert(schema), schema, filter);
|
||||
String commitTime = FSUtils.getCommitTime(filename);
|
||||
HoodieParquetConfig config = new HoodieParquetConfig(writeSupport, CompressionCodecName.GZIP,
|
||||
ParquetWriter.DEFAULT_BLOCK_SIZE, ParquetWriter.DEFAULT_PAGE_SIZE, 120 * 1024 * 1024,
|
||||
new Configuration());
|
||||
HoodieParquetWriter writer = new HoodieParquetWriter(
|
||||
commitTime,
|
||||
new Path(basePath + "/" + partitionPath + "/" + filename),
|
||||
config,
|
||||
schema);
|
||||
int seqId = 1;
|
||||
for (HoodieRecord record : records) {
|
||||
GenericRecord avroRecord = (GenericRecord) record.getData().getInsertValue(schema).get();
|
||||
HoodieAvroUtils.addCommitMetadataToRecord(avroRecord, commitTime, "" + seqId++);
|
||||
HoodieAvroUtils
|
||||
.addHoodieKeyToRecord(avroRecord, record.getRecordKey(), record.getPartitionPath(),
|
||||
filename);
|
||||
writer.writeAvro(record.getRecordKey(), avroRecord);
|
||||
filter.add(record.getRecordKey());
|
||||
}
|
||||
writer.close();
|
||||
|
||||
private String writeParquetFile(String partitionPath, List<HoodieRecord> records, Schema schema,
|
||||
BloomFilter filter, boolean createCommitTime) throws IOException, InterruptedException {
|
||||
Thread.sleep(1000);
|
||||
String commitTime = new SimpleDateFormat("yyyyMMddHHmmss").format(new Date());
|
||||
String fileId = UUID.randomUUID().toString();
|
||||
String filename = FSUtils.makeDataFileName(commitTime, 1, fileId);
|
||||
|
||||
|
||||
return writeParquetFile(partitionPath, filename, records, schema, filter, createCommitTime);
|
||||
if (createCommitTime) {
|
||||
// Also make sure the commit is valid
|
||||
new File(basePath + "/" + HoodieTableMetaClient.METAFOLDER_NAME).mkdirs();
|
||||
new File(
|
||||
basePath + "/" + HoodieTableMetaClient.METAFOLDER_NAME + "/" + commitTime + ".commit")
|
||||
.createNewFile();
|
||||
}
|
||||
return filename;
|
||||
}
|
||||
|
||||
private String writeParquetFile(String partitionPath, String filename, List<HoodieRecord> records, Schema schema,
|
||||
BloomFilter filter, boolean createCommitTime) throws IOException {
|
||||
|
||||
|
||||
if (filter == null) {
|
||||
filter = new BloomFilter(10000, 0.0000001);
|
||||
}
|
||||
HoodieAvroWriteSupport writeSupport = new HoodieAvroWriteSupport(new AvroSchemaConverter().convert(schema), schema, filter);
|
||||
String commitTime = FSUtils.getCommitTime(filename);
|
||||
HoodieParquetConfig config = new HoodieParquetConfig(writeSupport, CompressionCodecName.GZIP,
|
||||
ParquetWriter.DEFAULT_BLOCK_SIZE, ParquetWriter.DEFAULT_PAGE_SIZE, 120 * 1024 * 1024, new Configuration());
|
||||
HoodieParquetWriter writer = new HoodieParquetWriter(
|
||||
commitTime,
|
||||
new Path(basePath + "/" + partitionPath + "/" + filename),
|
||||
config,
|
||||
schema);
|
||||
int seqId = 1;
|
||||
for (HoodieRecord record : records) {
|
||||
GenericRecord avroRecord = (GenericRecord) record.getData().getInsertValue(schema).get();
|
||||
HoodieAvroUtils.addCommitMetadataToRecord(avroRecord, commitTime, "" + seqId++);
|
||||
HoodieAvroUtils.addHoodieKeyToRecord(avroRecord, record.getRecordKey(), record.getPartitionPath(), filename);
|
||||
writer.writeAvro(record.getRecordKey(), avroRecord);
|
||||
filter.add(record.getRecordKey());
|
||||
}
|
||||
writer.close();
|
||||
|
||||
if (createCommitTime) {
|
||||
// Also make sure the commit is valid
|
||||
new File(basePath + "/" + HoodieTableMetaClient.METAFOLDER_NAME).mkdirs();
|
||||
new File(basePath + "/" + HoodieTableMetaClient.METAFOLDER_NAME + "/" + commitTime + ".commit").createNewFile();
|
||||
}
|
||||
return filename;
|
||||
@After
|
||||
public void clean() {
|
||||
if (jsc != null) {
|
||||
jsc.stop();
|
||||
}
|
||||
|
||||
@After
|
||||
public void clean() {
|
||||
if (jsc != null) {
|
||||
jsc.stop();
|
||||
}
|
||||
if (basePath != null) {
|
||||
new File(basePath).delete();
|
||||
}
|
||||
if (basePath != null) {
|
||||
new File(basePath).delete();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -16,9 +16,11 @@
|
||||
|
||||
package com.uber.hoodie.io;
|
||||
|
||||
import static org.junit.Assert.assertEquals;
|
||||
import static org.junit.Assert.assertTrue;
|
||||
|
||||
import com.uber.hoodie.avro.model.HoodieArchivedMetaEntry;
|
||||
import com.uber.hoodie.common.HoodieTestDataGenerator;
|
||||
import com.uber.hoodie.common.model.HoodieArchivedLogFile;
|
||||
import com.uber.hoodie.common.model.HoodieLogFile;
|
||||
import com.uber.hoodie.common.model.HoodieTestUtils;
|
||||
import com.uber.hoodie.common.table.HoodieTableMetaClient;
|
||||
@@ -29,6 +31,11 @@ import com.uber.hoodie.common.table.timeline.HoodieInstant;
|
||||
import com.uber.hoodie.common.util.FSUtils;
|
||||
import com.uber.hoodie.config.HoodieCompactionConfig;
|
||||
import com.uber.hoodie.config.HoodieWriteConfig;
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
import java.util.stream.Collectors;
|
||||
import org.apache.avro.generic.GenericRecord;
|
||||
import org.apache.avro.generic.IndexedRecord;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
@@ -37,197 +44,196 @@ import org.junit.Before;
|
||||
import org.junit.Test;
|
||||
import org.junit.rules.TemporaryFolder;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import static org.junit.Assert.assertEquals;
|
||||
import static org.junit.Assert.assertTrue;
|
||||
|
||||
public class TestHoodieCommitArchiveLog {
|
||||
private String basePath;
|
||||
private FileSystem fs;
|
||||
|
||||
@Before
|
||||
public void init() throws Exception {
|
||||
TemporaryFolder folder = new TemporaryFolder();
|
||||
folder.create();
|
||||
basePath = folder.getRoot().getAbsolutePath();
|
||||
HoodieTestUtils.init(basePath);
|
||||
fs = FSUtils.getFs();
|
||||
private String basePath;
|
||||
private FileSystem fs;
|
||||
|
||||
@Before
|
||||
public void init() throws Exception {
|
||||
TemporaryFolder folder = new TemporaryFolder();
|
||||
folder.create();
|
||||
basePath = folder.getRoot().getAbsolutePath();
|
||||
HoodieTestUtils.init(basePath);
|
||||
fs = FSUtils.getFs();
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testArchiveEmptyDataset() throws IOException {
|
||||
HoodieWriteConfig cfg = HoodieWriteConfig.newBuilder().withPath(basePath)
|
||||
.withSchema(HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA).withParallelism(2, 2)
|
||||
.forTable("test-trip-table").build();
|
||||
HoodieCommitArchiveLog archiveLog = new HoodieCommitArchiveLog(cfg, fs);
|
||||
boolean result = archiveLog.archiveIfRequired();
|
||||
assertTrue(result);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testArchiveDatasetWithArchival() throws IOException {
|
||||
HoodieWriteConfig cfg = HoodieWriteConfig.newBuilder().withPath(basePath)
|
||||
.withSchema(HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA).withParallelism(2, 2)
|
||||
.withCompactionConfig(HoodieCompactionConfig.newBuilder().archiveCommitsWith(2, 4).build())
|
||||
.forTable("test-trip-table").build();
|
||||
HoodieTestUtils.init(basePath);
|
||||
HoodieTestDataGenerator.createCommitFile(basePath, "100");
|
||||
HoodieTestDataGenerator.createCommitFile(basePath, "101");
|
||||
HoodieTestDataGenerator.createCommitFile(basePath, "102");
|
||||
HoodieTestDataGenerator.createCommitFile(basePath, "103");
|
||||
HoodieTestDataGenerator.createCommitFile(basePath, "104");
|
||||
HoodieTestDataGenerator.createCommitFile(basePath, "105");
|
||||
|
||||
HoodieTableMetaClient metadata = new HoodieTableMetaClient(fs, basePath);
|
||||
HoodieTimeline timeline =
|
||||
metadata.getActiveTimeline().getCommitsAndCompactionsTimeline().filterCompletedInstants();
|
||||
|
||||
assertEquals("Loaded 6 commits and the count should match", 6, timeline.countInstants());
|
||||
|
||||
HoodieTestUtils.createCleanFiles(basePath, "100");
|
||||
HoodieTestUtils.createCleanFiles(basePath, "101");
|
||||
HoodieTestUtils.createCleanFiles(basePath, "102");
|
||||
HoodieTestUtils.createCleanFiles(basePath, "103");
|
||||
HoodieTestUtils.createCleanFiles(basePath, "104");
|
||||
HoodieTestUtils.createCleanFiles(basePath, "105");
|
||||
|
||||
//reload the timeline and get all the commmits before archive
|
||||
timeline = metadata.getActiveTimeline().reload().getAllCommitsTimeline()
|
||||
.filterCompletedInstants();
|
||||
List<HoodieInstant> originalCommits = timeline.getInstants().collect(Collectors.toList());
|
||||
|
||||
assertEquals("Loaded 6 commits and the count should match", 12, timeline.countInstants());
|
||||
|
||||
HoodieCommitArchiveLog archiveLog = new HoodieCommitArchiveLog(cfg, fs);
|
||||
|
||||
assertTrue(archiveLog.archiveIfRequired());
|
||||
|
||||
//reload the timeline and remove the remaining commits
|
||||
timeline = metadata.getActiveTimeline().reload().getAllCommitsTimeline()
|
||||
.filterCompletedInstants();
|
||||
originalCommits.removeAll(timeline.getInstants().collect(Collectors.toList()));
|
||||
|
||||
//read the file
|
||||
HoodieLogFormat.Reader reader = HoodieLogFormat.newReader(FSUtils.getFs(),
|
||||
new HoodieLogFile(new Path(basePath + "/.hoodie/.commits_.archive.1")),
|
||||
HoodieArchivedMetaEntry.getClassSchema(), false);
|
||||
|
||||
int archivedRecordsCount = 0;
|
||||
List<IndexedRecord> readRecords = new ArrayList<>();
|
||||
//read the avro blocks and validate the number of records written in each avro block
|
||||
while (reader.hasNext()) {
|
||||
HoodieAvroDataBlock blk = (HoodieAvroDataBlock) reader.next();
|
||||
List<IndexedRecord> records = blk.getRecords();
|
||||
readRecords.addAll(records);
|
||||
assertEquals("Archived and read records for each block are same", 8, records.size());
|
||||
archivedRecordsCount += records.size();
|
||||
}
|
||||
assertEquals("Total archived records and total read records are the same count", 8,
|
||||
archivedRecordsCount);
|
||||
|
||||
@Test
|
||||
public void testArchiveEmptyDataset() throws IOException {
|
||||
HoodieWriteConfig cfg = HoodieWriteConfig.newBuilder().withPath(basePath)
|
||||
.withSchema(HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA).withParallelism(2, 2)
|
||||
.forTable("test-trip-table").build();
|
||||
HoodieCommitArchiveLog archiveLog = new HoodieCommitArchiveLog(cfg, fs);
|
||||
boolean result = archiveLog.archiveIfRequired();
|
||||
assertTrue(result);
|
||||
}
|
||||
//make sure the archived commits are the same as the (originalcommits - commitsleft)
|
||||
List<String> readCommits = readRecords.stream().map(r -> (GenericRecord) r).map(r -> {
|
||||
return r.get("commitTime").toString();
|
||||
}).collect(Collectors.toList());
|
||||
Collections.sort(readCommits);
|
||||
|
||||
@Test
|
||||
public void testArchiveDatasetWithArchival() throws IOException {
|
||||
HoodieWriteConfig cfg = HoodieWriteConfig.newBuilder().withPath(basePath)
|
||||
.withSchema(HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA).withParallelism(2, 2)
|
||||
.withCompactionConfig(HoodieCompactionConfig.newBuilder().archiveCommitsWith(2, 4).build())
|
||||
.forTable("test-trip-table").build();
|
||||
HoodieTestUtils.init(basePath);
|
||||
HoodieTestDataGenerator.createCommitFile(basePath, "100");
|
||||
HoodieTestDataGenerator.createCommitFile(basePath, "101");
|
||||
HoodieTestDataGenerator.createCommitFile(basePath, "102");
|
||||
HoodieTestDataGenerator.createCommitFile(basePath, "103");
|
||||
HoodieTestDataGenerator.createCommitFile(basePath, "104");
|
||||
HoodieTestDataGenerator.createCommitFile(basePath, "105");
|
||||
assertEquals(
|
||||
"Read commits map should match the originalCommits - commitsLoadedFromArchival",
|
||||
originalCommits.stream().map(HoodieInstant::getTimestamp).collect(Collectors.toList()),
|
||||
readCommits);
|
||||
}
|
||||
|
||||
HoodieTableMetaClient metadata = new HoodieTableMetaClient(fs, basePath);
|
||||
HoodieTimeline timeline =
|
||||
metadata.getActiveTimeline().getCommitsAndCompactionsTimeline().filterCompletedInstants();
|
||||
@Test
|
||||
public void testArchiveDatasetWithNoArchival() throws IOException {
|
||||
HoodieWriteConfig cfg = HoodieWriteConfig.newBuilder().withPath(basePath)
|
||||
.withSchema(HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA).withParallelism(2, 2)
|
||||
.forTable("test-trip-table").withCompactionConfig(
|
||||
HoodieCompactionConfig.newBuilder().archiveCommitsWith(2, 5).build()).build();
|
||||
HoodieTableMetaClient metadata = new HoodieTableMetaClient(fs, basePath);
|
||||
HoodieCommitArchiveLog archiveLog = new HoodieCommitArchiveLog(cfg, fs);
|
||||
HoodieTestDataGenerator.createCommitFile(basePath, "100");
|
||||
HoodieTestDataGenerator.createCommitFile(basePath, "101");
|
||||
HoodieTestDataGenerator.createCommitFile(basePath, "102");
|
||||
HoodieTestDataGenerator.createCommitFile(basePath, "103");
|
||||
|
||||
assertEquals("Loaded 6 commits and the count should match", 6, timeline.countInstants());
|
||||
HoodieTimeline timeline =
|
||||
metadata.getActiveTimeline().getCommitsAndCompactionsTimeline().filterCompletedInstants();
|
||||
|
||||
HoodieTestUtils.createCleanFiles(basePath, "100");
|
||||
HoodieTestUtils.createCleanFiles(basePath, "101");
|
||||
HoodieTestUtils.createCleanFiles(basePath, "102");
|
||||
HoodieTestUtils.createCleanFiles(basePath, "103");
|
||||
HoodieTestUtils.createCleanFiles(basePath, "104");
|
||||
HoodieTestUtils.createCleanFiles(basePath, "105");
|
||||
assertEquals("Loaded 4 commits and the count should match", 4, timeline.countInstants());
|
||||
boolean result = archiveLog.archiveIfRequired();
|
||||
assertTrue(result);
|
||||
timeline =
|
||||
metadata.getActiveTimeline().reload().getCommitsAndCompactionsTimeline()
|
||||
.filterCompletedInstants();
|
||||
assertEquals("Should not archive commits when maxCommitsToKeep is 5", 4,
|
||||
timeline.countInstants());
|
||||
}
|
||||
|
||||
//reload the timeline and get all the commmits before archive
|
||||
timeline = metadata.getActiveTimeline().reload().getAllCommitsTimeline().filterCompletedInstants();
|
||||
List<HoodieInstant> originalCommits = timeline.getInstants().collect(Collectors.toList());
|
||||
@Test
|
||||
public void testArchiveCommitSafety() throws IOException {
|
||||
HoodieWriteConfig cfg = HoodieWriteConfig.newBuilder().withPath(basePath)
|
||||
.withSchema(HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA).withParallelism(2, 2)
|
||||
.forTable("test-trip-table").withCompactionConfig(
|
||||
HoodieCompactionConfig.newBuilder().archiveCommitsWith(2, 5).build()).build();
|
||||
HoodieTableMetaClient metadata = new HoodieTableMetaClient(fs, basePath);
|
||||
HoodieCommitArchiveLog archiveLog = new HoodieCommitArchiveLog(cfg, fs);
|
||||
HoodieTestDataGenerator.createCommitFile(basePath, "100");
|
||||
HoodieTestDataGenerator.createCommitFile(basePath, "101");
|
||||
HoodieTestDataGenerator.createCommitFile(basePath, "102");
|
||||
HoodieTestDataGenerator.createCommitFile(basePath, "103");
|
||||
HoodieTestDataGenerator.createCommitFile(basePath, "104");
|
||||
HoodieTestDataGenerator.createCommitFile(basePath, "105");
|
||||
|
||||
assertEquals("Loaded 6 commits and the count should match", 12, timeline.countInstants());
|
||||
HoodieTimeline timeline =
|
||||
metadata.getActiveTimeline().getCommitsAndCompactionsTimeline().filterCompletedInstants();
|
||||
assertEquals("Loaded 6 commits and the count should match", 6, timeline.countInstants());
|
||||
boolean result = archiveLog.archiveIfRequired();
|
||||
assertTrue(result);
|
||||
timeline =
|
||||
metadata.getActiveTimeline().reload().getCommitsAndCompactionsTimeline()
|
||||
.filterCompletedInstants();
|
||||
assertTrue("Archived commits should always be safe",
|
||||
timeline.containsOrBeforeTimelineStarts("100"));
|
||||
assertTrue("Archived commits should always be safe",
|
||||
timeline.containsOrBeforeTimelineStarts("101"));
|
||||
assertTrue("Archived commits should always be safe",
|
||||
timeline.containsOrBeforeTimelineStarts("102"));
|
||||
assertTrue("Archived commits should always be safe",
|
||||
timeline.containsOrBeforeTimelineStarts("103"));
|
||||
}
|
||||
|
||||
HoodieCommitArchiveLog archiveLog = new HoodieCommitArchiveLog(cfg, fs);
|
||||
@Test
|
||||
public void testArchiveCommitSavepointNoHole() throws IOException {
|
||||
HoodieWriteConfig cfg = HoodieWriteConfig.newBuilder().withPath(basePath)
|
||||
.withSchema(HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA).withParallelism(2, 2)
|
||||
.forTable("test-trip-table").withCompactionConfig(
|
||||
HoodieCompactionConfig.newBuilder().archiveCommitsWith(2, 5).build()).build();
|
||||
HoodieTableMetaClient metadata = new HoodieTableMetaClient(fs, basePath);
|
||||
HoodieCommitArchiveLog archiveLog = new HoodieCommitArchiveLog(cfg, fs);
|
||||
HoodieTestDataGenerator.createCommitFile(basePath, "100");
|
||||
HoodieTestDataGenerator.createCommitFile(basePath, "101");
|
||||
HoodieTestDataGenerator.createSavepointFile(basePath, "101");
|
||||
HoodieTestDataGenerator.createCommitFile(basePath, "102");
|
||||
HoodieTestDataGenerator.createCommitFile(basePath, "103");
|
||||
HoodieTestDataGenerator.createCommitFile(basePath, "104");
|
||||
HoodieTestDataGenerator.createCommitFile(basePath, "105");
|
||||
|
||||
assertTrue(archiveLog.archiveIfRequired());
|
||||
|
||||
//reload the timeline and remove the remaining commits
|
||||
timeline = metadata.getActiveTimeline().reload().getAllCommitsTimeline().filterCompletedInstants();
|
||||
originalCommits.removeAll(timeline.getInstants().collect(Collectors.toList()));
|
||||
|
||||
//read the file
|
||||
HoodieLogFormat.Reader reader = HoodieLogFormat.newReader(FSUtils.getFs(),
|
||||
new HoodieLogFile(new Path(basePath + "/.hoodie/.commits_.archive.1")), HoodieArchivedMetaEntry.getClassSchema(), false);
|
||||
|
||||
int archivedRecordsCount = 0;
|
||||
List<IndexedRecord> readRecords = new ArrayList<>();
|
||||
//read the avro blocks and validate the number of records written in each avro block
|
||||
while(reader.hasNext()) {
|
||||
HoodieAvroDataBlock blk = (HoodieAvroDataBlock) reader.next();
|
||||
List<IndexedRecord> records = blk.getRecords();
|
||||
readRecords.addAll(records);
|
||||
assertEquals("Archived and read records for each block are same", 8, records.size());
|
||||
archivedRecordsCount += records.size();
|
||||
}
|
||||
assertEquals("Total archived records and total read records are the same count", 8, archivedRecordsCount);
|
||||
|
||||
//make sure the archived commits are the same as the (originalcommits - commitsleft)
|
||||
List<String> readCommits = readRecords.stream().map(r -> (GenericRecord)r).map(r -> {
|
||||
return r.get("commitTime").toString();
|
||||
}).collect(Collectors.toList());
|
||||
Collections.sort(readCommits);
|
||||
|
||||
assertEquals(
|
||||
"Read commits map should match the originalCommits - commitsLoadedFromArchival",
|
||||
originalCommits.stream().map(HoodieInstant::getTimestamp).collect(Collectors.toList()),
|
||||
readCommits);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testArchiveDatasetWithNoArchival() throws IOException {
|
||||
HoodieWriteConfig cfg = HoodieWriteConfig.newBuilder().withPath(basePath)
|
||||
.withSchema(HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA).withParallelism(2, 2)
|
||||
.forTable("test-trip-table").withCompactionConfig(
|
||||
HoodieCompactionConfig.newBuilder().archiveCommitsWith(2, 5).build()).build();
|
||||
HoodieTableMetaClient metadata = new HoodieTableMetaClient(fs, basePath);
|
||||
HoodieCommitArchiveLog archiveLog = new HoodieCommitArchiveLog(cfg, fs);
|
||||
HoodieTestDataGenerator.createCommitFile(basePath, "100");
|
||||
HoodieTestDataGenerator.createCommitFile(basePath, "101");
|
||||
HoodieTestDataGenerator.createCommitFile(basePath, "102");
|
||||
HoodieTestDataGenerator.createCommitFile(basePath, "103");
|
||||
|
||||
HoodieTimeline timeline =
|
||||
metadata.getActiveTimeline().getCommitsAndCompactionsTimeline().filterCompletedInstants();
|
||||
|
||||
assertEquals("Loaded 4 commits and the count should match", 4, timeline.countInstants());
|
||||
boolean result = archiveLog.archiveIfRequired();
|
||||
assertTrue(result);
|
||||
timeline =
|
||||
metadata.getActiveTimeline().reload().getCommitsAndCompactionsTimeline().filterCompletedInstants();
|
||||
assertEquals("Should not archive commits when maxCommitsToKeep is 5", 4,
|
||||
timeline.countInstants());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testArchiveCommitSafety() throws IOException {
|
||||
HoodieWriteConfig cfg = HoodieWriteConfig.newBuilder().withPath(basePath)
|
||||
.withSchema(HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA).withParallelism(2, 2)
|
||||
.forTable("test-trip-table").withCompactionConfig(
|
||||
HoodieCompactionConfig.newBuilder().archiveCommitsWith(2, 5).build()).build();
|
||||
HoodieTableMetaClient metadata = new HoodieTableMetaClient(fs, basePath);
|
||||
HoodieCommitArchiveLog archiveLog = new HoodieCommitArchiveLog(cfg, fs);
|
||||
HoodieTestDataGenerator.createCommitFile(basePath, "100");
|
||||
HoodieTestDataGenerator.createCommitFile(basePath, "101");
|
||||
HoodieTestDataGenerator.createCommitFile(basePath, "102");
|
||||
HoodieTestDataGenerator.createCommitFile(basePath, "103");
|
||||
HoodieTestDataGenerator.createCommitFile(basePath, "104");
|
||||
HoodieTestDataGenerator.createCommitFile(basePath, "105");
|
||||
|
||||
HoodieTimeline timeline =
|
||||
metadata.getActiveTimeline().getCommitsAndCompactionsTimeline().filterCompletedInstants();
|
||||
assertEquals("Loaded 6 commits and the count should match", 6, timeline.countInstants());
|
||||
boolean result = archiveLog.archiveIfRequired();
|
||||
assertTrue(result);
|
||||
timeline =
|
||||
metadata.getActiveTimeline().reload().getCommitsAndCompactionsTimeline().filterCompletedInstants();
|
||||
assertTrue("Archived commits should always be safe",
|
||||
timeline.containsOrBeforeTimelineStarts("100"));
|
||||
assertTrue("Archived commits should always be safe",
|
||||
timeline.containsOrBeforeTimelineStarts("101"));
|
||||
assertTrue("Archived commits should always be safe",
|
||||
timeline.containsOrBeforeTimelineStarts("102"));
|
||||
assertTrue("Archived commits should always be safe",
|
||||
timeline.containsOrBeforeTimelineStarts("103"));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testArchiveCommitSavepointNoHole() throws IOException {
|
||||
HoodieWriteConfig cfg = HoodieWriteConfig.newBuilder().withPath(basePath)
|
||||
.withSchema(HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA).withParallelism(2, 2)
|
||||
.forTable("test-trip-table").withCompactionConfig(
|
||||
HoodieCompactionConfig.newBuilder().archiveCommitsWith(2, 5).build()).build();
|
||||
HoodieTableMetaClient metadata = new HoodieTableMetaClient(fs, basePath);
|
||||
HoodieCommitArchiveLog archiveLog = new HoodieCommitArchiveLog(cfg, fs);
|
||||
HoodieTestDataGenerator.createCommitFile(basePath, "100");
|
||||
HoodieTestDataGenerator.createCommitFile(basePath, "101");
|
||||
HoodieTestDataGenerator.createSavepointFile(basePath, "101");
|
||||
HoodieTestDataGenerator.createCommitFile(basePath, "102");
|
||||
HoodieTestDataGenerator.createCommitFile(basePath, "103");
|
||||
HoodieTestDataGenerator.createCommitFile(basePath, "104");
|
||||
HoodieTestDataGenerator.createCommitFile(basePath, "105");
|
||||
|
||||
HoodieTimeline timeline =
|
||||
metadata.getActiveTimeline().getCommitsAndCompactionsTimeline().filterCompletedInstants();
|
||||
assertEquals("Loaded 6 commits and the count should match", 6, timeline.countInstants());
|
||||
boolean result = archiveLog.archiveIfRequired();
|
||||
assertTrue(result);
|
||||
timeline =
|
||||
metadata.getActiveTimeline().reload().getCommitsAndCompactionsTimeline().filterCompletedInstants();
|
||||
assertEquals(
|
||||
"Since we have a savepoint at 101, we should never archive any commit after 101 (we only archive 100)",
|
||||
5, timeline.countInstants());
|
||||
assertTrue("Archived commits should always be safe",
|
||||
timeline.containsInstant(new HoodieInstant(false, HoodieTimeline.COMMIT_ACTION, "101")));
|
||||
assertTrue("Archived commits should always be safe",
|
||||
timeline.containsInstant(new HoodieInstant(false, HoodieTimeline.COMMIT_ACTION, "102")));
|
||||
assertTrue("Archived commits should always be safe",
|
||||
timeline.containsInstant(new HoodieInstant(false, HoodieTimeline.COMMIT_ACTION, "103")));
|
||||
}
|
||||
HoodieTimeline timeline =
|
||||
metadata.getActiveTimeline().getCommitsAndCompactionsTimeline().filterCompletedInstants();
|
||||
assertEquals("Loaded 6 commits and the count should match", 6, timeline.countInstants());
|
||||
boolean result = archiveLog.archiveIfRequired();
|
||||
assertTrue(result);
|
||||
timeline =
|
||||
metadata.getActiveTimeline().reload().getCommitsAndCompactionsTimeline()
|
||||
.filterCompletedInstants();
|
||||
assertEquals(
|
||||
"Since we have a savepoint at 101, we should never archive any commit after 101 (we only archive 100)",
|
||||
5, timeline.countInstants());
|
||||
assertTrue("Archived commits should always be safe",
|
||||
timeline.containsInstant(new HoodieInstant(false, HoodieTimeline.COMMIT_ACTION, "101")));
|
||||
assertTrue("Archived commits should always be safe",
|
||||
timeline.containsInstant(new HoodieInstant(false, HoodieTimeline.COMMIT_ACTION, "102")));
|
||||
assertTrue("Archived commits should always be safe",
|
||||
timeline.containsInstant(new HoodieInstant(false, HoodieTimeline.COMMIT_ACTION, "103")));
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
||||
@@ -16,7 +16,9 @@
|
||||
|
||||
package com.uber.hoodie.io;
|
||||
|
||||
import com.uber.hoodie.HoodieReadClient;
|
||||
import static org.junit.Assert.assertEquals;
|
||||
import static org.junit.Assert.assertTrue;
|
||||
|
||||
import com.uber.hoodie.HoodieWriteClient;
|
||||
import com.uber.hoodie.WriteStatus;
|
||||
import com.uber.hoodie.common.HoodieClientTestUtils;
|
||||
@@ -34,13 +36,16 @@ import com.uber.hoodie.config.HoodieCompactionConfig;
|
||||
import com.uber.hoodie.config.HoodieIndexConfig;
|
||||
import com.uber.hoodie.config.HoodieStorageConfig;
|
||||
import com.uber.hoodie.config.HoodieWriteConfig;
|
||||
import com.uber.hoodie.index.bloom.HoodieBloomIndex;
|
||||
import com.uber.hoodie.index.HoodieIndex;
|
||||
import com.uber.hoodie.index.bloom.HoodieBloomIndex;
|
||||
import com.uber.hoodie.io.compact.HoodieCompactor;
|
||||
import com.uber.hoodie.io.compact.HoodieRealtimeTableCompactor;
|
||||
import com.uber.hoodie.table.HoodieTable;
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.util.List;
|
||||
import java.util.stream.Collectors;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.spark.SparkConf;
|
||||
import org.apache.spark.api.java.JavaRDD;
|
||||
import org.apache.spark.api.java.JavaSparkContext;
|
||||
import org.junit.After;
|
||||
@@ -48,161 +53,154 @@ import org.junit.Before;
|
||||
import org.junit.Test;
|
||||
import org.junit.rules.TemporaryFolder;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.util.List;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import static org.junit.Assert.assertEquals;
|
||||
import static org.junit.Assert.assertTrue;
|
||||
|
||||
public class TestHoodieCompactor {
|
||||
private transient JavaSparkContext jsc = null;
|
||||
private String basePath = null;
|
||||
private HoodieCompactor compactor;
|
||||
private transient HoodieTestDataGenerator dataGen = null;
|
||||
|
||||
@Before
|
||||
public void init() throws IOException {
|
||||
// Initialize a local spark env
|
||||
jsc = new JavaSparkContext(HoodieClientTestUtils.getSparkConfForTest("TestHoodieCompactor"));
|
||||
private transient JavaSparkContext jsc = null;
|
||||
private String basePath = null;
|
||||
private HoodieCompactor compactor;
|
||||
private transient HoodieTestDataGenerator dataGen = null;
|
||||
|
||||
// Create a temp folder as the base path
|
||||
TemporaryFolder folder = new TemporaryFolder();
|
||||
folder.create();
|
||||
basePath = folder.getRoot().getAbsolutePath();
|
||||
HoodieTestUtils.initTableType(basePath, HoodieTableType.MERGE_ON_READ);
|
||||
@Before
|
||||
public void init() throws IOException {
|
||||
// Initialize a local spark env
|
||||
jsc = new JavaSparkContext(HoodieClientTestUtils.getSparkConfForTest("TestHoodieCompactor"));
|
||||
|
||||
dataGen = new HoodieTestDataGenerator();
|
||||
compactor = new HoodieRealtimeTableCompactor();
|
||||
// Create a temp folder as the base path
|
||||
TemporaryFolder folder = new TemporaryFolder();
|
||||
folder.create();
|
||||
basePath = folder.getRoot().getAbsolutePath();
|
||||
HoodieTestUtils.initTableType(basePath, HoodieTableType.MERGE_ON_READ);
|
||||
|
||||
dataGen = new HoodieTestDataGenerator();
|
||||
compactor = new HoodieRealtimeTableCompactor();
|
||||
}
|
||||
|
||||
@After
|
||||
public void clean() {
|
||||
if (basePath != null) {
|
||||
new File(basePath).delete();
|
||||
}
|
||||
|
||||
@After
|
||||
public void clean() {
|
||||
if (basePath != null) {
|
||||
new File(basePath).delete();
|
||||
}
|
||||
if (jsc != null) {
|
||||
jsc.stop();
|
||||
}
|
||||
if (jsc != null) {
|
||||
jsc.stop();
|
||||
}
|
||||
}
|
||||
|
||||
private HoodieWriteConfig getConfig() {
|
||||
return getConfigBuilder().build();
|
||||
}
|
||||
private HoodieWriteConfig getConfig() {
|
||||
return getConfigBuilder().build();
|
||||
}
|
||||
|
||||
private HoodieWriteConfig.Builder getConfigBuilder() {
|
||||
return HoodieWriteConfig.newBuilder().withPath(basePath)
|
||||
.withSchema(HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA).withParallelism(2, 2)
|
||||
.withCompactionConfig(
|
||||
HoodieCompactionConfig.newBuilder().compactionSmallFileSize(1024 * 1024)
|
||||
.withInlineCompaction(false).build())
|
||||
.withStorageConfig(HoodieStorageConfig.newBuilder().limitFileSize(1024 * 1024).build())
|
||||
.forTable("test-trip-table").withIndexConfig(
|
||||
HoodieIndexConfig.newBuilder().withIndexType(HoodieIndex.IndexType.BLOOM).build());
|
||||
}
|
||||
private HoodieWriteConfig.Builder getConfigBuilder() {
|
||||
return HoodieWriteConfig.newBuilder().withPath(basePath)
|
||||
.withSchema(HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA).withParallelism(2, 2)
|
||||
.withCompactionConfig(
|
||||
HoodieCompactionConfig.newBuilder().compactionSmallFileSize(1024 * 1024)
|
||||
.withInlineCompaction(false).build())
|
||||
.withStorageConfig(HoodieStorageConfig.newBuilder().limitFileSize(1024 * 1024).build())
|
||||
.forTable("test-trip-table").withIndexConfig(
|
||||
HoodieIndexConfig.newBuilder().withIndexType(HoodieIndex.IndexType.BLOOM).build());
|
||||
}
|
||||
|
||||
@Test(expected = IllegalArgumentException.class)
|
||||
public void testCompactionOnCopyOnWriteFail() throws Exception {
|
||||
HoodieTestUtils.initTableType(basePath, HoodieTableType.COPY_ON_WRITE);
|
||||
@Test(expected = IllegalArgumentException.class)
|
||||
public void testCompactionOnCopyOnWriteFail() throws Exception {
|
||||
HoodieTestUtils.initTableType(basePath, HoodieTableType.COPY_ON_WRITE);
|
||||
|
||||
HoodieTableMetaClient metaClient = new HoodieTableMetaClient(FSUtils.getFs(), basePath);
|
||||
HoodieTable table = HoodieTable.getHoodieTable(metaClient, getConfig());
|
||||
HoodieTableMetaClient metaClient = new HoodieTableMetaClient(FSUtils.getFs(), basePath);
|
||||
HoodieTable table = HoodieTable.getHoodieTable(metaClient, getConfig());
|
||||
|
||||
compactor.compact(jsc, getConfig(), table);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testCompactionEmpty() throws Exception {
|
||||
HoodieTableMetaClient metaClient = new HoodieTableMetaClient(FSUtils.getFs(), basePath);
|
||||
HoodieWriteConfig config = getConfig();
|
||||
HoodieTable table = HoodieTable.getHoodieTable(metaClient, config);
|
||||
HoodieWriteClient writeClient = new HoodieWriteClient(jsc, config);
|
||||
|
||||
String newCommitTime = writeClient.startCommit();
|
||||
List<HoodieRecord> records = dataGen.generateInserts(newCommitTime, 100);
|
||||
JavaRDD<HoodieRecord> recordsRDD = jsc.parallelize(records, 1);
|
||||
writeClient.insert(recordsRDD, newCommitTime).collect();
|
||||
|
||||
HoodieCompactionMetadata result =
|
||||
compactor.compact(jsc, getConfig(), table);
|
||||
String basePath = table.getMetaClient().getBasePath();
|
||||
assertTrue("If there is nothing to compact, result will be empty",
|
||||
result.getFileIdAndFullPaths(basePath).isEmpty());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testLogFileCountsAfterCompaction() throws Exception {
|
||||
FileSystem fs = FSUtils.getFs();
|
||||
// insert 100 records
|
||||
HoodieWriteConfig config = getConfig();
|
||||
HoodieWriteClient writeClient = new HoodieWriteClient(jsc, config);
|
||||
String newCommitTime = "100";
|
||||
writeClient.startCommitWithTime(newCommitTime);
|
||||
|
||||
List<HoodieRecord> records = dataGen.generateInserts(newCommitTime, 100);
|
||||
JavaRDD<HoodieRecord> recordsRDD = jsc.parallelize(records, 1);
|
||||
List<WriteStatus> statuses = writeClient.insert(recordsRDD, newCommitTime).collect();
|
||||
|
||||
// Update all the 100 records
|
||||
HoodieTableMetaClient metaClient = new HoodieTableMetaClient(fs, basePath);
|
||||
HoodieTable table = HoodieTable.getHoodieTable(metaClient, config);
|
||||
|
||||
newCommitTime = "101";
|
||||
writeClient.startCommitWithTime(newCommitTime);
|
||||
|
||||
List<HoodieRecord> updatedRecords = dataGen.generateUpdates(newCommitTime, records);
|
||||
JavaRDD<HoodieRecord> updatedRecordsRDD = jsc.parallelize(updatedRecords, 1);
|
||||
HoodieIndex index = new HoodieBloomIndex<>(config, jsc);
|
||||
updatedRecords = index.tagLocation(updatedRecordsRDD, table).collect();
|
||||
|
||||
// Write them to corresponding avro logfiles
|
||||
HoodieTestUtils
|
||||
.writeRecordsToLogFiles(metaClient.getBasePath(), HoodieTestDataGenerator.avroSchema,
|
||||
updatedRecords);
|
||||
|
||||
// Verify that all data file has one log file
|
||||
metaClient = new HoodieTableMetaClient(fs, basePath);
|
||||
table = HoodieTable.getHoodieTable(metaClient, config);
|
||||
for (String partitionPath : dataGen.getPartitionPaths()) {
|
||||
List<FileSlice> groupedLogFiles =
|
||||
table.getRTFileSystemView().getLatestFileSlices(partitionPath)
|
||||
.collect(Collectors.toList());
|
||||
for (FileSlice fileSlice : groupedLogFiles) {
|
||||
assertEquals("There should be 1 log file written for every data file", 1,
|
||||
fileSlice.getLogFiles().count());
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testCompactionEmpty() throws Exception {
|
||||
HoodieTableMetaClient metaClient = new HoodieTableMetaClient(FSUtils.getFs(), basePath);
|
||||
HoodieWriteConfig config = getConfig();
|
||||
HoodieTable table = HoodieTable.getHoodieTable(metaClient, config);
|
||||
HoodieWriteClient writeClient = new HoodieWriteClient(jsc, config);
|
||||
// Do a compaction
|
||||
metaClient = new HoodieTableMetaClient(fs, basePath);
|
||||
table = HoodieTable.getHoodieTable(metaClient, config);
|
||||
|
||||
String newCommitTime = writeClient.startCommit();
|
||||
List<HoodieRecord> records = dataGen.generateInserts(newCommitTime, 100);
|
||||
JavaRDD<HoodieRecord> recordsRDD = jsc.parallelize(records, 1);
|
||||
writeClient.insert(recordsRDD, newCommitTime).collect();
|
||||
HoodieCompactionMetadata result =
|
||||
compactor.compact(jsc, getConfig(), table);
|
||||
|
||||
HoodieCompactionMetadata result =
|
||||
compactor.compact(jsc, getConfig(), table);
|
||||
String basePath = table.getMetaClient().getBasePath();
|
||||
assertTrue("If there is nothing to compact, result will be empty",
|
||||
result.getFileIdAndFullPaths(basePath).isEmpty());
|
||||
// Verify that recently written compacted data file has no log file
|
||||
metaClient = new HoodieTableMetaClient(fs, basePath);
|
||||
table = HoodieTable.getHoodieTable(metaClient, config);
|
||||
HoodieActiveTimeline timeline = metaClient.getActiveTimeline();
|
||||
|
||||
assertTrue("Compaction commit should be > than last insert",
|
||||
HoodieTimeline.compareTimestamps(timeline.lastInstant().get().getTimestamp(), newCommitTime,
|
||||
HoodieTimeline.GREATER));
|
||||
|
||||
for (String partitionPath : dataGen.getPartitionPaths()) {
|
||||
List<FileSlice> groupedLogFiles = table.getRTFileSystemView()
|
||||
.getLatestFileSlices(partitionPath)
|
||||
.collect(Collectors.toList());
|
||||
for (FileSlice slice : groupedLogFiles) {
|
||||
assertTrue(
|
||||
"After compaction there should be no log files visiable on a Realtime view",
|
||||
slice.getLogFiles().collect(Collectors.toList()).isEmpty());
|
||||
}
|
||||
assertTrue(result.getPartitionToCompactionWriteStats().containsKey(partitionPath));
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testLogFileCountsAfterCompaction() throws Exception {
|
||||
FileSystem fs = FSUtils.getFs();
|
||||
// insert 100 records
|
||||
HoodieWriteConfig config = getConfig();
|
||||
HoodieWriteClient writeClient = new HoodieWriteClient(jsc, config);
|
||||
String newCommitTime = "100";
|
||||
writeClient.startCommitWithTime(newCommitTime);
|
||||
|
||||
List<HoodieRecord> records = dataGen.generateInserts(newCommitTime, 100);
|
||||
JavaRDD<HoodieRecord> recordsRDD = jsc.parallelize(records, 1);
|
||||
List<WriteStatus> statuses = writeClient.insert(recordsRDD, newCommitTime).collect();
|
||||
|
||||
// Update all the 100 records
|
||||
HoodieTableMetaClient metaClient = new HoodieTableMetaClient(fs, basePath);
|
||||
HoodieTable table = HoodieTable.getHoodieTable(metaClient, config);
|
||||
|
||||
newCommitTime = "101";
|
||||
writeClient.startCommitWithTime(newCommitTime);
|
||||
|
||||
List<HoodieRecord> updatedRecords = dataGen.generateUpdates(newCommitTime, records);
|
||||
JavaRDD<HoodieRecord> updatedRecordsRDD = jsc.parallelize(updatedRecords, 1);
|
||||
HoodieIndex index = new HoodieBloomIndex<>(config, jsc);
|
||||
updatedRecords = index.tagLocation(updatedRecordsRDD, table).collect();
|
||||
|
||||
// Write them to corresponding avro logfiles
|
||||
HoodieTestUtils
|
||||
.writeRecordsToLogFiles(metaClient.getBasePath(), HoodieTestDataGenerator.avroSchema,
|
||||
updatedRecords);
|
||||
|
||||
// Verify that all data file has one log file
|
||||
metaClient = new HoodieTableMetaClient(fs, basePath);
|
||||
table = HoodieTable.getHoodieTable(metaClient, config);
|
||||
for (String partitionPath : dataGen.getPartitionPaths()) {
|
||||
List<FileSlice> groupedLogFiles =
|
||||
table.getRTFileSystemView().getLatestFileSlices(partitionPath)
|
||||
.collect(Collectors.toList());
|
||||
for (FileSlice fileSlice : groupedLogFiles) {
|
||||
assertEquals("There should be 1 log file written for every data file", 1,
|
||||
fileSlice.getLogFiles().count());
|
||||
}
|
||||
}
|
||||
|
||||
// Do a compaction
|
||||
metaClient = new HoodieTableMetaClient(fs, basePath);
|
||||
table = HoodieTable.getHoodieTable(metaClient, config);
|
||||
|
||||
HoodieCompactionMetadata result =
|
||||
compactor.compact(jsc, getConfig(), table);
|
||||
|
||||
// Verify that recently written compacted data file has no log file
|
||||
metaClient = new HoodieTableMetaClient(fs, basePath);
|
||||
table = HoodieTable.getHoodieTable(metaClient, config);
|
||||
HoodieActiveTimeline timeline = metaClient.getActiveTimeline();
|
||||
|
||||
assertTrue("Compaction commit should be > than last insert",
|
||||
HoodieTimeline.compareTimestamps(timeline.lastInstant().get().getTimestamp(), newCommitTime,
|
||||
HoodieTimeline.GREATER));
|
||||
|
||||
for (String partitionPath : dataGen.getPartitionPaths()) {
|
||||
List<FileSlice> groupedLogFiles = table.getRTFileSystemView()
|
||||
.getLatestFileSlices(partitionPath)
|
||||
.collect(Collectors.toList());
|
||||
for (FileSlice slice: groupedLogFiles) {
|
||||
assertTrue(
|
||||
"After compaction there should be no log files visiable on a Realtime view",
|
||||
slice.getLogFiles().collect(Collectors.toList()).isEmpty());
|
||||
}
|
||||
assertTrue(result.getPartitionToCompactionWriteStats().containsKey(partitionPath));
|
||||
}
|
||||
}
|
||||
|
||||
// TODO - after modifying HoodieReadClient to support realtime tables - add more tests to make sure the data read is the updated data (compaction correctness)
|
||||
// TODO - add more test cases for compactions after a failed commit/compaction
|
||||
// TODO - after modifying HoodieReadClient to support realtime tables - add more tests to make sure the data read is the updated data (compaction correctness)
|
||||
// TODO - add more test cases for compactions after a failed commit/compaction
|
||||
}
|
||||
|
||||
@@ -17,12 +17,10 @@
|
||||
package com.uber.hoodie.io.strategy;
|
||||
|
||||
import static org.junit.Assert.assertEquals;
|
||||
import static org.junit.Assert.assertThat;
|
||||
import static org.junit.Assert.assertTrue;
|
||||
|
||||
import com.beust.jcommander.internal.Lists;
|
||||
import com.google.common.collect.Maps;
|
||||
|
||||
import com.uber.hoodie.config.HoodieCompactionConfig;
|
||||
import com.uber.hoodie.config.HoodieWriteConfig;
|
||||
import com.uber.hoodie.io.compact.CompactionOperation;
|
||||
|
||||
@@ -17,9 +17,7 @@
|
||||
package com.uber.hoodie.io.strategy;
|
||||
|
||||
import com.uber.hoodie.common.model.HoodieDataFile;
|
||||
import com.uber.hoodie.common.util.FSUtils;
|
||||
import java.util.UUID;
|
||||
import org.apache.hadoop.fs.FileStatus;
|
||||
|
||||
public class TestHoodieDataFile extends HoodieDataFile {
|
||||
|
||||
|
||||
@@ -18,7 +18,6 @@ package com.uber.hoodie.io.strategy;
|
||||
|
||||
import com.uber.hoodie.common.model.HoodieLogFile;
|
||||
import java.util.Optional;
|
||||
|
||||
import org.apache.hadoop.fs.Path;
|
||||
|
||||
public class TestHoodieLogFile extends HoodieLogFile {
|
||||
|
||||
@@ -16,30 +16,31 @@
|
||||
|
||||
package com.uber.hoodie.metrics;
|
||||
|
||||
import com.uber.hoodie.config.HoodieWriteConfig;
|
||||
|
||||
import org.apache.commons.configuration.ConfigurationException;
|
||||
import org.junit.Before;
|
||||
import org.junit.Test;
|
||||
|
||||
import static org.junit.Assert.assertTrue;
|
||||
import static org.mockito.Mockito.mock;
|
||||
import static org.mockito.Mockito.when;
|
||||
|
||||
import com.uber.hoodie.config.HoodieWriteConfig;
|
||||
import org.apache.commons.configuration.ConfigurationException;
|
||||
import org.junit.Before;
|
||||
import org.junit.Test;
|
||||
|
||||
public class TestHoodieMetrics {
|
||||
private HoodieMetrics metrics = null;
|
||||
|
||||
@Before
|
||||
public void start() throws ConfigurationException {
|
||||
HoodieWriteConfig config = mock(HoodieWriteConfig.class);
|
||||
when(config.isMetricsOn()).thenReturn(true);
|
||||
when(config.getMetricsReporterType()).thenReturn(MetricsReporterType.INMEMORY);
|
||||
metrics = new HoodieMetrics(config, "raw_table");
|
||||
}
|
||||
private HoodieMetrics metrics = null;
|
||||
|
||||
@Test
|
||||
public void testRegisterGauge() {
|
||||
metrics.registerGauge("metric1", 123L);
|
||||
assertTrue(Metrics.getInstance().getRegistry().getGauges().get("metric1").getValue().toString().equals("123"));
|
||||
}
|
||||
@Before
|
||||
public void start() throws ConfigurationException {
|
||||
HoodieWriteConfig config = mock(HoodieWriteConfig.class);
|
||||
when(config.isMetricsOn()).thenReturn(true);
|
||||
when(config.getMetricsReporterType()).thenReturn(MetricsReporterType.INMEMORY);
|
||||
metrics = new HoodieMetrics(config, "raw_table");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testRegisterGauge() {
|
||||
metrics.registerGauge("metric1", 123L);
|
||||
assertTrue(Metrics.getInstance().getRegistry().getGauges().get("metric1").getValue().toString()
|
||||
.equals("123"));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -16,26 +16,37 @@
|
||||
|
||||
package com.uber.hoodie.table;
|
||||
|
||||
import com.uber.hoodie.common.TestRawTripPayload.MetadataMergeWriteStatus;
|
||||
import com.uber.hoodie.common.table.HoodieTableMetaClient;
|
||||
import com.uber.hoodie.common.table.HoodieTimeline;
|
||||
import com.uber.hoodie.config.HoodieWriteConfig;
|
||||
import static org.junit.Assert.assertEquals;
|
||||
import static org.junit.Assert.assertFalse;
|
||||
import static org.junit.Assert.assertTrue;
|
||||
import static org.mockito.Mockito.mock;
|
||||
import static org.mockito.Mockito.when;
|
||||
|
||||
import com.uber.hoodie.WriteStatus;
|
||||
import com.uber.hoodie.common.BloomFilter;
|
||||
import com.uber.hoodie.common.HoodieClientTestUtils;
|
||||
import com.uber.hoodie.common.HoodieTestDataGenerator;
|
||||
import com.uber.hoodie.common.TestRawTripPayload;
|
||||
import com.uber.hoodie.common.TestRawTripPayload.MetadataMergeWriteStatus;
|
||||
import com.uber.hoodie.common.model.HoodieKey;
|
||||
import com.uber.hoodie.common.model.HoodieRecord;
|
||||
import com.uber.hoodie.common.model.HoodieRecordLocation;
|
||||
import com.uber.hoodie.common.model.HoodieTestUtils;
|
||||
import com.uber.hoodie.common.table.HoodieTableMetaClient;
|
||||
import com.uber.hoodie.common.table.HoodieTimeline;
|
||||
import com.uber.hoodie.common.util.FSUtils;
|
||||
import com.uber.hoodie.common.util.ParquetUtils;
|
||||
|
||||
import com.uber.hoodie.config.HoodieCompactionConfig;
|
||||
import com.uber.hoodie.io.HoodieCreateHandle;
|
||||
import com.uber.hoodie.config.HoodieStorageConfig;
|
||||
import com.uber.hoodie.config.HoodieWriteConfig;
|
||||
import com.uber.hoodie.io.HoodieCreateHandle;
|
||||
import java.io.File;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.UUID;
|
||||
import org.apache.avro.generic.GenericRecord;
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
@@ -47,424 +58,452 @@ import org.junit.After;
|
||||
import org.junit.Before;
|
||||
import org.junit.Test;
|
||||
import org.junit.rules.TemporaryFolder;
|
||||
|
||||
import java.io.File;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
import java.util.UUID;
|
||||
|
||||
import scala.Option;
|
||||
import scala.Tuple2;
|
||||
|
||||
import static org.junit.Assert.*;
|
||||
import static org.mockito.Mockito.mock;
|
||||
import static org.mockito.Mockito.when;
|
||||
|
||||
public class TestCopyOnWriteTable {
|
||||
private String basePath = null;
|
||||
private transient JavaSparkContext jsc = null;
|
||||
|
||||
@Before
|
||||
public void init() throws Exception {
|
||||
private String basePath = null;
|
||||
private transient JavaSparkContext jsc = null;
|
||||
|
||||
// Initialize a local spark env
|
||||
jsc = new JavaSparkContext(HoodieClientTestUtils.getSparkConfForTest("TestCopyOnWriteTable"));
|
||||
@Before
|
||||
public void init() throws Exception {
|
||||
|
||||
// Create a temp folder as the base path
|
||||
TemporaryFolder folder = new TemporaryFolder();
|
||||
folder.create();
|
||||
this.basePath = folder.getRoot().getAbsolutePath();
|
||||
HoodieTestUtils.init(basePath);
|
||||
// Initialize a local spark env
|
||||
jsc = new JavaSparkContext(HoodieClientTestUtils.getSparkConfForTest("TestCopyOnWriteTable"));
|
||||
|
||||
// Create a temp folder as the base path
|
||||
TemporaryFolder folder = new TemporaryFolder();
|
||||
folder.create();
|
||||
this.basePath = folder.getRoot().getAbsolutePath();
|
||||
HoodieTestUtils.init(basePath);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testMakeNewPath() throws Exception {
|
||||
String fileName = UUID.randomUUID().toString();
|
||||
String partitionPath = "2016/05/04";
|
||||
int unitNumber = (int) (Math.random() * 10);
|
||||
HoodieRecord record = mock(HoodieRecord.class);
|
||||
when(record.getPartitionPath()).thenReturn(partitionPath);
|
||||
|
||||
String commitTime = HoodieTestUtils.makeNewCommitTime();
|
||||
HoodieWriteConfig config = makeHoodieClientConfig();
|
||||
HoodieTableMetaClient metaClient = new HoodieTableMetaClient(FSUtils.getFs(), basePath);
|
||||
HoodieTable table = HoodieTable.getHoodieTable(metaClient, config);
|
||||
|
||||
HoodieCreateHandle io = new HoodieCreateHandle(config, commitTime, table, partitionPath);
|
||||
Path newPath = io.makeNewPath(record.getPartitionPath(), unitNumber, fileName);
|
||||
assertTrue(newPath.toString().equals(this.basePath + "/" + partitionPath + "/" + FSUtils
|
||||
.makeDataFileName(commitTime, unitNumber, fileName)));
|
||||
}
|
||||
|
||||
private HoodieWriteConfig makeHoodieClientConfig() throws Exception {
|
||||
return makeHoodieClientConfigBuilder().build();
|
||||
}
|
||||
|
||||
private HoodieWriteConfig.Builder makeHoodieClientConfigBuilder() throws Exception {
|
||||
// Prepare the AvroParquetIO
|
||||
String schemaStr = IOUtils
|
||||
.toString(getClass().getResourceAsStream("/exampleSchema.txt"), "UTF-8");
|
||||
return HoodieWriteConfig.newBuilder().withPath(basePath).withSchema(schemaStr);
|
||||
}
|
||||
|
||||
// TODO (weiy): Add testcases for crossing file writing.
|
||||
@Test
|
||||
public void testUpdateRecords() throws Exception {
|
||||
// Prepare the AvroParquetIO
|
||||
HoodieWriteConfig config = makeHoodieClientConfig();
|
||||
String firstCommitTime = HoodieTestUtils.makeNewCommitTime();
|
||||
HoodieTableMetaClient metadata = new HoodieTableMetaClient(FSUtils.getFs(), basePath);
|
||||
|
||||
String partitionPath = "/2016/01/31";
|
||||
HoodieCopyOnWriteTable table = new HoodieCopyOnWriteTable(config, metadata);
|
||||
|
||||
// Get some records belong to the same partition (2016/01/31)
|
||||
String recordStr1 = "{\"_row_key\":\"8eb5b87a-1feh-4edd-87b4-6ec96dc405a0\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}";
|
||||
String recordStr2 = "{\"_row_key\":\"8eb5b87b-1feu-4edd-87b4-6ec96dc405a0\",\"time\":\"2016-01-31T03:20:41.415Z\",\"number\":100}";
|
||||
String recordStr3 = "{\"_row_key\":\"8eb5b87c-1fej-4edd-87b4-6ec96dc405a0\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":15}";
|
||||
String recordStr4 = "{\"_row_key\":\"8eb5b87d-1fej-4edd-87b4-6ec96dc405a0\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":51}";
|
||||
|
||||
List<HoodieRecord> records = new ArrayList<>();
|
||||
TestRawTripPayload rowChange1 = new TestRawTripPayload(recordStr1);
|
||||
records.add(
|
||||
new HoodieRecord(new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()),
|
||||
rowChange1));
|
||||
TestRawTripPayload rowChange2 = new TestRawTripPayload(recordStr2);
|
||||
records.add(
|
||||
new HoodieRecord(new HoodieKey(rowChange2.getRowKey(), rowChange2.getPartitionPath()),
|
||||
rowChange2));
|
||||
TestRawTripPayload rowChange3 = new TestRawTripPayload(recordStr3);
|
||||
records.add(
|
||||
new HoodieRecord(new HoodieKey(rowChange3.getRowKey(), rowChange3.getPartitionPath()),
|
||||
rowChange3));
|
||||
|
||||
// Insert new records
|
||||
HoodieClientTestUtils.collectStatuses(table.handleInsert(firstCommitTime, records.iterator()));
|
||||
// We should have a parquet file generated (TODO: better control # files after we revise AvroParquetIO)
|
||||
File parquetFile = null;
|
||||
for (File file : new File(this.basePath + partitionPath).listFiles()) {
|
||||
if (file.getName().endsWith(".parquet")) {
|
||||
parquetFile = file;
|
||||
break;
|
||||
}
|
||||
}
|
||||
assertTrue(parquetFile != null);
|
||||
|
||||
// Read out the bloom filter and make sure filter can answer record exist or not
|
||||
Path parquetFilePath = new Path(parquetFile.getAbsolutePath());
|
||||
BloomFilter filter = ParquetUtils.readBloomFilterFromParquetMetadata(parquetFilePath);
|
||||
for (HoodieRecord record : records) {
|
||||
assertTrue(filter.mightContain(record.getRecordKey()));
|
||||
}
|
||||
// Create a commit file
|
||||
new File(this.basePath + "/" + HoodieTableMetaClient.METAFOLDER_NAME + "/"
|
||||
+ FSUtils.getCommitTime(parquetFile.getName()) + ".commit").createNewFile();
|
||||
|
||||
// Read the parquet file, check the record content
|
||||
List<GenericRecord> fileRecords = ParquetUtils.readAvroRecords(parquetFilePath);
|
||||
GenericRecord newRecord;
|
||||
int index = 0;
|
||||
for (GenericRecord record : fileRecords) {
|
||||
assertTrue(record.get("_row_key").toString().equals(records.get(index).getRecordKey()));
|
||||
index++;
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testMakeNewPath() throws Exception {
|
||||
String fileName = UUID.randomUUID().toString();
|
||||
String partitionPath = "2016/05/04";
|
||||
int unitNumber = (int) (Math.random() * 10);
|
||||
HoodieRecord record = mock(HoodieRecord.class);
|
||||
when(record.getPartitionPath()).thenReturn(partitionPath);
|
||||
// We update the 1st record & add a new record
|
||||
String updateRecordStr1 = "{\"_row_key\":\"8eb5b87a-1feh-4edd-87b4-6ec96dc405a0\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":15}";
|
||||
TestRawTripPayload updateRowChanges1 = new TestRawTripPayload(updateRecordStr1);
|
||||
HoodieRecord updatedRecord1 = new HoodieRecord(
|
||||
new HoodieKey(updateRowChanges1.getRowKey(), updateRowChanges1.getPartitionPath()),
|
||||
updateRowChanges1);
|
||||
updatedRecord1.setCurrentLocation(
|
||||
new HoodieRecordLocation(null, FSUtils.getFileId(parquetFile.getName())));
|
||||
|
||||
String commitTime = HoodieTestUtils.makeNewCommitTime();
|
||||
HoodieWriteConfig config = makeHoodieClientConfig();
|
||||
HoodieTableMetaClient metaClient = new HoodieTableMetaClient(FSUtils.getFs(), basePath);
|
||||
HoodieTable table = HoodieTable.getHoodieTable(metaClient, config);
|
||||
TestRawTripPayload rowChange4 = new TestRawTripPayload(recordStr4);
|
||||
HoodieRecord insertedRecord1 = new HoodieRecord(
|
||||
new HoodieKey(rowChange4.getRowKey(), rowChange4.getPartitionPath()), rowChange4);
|
||||
|
||||
HoodieCreateHandle io = new HoodieCreateHandle(config, commitTime, table, partitionPath);
|
||||
Path newPath = io.makeNewPath(record.getPartitionPath(), unitNumber, fileName);
|
||||
assertTrue(newPath.toString().equals(this.basePath + "/" + partitionPath + "/" + FSUtils
|
||||
.makeDataFileName(commitTime, unitNumber, fileName)));
|
||||
}
|
||||
List<HoodieRecord> updatedRecords = Arrays.asList(updatedRecord1, insertedRecord1);
|
||||
|
||||
private HoodieWriteConfig makeHoodieClientConfig() throws Exception {
|
||||
return makeHoodieClientConfigBuilder().build();
|
||||
}
|
||||
Thread.sleep(1000);
|
||||
String newCommitTime = HoodieTestUtils.makeNewCommitTime();
|
||||
metadata = new HoodieTableMetaClient(FSUtils.getFs(), basePath);
|
||||
table = new HoodieCopyOnWriteTable(config, metadata);
|
||||
Iterator<List<WriteStatus>> iter = table
|
||||
.handleUpdate(newCommitTime, updatedRecord1.getCurrentLocation().getFileId(),
|
||||
updatedRecords.iterator());
|
||||
|
||||
private HoodieWriteConfig.Builder makeHoodieClientConfigBuilder() throws Exception {
|
||||
// Prepare the AvroParquetIO
|
||||
String schemaStr = IOUtils.toString(getClass().getResourceAsStream("/exampleSchema.txt"), "UTF-8");
|
||||
return HoodieWriteConfig.newBuilder().withPath(basePath).withSchema(schemaStr);
|
||||
}
|
||||
|
||||
// TODO (weiy): Add testcases for crossing file writing.
|
||||
@Test
|
||||
public void testUpdateRecords() throws Exception {
|
||||
// Prepare the AvroParquetIO
|
||||
HoodieWriteConfig config = makeHoodieClientConfig();
|
||||
String firstCommitTime = HoodieTestUtils.makeNewCommitTime();
|
||||
HoodieTableMetaClient metadata = new HoodieTableMetaClient(FSUtils.getFs(), basePath);
|
||||
|
||||
String partitionPath = "/2016/01/31";
|
||||
HoodieCopyOnWriteTable table = new HoodieCopyOnWriteTable(config, metadata);
|
||||
|
||||
// Get some records belong to the same partition (2016/01/31)
|
||||
String recordStr1 = "{\"_row_key\":\"8eb5b87a-1feh-4edd-87b4-6ec96dc405a0\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}";
|
||||
String recordStr2 = "{\"_row_key\":\"8eb5b87b-1feu-4edd-87b4-6ec96dc405a0\",\"time\":\"2016-01-31T03:20:41.415Z\",\"number\":100}";
|
||||
String recordStr3 = "{\"_row_key\":\"8eb5b87c-1fej-4edd-87b4-6ec96dc405a0\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":15}";
|
||||
String recordStr4 = "{\"_row_key\":\"8eb5b87d-1fej-4edd-87b4-6ec96dc405a0\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":51}";
|
||||
|
||||
List<HoodieRecord> records = new ArrayList<>();
|
||||
TestRawTripPayload rowChange1 = new TestRawTripPayload(recordStr1);
|
||||
records.add(new HoodieRecord(new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()), rowChange1));
|
||||
TestRawTripPayload rowChange2 = new TestRawTripPayload(recordStr2);
|
||||
records.add(new HoodieRecord(new HoodieKey(rowChange2.getRowKey(), rowChange2.getPartitionPath()), rowChange2));
|
||||
TestRawTripPayload rowChange3 = new TestRawTripPayload(recordStr3);
|
||||
records.add(new HoodieRecord(new HoodieKey(rowChange3.getRowKey(), rowChange3.getPartitionPath()), rowChange3));
|
||||
|
||||
// Insert new records
|
||||
HoodieClientTestUtils.collectStatuses(table.handleInsert(firstCommitTime, records.iterator()));
|
||||
// We should have a parquet file generated (TODO: better control # files after we revise AvroParquetIO)
|
||||
File parquetFile = null;
|
||||
for (File file : new File(this.basePath + partitionPath).listFiles()) {
|
||||
if (file.getName().endsWith(".parquet")) {
|
||||
parquetFile = file;
|
||||
break;
|
||||
}
|
||||
// Check the updated file
|
||||
File updatedParquetFile = null;
|
||||
for (File file : new File(basePath + "/2016/01/31").listFiles()) {
|
||||
if (file.getName().endsWith(".parquet")) {
|
||||
if (FSUtils.getFileId(file.getName())
|
||||
.equals(FSUtils.getFileId(parquetFile.getName())) &&
|
||||
HoodieTimeline.compareTimestamps(FSUtils.getCommitTime(file.getName()),
|
||||
FSUtils.getCommitTime(parquetFile.getName()), HoodieTimeline.GREATER)) {
|
||||
updatedParquetFile = file;
|
||||
break;
|
||||
}
|
||||
assertTrue(parquetFile != null);
|
||||
|
||||
// Read out the bloom filter and make sure filter can answer record exist or not
|
||||
Path parquetFilePath = new Path(parquetFile.getAbsolutePath());
|
||||
BloomFilter filter = ParquetUtils.readBloomFilterFromParquetMetadata(parquetFilePath);
|
||||
for (HoodieRecord record : records) {
|
||||
assertTrue(filter.mightContain(record.getRecordKey()));
|
||||
}
|
||||
// Create a commit file
|
||||
new File(this.basePath + "/" + HoodieTableMetaClient.METAFOLDER_NAME + "/"
|
||||
+ FSUtils.getCommitTime(parquetFile.getName()) + ".commit").createNewFile();
|
||||
|
||||
// Read the parquet file, check the record content
|
||||
List<GenericRecord> fileRecords = ParquetUtils.readAvroRecords(parquetFilePath);
|
||||
GenericRecord newRecord;
|
||||
int index = 0;
|
||||
for (GenericRecord record: fileRecords) {
|
||||
assertTrue(record.get("_row_key").toString().equals(records.get(index).getRecordKey()));
|
||||
index++;
|
||||
}
|
||||
|
||||
// We update the 1st record & add a new record
|
||||
String updateRecordStr1 = "{\"_row_key\":\"8eb5b87a-1feh-4edd-87b4-6ec96dc405a0\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":15}";
|
||||
TestRawTripPayload updateRowChanges1 = new TestRawTripPayload(updateRecordStr1);
|
||||
HoodieRecord updatedRecord1 = new HoodieRecord(new HoodieKey(updateRowChanges1.getRowKey(), updateRowChanges1.getPartitionPath()), updateRowChanges1);
|
||||
updatedRecord1.setCurrentLocation(new HoodieRecordLocation(null, FSUtils.getFileId(parquetFile.getName())));
|
||||
|
||||
TestRawTripPayload rowChange4 = new TestRawTripPayload(recordStr4);
|
||||
HoodieRecord insertedRecord1 = new HoodieRecord(new HoodieKey(rowChange4.getRowKey(), rowChange4.getPartitionPath()), rowChange4);
|
||||
|
||||
List<HoodieRecord> updatedRecords = Arrays.asList(updatedRecord1, insertedRecord1);
|
||||
|
||||
Thread.sleep(1000);
|
||||
String newCommitTime = HoodieTestUtils.makeNewCommitTime();
|
||||
metadata = new HoodieTableMetaClient(FSUtils.getFs(), basePath);
|
||||
table = new HoodieCopyOnWriteTable(config, metadata);
|
||||
Iterator<List<WriteStatus>> iter = table.handleUpdate(newCommitTime, updatedRecord1.getCurrentLocation().getFileId(), updatedRecords.iterator());
|
||||
|
||||
// Check the updated file
|
||||
File updatedParquetFile = null;
|
||||
for (File file : new File(basePath + "/2016/01/31").listFiles()) {
|
||||
if (file.getName().endsWith(".parquet")) {
|
||||
if (FSUtils.getFileId(file.getName())
|
||||
.equals(FSUtils.getFileId(parquetFile.getName())) &&
|
||||
HoodieTimeline.compareTimestamps(FSUtils.getCommitTime(file.getName()),
|
||||
FSUtils.getCommitTime(parquetFile.getName()), HoodieTimeline.GREATER)) {
|
||||
updatedParquetFile = file;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
assertTrue(updatedParquetFile != null);
|
||||
// Check whether the record has been updated
|
||||
Path updatedParquetFilePath = new Path(updatedParquetFile.getAbsolutePath());
|
||||
BloomFilter updatedFilter = ParquetUtils.readBloomFilterFromParquetMetadata(updatedParquetFilePath);
|
||||
for (HoodieRecord record : records) {
|
||||
// No change to the _row_key
|
||||
assertTrue(updatedFilter.mightContain(record.getRecordKey()));
|
||||
}
|
||||
|
||||
assertTrue(updatedFilter.mightContain(insertedRecord1.getRecordKey()));
|
||||
records.add(insertedRecord1);// add this so it can further check below
|
||||
|
||||
ParquetReader updatedReader = ParquetReader.builder(new AvroReadSupport<>(), updatedParquetFilePath).build();
|
||||
index = 0;
|
||||
while ((newRecord = (GenericRecord) updatedReader.read()) != null) {
|
||||
assertTrue(newRecord.get("_row_key").toString().equals(records.get(index).getRecordKey()));
|
||||
if (index == 0) {
|
||||
assertTrue(newRecord.get("number").toString().equals("15"));
|
||||
}
|
||||
index++;
|
||||
}
|
||||
updatedReader.close();
|
||||
// Also check the numRecordsWritten
|
||||
List<WriteStatus> statuses = HoodieClientTestUtils.collectStatuses(iter);
|
||||
WriteStatus writeStatus = statuses.get(0);
|
||||
assertTrue("Should be only one file generated", statuses.size() == 1);
|
||||
assertEquals(4, writeStatus.getStat().getNumWrites());//3 rewritten records + 1 new record
|
||||
}
|
||||
}
|
||||
assertTrue(updatedParquetFile != null);
|
||||
// Check whether the record has been updated
|
||||
Path updatedParquetFilePath = new Path(updatedParquetFile.getAbsolutePath());
|
||||
BloomFilter updatedFilter = ParquetUtils
|
||||
.readBloomFilterFromParquetMetadata(updatedParquetFilePath);
|
||||
for (HoodieRecord record : records) {
|
||||
// No change to the _row_key
|
||||
assertTrue(updatedFilter.mightContain(record.getRecordKey()));
|
||||
}
|
||||
|
||||
assertTrue(updatedFilter.mightContain(insertedRecord1.getRecordKey()));
|
||||
records.add(insertedRecord1);// add this so it can further check below
|
||||
|
||||
private List<HoodieRecord> newHoodieRecords(int n, String time) throws Exception {
|
||||
List<HoodieRecord> records = new ArrayList<>();
|
||||
for (int i = 0; i < n; i++) {
|
||||
String recordStr = String.format("{\"_row_key\":\"%s\",\"time\":\"%s\",\"number\":%d}",
|
||||
UUID.randomUUID().toString(),
|
||||
time,
|
||||
i);
|
||||
TestRawTripPayload rowChange = new TestRawTripPayload(recordStr);
|
||||
records.add(new HoodieRecord(
|
||||
new HoodieKey(rowChange.getRowKey(), rowChange.getPartitionPath()),
|
||||
rowChange));
|
||||
}
|
||||
return records;
|
||||
ParquetReader updatedReader = ParquetReader
|
||||
.builder(new AvroReadSupport<>(), updatedParquetFilePath).build();
|
||||
index = 0;
|
||||
while ((newRecord = (GenericRecord) updatedReader.read()) != null) {
|
||||
assertTrue(newRecord.get("_row_key").toString().equals(records.get(index).getRecordKey()));
|
||||
if (index == 0) {
|
||||
assertTrue(newRecord.get("number").toString().equals("15"));
|
||||
}
|
||||
index++;
|
||||
}
|
||||
updatedReader.close();
|
||||
// Also check the numRecordsWritten
|
||||
List<WriteStatus> statuses = HoodieClientTestUtils.collectStatuses(iter);
|
||||
WriteStatus writeStatus = statuses.get(0);
|
||||
assertTrue("Should be only one file generated", statuses.size() == 1);
|
||||
assertEquals(4, writeStatus.getStat().getNumWrites());//3 rewritten records + 1 new record
|
||||
}
|
||||
|
||||
|
||||
private List<HoodieRecord> newHoodieRecords(int n, String time) throws Exception {
|
||||
List<HoodieRecord> records = new ArrayList<>();
|
||||
for (int i = 0; i < n; i++) {
|
||||
String recordStr = String.format("{\"_row_key\":\"%s\",\"time\":\"%s\",\"number\":%d}",
|
||||
UUID.randomUUID().toString(),
|
||||
time,
|
||||
i);
|
||||
TestRawTripPayload rowChange = new TestRawTripPayload(recordStr);
|
||||
records.add(new HoodieRecord(
|
||||
new HoodieKey(rowChange.getRowKey(), rowChange.getPartitionPath()),
|
||||
rowChange));
|
||||
}
|
||||
return records;
|
||||
}
|
||||
|
||||
// Check if record level metadata is aggregated properly at the end of write.
|
||||
@Test
|
||||
public void testMetadataAggregateFromWriteStatus() throws Exception {
|
||||
// Prepare the AvroParquetIO
|
||||
HoodieWriteConfig config = makeHoodieClientConfigBuilder()
|
||||
.withWriteStatusClass(MetadataMergeWriteStatus.class).build();
|
||||
String firstCommitTime = HoodieTestUtils.makeNewCommitTime();
|
||||
HoodieTableMetaClient metadata = new HoodieTableMetaClient(FSUtils.getFs(), basePath);
|
||||
|
||||
HoodieCopyOnWriteTable table = new HoodieCopyOnWriteTable(config, metadata);
|
||||
|
||||
// Get some records belong to the same partition (2016/01/31)
|
||||
String recordStr1 = "{\"_row_key\":\"8eb5b87a-1feh-4edd-87b4-6ec96dc405a0\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}";
|
||||
String recordStr2 = "{\"_row_key\":\"8eb5b87b-1feu-4edd-87b4-6ec96dc405a0\",\"time\":\"2016-01-31T03:20:41.415Z\",\"number\":100}";
|
||||
String recordStr3 = "{\"_row_key\":\"8eb5b87c-1fej-4edd-87b4-6ec96dc405a0\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":15}";
|
||||
|
||||
List<HoodieRecord> records = new ArrayList<>();
|
||||
TestRawTripPayload rowChange1 = new TestRawTripPayload(recordStr1);
|
||||
records.add(
|
||||
new HoodieRecord(new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()),
|
||||
rowChange1));
|
||||
TestRawTripPayload rowChange2 = new TestRawTripPayload(recordStr2);
|
||||
records.add(
|
||||
new HoodieRecord(new HoodieKey(rowChange2.getRowKey(), rowChange2.getPartitionPath()),
|
||||
rowChange2));
|
||||
TestRawTripPayload rowChange3 = new TestRawTripPayload(recordStr3);
|
||||
records.add(
|
||||
new HoodieRecord(new HoodieKey(rowChange3.getRowKey(), rowChange3.getPartitionPath()),
|
||||
rowChange3));
|
||||
|
||||
// Insert new records
|
||||
List<WriteStatus> writeStatuses = HoodieClientTestUtils
|
||||
.collectStatuses(table.handleInsert(firstCommitTime, records.iterator()));
|
||||
Map<String, String> allWriteStatusMergedMetadataMap = MetadataMergeWriteStatus
|
||||
.mergeMetadataForWriteStatuses(writeStatuses);
|
||||
assertTrue(allWriteStatusMergedMetadataMap.containsKey("InputRecordCount_1506582000"));
|
||||
// For metadata key InputRecordCount_1506582000, value is 2 for each record. So sum of this should be 2 * 3
|
||||
assertEquals("6", allWriteStatusMergedMetadataMap.get("InputRecordCount_1506582000"));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testInsertWithPartialFailures() throws Exception {
|
||||
HoodieWriteConfig config = makeHoodieClientConfig();
|
||||
String commitTime = HoodieTestUtils.makeNewCommitTime();
|
||||
FileSystem fs = FSUtils.getFs();
|
||||
HoodieTableMetaClient metadata = new HoodieTableMetaClient(fs, basePath);
|
||||
HoodieCopyOnWriteTable table = new HoodieCopyOnWriteTable(config, metadata);
|
||||
|
||||
// Write a few records, and get atleast one file
|
||||
// 10 records for partition 1, 1 record for partition 2.
|
||||
List<HoodieRecord> records = newHoodieRecords(10, "2016-01-31T03:16:41.415Z");
|
||||
records.addAll(newHoodieRecords(1, "2016-02-01T03:16:41.415Z"));
|
||||
|
||||
// Simulate crash after first file
|
||||
List<WriteStatus> statuses = HoodieClientTestUtils
|
||||
.collectStatuses(table.handleInsert(commitTime, records.iterator()));
|
||||
WriteStatus status = statuses.get(0);
|
||||
Path partialFile = new Path(String.format("%s/%s/%s",
|
||||
basePath,
|
||||
status.getPartitionPath(),
|
||||
FSUtils.makeDataFileName(commitTime, 0, status.getFileId()))
|
||||
);
|
||||
assertTrue(fs.exists(partialFile));
|
||||
|
||||
// When we retry
|
||||
records = newHoodieRecords(10, "2016-01-31T03:16:41.415Z");
|
||||
records.addAll(newHoodieRecords(1, "2016-02-01T03:16:41.415Z"));
|
||||
|
||||
statuses = HoodieClientTestUtils
|
||||
.collectStatuses(table.handleInsert(commitTime, records.iterator()));
|
||||
status = statuses.get(0);
|
||||
|
||||
Path retriedFIle = new Path(String.format("%s/%s/%s",
|
||||
basePath,
|
||||
status.getPartitionPath(),
|
||||
FSUtils.makeDataFileName(commitTime, 0, status.getFileId()))
|
||||
);
|
||||
assertTrue(fs.exists(retriedFIle));
|
||||
assertFalse(fs.exists(partialFile));
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
public void testInsertRecords() throws Exception {
|
||||
HoodieWriteConfig config = makeHoodieClientConfig();
|
||||
String commitTime = HoodieTestUtils.makeNewCommitTime();
|
||||
HoodieTableMetaClient metadata = new HoodieTableMetaClient(FSUtils.getFs(), basePath);
|
||||
HoodieCopyOnWriteTable table = new HoodieCopyOnWriteTable(config, metadata);
|
||||
|
||||
// Case 1:
|
||||
// 10 records for partition 1, 1 record for partition 2.
|
||||
List<HoodieRecord> records = newHoodieRecords(10, "2016-01-31T03:16:41.415Z");
|
||||
records.addAll(newHoodieRecords(1, "2016-02-01T03:16:41.415Z"));
|
||||
|
||||
// Insert new records
|
||||
List<WriteStatus> returnedStatuses = HoodieClientTestUtils
|
||||
.collectStatuses(table.handleInsert(commitTime, records.iterator()));
|
||||
|
||||
// TODO: check the actual files and make sure 11 records, total were written.
|
||||
assertEquals(2, returnedStatuses.size());
|
||||
assertEquals("2016/01/31", returnedStatuses.get(0).getPartitionPath());
|
||||
assertEquals(0, returnedStatuses.get(0).getFailedRecords().size());
|
||||
assertEquals(10, returnedStatuses.get(0).getWrittenRecords().size());
|
||||
assertEquals("2016/02/01", returnedStatuses.get(1).getPartitionPath());
|
||||
assertEquals(0, returnedStatuses.get(0).getFailedRecords().size());
|
||||
assertEquals(1, returnedStatuses.get(1).getWrittenRecords().size());
|
||||
|
||||
// Case 2:
|
||||
// 1 record for partition 1, 5 record for partition 2, 1 records for partition 3.
|
||||
records = newHoodieRecords(1, "2016-01-31T03:16:41.415Z");
|
||||
records.addAll(newHoodieRecords(5, "2016-02-01T03:16:41.415Z"));
|
||||
records.addAll(newHoodieRecords(1, "2016-02-02T03:16:41.415Z"));
|
||||
|
||||
// Insert new records
|
||||
returnedStatuses = HoodieClientTestUtils
|
||||
.collectStatuses(table.handleInsert(commitTime, records.iterator()));
|
||||
|
||||
assertEquals(3, returnedStatuses.size());
|
||||
assertEquals("2016/01/31", returnedStatuses.get(0).getPartitionPath());
|
||||
assertEquals(1, returnedStatuses.get(0).getWrittenRecords().size());
|
||||
|
||||
assertEquals("2016/02/01", returnedStatuses.get(1).getPartitionPath());
|
||||
assertEquals(5, returnedStatuses.get(1).getWrittenRecords().size());
|
||||
|
||||
assertEquals("2016/02/02", returnedStatuses.get(2).getPartitionPath());
|
||||
assertEquals(1, returnedStatuses.get(2).getWrittenRecords().size());
|
||||
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testFileSizeUpsertRecords() throws Exception {
|
||||
HoodieWriteConfig config = makeHoodieClientConfigBuilder().withStorageConfig(
|
||||
HoodieStorageConfig.newBuilder().limitFileSize(64 * 1024).parquetBlockSize(64 * 1024)
|
||||
.parquetPageSize(64 * 1024).build()).build();
|
||||
String commitTime = HoodieTestUtils.makeNewCommitTime();
|
||||
HoodieTableMetaClient metadata = new HoodieTableMetaClient(FSUtils.getFs(), basePath);
|
||||
HoodieCopyOnWriteTable table = new HoodieCopyOnWriteTable(config, metadata);
|
||||
|
||||
List<HoodieRecord> records = new ArrayList<>();
|
||||
// Approx 1150 records are written for block size of 64KB
|
||||
for (int i = 0; i < 2000; i++) {
|
||||
String recordStr = "{\"_row_key\":\"" + UUID.randomUUID().toString()
|
||||
+ "\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":" + i + "}";
|
||||
TestRawTripPayload rowChange = new TestRawTripPayload(recordStr);
|
||||
records
|
||||
.add(new HoodieRecord(new HoodieKey(rowChange.getRowKey(), rowChange.getPartitionPath()),
|
||||
rowChange));
|
||||
}
|
||||
|
||||
// Check if record level metadata is aggregated properly at the end of write.
|
||||
@Test
|
||||
public void testMetadataAggregateFromWriteStatus() throws Exception {
|
||||
// Prepare the AvroParquetIO
|
||||
HoodieWriteConfig config = makeHoodieClientConfigBuilder().withWriteStatusClass(MetadataMergeWriteStatus.class).build();
|
||||
String firstCommitTime = HoodieTestUtils.makeNewCommitTime();
|
||||
HoodieTableMetaClient metadata = new HoodieTableMetaClient(FSUtils.getFs(), basePath);
|
||||
// Insert new records
|
||||
HoodieClientTestUtils.collectStatuses(table.handleInsert(commitTime, records.iterator()));
|
||||
|
||||
HoodieCopyOnWriteTable table = new HoodieCopyOnWriteTable(config, metadata);
|
||||
|
||||
// Get some records belong to the same partition (2016/01/31)
|
||||
String recordStr1 = "{\"_row_key\":\"8eb5b87a-1feh-4edd-87b4-6ec96dc405a0\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}";
|
||||
String recordStr2 = "{\"_row_key\":\"8eb5b87b-1feu-4edd-87b4-6ec96dc405a0\",\"time\":\"2016-01-31T03:20:41.415Z\",\"number\":100}";
|
||||
String recordStr3 = "{\"_row_key\":\"8eb5b87c-1fej-4edd-87b4-6ec96dc405a0\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":15}";
|
||||
|
||||
List<HoodieRecord> records = new ArrayList<>();
|
||||
TestRawTripPayload rowChange1 = new TestRawTripPayload(recordStr1);
|
||||
records.add(new HoodieRecord(new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()), rowChange1));
|
||||
TestRawTripPayload rowChange2 = new TestRawTripPayload(recordStr2);
|
||||
records.add(new HoodieRecord(new HoodieKey(rowChange2.getRowKey(), rowChange2.getPartitionPath()), rowChange2));
|
||||
TestRawTripPayload rowChange3 = new TestRawTripPayload(recordStr3);
|
||||
records.add(new HoodieRecord(new HoodieKey(rowChange3.getRowKey(), rowChange3.getPartitionPath()), rowChange3));
|
||||
|
||||
// Insert new records
|
||||
List<WriteStatus> writeStatuses = HoodieClientTestUtils
|
||||
.collectStatuses(table.handleInsert(firstCommitTime, records.iterator()));
|
||||
Map<String, String> allWriteStatusMergedMetadataMap = MetadataMergeWriteStatus
|
||||
.mergeMetadataForWriteStatuses(writeStatuses);
|
||||
assertTrue(allWriteStatusMergedMetadataMap.containsKey("InputRecordCount_1506582000"));
|
||||
// For metadata key InputRecordCount_1506582000, value is 2 for each record. So sum of this should be 2 * 3
|
||||
assertEquals("6", allWriteStatusMergedMetadataMap.get("InputRecordCount_1506582000"));
|
||||
// Check the updated file
|
||||
int counts = 0;
|
||||
for (File file : new File(basePath + "/2016/01/31").listFiles()) {
|
||||
if (file.getName().endsWith(".parquet") && FSUtils.getCommitTime(file.getName())
|
||||
.equals(commitTime)) {
|
||||
System.out.println(file.getName() + "-" + file.length());
|
||||
counts++;
|
||||
}
|
||||
}
|
||||
assertEquals(
|
||||
"If the number of records are more than 1150, then there should be a new file", 3,
|
||||
counts);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testInsertWithPartialFailures() throws Exception {
|
||||
HoodieWriteConfig config = makeHoodieClientConfig();
|
||||
String commitTime = HoodieTestUtils.makeNewCommitTime();
|
||||
FileSystem fs = FSUtils.getFs();
|
||||
HoodieTableMetaClient metadata = new HoodieTableMetaClient(fs, basePath);
|
||||
HoodieCopyOnWriteTable table = new HoodieCopyOnWriteTable(config, metadata);
|
||||
|
||||
// Write a few records, and get atleast one file
|
||||
// 10 records for partition 1, 1 record for partition 2.
|
||||
List<HoodieRecord> records = newHoodieRecords(10, "2016-01-31T03:16:41.415Z");
|
||||
records.addAll(newHoodieRecords(1, "2016-02-01T03:16:41.415Z"));
|
||||
private List<HoodieCopyOnWriteTable.InsertBucket> testUpsertPartitioner(int smallFileSize,
|
||||
int numInserts,
|
||||
int numUpdates,
|
||||
int fileSize,
|
||||
boolean autoSplitInserts) throws Exception {
|
||||
final String TEST_PARTITION_PATH = "2016/09/26";
|
||||
HoodieWriteConfig config = makeHoodieClientConfigBuilder()
|
||||
.withCompactionConfig(HoodieCompactionConfig.newBuilder()
|
||||
.compactionSmallFileSize(smallFileSize).insertSplitSize(100)
|
||||
.autoTuneInsertSplits(autoSplitInserts).build())
|
||||
.withStorageConfig(HoodieStorageConfig.newBuilder().limitFileSize(1000 * 1024).build())
|
||||
.build();
|
||||
|
||||
// Simulate crash after first file
|
||||
List<WriteStatus> statuses = HoodieClientTestUtils.collectStatuses(table.handleInsert(commitTime, records.iterator()));
|
||||
WriteStatus status = statuses.get(0);
|
||||
Path partialFile = new Path(String.format("%s/%s/%s",
|
||||
basePath,
|
||||
status.getPartitionPath(),
|
||||
FSUtils.makeDataFileName(commitTime, 0, status.getFileId()))
|
||||
);
|
||||
assertTrue(fs.exists(partialFile));
|
||||
HoodieClientTestUtils.fakeCommitFile(basePath, "001");
|
||||
HoodieClientTestUtils.fakeDataFile(basePath, TEST_PARTITION_PATH, "001", "file1", fileSize);
|
||||
|
||||
// When we retry
|
||||
records = newHoodieRecords(10, "2016-01-31T03:16:41.415Z");
|
||||
records.addAll(newHoodieRecords(1, "2016-02-01T03:16:41.415Z"));
|
||||
HoodieTableMetaClient metadata = new HoodieTableMetaClient(FSUtils.getFs(), basePath);
|
||||
HoodieCopyOnWriteTable table = new HoodieCopyOnWriteTable(config, metadata);
|
||||
|
||||
statuses = HoodieClientTestUtils.collectStatuses(table.handleInsert(commitTime, records.iterator()));
|
||||
status = statuses.get(0);
|
||||
|
||||
Path retriedFIle = new Path(String.format("%s/%s/%s",
|
||||
basePath,
|
||||
status.getPartitionPath(),
|
||||
FSUtils.makeDataFileName(commitTime, 0, status.getFileId()))
|
||||
);
|
||||
assertTrue(fs.exists(retriedFIle));
|
||||
assertFalse(fs.exists(partialFile));
|
||||
HoodieTestDataGenerator dataGenerator = new HoodieTestDataGenerator(
|
||||
new String[]{TEST_PARTITION_PATH});
|
||||
List<HoodieRecord> insertRecords = dataGenerator.generateInserts("001", numInserts);
|
||||
List<HoodieRecord> updateRecords = dataGenerator.generateUpdates("001", numUpdates);
|
||||
for (HoodieRecord updateRec : updateRecords) {
|
||||
updateRec.setCurrentLocation(new HoodieRecordLocation("001", "file1"));
|
||||
}
|
||||
List<HoodieRecord> records = new ArrayList<>();
|
||||
records.addAll(insertRecords);
|
||||
records.addAll(updateRecords);
|
||||
WorkloadProfile profile = new WorkloadProfile(jsc.parallelize(records));
|
||||
HoodieCopyOnWriteTable.UpsertPartitioner partitioner = (HoodieCopyOnWriteTable.UpsertPartitioner)
|
||||
table.getUpsertPartitioner(profile);
|
||||
|
||||
assertEquals("Should have 3 partitions", 3, partitioner.numPartitions());
|
||||
assertEquals("Bucket 0 is UPDATE", HoodieCopyOnWriteTable.BucketType.UPDATE,
|
||||
partitioner.getBucketInfo(0).bucketType);
|
||||
assertEquals("Bucket 1 is INSERT", HoodieCopyOnWriteTable.BucketType.INSERT,
|
||||
partitioner.getBucketInfo(1).bucketType);
|
||||
assertEquals("Bucket 2 is INSERT", HoodieCopyOnWriteTable.BucketType.INSERT,
|
||||
partitioner.getBucketInfo(2).bucketType);
|
||||
assertEquals("Update record should have gone to the 1 update partiton", 0,
|
||||
partitioner.getPartition(new Tuple2<>(updateRecords.get(0).getKey(),
|
||||
Option.apply(updateRecords.get(0).getCurrentLocation()))));
|
||||
return partitioner.getInsertBuckets(TEST_PARTITION_PATH);
|
||||
}
|
||||
|
||||
|
||||
@Test public void testInsertRecords() throws Exception {
|
||||
HoodieWriteConfig config = makeHoodieClientConfig();
|
||||
String commitTime = HoodieTestUtils.makeNewCommitTime();
|
||||
HoodieTableMetaClient metadata = new HoodieTableMetaClient(FSUtils.getFs(), basePath);
|
||||
HoodieCopyOnWriteTable table = new HoodieCopyOnWriteTable(config, metadata);
|
||||
|
||||
// Case 1:
|
||||
// 10 records for partition 1, 1 record for partition 2.
|
||||
List<HoodieRecord> records = newHoodieRecords(10, "2016-01-31T03:16:41.415Z");
|
||||
records.addAll(newHoodieRecords(1, "2016-02-01T03:16:41.415Z"));
|
||||
|
||||
// Insert new records
|
||||
List<WriteStatus> returnedStatuses = HoodieClientTestUtils.collectStatuses(table.handleInsert(commitTime, records.iterator()));
|
||||
@Test
|
||||
public void testUpsertPartitioner() throws Exception {
|
||||
// Inserts + Updates... Check all updates go together & inserts subsplit
|
||||
List<HoodieCopyOnWriteTable.InsertBucket> insertBuckets = testUpsertPartitioner(0, 200, 100,
|
||||
1024, false);
|
||||
assertEquals("Total of 2 insert buckets", 2, insertBuckets.size());
|
||||
}
|
||||
|
||||
|
||||
// TODO: check the actual files and make sure 11 records, total were written.
|
||||
assertEquals(2, returnedStatuses.size());
|
||||
assertEquals("2016/01/31", returnedStatuses.get(0).getPartitionPath());
|
||||
assertEquals(0, returnedStatuses.get(0).getFailedRecords().size());
|
||||
assertEquals(10, returnedStatuses.get(0).getWrittenRecords().size());
|
||||
assertEquals("2016/02/01", returnedStatuses.get(1).getPartitionPath());
|
||||
assertEquals(0, returnedStatuses.get(0).getFailedRecords().size());
|
||||
assertEquals(1, returnedStatuses.get(1).getWrittenRecords().size());
|
||||
@Test
|
||||
public void testUpsertPartitionerWithSmallInsertHandling() throws Exception {
|
||||
// Inserts + Updates .. Check updates go together & inserts subsplit, after expanding smallest file
|
||||
List<HoodieCopyOnWriteTable.InsertBucket> insertBuckets = testUpsertPartitioner(1000 * 1024,
|
||||
400, 100, 800 * 1024, false);
|
||||
assertEquals("Total of 3 insert buckets", 3, insertBuckets.size());
|
||||
assertEquals("First insert bucket must be same as update bucket", 0,
|
||||
insertBuckets.get(0).bucketNumber);
|
||||
assertEquals("First insert bucket should have weight 0.5", 0.5, insertBuckets.get(0).weight,
|
||||
0.01);
|
||||
|
||||
// Case 2:
|
||||
// 1 record for partition 1, 5 record for partition 2, 1 records for partition 3.
|
||||
records = newHoodieRecords(1, "2016-01-31T03:16:41.415Z");
|
||||
records.addAll(newHoodieRecords(5, "2016-02-01T03:16:41.415Z"));
|
||||
records.addAll(newHoodieRecords(1, "2016-02-02T03:16:41.415Z"));
|
||||
|
||||
// Insert new records
|
||||
returnedStatuses = HoodieClientTestUtils.collectStatuses(table.handleInsert(commitTime, records.iterator()));
|
||||
|
||||
assertEquals(3, returnedStatuses.size());
|
||||
assertEquals("2016/01/31", returnedStatuses.get(0).getPartitionPath());
|
||||
assertEquals(1, returnedStatuses.get(0).getWrittenRecords().size());
|
||||
|
||||
assertEquals("2016/02/01", returnedStatuses.get(1).getPartitionPath());
|
||||
assertEquals(5, returnedStatuses.get(1).getWrittenRecords().size());
|
||||
|
||||
assertEquals("2016/02/02", returnedStatuses.get(2).getPartitionPath());
|
||||
assertEquals(1, returnedStatuses.get(2).getWrittenRecords().size());
|
||||
// Now with insert split size auto tuned
|
||||
insertBuckets = testUpsertPartitioner(1000 * 1024, 2400, 100, 800 * 1024, true);
|
||||
assertEquals("Total of 3 insert buckets", 3, insertBuckets.size());
|
||||
assertEquals("First insert bucket must be same as update bucket", 0,
|
||||
insertBuckets.get(0).bucketNumber);
|
||||
assertEquals("First insert bucket should have weight 0.5", 200.0 / 2400,
|
||||
insertBuckets.get(0).weight, 0.01);
|
||||
}
|
||||
|
||||
@After
|
||||
public void cleanup() {
|
||||
if (basePath != null) {
|
||||
new File(basePath).delete();
|
||||
}
|
||||
|
||||
@Test public void testFileSizeUpsertRecords() throws Exception {
|
||||
HoodieWriteConfig config = makeHoodieClientConfigBuilder().withStorageConfig(
|
||||
HoodieStorageConfig.newBuilder().limitFileSize(64 * 1024).parquetBlockSize(64 * 1024)
|
||||
.parquetPageSize(64 * 1024).build()).build();
|
||||
String commitTime = HoodieTestUtils.makeNewCommitTime();
|
||||
HoodieTableMetaClient metadata = new HoodieTableMetaClient(FSUtils.getFs(), basePath);
|
||||
HoodieCopyOnWriteTable table = new HoodieCopyOnWriteTable(config, metadata);
|
||||
|
||||
List<HoodieRecord> records = new ArrayList<>();
|
||||
// Approx 1150 records are written for block size of 64KB
|
||||
for (int i = 0; i < 2000; i++) {
|
||||
String recordStr = "{\"_row_key\":\"" + UUID.randomUUID().toString() + "\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":" + i + "}";
|
||||
TestRawTripPayload rowChange = new TestRawTripPayload(recordStr);
|
||||
records.add(new HoodieRecord(new HoodieKey(rowChange.getRowKey(), rowChange.getPartitionPath()),
|
||||
rowChange));
|
||||
}
|
||||
|
||||
// Insert new records
|
||||
HoodieClientTestUtils.collectStatuses(table.handleInsert(commitTime, records.iterator()));
|
||||
|
||||
// Check the updated file
|
||||
int counts = 0;
|
||||
for (File file : new File(basePath + "/2016/01/31").listFiles()) {
|
||||
if (file.getName().endsWith(".parquet") && FSUtils.getCommitTime(file.getName()).equals(commitTime)) {
|
||||
System.out.println(file.getName() + "-" + file.length());
|
||||
counts++;
|
||||
}
|
||||
}
|
||||
assertEquals(
|
||||
"If the number of records are more than 1150, then there should be a new file", 3,
|
||||
counts);
|
||||
}
|
||||
|
||||
|
||||
|
||||
private List<HoodieCopyOnWriteTable.InsertBucket> testUpsertPartitioner(int smallFileSize,
|
||||
int numInserts,
|
||||
int numUpdates,
|
||||
int fileSize,
|
||||
boolean autoSplitInserts) throws Exception {
|
||||
final String TEST_PARTITION_PATH = "2016/09/26";
|
||||
HoodieWriteConfig config = makeHoodieClientConfigBuilder()
|
||||
.withCompactionConfig(HoodieCompactionConfig.newBuilder()
|
||||
.compactionSmallFileSize(smallFileSize).insertSplitSize(100).autoTuneInsertSplits(autoSplitInserts).build())
|
||||
.withStorageConfig(HoodieStorageConfig.newBuilder().limitFileSize(1000 * 1024).build()).build();
|
||||
|
||||
HoodieClientTestUtils.fakeCommitFile(basePath, "001");
|
||||
HoodieClientTestUtils.fakeDataFile(basePath, TEST_PARTITION_PATH, "001", "file1", fileSize);
|
||||
|
||||
HoodieTableMetaClient metadata = new HoodieTableMetaClient(FSUtils.getFs(), basePath);
|
||||
HoodieCopyOnWriteTable table = new HoodieCopyOnWriteTable(config, metadata);
|
||||
|
||||
HoodieTestDataGenerator dataGenerator = new HoodieTestDataGenerator(new String[]{TEST_PARTITION_PATH});
|
||||
List<HoodieRecord> insertRecords = dataGenerator.generateInserts("001", numInserts);
|
||||
List<HoodieRecord> updateRecords = dataGenerator.generateUpdates("001", numUpdates);
|
||||
for (HoodieRecord updateRec: updateRecords) {
|
||||
updateRec.setCurrentLocation(new HoodieRecordLocation("001", "file1"));
|
||||
}
|
||||
List<HoodieRecord> records = new ArrayList<>();
|
||||
records.addAll(insertRecords);
|
||||
records.addAll(updateRecords);
|
||||
WorkloadProfile profile = new WorkloadProfile(jsc.parallelize(records));
|
||||
HoodieCopyOnWriteTable.UpsertPartitioner partitioner = (HoodieCopyOnWriteTable.UpsertPartitioner)
|
||||
table.getUpsertPartitioner(profile);
|
||||
|
||||
assertEquals("Should have 3 partitions", 3, partitioner.numPartitions());
|
||||
assertEquals("Bucket 0 is UPDATE", HoodieCopyOnWriteTable.BucketType.UPDATE,
|
||||
partitioner.getBucketInfo(0).bucketType);
|
||||
assertEquals("Bucket 1 is INSERT", HoodieCopyOnWriteTable.BucketType.INSERT,
|
||||
partitioner.getBucketInfo(1).bucketType);
|
||||
assertEquals("Bucket 2 is INSERT", HoodieCopyOnWriteTable.BucketType.INSERT,
|
||||
partitioner.getBucketInfo(2).bucketType);
|
||||
assertEquals("Update record should have gone to the 1 update partiton", 0,
|
||||
partitioner.getPartition(new Tuple2<>(updateRecords.get(0).getKey(), Option.apply(updateRecords.get(0).getCurrentLocation()))));
|
||||
return partitioner.getInsertBuckets(TEST_PARTITION_PATH);
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
public void testUpsertPartitioner() throws Exception {
|
||||
// Inserts + Updates... Check all updates go together & inserts subsplit
|
||||
List<HoodieCopyOnWriteTable.InsertBucket> insertBuckets = testUpsertPartitioner(0, 200, 100, 1024, false);
|
||||
assertEquals("Total of 2 insert buckets", 2, insertBuckets.size());
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
public void testUpsertPartitionerWithSmallInsertHandling() throws Exception {
|
||||
// Inserts + Updates .. Check updates go together & inserts subsplit, after expanding smallest file
|
||||
List<HoodieCopyOnWriteTable.InsertBucket> insertBuckets = testUpsertPartitioner(1000 * 1024, 400, 100, 800 * 1024, false);
|
||||
assertEquals("Total of 3 insert buckets", 3, insertBuckets.size());
|
||||
assertEquals("First insert bucket must be same as update bucket", 0, insertBuckets.get(0).bucketNumber);
|
||||
assertEquals("First insert bucket should have weight 0.5", 0.5, insertBuckets.get(0).weight, 0.01);
|
||||
|
||||
// Now with insert split size auto tuned
|
||||
insertBuckets = testUpsertPartitioner(1000 * 1024, 2400, 100, 800 * 1024, true);
|
||||
assertEquals("Total of 3 insert buckets", 3, insertBuckets.size());
|
||||
assertEquals("First insert bucket must be same as update bucket", 0, insertBuckets.get(0).bucketNumber);
|
||||
assertEquals("First insert bucket should have weight 0.5", 200.0/2400, insertBuckets.get(0).weight, 0.01);
|
||||
}
|
||||
|
||||
@After
|
||||
public void cleanup() {
|
||||
if (basePath != null) {
|
||||
new File(basePath).delete();
|
||||
}
|
||||
if (jsc != null) {
|
||||
jsc.stop();
|
||||
}
|
||||
if (jsc != null) {
|
||||
jsc.stop();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
Reference in New Issue
Block a user