1
0

CodeStyle formatting to conform to basic Checkstyle rules.

The code-style rules follow google style with some changes:

1. Increase line length from 100 to 120
2. Disable JavaDoc related checkstyles as this needs more manual work.

Both source and test code are checked for code-style
This commit is contained in:
Balaji Varadarajan
2018-03-20 16:29:20 -07:00
committed by vinoth chandar
parent 987f5d6b96
commit 788e4f2d2e
200 changed files with 6209 additions and 5975 deletions

View File

@@ -14,7 +14,6 @@
* limitations under the License.
*/
import com.beust.jcommander.JCommander;
import com.beust.jcommander.Parameter;
import com.uber.hoodie.HoodieWriteClient;
@@ -38,24 +37,19 @@ import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
/**
* Driver program that uses the Hoodie client with synthetic workload, and performs basic
* operations. <p>
* Driver program that uses the Hoodie client with synthetic workload, and performs basic operations. <p>
*/
public class HoodieClientExample {
@Parameter(names = {"--table-path", "-p"}, description = "path for Hoodie sample table")
private String tablePath = "file:///tmp/hoodie/sample-table";
@Parameter(names = {"--table-name", "-n"}, description = "table name for Hoodie sample table")
private String tableName = "hoodie_rt";
@Parameter(names = {"--table-type", "-t"}, description = "One of COPY_ON_WRITE or MERGE_ON_READ")
private String tableType = HoodieTableType.COPY_ON_WRITE.name();
private static Logger logger = LogManager.getLogger(HoodieClientExample.class);
@Parameter(names = {"--help", "-h"}, help = true)
public Boolean help = false;
private static Logger logger = LogManager.getLogger(HoodieClientExample.class);
@Parameter(names = {"--table-path", "-p"}, description = "path for Hoodie sample table")
private String tablePath = "file:///tmp/hoodie/sample-table";
@Parameter(names = {"--table-name", "-n"}, description = "table name for Hoodie sample table")
private String tableName = "hoodie_rt";
@Parameter(names = {"--table-type", "-t"}, description = "One of COPY_ON_WRITE or MERGE_ON_READ")
private String tableType = HoodieTableType.COPY_ON_WRITE.name();
public static void main(String[] args) throws Exception {
HoodieClientExample cli = new HoodieClientExample();
@@ -92,10 +86,10 @@ public class HoodieClientExample {
// Create the write client to write some records in
HoodieWriteConfig cfg = HoodieWriteConfig.newBuilder().withPath(tablePath)
.withSchema(HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA).withParallelism(2, 2)
.forTable(tableName).withIndexConfig(
HoodieIndexConfig.newBuilder().withIndexType(IndexType.BLOOM).build())
.withCompactionConfig(HoodieCompactionConfig.newBuilder().archiveCommitsWith(2, 3).build())
.build();
.forTable(tableName)
.withIndexConfig(HoodieIndexConfig.newBuilder().withIndexType(IndexType.BLOOM).build())
.withCompactionConfig(
HoodieCompactionConfig.newBuilder().archiveCommitsWith(2, 3).build()).build();
HoodieWriteClient client = new HoodieWriteClient(jsc, cfg);
/**

View File

@@ -16,7 +16,6 @@
package com.uber.hoodie;
import static org.junit.Assert.assertEquals;
import com.uber.hoodie.common.HoodieClientTestUtils;
@@ -58,11 +57,11 @@ public class TestMultiFS implements Serializable {
private static MiniDFSCluster dfsCluster;
private static DistributedFileSystem dfs;
private static Logger logger = LogManager.getLogger(TestMultiFS.class);
private static JavaSparkContext jsc;
private static SQLContext sqlContext;
private String tablePath = "file:///tmp/hoodie/sample-table";
private String tableName = "hoodie_rt";
private String tableType = HoodieTableType.COPY_ON_WRITE.name();
private static JavaSparkContext jsc;
private static SQLContext sqlContext;
@BeforeClass
public static void initClass() throws Exception {
@@ -92,7 +91,8 @@ public class TestMultiFS implements Serializable {
hdfsTestService.stop();
dfsCluster.shutdown();
}
// Need to closeAll to clear FileSystem.Cache, required because DFS and LocalFS used in the same JVM
// Need to closeAll to clear FileSystem.Cache, required because DFS and LocalFS used in the
// same JVM
FileSystem.closeAll();
}
@@ -111,8 +111,7 @@ public class TestMultiFS implements Serializable {
HoodieWriteConfig cfg = HoodieWriteConfig.newBuilder().withPath(dfsBasePath)
.withSchema(HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA).withParallelism(2, 2)
.forTable(tableName).withIndexConfig(
HoodieIndexConfig.newBuilder().withIndexType(HoodieIndex.IndexType.BLOOM).build())
.build();
HoodieIndexConfig.newBuilder().withIndexType(HoodieIndex.IndexType.BLOOM).build()).build();
HoodieWriteClient hdfsWriteClient = new HoodieWriteClient(jsc, cfg);
// Write generated data to hdfs (only inserts)
@@ -125,10 +124,8 @@ public class TestMultiFS implements Serializable {
// Read from hdfs
FileSystem fs = FSUtils.getFs(dfsBasePath, HoodieTestUtils.getDefaultHadoopConf());
HoodieTableMetaClient metaClient = new HoodieTableMetaClient(fs.getConf(), dfsBasePath);
HoodieTimeline timeline = new HoodieActiveTimeline(metaClient)
.getCommitTimeline();
Dataset<Row> readRecords = HoodieClientTestUtils
.readCommit(dfsBasePath, sqlContext, timeline, readCommitTime);
HoodieTimeline timeline = new HoodieActiveTimeline(metaClient).getCommitTimeline();
Dataset<Row> readRecords = HoodieClientTestUtils.readCommit(dfsBasePath, sqlContext, timeline, readCommitTime);
assertEquals("Should contain 100 records", readRecords.count(), records.size());
// Write to local
@@ -138,8 +135,7 @@ public class TestMultiFS implements Serializable {
HoodieWriteConfig localConfig = HoodieWriteConfig.newBuilder().withPath(tablePath)
.withSchema(HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA).withParallelism(2, 2)
.forTable(tableName).withIndexConfig(
HoodieIndexConfig.newBuilder().withIndexType(HoodieIndex.IndexType.BLOOM).build())
.build();
HoodieIndexConfig.newBuilder().withIndexType(HoodieIndex.IndexType.BLOOM).build()).build();
HoodieWriteClient localWriteClient = new HoodieWriteClient(jsc, localConfig);
String writeCommitTime = localWriteClient.startCommit();
@@ -153,8 +149,7 @@ public class TestMultiFS implements Serializable {
fs = FSUtils.getFs(tablePath, HoodieTestUtils.getDefaultHadoopConf());
metaClient = new HoodieTableMetaClient(fs.getConf(), tablePath);
timeline = new HoodieActiveTimeline(metaClient).getCommitTimeline();
Dataset<Row> localReadRecords = HoodieClientTestUtils
.readCommit(tablePath, sqlContext, timeline, writeCommitTime);
Dataset<Row> localReadRecords = HoodieClientTestUtils.readCommit(tablePath, sqlContext, timeline, writeCommitTime);
assertEquals("Should contain 100 records", localReadRecords.count(), localRecords.size());
}
}

View File

@@ -69,8 +69,7 @@ public class HoodieClientTestUtils {
return keys;
}
private static void fakeMetaFile(String basePath, String commitTime, String suffix)
throws IOException {
private static void fakeMetaFile(String basePath, String commitTime, String suffix) throws IOException {
String parentPath = basePath + "/" + HoodieTableMetaClient.METAFOLDER_NAME;
new File(parentPath).mkdirs();
new File(parentPath + "/" + commitTime + suffix).createNewFile();
@@ -85,55 +84,48 @@ public class HoodieClientTestUtils {
fakeMetaFile(basePath, commitTime, HoodieTimeline.INFLIGHT_EXTENSION);
}
public static void fakeDataFile(String basePath, String partitionPath, String commitTime,
String fileId) throws Exception {
public static void fakeDataFile(String basePath, String partitionPath, String commitTime, String fileId)
throws Exception {
fakeDataFile(basePath, partitionPath, commitTime, fileId, 0);
}
public static void fakeDataFile(String basePath, String partitionPath, String commitTime,
String fileId, long length) throws Exception {
public static void fakeDataFile(String basePath, String partitionPath, String commitTime, String fileId, long length)
throws Exception {
String parentPath = String.format("%s/%s", basePath, partitionPath);
new File(parentPath).mkdirs();
String path = String
.format("%s/%s", parentPath, FSUtils.makeDataFileName(commitTime, 0, fileId));
String path = String.format("%s/%s", parentPath, FSUtils.makeDataFileName(commitTime, 0, fileId));
new File(path).createNewFile();
new RandomAccessFile(path, "rw").setLength(length);
}
public static SparkConf getSparkConfForTest(String appName) {
SparkConf sparkConf = new SparkConf()
.setAppName(appName)
System.out.println("HIII" + "HII2");
SparkConf sparkConf = new SparkConf().setAppName(appName)
.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
.setMaster("local[1]");
return HoodieReadClient.addHoodieSupport(sparkConf);
}
public static HashMap<String, String> getLatestFileIDsToFullPath(String basePath,
HoodieTimeline commitTimeline,
public static HashMap<String, String> getLatestFileIDsToFullPath(String basePath, HoodieTimeline commitTimeline,
List<HoodieInstant> commitsToReturn) throws IOException {
HashMap<String, String> fileIdToFullPath = new HashMap<>();
for (HoodieInstant commit : commitsToReturn) {
HoodieCommitMetadata metadata =
HoodieCommitMetadata.fromBytes(commitTimeline.getInstantDetails(commit).get());
HoodieCommitMetadata metadata = HoodieCommitMetadata.fromBytes(commitTimeline.getInstantDetails(commit).get());
fileIdToFullPath.putAll(metadata.getFileIdAndFullPaths(basePath));
}
return fileIdToFullPath;
}
public static Dataset<Row> readCommit(String basePath,
SQLContext sqlContext,
HoodieTimeline commitTimeline,
public static Dataset<Row> readCommit(String basePath, SQLContext sqlContext, HoodieTimeline commitTimeline,
String commitTime) {
HoodieInstant commitInstant =
new HoodieInstant(false, HoodieTimeline.COMMIT_ACTION, commitTime);
HoodieInstant commitInstant = new HoodieInstant(false, HoodieTimeline.COMMIT_ACTION, commitTime);
if (!commitTimeline.containsInstant(commitInstant)) {
new HoodieException("No commit exists at " + commitTime);
}
try {
HashMap<String, String> paths = getLatestFileIDsToFullPath(basePath, commitTimeline,
Arrays.asList(commitInstant));
return sqlContext.read()
.parquet(paths.values().toArray(new String[paths.size()]))
return sqlContext.read().parquet(paths.values().toArray(new String[paths.size()]))
.filter(String.format("%s ='%s'", HoodieRecord.COMMIT_TIME_METADATA_FIELD, commitTime));
} catch (Exception e) {
throw new HoodieException("Error reading commit " + commitTime, e);
@@ -143,50 +135,37 @@ public class HoodieClientTestUtils {
/**
* Obtain all new data written into the Hoodie dataset since the given timestamp.
*/
public static Dataset<Row> readSince(String basePath,
SQLContext sqlContext,
HoodieTimeline commitTimeline,
public static Dataset<Row> readSince(String basePath, SQLContext sqlContext, HoodieTimeline commitTimeline,
String lastCommitTime) {
List<HoodieInstant> commitsToReturn =
commitTimeline.findInstantsAfter(lastCommitTime, Integer.MAX_VALUE)
.getInstants().collect(Collectors.toList());
List<HoodieInstant> commitsToReturn = commitTimeline.findInstantsAfter(lastCommitTime, Integer.MAX_VALUE)
.getInstants().collect(Collectors.toList());
try {
// Go over the commit metadata, and obtain the new files that need to be read.
HashMap<String, String> fileIdToFullPath = getLatestFileIDsToFullPath(basePath,
commitTimeline, commitsToReturn);
return sqlContext.read()
.parquet(fileIdToFullPath.values().toArray(new String[fileIdToFullPath.size()]))
.filter(
String.format("%s >'%s'", HoodieRecord.COMMIT_TIME_METADATA_FIELD, lastCommitTime));
HashMap<String, String> fileIdToFullPath = getLatestFileIDsToFullPath(basePath, commitTimeline, commitsToReturn);
return sqlContext.read().parquet(fileIdToFullPath.values().toArray(new String[fileIdToFullPath.size()]))
.filter(String.format("%s >'%s'", HoodieRecord.COMMIT_TIME_METADATA_FIELD, lastCommitTime));
} catch (IOException e) {
throw new HoodieException(
"Error pulling data incrementally from commitTimestamp :" + lastCommitTime, e);
throw new HoodieException("Error pulling data incrementally from commitTimestamp :" + lastCommitTime, e);
}
}
/**
* Reads the paths under the a hoodie dataset out as a DataFrame
*/
public static Dataset<Row> read(String basePath,
SQLContext sqlContext,
FileSystem fs,
String... paths) {
public static Dataset<Row> read(String basePath, SQLContext sqlContext, FileSystem fs, String... paths) {
List<String> filteredPaths = new ArrayList<>();
try {
HoodieTable hoodieTable = HoodieTable
.getHoodieTable(new HoodieTableMetaClient(fs.getConf(), basePath, true), null);
for (String path : paths) {
TableFileSystemView.ReadOptimizedView fileSystemView = new HoodieTableFileSystemView(
hoodieTable.getMetaClient(),
hoodieTable.getCompletedCommitTimeline(), fs.globStatus(new Path(path)));
List<HoodieDataFile> latestFiles = fileSystemView.getLatestDataFiles().collect(
Collectors.toList());
hoodieTable.getMetaClient(), hoodieTable.getCompletedCommitTimeline(), fs.globStatus(new Path(path)));
List<HoodieDataFile> latestFiles = fileSystemView.getLatestDataFiles().collect(Collectors.toList());
for (HoodieDataFile file : latestFiles) {
filteredPaths.add(file.getPath());
}
}
return sqlContext.read()
.parquet(filteredPaths.toArray(new String[filteredPaths.size()]));
return sqlContext.read().parquet(filteredPaths.toArray(new String[filteredPaths.size()]));
} catch (Exception e) {
throw new HoodieException("Error reading hoodie dataset as a dataframe", e);
}

View File

@@ -42,8 +42,7 @@ import org.apache.hadoop.mapred.RecordReader;
*/
public class HoodieMergeOnReadTestUtils {
public static List<GenericRecord> getRecordsUsingInputFormat(List<String> inputPaths,
String basePath)
public static List<GenericRecord> getRecordsUsingInputFormat(List<String> inputPaths, String basePath)
throws IOException {
JobConf jobConf = new JobConf();
Schema schema = HoodieAvroUtils.addMetadataFields(Schema.parse(TRIP_EXAMPLE_SCHEMA));
@@ -59,7 +58,8 @@ public class HoodieMergeOnReadTestUtils {
ArrayWritable writable = (ArrayWritable) recordReader.createValue();
while (recordReader.next(key, writable)) {
GenericRecordBuilder newRecord = new GenericRecordBuilder(schema);
// writable returns an array with [field1, field2, _hoodie_commit_time, _hoodie_commit_seqno]
// writable returns an array with [field1, field2, _hoodie_commit_time,
// _hoodie_commit_seqno]
Writable[] values = writable.get();
schema.getFields().forEach(field -> {
newRecord.set(field, values[2]);
@@ -76,12 +76,11 @@ public class HoodieMergeOnReadTestUtils {
}).get();
}
private static void setPropsForInputFormat(HoodieRealtimeInputFormat inputFormat, JobConf jobConf,
Schema schema, String basePath) {
private static void setPropsForInputFormat(HoodieRealtimeInputFormat inputFormat, JobConf jobConf, Schema schema,
String basePath) {
List<Schema.Field> fields = schema.getFields();
String names = fields.stream().map(f -> f.name().toString()).collect(Collectors.joining(","));
String postions = fields.stream().map(f -> String.valueOf(f.pos()))
.collect(Collectors.joining(","));
String postions = fields.stream().map(f -> String.valueOf(f.pos())).collect(Collectors.joining(","));
Configuration conf = HoodieTestUtils.getDefaultHadoopConf();
jobConf.set(ColumnProjectionUtils.READ_COLUMN_NAMES_CONF_STR, names);
jobConf.set(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR, postions);

View File

@@ -41,20 +41,15 @@ import org.apache.hadoop.fs.Path;
/**
* Class to be used in tests to keep generating test inserts and updates against a corpus.
*
* <p>
* Test data uses a toy Uber trips, data model.
*/
public class HoodieTestDataGenerator {
static class KeyPartition {
HoodieKey key;
String partitionPath;
}
public static String TRIP_EXAMPLE_SCHEMA = "{\"type\": \"record\","
+ "\"name\": \"triprec\","
+ "\"fields\": [ "
// based on examination of sample file, the schema produces the following per record size
public static final int SIZE_PER_RECORD = 50 * 1024;
public static final String[] DEFAULT_PARTITION_PATHS = {"2016/03/15", "2015/03/16", "2015/03/17"};
public static String TRIP_EXAMPLE_SCHEMA = "{\"type\": \"record\"," + "\"name\": \"triprec\"," + "\"fields\": [ "
+ "{\"name\": \"timestamp\",\"type\": \"double\"},"
+ "{\"name\": \"_row_key\", \"type\": \"string\"},"
+ "{\"name\": \"rider\", \"type\": \"string\"},"
@@ -64,25 +59,9 @@ public class HoodieTestDataGenerator {
+ "{\"name\": \"end_lat\", \"type\": \"double\"},"
+ "{\"name\": \"end_lon\", \"type\": \"double\"},"
+ "{\"name\":\"fare\",\"type\": \"double\"}]}";
// based on examination of sample file, the schema produces the following per record size
public static final int SIZE_PER_RECORD = 50 * 1024;
public static final String[] DEFAULT_PARTITION_PATHS = {"2016/03/15", "2015/03/16", "2015/03/17"};
public static void writePartitionMetadata(FileSystem fs, String[] partitionPaths,
String basePath) {
for (String partitionPath : partitionPaths) {
new HoodiePartitionMetadata(fs, "000", new Path(basePath), new Path(basePath, partitionPath))
.trySave(0);
}
}
private List<KeyPartition> existingKeysList = new ArrayList<>();
public static Schema avroSchema = HoodieAvroUtils
.addMetadataFields(new Schema.Parser().parse(TRIP_EXAMPLE_SCHEMA));
public static Schema avroSchema = HoodieAvroUtils.addMetadataFields(new Schema.Parser().parse(TRIP_EXAMPLE_SCHEMA));
private static Random rand = new Random(46474747);
private List<KeyPartition> existingKeysList = new ArrayList<>();
private String[] partitionPaths = DEFAULT_PARTITION_PATHS;
public HoodieTestDataGenerator(String[] partitionPaths) {
@@ -90,13 +69,69 @@ public class HoodieTestDataGenerator {
}
public HoodieTestDataGenerator() {
this(new String[]{"2016/03/15", "2015/03/16", "2015/03/17"});
this(new String[] {"2016/03/15", "2015/03/16", "2015/03/17"});
}
public static void writePartitionMetadata(FileSystem fs, String[] partitionPaths, String basePath) {
for (String partitionPath : partitionPaths) {
new HoodiePartitionMetadata(fs, "000", new Path(basePath), new Path(basePath, partitionPath)).trySave(0);
}
}
/**
* Generates new inserts, uniformly across the partition paths above. It also updates the list of
* existing keys.
* Generates a new avro record of the above schema format, retaining the key if optionally provided.
*/
public static TestRawTripPayload generateRandomValue(HoodieKey key, String commitTime) throws IOException {
GenericRecord rec = generateGenericRecord(key.getRecordKey(), "rider-" + commitTime, "driver-" + commitTime, 0.0);
HoodieAvroUtils.addCommitMetadataToRecord(rec, commitTime, "-1");
return new TestRawTripPayload(rec.toString(), key.getRecordKey(), key.getPartitionPath(), TRIP_EXAMPLE_SCHEMA);
}
public static GenericRecord generateGenericRecord(String rowKey, String riderName, String driverName,
double timestamp) {
GenericRecord rec = new GenericData.Record(avroSchema);
rec.put("_row_key", rowKey);
rec.put("timestamp", timestamp);
rec.put("rider", riderName);
rec.put("driver", driverName);
rec.put("begin_lat", rand.nextDouble());
rec.put("begin_lon", rand.nextDouble());
rec.put("end_lat", rand.nextDouble());
rec.put("end_lon", rand.nextDouble());
rec.put("fare", rand.nextDouble() * 100);
return rec;
}
public static void createCommitFile(String basePath, String commitTime) throws IOException {
Path commitFile = new Path(
basePath + "/" + HoodieTableMetaClient.METAFOLDER_NAME + "/" + HoodieTimeline.makeCommitFileName(commitTime));
FileSystem fs = FSUtils.getFs(basePath, HoodieTestUtils.getDefaultHadoopConf());
FSDataOutputStream os = fs.create(commitFile, true);
HoodieCommitMetadata commitMetadata = new HoodieCommitMetadata();
try {
// Write empty commit metadata
os.writeBytes(new String(commitMetadata.toJsonString().getBytes(StandardCharsets.UTF_8)));
} finally {
os.close();
}
}
public static void createSavepointFile(String basePath, String commitTime) throws IOException {
Path commitFile = new Path(basePath + "/" + HoodieTableMetaClient.METAFOLDER_NAME
+ "/" + HoodieTimeline.makeSavePointFileName(commitTime));
FileSystem fs = FSUtils.getFs(basePath, HoodieTestUtils.getDefaultHadoopConf());
FSDataOutputStream os = fs.create(commitFile, true);
HoodieCommitMetadata commitMetadata = new HoodieCommitMetadata();
try {
// Write empty commit metadata
os.writeBytes(new String(commitMetadata.toJsonString().getBytes(StandardCharsets.UTF_8)));
} finally {
os.close();
}
}
/**
* Generates new inserts, uniformly across the partition paths above. It also updates the list of existing keys.
*/
public List<HoodieRecord> generateInserts(String commitTime, int n) throws IOException {
List<HoodieRecord> inserts = new ArrayList<>();
@@ -119,8 +154,7 @@ public class HoodieTestDataGenerator {
return generateDeletesFromExistingRecords(inserts);
}
public List<HoodieRecord> generateDeletesFromExistingRecords(List<HoodieRecord> existingRecords)
throws IOException {
public List<HoodieRecord> generateDeletesFromExistingRecords(List<HoodieRecord> existingRecords) throws IOException {
List<HoodieRecord> deletes = new ArrayList<>();
for (HoodieRecord existingRecord : existingRecords) {
HoodieRecord record = generateDeleteRecord(existingRecord);
@@ -132,17 +166,15 @@ public class HoodieTestDataGenerator {
public HoodieRecord generateDeleteRecord(HoodieRecord existingRecord) throws IOException {
HoodieKey key = existingRecord.getKey();
TestRawTripPayload payload = new TestRawTripPayload(Optional.empty(), key.getRecordKey(),
key.getPartitionPath(), null, true);
TestRawTripPayload payload = new TestRawTripPayload(Optional.empty(), key.getRecordKey(), key.getPartitionPath(),
null, true);
return new HoodieRecord(key, payload);
}
public List<HoodieRecord> generateUpdates(String commitTime, List<HoodieRecord> baseRecords)
throws IOException {
public List<HoodieRecord> generateUpdates(String commitTime, List<HoodieRecord> baseRecords) throws IOException {
List<HoodieRecord> updates = new ArrayList<>();
for (HoodieRecord baseRecord : baseRecords) {
HoodieRecord record = new HoodieRecord(baseRecord.getKey(),
generateRandomValue(baseRecord.getKey(), commitTime));
HoodieRecord record = new HoodieRecord(baseRecord.getKey(), generateRandomValue(baseRecord.getKey(), commitTime));
updates.add(record);
}
return updates;
@@ -161,68 +193,13 @@ public class HoodieTestDataGenerator {
return updates;
}
/**
* Generates a new avro record of the above schema format, retaining the key if optionally
* provided.
*/
public static TestRawTripPayload generateRandomValue(HoodieKey key, String commitTime)
throws IOException {
GenericRecord rec = generateGenericRecord(key.getRecordKey(), "rider-" + commitTime,
"driver-" + commitTime, 0.0);
HoodieAvroUtils.addCommitMetadataToRecord(rec, commitTime, "-1");
return new TestRawTripPayload(rec.toString(), key.getRecordKey(), key.getPartitionPath(),
TRIP_EXAMPLE_SCHEMA);
}
public static GenericRecord generateGenericRecord(String rowKey, String riderName,
String driverName, double timestamp) {
GenericRecord rec = new GenericData.Record(avroSchema);
rec.put("_row_key", rowKey);
rec.put("timestamp", timestamp);
rec.put("rider", riderName);
rec.put("driver", driverName);
rec.put("begin_lat", rand.nextDouble());
rec.put("begin_lon", rand.nextDouble());
rec.put("end_lat", rand.nextDouble());
rec.put("end_lon", rand.nextDouble());
rec.put("fare", rand.nextDouble() * 100);
return rec;
}
public static void createCommitFile(String basePath, String commitTime) throws IOException {
Path commitFile =
new Path(basePath + "/" + HoodieTableMetaClient.METAFOLDER_NAME + "/" + HoodieTimeline
.makeCommitFileName(commitTime));
FileSystem fs = FSUtils.getFs(basePath, HoodieTestUtils.getDefaultHadoopConf());
FSDataOutputStream os = fs.create(commitFile, true);
HoodieCommitMetadata commitMetadata = new HoodieCommitMetadata();
try {
// Write empty commit metadata
os.writeBytes(new String(commitMetadata.toJsonString().getBytes(
StandardCharsets.UTF_8)));
} finally {
os.close();
}
}
public static void createSavepointFile(String basePath, String commitTime) throws IOException {
Path commitFile =
new Path(basePath + "/" + HoodieTableMetaClient.METAFOLDER_NAME + "/" + HoodieTimeline
.makeSavePointFileName(commitTime));
FileSystem fs = FSUtils.getFs(basePath, HoodieTestUtils.getDefaultHadoopConf());
FSDataOutputStream os = fs.create(commitFile, true);
HoodieCommitMetadata commitMetadata = new HoodieCommitMetadata();
try {
// Write empty commit metadata
os.writeBytes(new String(commitMetadata.toJsonString().getBytes(
StandardCharsets.UTF_8)));
} finally {
os.close();
}
}
public String[] getPartitionPaths() {
return partitionPaths;
}
static class KeyPartition {
HoodieKey key;
String partitionPath;
}
}

View File

@@ -43,15 +43,15 @@ import org.apache.commons.io.IOUtils;
*/
public class TestRawTripPayload implements HoodieRecordPayload<TestRawTripPayload> {
private transient static final ObjectMapper mapper = new ObjectMapper();
private static final transient ObjectMapper mapper = new ObjectMapper();
private String partitionPath;
private String rowKey;
private byte[] jsonDataCompressed;
private int dataSize;
private boolean isDeleted;
public TestRawTripPayload(Optional<String> jsonData, String rowKey, String partitionPath,
String schemaStr, Boolean isDeleted) throws IOException {
public TestRawTripPayload(Optional<String> jsonData, String rowKey, String partitionPath, String schemaStr,
Boolean isDeleted) throws IOException {
if (jsonData.isPresent()) {
this.jsonDataCompressed = compressData(jsonData.get());
this.dataSize = jsonData.get().length();
@@ -61,8 +61,7 @@ public class TestRawTripPayload implements HoodieRecordPayload<TestRawTripPayloa
this.isDeleted = isDeleted;
}
public TestRawTripPayload(String jsonData, String rowKey, String partitionPath,
String schemaStr) throws IOException {
public TestRawTripPayload(String jsonData, String rowKey, String partitionPath, String schemaStr) throws IOException {
this(Optional.of(jsonData), rowKey, partitionPath, schemaStr, false);
}
@@ -86,8 +85,7 @@ public class TestRawTripPayload implements HoodieRecordPayload<TestRawTripPayloa
}
@Override
public Optional<IndexedRecord> combineAndGetUpdateValue(IndexedRecord oldRec, Schema schema)
throws IOException {
public Optional<IndexedRecord> combineAndGetUpdateValue(IndexedRecord oldRec, Schema schema) throws IOException {
return this.getInsertValue(schema);
}
@@ -120,8 +118,7 @@ public class TestRawTripPayload implements HoodieRecordPayload<TestRawTripPayloa
private byte[] compressData(String jsonData) throws IOException {
ByteArrayOutputStream baos = new ByteArrayOutputStream();
DeflaterOutputStream dos =
new DeflaterOutputStream(baos, new Deflater(Deflater.BEST_COMPRESSION), true);
DeflaterOutputStream dos = new DeflaterOutputStream(baos, new Deflater(Deflater.BEST_COMPRESSION), true);
try {
dos.write(jsonData.getBytes());
} finally {
@@ -140,13 +137,36 @@ public class TestRawTripPayload implements HoodieRecordPayload<TestRawTripPayloa
}
/**
* A custom {@link WriteStatus} that merges passed metadata key value map to {@code
* WriteStatus.markSuccess()} and {@code WriteStatus.markFailure()}.
* A custom {@link WriteStatus} that merges passed metadata key value map to {@code WriteStatus.markSuccess()} and
* {@code WriteStatus.markFailure()}.
*/
public static class MetadataMergeWriteStatus extends WriteStatus {
private Map<String, String> mergedMetadataMap = new HashMap<>();
public static Map<String, String> mergeMetadataForWriteStatuses(List<WriteStatus> writeStatuses) {
Map<String, String> allWriteStatusMergedMetadataMap = new HashMap<>();
for (WriteStatus writeStatus : writeStatuses) {
MetadataMergeWriteStatus.mergeMetadataMaps(((MetadataMergeWriteStatus) writeStatus).getMergedMetadataMap(),
allWriteStatusMergedMetadataMap);
}
return allWriteStatusMergedMetadataMap;
}
private static void mergeMetadataMaps(Map<String, String> mergeFromMap, Map<String, String> mergeToMap) {
for (Entry<String, String> entry : mergeFromMap.entrySet()) {
String key = entry.getKey();
if (!mergeToMap.containsKey(key)) {
mergeToMap.put(key, "0");
}
mergeToMap.put(key, addStrsAsInt(entry.getValue(), mergeToMap.get(key)));
}
}
private static String addStrsAsInt(String a, String b) {
return String.valueOf(Integer.parseInt(a) + Integer.parseInt(b));
}
@Override
public void markSuccess(HoodieRecord record, Optional<Map<String, String>> recordMetadata) {
super.markSuccess(record, recordMetadata);
@@ -156,43 +176,15 @@ public class TestRawTripPayload implements HoodieRecordPayload<TestRawTripPayloa
}
@Override
public void markFailure(HoodieRecord record, Throwable t,
Optional<Map<String, String>> recordMetadata) {
public void markFailure(HoodieRecord record, Throwable t, Optional<Map<String, String>> recordMetadata) {
super.markFailure(record, t, recordMetadata);
if (recordMetadata.isPresent()) {
mergeMetadataMaps(recordMetadata.get(), mergedMetadataMap);
}
}
public static Map<String, String> mergeMetadataForWriteStatuses(
List<WriteStatus> writeStatuses) {
Map<String, String> allWriteStatusMergedMetadataMap = new HashMap<>();
for (WriteStatus writeStatus : writeStatuses) {
MetadataMergeWriteStatus.mergeMetadataMaps(
((MetadataMergeWriteStatus) writeStatus).getMergedMetadataMap(),
allWriteStatusMergedMetadataMap);
}
return allWriteStatusMergedMetadataMap;
}
private static void mergeMetadataMaps(Map<String, String> mergeFromMap,
Map<String, String> mergeToMap) {
for (Entry<String, String> entry : mergeFromMap.entrySet()) {
String key = entry.getKey();
if (!mergeToMap.containsKey(key)) {
mergeToMap.put(key, "0");
}
mergeToMap
.put(key, addStrsAsInt(entry.getValue(), mergeToMap.get(key)));
}
}
private Map<String, String> getMergedMetadataMap() {
return mergedMetadataMap;
}
private static String addStrsAsInt(String a, String b) {
return String.valueOf(Integer.parseInt(a) + Integer.parseInt(b));
}
}
}

View File

@@ -49,8 +49,7 @@ public class HoodieWriteConfigTest {
assertEquals(config.getMinCommitsToKeep(), 2);
}
private ByteArrayOutputStream saveParamsIntoOutputStream(Map<String, String> params)
throws IOException {
private ByteArrayOutputStream saveParamsIntoOutputStream(Map<String, String> params) throws IOException {
Properties properties = new Properties();
properties.putAll(params);
ByteArrayOutputStream outStream = new ByteArrayOutputStream();

View File

@@ -16,18 +16,13 @@
package com.uber.hoodie.func;
import static org.mockito.Mockito.mock;
import static org.mockito.Mockito.when;
import com.uber.hoodie.common.HoodieTestDataGenerator;
import com.uber.hoodie.common.model.HoodieRecord;
import com.uber.hoodie.common.table.timeline.HoodieActiveTimeline;
import com.uber.hoodie.exception.HoodieException;
import org.apache.avro.generic.IndexedRecord;
import org.apache.commons.io.FileUtils;
import org.apache.spark.util.SizeEstimator;
import org.junit.After;
import org.junit.Assert;
import org.junit.Before;
import org.junit.Test;
import java.io.IOException;
import java.util.Iterator;
import java.util.List;
@@ -37,171 +32,168 @@ import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.Future;
import java.util.concurrent.Semaphore;
import static org.mockito.Mockito.mock;
import static org.mockito.Mockito.when;
import org.apache.avro.generic.IndexedRecord;
import org.apache.commons.io.FileUtils;
import org.apache.spark.util.SizeEstimator;
import org.junit.After;
import org.junit.Assert;
import org.junit.Before;
import org.junit.Test;
public class TestBufferedIterator {
private final HoodieTestDataGenerator hoodieTestDataGenerator = new HoodieTestDataGenerator();
private final String commitTime = HoodieActiveTimeline.createNewCommitTime();
private ExecutorService recordReader = null;
private final HoodieTestDataGenerator hoodieTestDataGenerator = new HoodieTestDataGenerator();
private final String commitTime = HoodieActiveTimeline.createNewCommitTime();
private ExecutorService recordReader = null;
@Before
public void beforeTest() {
this.recordReader = Executors.newFixedThreadPool(1);
@Before
public void beforeTest() {
this.recordReader = Executors.newFixedThreadPool(1);
}
@After
public void afterTest() {
if (this.recordReader != null) {
this.recordReader.shutdownNow();
this.recordReader = null;
}
}
// Test to ensure that we are reading all records from buffered iterator in the same order
// without any exceptions.
@Test(timeout = 60000)
public void testRecordReading() throws IOException, ExecutionException, InterruptedException {
final int numRecords = 128;
final List<HoodieRecord> hoodieRecords = hoodieTestDataGenerator.generateInserts(commitTime, numRecords);
final BufferedIterator bufferedIterator = new BufferedIterator(hoodieRecords.iterator(), FileUtils.ONE_KB,
HoodieTestDataGenerator.avroSchema);
Future<Boolean> result = recordReader.submit(() -> {
bufferedIterator.startBuffering();
return true;
});
final Iterator<HoodieRecord> originalRecordIterator = hoodieRecords.iterator();
int recordsRead = 0;
while (bufferedIterator.hasNext()) {
final HoodieRecord originalRecord = originalRecordIterator.next();
final Optional<IndexedRecord> originalInsertValue = originalRecord.getData()
.getInsertValue(HoodieTestDataGenerator.avroSchema);
final BufferedIterator.BufferedIteratorPayload payload = bufferedIterator.next();
// Ensure that record ordering is guaranteed.
Assert.assertEquals(originalRecord, payload.record);
// cached insert value matches the expected insert value.
Assert.assertEquals(originalInsertValue, payload.insertValue);
recordsRead++;
}
Assert.assertFalse(bufferedIterator.hasNext() || originalRecordIterator.hasNext());
// all the records should be read successfully.
Assert.assertEquals(numRecords, recordsRead);
// should not throw any exceptions.
Assert.assertTrue(result.get());
}
// Test to ensure that record buffering is throttled when we hit memory limit.
@Test(timeout = 60000)
public void testMemoryLimitForBuffering() throws IOException, InterruptedException {
final int numRecords = 128;
final List<HoodieRecord> hoodieRecords = hoodieTestDataGenerator.generateInserts(commitTime, numRecords);
// maximum number of records to keep in memory.
final int recordLimit = 5;
final long memoryLimitInBytes = recordLimit * SizeEstimator.estimate(hoodieRecords.get(0));
final BufferedIterator bufferedIterator = new BufferedIterator(hoodieRecords.iterator(), memoryLimitInBytes,
HoodieTestDataGenerator.avroSchema);
Future<Boolean> result = recordReader.submit(() -> {
bufferedIterator.startBuffering();
return true;
});
// waiting for permits to expire.
while (!isQueueFull(bufferedIterator.rateLimiter)) {
Thread.sleep(10);
}
Assert.assertEquals(0, bufferedIterator.rateLimiter.availablePermits());
Assert.assertEquals(recordLimit, bufferedIterator.currentRateLimit);
Assert.assertEquals(recordLimit, bufferedIterator.size());
Assert.assertEquals(recordLimit - 1, bufferedIterator.samplingRecordCounter.get());
// try to read 2 records.
Assert.assertEquals(hoodieRecords.get(0), bufferedIterator.next().record);
Assert.assertEquals(hoodieRecords.get(1), bufferedIterator.next().record);
// waiting for permits to expire.
while (!isQueueFull(bufferedIterator.rateLimiter)) {
Thread.sleep(10);
}
// No change is expected in rate limit or number of buffered records. We only expect
// buffering thread to read
// 2 more records into the buffer.
Assert.assertEquals(0, bufferedIterator.rateLimiter.availablePermits());
Assert.assertEquals(recordLimit, bufferedIterator.currentRateLimit);
Assert.assertEquals(recordLimit, bufferedIterator.size());
Assert.assertEquals(recordLimit - 1 + 2, bufferedIterator.samplingRecordCounter.get());
}
// Test to ensure that exception in either buffering thread or BufferedIterator-reader thread
// is propagated to
// another thread.
@Test(timeout = 60000)
public void testException() throws IOException, InterruptedException {
final int numRecords = 256;
final List<HoodieRecord> hoodieRecords = hoodieTestDataGenerator.generateInserts(commitTime, numRecords);
// buffer memory limit
final long memoryLimitInBytes = 4 * SizeEstimator.estimate(hoodieRecords.get(0));
// first let us throw exception from bufferIterator reader and test that buffering thread
// stops and throws
// correct exception back.
BufferedIterator bufferedIterator1 = new BufferedIterator(hoodieRecords.iterator(), memoryLimitInBytes,
HoodieTestDataGenerator.avroSchema);
Future<Boolean> result = recordReader.submit(() -> {
bufferedIterator1.startBuffering();
return true;
});
// waiting for permits to expire.
while (!isQueueFull(bufferedIterator1.rateLimiter)) {
Thread.sleep(10);
}
// notify buffering thread of an exception and ensure that it exits.
final Exception e = new Exception("Failing it :)");
bufferedIterator1.markAsFailed(e);
try {
result.get();
Assert.fail("exception is expected");
} catch (ExecutionException e1) {
Assert.assertEquals(HoodieException.class, e1.getCause().getClass());
Assert.assertEquals(e, e1.getCause().getCause());
}
@After
public void afterTest() {
if (this.recordReader != null) {
this.recordReader.shutdownNow();
this.recordReader = null;
}
// second let us raise an exception while doing record buffering. this exception should get
// propagated to
// buffered iterator reader.
final RuntimeException expectedException = new RuntimeException("failing record reading");
final Iterator<HoodieRecord> mockHoodieRecordsIterator = mock(Iterator.class);
when(mockHoodieRecordsIterator.hasNext()).thenReturn(true);
when(mockHoodieRecordsIterator.next()).thenThrow(expectedException);
BufferedIterator bufferedIterator2 = new BufferedIterator(mockHoodieRecordsIterator, memoryLimitInBytes,
HoodieTestDataGenerator.avroSchema);
Future<Boolean> result2 = recordReader.submit(() -> {
bufferedIterator2.startBuffering();
return true;
});
try {
bufferedIterator2.hasNext();
Assert.fail("exception is expected");
} catch (Exception e1) {
Assert.assertEquals(expectedException, e1.getCause());
}
// Test to ensure that we are reading all records from buffered iterator in the same order without any exceptions.
@Test(timeout = 60000)
public void testRecordReading() throws IOException, ExecutionException, InterruptedException {
final int numRecords = 128;
final List<HoodieRecord> hoodieRecords = hoodieTestDataGenerator.generateInserts(commitTime, numRecords);
final BufferedIterator bufferedIterator =
new BufferedIterator(hoodieRecords.iterator(), FileUtils.ONE_KB, HoodieTestDataGenerator.avroSchema);
Future<Boolean> result =
recordReader.submit(
() -> {
bufferedIterator.startBuffering();
return true;
}
);
final Iterator<HoodieRecord> originalRecordIterator = hoodieRecords.iterator();
int recordsRead = 0;
while (bufferedIterator.hasNext()) {
final HoodieRecord originalRecord = originalRecordIterator.next();
final Optional<IndexedRecord> originalInsertValue =
originalRecord.getData().getInsertValue(HoodieTestDataGenerator.avroSchema);
final BufferedIterator.BufferedIteratorPayload payload = bufferedIterator.next();
// Ensure that record ordering is guaranteed.
Assert.assertEquals(originalRecord, payload.record);
// cached insert value matches the expected insert value.
Assert.assertEquals(originalInsertValue, payload.insertValue);
recordsRead++;
}
Assert.assertFalse(bufferedIterator.hasNext() || originalRecordIterator.hasNext());
// all the records should be read successfully.
Assert.assertEquals(numRecords, recordsRead);
// should not throw any exceptions.
Assert.assertTrue(result.get());
// buffering thread should also have exited. make sure that it is not running.
try {
result2.get();
Assert.fail("exception is expected");
} catch (ExecutionException e2) {
Assert.assertEquals(expectedException, e2.getCause());
}
}
// Test to ensure that record buffering is throttled when we hit memory limit.
@Test(timeout = 60000)
public void testMemoryLimitForBuffering() throws IOException, InterruptedException {
final int numRecords = 128;
final List<HoodieRecord> hoodieRecords = hoodieTestDataGenerator.generateInserts(commitTime, numRecords);
// maximum number of records to keep in memory.
final int recordLimit = 5;
final long memoryLimitInBytes = recordLimit * SizeEstimator.estimate(hoodieRecords.get(0));
final BufferedIterator bufferedIterator =
new BufferedIterator(hoodieRecords.iterator(), memoryLimitInBytes, HoodieTestDataGenerator.avroSchema);
Future<Boolean> result =
recordReader.submit(
() -> {
bufferedIterator.startBuffering();
return true;
}
);
// waiting for permits to expire.
while (!isQueueFull(bufferedIterator.rateLimiter)) {
Thread.sleep(10);
}
Assert.assertEquals(0, bufferedIterator.rateLimiter.availablePermits());
Assert.assertEquals(recordLimit, bufferedIterator.currentRateLimit);
Assert.assertEquals(recordLimit, bufferedIterator.size());
Assert.assertEquals(recordLimit - 1, bufferedIterator.samplingRecordCounter.get());
// try to read 2 records.
Assert.assertEquals(hoodieRecords.get(0), bufferedIterator.next().record);
Assert.assertEquals(hoodieRecords.get(1), bufferedIterator.next().record);
// waiting for permits to expire.
while (!isQueueFull(bufferedIterator.rateLimiter)) {
Thread.sleep(10);
}
// No change is expected in rate limit or number of buffered records. We only expect buffering thread to read
// 2 more records into the buffer.
Assert.assertEquals(0, bufferedIterator.rateLimiter.availablePermits());
Assert.assertEquals(recordLimit, bufferedIterator.currentRateLimit);
Assert.assertEquals(recordLimit, bufferedIterator.size());
Assert.assertEquals(recordLimit - 1 + 2, bufferedIterator.samplingRecordCounter.get());
}
// Test to ensure that exception in either buffering thread or BufferedIterator-reader thread is propagated to
// another thread.
@Test(timeout = 60000)
public void testException() throws IOException, InterruptedException {
final int numRecords = 256;
final List<HoodieRecord> hoodieRecords = hoodieTestDataGenerator.generateInserts(commitTime, numRecords);
// buffer memory limit
final long memoryLimitInBytes = 4 * SizeEstimator.estimate(hoodieRecords.get(0));
// first let us throw exception from bufferIterator reader and test that buffering thread stops and throws
// correct exception back.
BufferedIterator bufferedIterator1 =
new BufferedIterator(hoodieRecords.iterator(), memoryLimitInBytes, HoodieTestDataGenerator.avroSchema);
Future<Boolean> result =
recordReader.submit(
() -> {
bufferedIterator1.startBuffering();
return true;
}
);
// waiting for permits to expire.
while (!isQueueFull(bufferedIterator1.rateLimiter)) {
Thread.sleep(10);
}
// notify buffering thread of an exception and ensure that it exits.
final Exception e = new Exception("Failing it :)");
bufferedIterator1.markAsFailed(e);
try {
result.get();
Assert.fail("exception is expected");
} catch (ExecutionException e1) {
Assert.assertEquals(HoodieException.class, e1.getCause().getClass());
Assert.assertEquals(e, e1.getCause().getCause());
}
// second let us raise an exception while doing record buffering. this exception should get propagated to
// buffered iterator reader.
final RuntimeException expectedException = new RuntimeException("failing record reading");
final Iterator<HoodieRecord> mockHoodieRecordsIterator = mock(Iterator.class);
when(mockHoodieRecordsIterator.hasNext()).thenReturn(true);
when(mockHoodieRecordsIterator.next()).thenThrow(expectedException);
BufferedIterator bufferedIterator2 =
new BufferedIterator(mockHoodieRecordsIterator, memoryLimitInBytes, HoodieTestDataGenerator.avroSchema);
Future<Boolean> result2 =
recordReader.submit(
() -> {
bufferedIterator2.startBuffering();
return true;
}
);
try {
bufferedIterator2.hasNext();
Assert.fail("exception is expected");
} catch (Exception e1) {
Assert.assertEquals(expectedException, e1.getCause());
}
// buffering thread should also have exited. make sure that it is not running.
try {
result2.get();
Assert.fail("exception is expected");
} catch (ExecutionException e2) {
Assert.assertEquals(expectedException, e2.getCause());
}
}
private boolean isQueueFull(Semaphore rateLimiter) {
return (rateLimiter.availablePermits() == 0 && rateLimiter.hasQueuedThreads());
}
private boolean isQueueFull(Semaphore rateLimiter) {
return (rateLimiter.availablePermits() == 0 && rateLimiter.hasQueuedThreads());
}
}

View File

@@ -55,32 +55,24 @@ public class TestUpdateMapFunction {
public void testSchemaEvolutionOnUpdate() throws Exception {
// Create a bunch of records with a old version of schema
HoodieWriteConfig config = makeHoodieClientConfig("/exampleSchema.txt");
HoodieTableMetaClient metaClient = new HoodieTableMetaClient(
HoodieTestUtils.getDefaultHadoopConf(), basePath);
HoodieTableMetaClient metaClient = new HoodieTableMetaClient(HoodieTestUtils.getDefaultHadoopConf(), basePath);
HoodieCopyOnWriteTable table = new HoodieCopyOnWriteTable(config, metaClient);
String recordStr1 =
"{\"_row_key\":\"8eb5b87a-1feh-4edd-87b4-6ec96dc405a0\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}";
String recordStr2 =
"{\"_row_key\":\"8eb5b87b-1feu-4edd-87b4-6ec96dc405a0\",\"time\":\"2016-01-31T03:20:41.415Z\",\"number\":100}";
String recordStr3 =
"{\"_row_key\":\"8eb5b87c-1fej-4edd-87b4-6ec96dc405a0\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":15}";
String recordStr1 = "{\"_row_key\":\"8eb5b87a-1feh-4edd-87b4-6ec96dc405a0\","
+ "\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}";
String recordStr2 = "{\"_row_key\":\"8eb5b87b-1feu-4edd-87b4-6ec96dc405a0\","
+ "\"time\":\"2016-01-31T03:20:41.415Z\",\"number\":100}";
String recordStr3 = "{\"_row_key\":\"8eb5b87c-1fej-4edd-87b4-6ec96dc405a0\","
+ "\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":15}";
List<HoodieRecord> records = new ArrayList<>();
TestRawTripPayload rowChange1 = new TestRawTripPayload(recordStr1);
records.add(
new HoodieRecord(new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()),
rowChange1));
records.add(new HoodieRecord(new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()), rowChange1));
TestRawTripPayload rowChange2 = new TestRawTripPayload(recordStr2);
records.add(
new HoodieRecord(new HoodieKey(rowChange2.getRowKey(), rowChange2.getPartitionPath()),
rowChange2));
records.add(new HoodieRecord(new HoodieKey(rowChange2.getRowKey(), rowChange2.getPartitionPath()), rowChange2));
TestRawTripPayload rowChange3 = new TestRawTripPayload(recordStr3);
records.add(
new HoodieRecord(new HoodieKey(rowChange3.getRowKey(), rowChange3.getPartitionPath()),
rowChange3));
records.add(new HoodieRecord(new HoodieKey(rowChange3.getRowKey(), rowChange3.getPartitionPath()), rowChange3));
Iterator<List<WriteStatus>> insertResult = table.handleInsert("100", records.iterator());
Path commitFile =
new Path(config.getBasePath() + "/.hoodie/" + HoodieTimeline.makeCommitFileName("100"));
Path commitFile = new Path(config.getBasePath() + "/.hoodie/" + HoodieTimeline.makeCommitFileName("100"));
FSUtils.getFs(basePath, HoodieTestUtils.getDefaultHadoopConf()).create(commitFile);
// Now try an update with an evolved schema
@@ -92,21 +84,20 @@ public class TestUpdateMapFunction {
table = new HoodieCopyOnWriteTable(config, metaClient);
// New content with values for the newly added field
recordStr1 =
"{\"_row_key\":\"8eb5b87a-1feh-4edd-87b4-6ec96dc405a0\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12,\"added_field\":1}";
recordStr1 = "{\"_row_key\":\"8eb5b87a-1feh-4edd-87b4-6ec96dc405a0\","
+ "\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12,\"added_field\":1}";
records = new ArrayList<>();
rowChange1 = new TestRawTripPayload(recordStr1);
HoodieRecord record1 =
new HoodieRecord(new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()),
rowChange1);
HoodieRecord record1 = new HoodieRecord(new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()),
rowChange1);
record1.setCurrentLocation(new HoodieRecordLocation("100", fileId));
records.add(record1);
try {
table.handleUpdate("101", fileId, records.iterator());
} catch (ClassCastException e) {
fail(
"UpdateFunction could not read records written with exampleSchema.txt using the exampleEvolvedSchema.txt");
fail("UpdateFunction could not read records written with exampleSchema.txt using the "
+ "exampleEvolvedSchema.txt");
}
}

View File

@@ -16,6 +16,12 @@
package com.uber.hoodie.index;
import static org.junit.Assert.assertFalse;
import static org.junit.Assert.assertTrue;
import static org.mockito.Matchers.anyObject;
import static org.mockito.Mockito.atMost;
import static org.mockito.Mockito.times;
import com.uber.hoodie.HoodieWriteClient;
import com.uber.hoodie.WriteStatus;
import com.uber.hoodie.common.HoodieTestDataGenerator;
@@ -23,19 +29,16 @@ import com.uber.hoodie.common.model.HoodieRecord;
import com.uber.hoodie.common.model.HoodieTableType;
import com.uber.hoodie.common.table.HoodieTableConfig;
import com.uber.hoodie.common.table.HoodieTableMetaClient;
import com.uber.hoodie.common.table.TableFileSystemView;
import com.uber.hoodie.common.table.view.HoodieTableFileSystemView;
import com.uber.hoodie.common.util.FSUtils;
import com.uber.hoodie.config.HoodieCompactionConfig;
import com.uber.hoodie.config.HoodieIndexConfig;
import com.uber.hoodie.config.HoodieStorageConfig;
import com.uber.hoodie.config.HoodieWriteConfig;
import com.uber.hoodie.index.hbase.HBaseIndex;
import com.uber.hoodie.table.HoodieTable;
import java.io.File;
import java.util.List;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hbase.HBaseTestingUtility;
import org.apache.hadoop.hbase.TableName;
import org.apache.hadoop.hbase.client.Connection;
@@ -56,37 +59,26 @@ import org.junit.Test;
import org.junit.rules.TemporaryFolder;
import org.junit.runners.MethodSorters;
import org.mockito.Mockito;
import scala.Tuple2;
import java.io.File;
import java.io.IOException;
import java.util.List;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertFalse;
import static org.junit.Assert.assertTrue;
import static org.mockito.Matchers.anyObject;
import static org.mockito.Mockito.atLeast;
import static org.mockito.Mockito.atMost;
import static org.mockito.Mockito.times;
/**
* Note :: HBaseTestingUtility is really flaky with issues where the HbaseMiniCluster fails to shutdown
* across tests, (see one problem here : https://issues.apache.org/jira/browse/HBASE-15835).
* Hence, the need to use MethodSorters.NAME_ASCENDING to make sure the tests run in order. Please alter
* the order of tests running carefully.
* Note :: HBaseTestingUtility is really flaky with issues where the HbaseMiniCluster fails to shutdown across tests,
* (see one problem here : https://issues.apache .org/jira/browse/HBASE-15835). Hence, the need to use
* MethodSorters.NAME_ASCENDING to make sure the tests run in order. Please alter the order of tests running carefully.
*/
@FixMethodOrder(MethodSorters.NAME_ASCENDING)
public class TestHbaseIndex {
private static JavaSparkContext jsc = null;
private String basePath = null;
private transient FileSystem fs;
private static HBaseTestingUtility utility;
private static Configuration hbaseConfig;
private static String tableName = "test_table";
private String basePath = null;
private transient FileSystem fs;
private HoodieTableMetaClient metaClient;
public TestHbaseIndex() throws Exception {
}
@AfterClass
public static void clean() throws Exception {
if (jsc != null) {
@@ -97,6 +89,20 @@ public class TestHbaseIndex {
}
}
@BeforeClass
public static void init() throws Exception {
// Initialize HbaseMiniCluster
utility = new HBaseTestingUtility();
utility.startMiniCluster();
hbaseConfig = utility.getConnection().getConfiguration();
utility.createTable(TableName.valueOf(tableName), Bytes.toBytes("_s"));
// Initialize a local spark env
SparkConf sparkConf = new SparkConf().setAppName("TestHbaseIndex").setMaster("local[1]");
jsc = new JavaSparkContext(sparkConf);
jsc.hadoopConfiguration().addResource(utility.getConfiguration());
}
@After
public void clear() throws Exception {
if (basePath != null) {
@@ -112,25 +118,8 @@ public class TestHbaseIndex {
basePath = folder.getRoot().getAbsolutePath();
// Initialize table
metaClient = HoodieTableMetaClient
.initTableType(utility.getConfiguration(), basePath, HoodieTableType.COPY_ON_WRITE,
tableName, HoodieTableConfig.DEFAULT_PAYLOAD_CLASS);
}
public TestHbaseIndex() throws Exception {
}
@BeforeClass
public static void init() throws Exception {
// Initialize HbaseMiniCluster
utility = new HBaseTestingUtility();
utility.startMiniCluster();
hbaseConfig = utility.getConnection().getConfiguration();
utility.createTable(TableName.valueOf(tableName), Bytes.toBytes("_s"));
// Initialize a local spark env
SparkConf sparkConf = new SparkConf().setAppName("TestHbaseIndex").setMaster("local[1]");
jsc = new JavaSparkContext(sparkConf);
jsc.hadoopConfiguration().addResource(utility.getConfiguration());
.initTableType(utility.getConfiguration(), basePath, HoodieTableType.COPY_ON_WRITE, tableName,
HoodieTableConfig.DEFAULT_PAYLOAD_CLASS);
}
@Test
@@ -156,7 +145,8 @@ public class TestHbaseIndex {
JavaRDD<WriteStatus> writeStatues = writeClient.upsert(writeRecords, newCommitTime);
assertNoWriteErrors(writeStatues.collect());
// Now tagLocation for these records, hbaseIndex should not tag them since it was a failed commit
// Now tagLocation for these records, hbaseIndex should not tag them since it was a failed
// commit
javaRDD = index.tagLocation(writeRecords, hoodieTable);
assert (javaRDD.filter(record -> record.isCurrentLocationKnown()).collect().size() == 0);
@@ -167,8 +157,9 @@ public class TestHbaseIndex {
javaRDD = index.tagLocation(writeRecords, hoodieTable);
assertTrue(javaRDD.filter(record -> record.isCurrentLocationKnown()).collect().size() == 200);
assertTrue(javaRDD.map(record -> record.getKey().getRecordKey()).distinct().count() == 200);
assertTrue(javaRDD.filter(record -> (record.getCurrentLocation() != null
&& record.getCurrentLocation().getCommitTime().equals(newCommitTime))).distinct().count() == 200);
assertTrue(javaRDD.filter(
record -> (record.getCurrentLocation() != null && record.getCurrentLocation().getCommitTime()
.equals(newCommitTime))).distinct().count() == 200);
}
@@ -208,7 +199,8 @@ public class TestHbaseIndex {
// Rollback the last commit
writeClient.rollback(newCommitTime);
// Now tagLocation for these records, hbaseIndex should not tag them since it was a rolled back commit
// Now tagLocation for these records, hbaseIndex should not tag them since it was a rolled
// back commit
javaRDD = index.tagLocation(writeRecords, hoodieTable);
assert (javaRDD.filter(record -> record.isCurrentLocationKnown()).collect().size() == 0);
assert (javaRDD.filter(record -> record.getCurrentLocation() != null).collect().size() == 0);
@@ -302,12 +294,10 @@ public class TestHbaseIndex {
}
private HoodieWriteConfig.Builder getConfigBuilder() {
return HoodieWriteConfig.newBuilder().withPath(basePath)
.withSchema(HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA).withParallelism(1, 1)
.withCompactionConfig(
HoodieCompactionConfig.newBuilder().compactionSmallFileSize(1024 * 1024)
.withInlineCompaction(false).build())
.withAutoCommit(false)
return HoodieWriteConfig.newBuilder().withPath(basePath).withSchema(HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA)
.withParallelism(1, 1).withCompactionConfig(
HoodieCompactionConfig.newBuilder().compactionSmallFileSize(1024 * 1024).withInlineCompaction(false)
.build()).withAutoCommit(false)
.withStorageConfig(HoodieStorageConfig.newBuilder().limitFileSize(1024 * 1024).build())
.forTable("test-trip-table").withIndexConfig(
HoodieIndexConfig.newBuilder().withIndexType(HoodieIndex.IndexType.HBASE)

View File

@@ -31,16 +31,14 @@ public class TestHoodieIndex {
HoodieWriteConfig.Builder clientConfigBuilder = HoodieWriteConfig.newBuilder();
HoodieIndexConfig.Builder indexConfigBuilder = HoodieIndexConfig.newBuilder();
// Different types
HoodieWriteConfig config = clientConfigBuilder.withPath("")
.withIndexConfig(indexConfigBuilder.withIndexType(HoodieIndex.IndexType.HBASE).build())
.build();
HoodieWriteConfig config = clientConfigBuilder.withPath("").withIndexConfig(
indexConfigBuilder.withIndexType(HoodieIndex.IndexType.HBASE).build()).build();
assertTrue(HoodieIndex.createIndex(config, null) instanceof HBaseIndex);
config = clientConfigBuilder.withPath("").withIndexConfig(
indexConfigBuilder.withIndexType(HoodieIndex.IndexType.INMEMORY).build()).build();
config = clientConfigBuilder.withPath("")
.withIndexConfig(indexConfigBuilder.withIndexType(HoodieIndex.IndexType.INMEMORY).build()).build();
assertTrue(HoodieIndex.createIndex(config, null) instanceof InMemoryHashIndex);
config = clientConfigBuilder.withPath("")
.withIndexConfig(indexConfigBuilder.withIndexType(HoodieIndex.IndexType.BLOOM).build())
.build();
.withIndexConfig(indexConfigBuilder.withIndexType(HoodieIndex.IndexType.BLOOM).build()).build();
assertTrue(HoodieIndex.createIndex(config, null) instanceof HoodieBloomIndex);
}
}

View File

@@ -98,31 +98,33 @@ public class TestHoodieBloomIndex {
@Test
public void testLoadUUIDsInMemory() throws IOException {
// Create one RDD of hoodie record
String recordStr1 = "{\"_row_key\":\"1eb5b87a-1feh-4edd-87b4-6ec96dc405a0\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}";
String recordStr2 = "{\"_row_key\":\"2eb5b87b-1feu-4edd-87b4-6ec96dc405a0\",\"time\":\"2016-01-31T03:20:41.415Z\",\"number\":100}";
String recordStr3 = "{\"_row_key\":\"3eb5b87c-1fej-4edd-87b4-6ec96dc405a0\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":15}";
String recordStr4 = "{\"_row_key\":\"4eb5b87c-1fej-4edd-87b4-6ec96dc405a0\",\"time\":\"2015-01-31T03:16:41.415Z\",\"number\":32}";
String recordStr1 = "{\"_row_key\":\"1eb5b87a-1feh-4edd-87b4-6ec96dc405a0\","
+ "\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}";
String recordStr2 = "{\"_row_key\":\"2eb5b87b-1feu-4edd-87b4-6ec96dc405a0\","
+ "\"time\":\"2016-01-31T03:20:41.415Z\",\"number\":100}";
String recordStr3 = "{\"_row_key\":\"3eb5b87c-1fej-4edd-87b4-6ec96dc405a0\","
+ "\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":15}";
String recordStr4 = "{\"_row_key\":\"4eb5b87c-1fej-4edd-87b4-6ec96dc405a0\","
+ "\"time\":\"2015-01-31T03:16:41.415Z\",\"number\":32}";
TestRawTripPayload rowChange1 = new TestRawTripPayload(recordStr1);
HoodieRecord record1 = new HoodieRecord(
new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()), rowChange1);
HoodieRecord record1 = new HoodieRecord(new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()),
rowChange1);
TestRawTripPayload rowChange2 = new TestRawTripPayload(recordStr2);
HoodieRecord record2 = new HoodieRecord(
new HoodieKey(rowChange2.getRowKey(), rowChange2.getPartitionPath()), rowChange2);
HoodieRecord record2 = new HoodieRecord(new HoodieKey(rowChange2.getRowKey(), rowChange2.getPartitionPath()),
rowChange2);
TestRawTripPayload rowChange3 = new TestRawTripPayload(recordStr3);
HoodieRecord record3 = new HoodieRecord(
new HoodieKey(rowChange3.getRowKey(), rowChange3.getPartitionPath()), rowChange3);
HoodieRecord record3 = new HoodieRecord(new HoodieKey(rowChange3.getRowKey(), rowChange3.getPartitionPath()),
rowChange3);
TestRawTripPayload rowChange4 = new TestRawTripPayload(recordStr4);
HoodieRecord record4 = new HoodieRecord(
new HoodieKey(rowChange4.getRowKey(), rowChange4.getPartitionPath()), rowChange4);
HoodieRecord record4 = new HoodieRecord(new HoodieKey(rowChange4.getRowKey(), rowChange4.getPartitionPath()),
rowChange4);
JavaRDD<HoodieRecord> recordRDD = jsc
.parallelize(Arrays.asList(record1, record2, record3, record4));
JavaRDD<HoodieRecord> recordRDD = jsc.parallelize(Arrays.asList(record1, record2, record3, record4));
// Load to memory
Map<String, Iterable<String>> map = recordRDD
.mapToPair(record -> new Tuple2<>(record.getPartitionPath(), record.getRecordKey()))
.groupByKey().collectAsMap();
Map<String, Iterable<String>> map = recordRDD.mapToPair(
record -> new Tuple2<>(record.getPartitionPath(), record.getRecordKey())).groupByKey().collectAsMap();
assertEquals(map.size(), 2);
List<String> list1 = Lists.newArrayList(map.get("2016/01/31"));
List<String> list2 = Lists.newArrayList(map.get("2015/01/31"));
@@ -132,44 +134,40 @@ public class TestHoodieBloomIndex {
@Test
public void testLoadInvolvedFiles() throws IOException {
HoodieWriteConfig config = HoodieWriteConfig.newBuilder()
.withPath(basePath)
.build();
HoodieWriteConfig config = HoodieWriteConfig.newBuilder().withPath(basePath).build();
HoodieBloomIndex index = new HoodieBloomIndex(config, jsc);
// Create some partitions, and put some files
// "2016/01/21": 0 file
// "2016/04/01": 1 file (2_0_20160401010101.parquet)
// "2015/03/12": 3 files (1_0_20150312101010.parquet, 3_0_20150312101010.parquet, 4_0_20150312101010.parquet)
// "2015/03/12": 3 files (1_0_20150312101010.parquet, 3_0_20150312101010.parquet,
// 4_0_20150312101010.parquet)
new File(basePath + "/2016/01/21").mkdirs();
new File(basePath + "/2016/04/01").mkdirs();
new File(basePath + "/2015/03/12").mkdirs();
TestRawTripPayload rowChange1 = new TestRawTripPayload(
"{\"_row_key\":\"000\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}");
HoodieRecord record1 = new HoodieRecord(
new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()), rowChange1);
HoodieRecord record1 = new HoodieRecord(new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()),
rowChange1);
TestRawTripPayload rowChange2 = new TestRawTripPayload(
"{\"_row_key\":\"001\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}");
HoodieRecord record2 = new HoodieRecord(
new HoodieKey(rowChange2.getRowKey(), rowChange2.getPartitionPath()), rowChange2);
HoodieRecord record2 = new HoodieRecord(new HoodieKey(rowChange2.getRowKey(), rowChange2.getPartitionPath()),
rowChange2);
TestRawTripPayload rowChange3 = new TestRawTripPayload(
"{\"_row_key\":\"002\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}");
HoodieRecord record3 = new HoodieRecord(
new HoodieKey(rowChange3.getRowKey(), rowChange3.getPartitionPath()), rowChange3);
HoodieRecord record3 = new HoodieRecord(new HoodieKey(rowChange3.getRowKey(), rowChange3.getPartitionPath()),
rowChange3);
TestRawTripPayload rowChange4 = new TestRawTripPayload(
"{\"_row_key\":\"003\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}");
HoodieRecord record4 = new HoodieRecord(
new HoodieKey(rowChange4.getRowKey(), rowChange4.getPartitionPath()), rowChange4);
HoodieRecord record4 = new HoodieRecord(new HoodieKey(rowChange4.getRowKey(), rowChange4.getPartitionPath()),
rowChange4);
writeParquetFile("2016/04/01", "2_0_20160401010101.parquet", Lists.newArrayList(), schema, null,
writeParquetFile("2016/04/01", "2_0_20160401010101.parquet", Lists.newArrayList(), schema, null, false);
writeParquetFile("2015/03/12", "1_0_20150312101010.parquet", Lists.newArrayList(), schema, null, false);
writeParquetFile("2015/03/12", "3_0_20150312101010.parquet", Arrays.asList(record1), schema, null, false);
writeParquetFile("2015/03/12", "4_0_20150312101010.parquet", Arrays.asList(record2, record3, record4), schema, null,
false);
writeParquetFile("2015/03/12", "1_0_20150312101010.parquet", Lists.newArrayList(), schema, null,
false);
writeParquetFile("2015/03/12", "3_0_20150312101010.parquet", Arrays.asList(record1), schema,
null, false);
writeParquetFile("2015/03/12", "4_0_20150312101010.parquet",
Arrays.asList(record2, record3, record4), schema, null, false);
List<String> partitions = Arrays.asList("2016/01/21", "2016/04/01", "2015/03/12");
HoodieTableMetaClient metadata = new HoodieTableMetaClient(jsc.hadoopConfiguration(), basePath);
@@ -198,51 +196,32 @@ public class TestHoodieBloomIndex {
List<Tuple2<String, BloomIndexFileInfo>> expected = Arrays.asList(
new Tuple2<>("2016/04/01", new BloomIndexFileInfo("2_0_20160401010101.parquet")),
new Tuple2<>("2015/03/12", new BloomIndexFileInfo("1_0_20150312101010.parquet")),
new Tuple2<>("2015/03/12",
new BloomIndexFileInfo("3_0_20150312101010.parquet", "000", "000")),
new Tuple2<>("2015/03/12",
new BloomIndexFileInfo("4_0_20150312101010.parquet", "001", "003"))
);
new Tuple2<>("2015/03/12", new BloomIndexFileInfo("3_0_20150312101010.parquet", "000", "000")),
new Tuple2<>("2015/03/12", new BloomIndexFileInfo("4_0_20150312101010.parquet", "001", "003")));
assertEquals(expected, filesList);
}
@Test
public void testRangePruning() {
HoodieWriteConfig config = HoodieWriteConfig.newBuilder()
.withPath(basePath)
.build();
HoodieWriteConfig config = HoodieWriteConfig.newBuilder().withPath(basePath).build();
HoodieBloomIndex index = new HoodieBloomIndex(config, jsc);
final Map<String, List<BloomIndexFileInfo>> partitionToFileIndexInfo = new HashMap<>();
partitionToFileIndexInfo.put("2017/10/22", Arrays.asList(
new BloomIndexFileInfo("f1"),
new BloomIndexFileInfo("f2", "000", "000"),
new BloomIndexFileInfo("f3", "001", "003"),
new BloomIndexFileInfo("f4", "002", "007"),
new BloomIndexFileInfo("f5", "009", "010")
));
partitionToFileIndexInfo.put("2017/10/22", Arrays.asList(new BloomIndexFileInfo("f1"),
new BloomIndexFileInfo("f2", "000", "000"), new BloomIndexFileInfo("f3", "001", "003"),
new BloomIndexFileInfo("f4", "002", "007"), new BloomIndexFileInfo("f5", "009", "010")));
JavaPairRDD<String, String> partitionRecordKeyPairRDD = jsc
.parallelize(Arrays.asList(
new Tuple2<>("2017/10/22", "003"),
new Tuple2<>("2017/10/22", "002"),
new Tuple2<>("2017/10/22", "005"),
new Tuple2<>("2017/10/22", "004")
))
.mapToPair(t -> t);
JavaPairRDD<String, String> partitionRecordKeyPairRDD = jsc.parallelize(Arrays.asList(
new Tuple2<>("2017/10/22", "003"), new Tuple2<>("2017/10/22", "002"), new Tuple2<>("2017/10/22", "005"),
new Tuple2<>("2017/10/22", "004"))).mapToPair(t -> t);
List<Tuple2<String, Tuple2<String, HoodieKey>>> comparisonKeyList = index
.explodeRecordRDDWithFileComparisons(partitionToFileIndexInfo, partitionRecordKeyPairRDD)
.collect();
List<Tuple2<String, Tuple2<String, HoodieKey>>> comparisonKeyList = index.explodeRecordRDDWithFileComparisons(
partitionToFileIndexInfo, partitionRecordKeyPairRDD).collect();
assertEquals(10, comparisonKeyList.size());
Map<String, List<String>> recordKeyToFileComps = comparisonKeyList.stream()
.collect(Collectors.groupingBy(
t -> t._2()._2().getRecordKey(),
Collectors.mapping(t -> t._2()._1().split("#")[0], Collectors.toList()
)
));
Map<String, List<String>> recordKeyToFileComps = comparisonKeyList.stream().collect(Collectors.groupingBy(
t -> t._2()._2().getRecordKey(), Collectors.mapping(t -> t._2()._1().split("#")[0], Collectors.toList())));
assertEquals(4, recordKeyToFileComps.size());
assertEquals(Arrays.asList("f1", "f3", "f4"), recordKeyToFileComps.get("002"));
@@ -252,32 +231,35 @@ public class TestHoodieBloomIndex {
}
@Test
public void testCheckUUIDsAgainstOneFile()
throws IOException, InterruptedException, ClassNotFoundException {
public void testCheckUUIDsAgainstOneFile() throws IOException, InterruptedException, ClassNotFoundException {
// Create some records to use
String recordStr1 = "{\"_row_key\":\"1eb5b87a-1feh-4edd-87b4-6ec96dc405a0\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}";
String recordStr2 = "{\"_row_key\":\"2eb5b87b-1feu-4edd-87b4-6ec96dc405a0\",\"time\":\"2016-01-31T03:20:41.415Z\",\"number\":100}";
String recordStr3 = "{\"_row_key\":\"3eb5b87c-1fej-4edd-87b4-6ec96dc405a0\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":15}";
String recordStr4 = "{\"_row_key\":\"4eb5b87c-1fej-4edd-87b4-6ec96dc405a0\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":32}";
String recordStr1 = "{\"_row_key\":\"1eb5b87a-1feh-4edd-87b4-6ec96dc405a0\","
+ "\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}";
String recordStr2 = "{\"_row_key\":\"2eb5b87b-1feu-4edd-87b4-6ec96dc405a0\","
+ "\"time\":\"2016-01-31T03:20:41.415Z\",\"number\":100}";
String recordStr3 = "{\"_row_key\":\"3eb5b87c-1fej-4edd-87b4-6ec96dc405a0\","
+ "\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":15}";
String recordStr4 = "{\"_row_key\":\"4eb5b87c-1fej-4edd-87b4-6ec96dc405a0\","
+ "\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":32}";
TestRawTripPayload rowChange1 = new TestRawTripPayload(recordStr1);
HoodieRecord record1 = new HoodieRecord(
new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()), rowChange1);
HoodieRecord record1 = new HoodieRecord(new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()),
rowChange1);
TestRawTripPayload rowChange2 = new TestRawTripPayload(recordStr2);
HoodieRecord record2 = new HoodieRecord(
new HoodieKey(rowChange2.getRowKey(), rowChange2.getPartitionPath()), rowChange2);
HoodieRecord record2 = new HoodieRecord(new HoodieKey(rowChange2.getRowKey(), rowChange2.getPartitionPath()),
rowChange2);
TestRawTripPayload rowChange3 = new TestRawTripPayload(recordStr3);
HoodieRecord record3 = new HoodieRecord(
new HoodieKey(rowChange3.getRowKey(), rowChange3.getPartitionPath()), rowChange3);
HoodieRecord record3 = new HoodieRecord(new HoodieKey(rowChange3.getRowKey(), rowChange3.getPartitionPath()),
rowChange3);
TestRawTripPayload rowChange4 = new TestRawTripPayload(recordStr4);
HoodieRecord record4 = new HoodieRecord(
new HoodieKey(rowChange4.getRowKey(), rowChange4.getPartitionPath()), rowChange4);
HoodieRecord record4 = new HoodieRecord(new HoodieKey(rowChange4.getRowKey(), rowChange4.getPartitionPath()),
rowChange4);
// We write record1, record2 to a parquet file, but the bloom filter contains (record1, record2, record3).
// We write record1, record2 to a parquet file, but the bloom filter contains (record1,
// record2, record3).
BloomFilter filter = new BloomFilter(10000, 0.0000001);
filter.add(record3.getRecordKey());
String filename = writeParquetFile("2016/01/31", Arrays.asList(record1, record2), schema,
filter, true);
String filename = writeParquetFile("2016/01/31", Arrays.asList(record1, record2), schema, filter, true);
// The bloom filter contains 3 records
assertTrue(filter.mightContain(record1.getRecordKey()));
@@ -286,17 +268,16 @@ public class TestHoodieBloomIndex {
assertFalse(filter.mightContain(record4.getRecordKey()));
// Compare with file
List<String> uuids = Arrays.asList(record1.getRecordKey(), record2.getRecordKey(),
record3.getRecordKey(), record4.getRecordKey());
List<String> uuids = Arrays.asList(record1.getRecordKey(), record2.getRecordKey(), record3.getRecordKey(),
record4.getRecordKey());
List<String> results = HoodieBloomIndexCheckFunction
.checkCandidatesAgainstFile(jsc.hadoopConfiguration(), uuids,
List<String> results = HoodieBloomIndexCheckFunction.checkCandidatesAgainstFile(jsc.hadoopConfiguration(), uuids,
new Path(basePath + "/2016/01/31/" + filename));
assertEquals(results.size(), 2);
assertTrue(results.get(0).equals("1eb5b87a-1feh-4edd-87b4-6ec96dc405a0")
|| results.get(1).equals("1eb5b87a-1feh-4edd-87b4-6ec96dc405a0"));
assertTrue(results.get(0).equals("2eb5b87b-1feu-4edd-87b4-6ec96dc405a0")
|| results.get(1).equals("2eb5b87b-1feu-4edd-87b4-6ec96dc405a0"));
assertTrue(results.get(0).equals("1eb5b87a-1feh-4edd-87b4-6ec96dc405a0") || results.get(1).equals(
"1eb5b87a-1feh-4edd-87b4-6ec96dc405a0"));
assertTrue(results.get(0).equals("2eb5b87b-1feu-4edd-87b4-6ec96dc405a0") || results.get(1).equals(
"2eb5b87b-1feu-4edd-87b4-6ec96dc405a0"));
// TODO(vc): Need more coverage on actual filenames
//assertTrue(results.get(0)._2().equals(filename));
//assertTrue(results.get(1)._2().equals(filename));
@@ -317,8 +298,7 @@ public class TestHoodieBloomIndex {
try {
bloomIndex.tagLocation(recordRDD, table);
} catch (IllegalArgumentException e) {
fail(
"EmptyRDD should not result in IllegalArgumentException: Positive number of slices required");
fail("EmptyRDD should not result in IllegalArgumentException: Positive number of slices " + "required");
}
}
@@ -327,24 +307,27 @@ public class TestHoodieBloomIndex {
public void testTagLocation() throws Exception {
// We have some records to be tagged (two different partitions)
String recordStr1 = "{\"_row_key\":\"1eb5b87a-1feh-4edd-87b4-6ec96dc405a0\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}";
String recordStr2 = "{\"_row_key\":\"2eb5b87b-1feu-4edd-87b4-6ec96dc405a0\",\"time\":\"2016-01-31T03:20:41.415Z\",\"number\":100}";
String recordStr3 = "{\"_row_key\":\"3eb5b87c-1fej-4edd-87b4-6ec96dc405a0\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":15}";
String recordStr4 = "{\"_row_key\":\"4eb5b87c-1fej-4edd-87b4-6ec96dc405a0\",\"time\":\"2015-01-31T03:16:41.415Z\",\"number\":32}";
String recordStr1 = "{\"_row_key\":\"1eb5b87a-1feh-4edd-87b4-6ec96dc405a0\","
+ "\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}";
String recordStr2 = "{\"_row_key\":\"2eb5b87b-1feu-4edd-87b4-6ec96dc405a0\","
+ "\"time\":\"2016-01-31T03:20:41.415Z\",\"number\":100}";
String recordStr3 = "{\"_row_key\":\"3eb5b87c-1fej-4edd-87b4-6ec96dc405a0\","
+ "\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":15}";
String recordStr4 = "{\"_row_key\":\"4eb5b87c-1fej-4edd-87b4-6ec96dc405a0\","
+ "\"time\":\"2015-01-31T03:16:41.415Z\",\"number\":32}";
TestRawTripPayload rowChange1 = new TestRawTripPayload(recordStr1);
HoodieRecord record1 = new HoodieRecord(
new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()), rowChange1);
HoodieRecord record1 = new HoodieRecord(new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()),
rowChange1);
TestRawTripPayload rowChange2 = new TestRawTripPayload(recordStr2);
HoodieRecord record2 = new HoodieRecord(
new HoodieKey(rowChange2.getRowKey(), rowChange2.getPartitionPath()), rowChange2);
HoodieRecord record2 = new HoodieRecord(new HoodieKey(rowChange2.getRowKey(), rowChange2.getPartitionPath()),
rowChange2);
TestRawTripPayload rowChange3 = new TestRawTripPayload(recordStr3);
HoodieRecord record3 = new HoodieRecord(
new HoodieKey(rowChange3.getRowKey(), rowChange3.getPartitionPath()), rowChange3);
HoodieRecord record3 = new HoodieRecord(new HoodieKey(rowChange3.getRowKey(), rowChange3.getPartitionPath()),
rowChange3);
TestRawTripPayload rowChange4 = new TestRawTripPayload(recordStr4);
HoodieRecord record4 = new HoodieRecord(
new HoodieKey(rowChange4.getRowKey(), rowChange4.getPartitionPath()), rowChange4);
JavaRDD<HoodieRecord> recordRDD = jsc
.parallelize(Arrays.asList(record1, record2, record3, record4));
HoodieRecord record4 = new HoodieRecord(new HoodieKey(rowChange4.getRowKey(), rowChange4.getPartitionPath()),
rowChange4);
JavaRDD<HoodieRecord> recordRDD = jsc.parallelize(Arrays.asList(record1, record2, record3, record4));
// Also create the metadata and config
HoodieTableMetaClient metadata = new HoodieTableMetaClient(jsc.hadoopConfiguration(), basePath);
@@ -389,10 +372,14 @@ public class TestHoodieBloomIndex {
public void testCheckExists() throws Exception {
// We have some records to be tagged (two different partitions)
String recordStr1 = "{\"_row_key\":\"1eb5b87a-1feh-4edd-87b4-6ec96dc405a0\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}";
String recordStr2 = "{\"_row_key\":\"2eb5b87b-1feu-4edd-87b4-6ec96dc405a0\",\"time\":\"2016-01-31T03:20:41.415Z\",\"number\":100}";
String recordStr3 = "{\"_row_key\":\"3eb5b87c-1fej-4edd-87b4-6ec96dc405a0\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":15}";
String recordStr4 = "{\"_row_key\":\"4eb5b87c-1fej-4edd-87b4-6ec96dc405a0\",\"time\":\"2015-01-31T03:16:41.415Z\",\"number\":32}";
String recordStr1 = "{\"_row_key\":\"1eb5b87a-1feh-4edd-87b4-6ec96dc405a0\","
+ "\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}";
String recordStr2 = "{\"_row_key\":\"2eb5b87b-1feu-4edd-87b4-6ec96dc405a0\","
+ "\"time\":\"2016-01-31T03:20:41.415Z\",\"number\":100}";
String recordStr3 = "{\"_row_key\":\"3eb5b87c-1fej-4edd-87b4-6ec96dc405a0\","
+ "\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":15}";
String recordStr4 = "{\"_row_key\":\"4eb5b87c-1fej-4edd-87b4-6ec96dc405a0\","
+ "\"time\":\"2015-01-31T03:16:41.415Z\",\"number\":32}";
TestRawTripPayload rowChange1 = new TestRawTripPayload(recordStr1);
HoodieKey key1 = new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath());
HoodieRecord record1 = new HoodieRecord(key1, rowChange1);
@@ -414,8 +401,7 @@ public class TestHoodieBloomIndex {
// Let's tag
HoodieBloomIndex bloomIndex = new HoodieBloomIndex(config, jsc);
JavaPairRDD<HoodieKey, Optional<String>> taggedRecordRDD = bloomIndex
.fetchRecordLocation(keysRDD, table);
JavaPairRDD<HoodieKey, Optional<String>> taggedRecordRDD = bloomIndex.fetchRecordLocation(keysRDD, table);
// Should not find any files
for (Tuple2<HoodieKey, Optional<String>> record : taggedRecordRDD.collect()) {
@@ -456,16 +442,18 @@ public class TestHoodieBloomIndex {
@Test
public void testBloomFilterFalseError() throws IOException, InterruptedException {
// We have two hoodie records
String recordStr1 = "{\"_row_key\":\"1eb5b87a-1feh-4edd-87b4-6ec96dc405a0\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}";
String recordStr2 = "{\"_row_key\":\"2eb5b87b-1feu-4edd-87b4-6ec96dc405a0\",\"time\":\"2016-01-31T03:20:41.415Z\",\"number\":100}";
String recordStr1 = "{\"_row_key\":\"1eb5b87a-1feh-4edd-87b4-6ec96dc405a0\","
+ "\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}";
String recordStr2 = "{\"_row_key\":\"2eb5b87b-1feu-4edd-87b4-6ec96dc405a0\","
+ "\"time\":\"2016-01-31T03:20:41.415Z\",\"number\":100}";
// We write record1 to a parquet file, using a bloom filter having both records
TestRawTripPayload rowChange1 = new TestRawTripPayload(recordStr1);
HoodieRecord record1 = new HoodieRecord(
new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()), rowChange1);
HoodieRecord record1 = new HoodieRecord(new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()),
rowChange1);
TestRawTripPayload rowChange2 = new TestRawTripPayload(recordStr2);
HoodieRecord record2 = new HoodieRecord(
new HoodieKey(rowChange2.getRowKey(), rowChange2.getPartitionPath()), rowChange2);
HoodieRecord record2 = new HoodieRecord(new HoodieKey(rowChange2.getRowKey(), rowChange2.getPartitionPath()),
rowChange2);
BloomFilter filter = new BloomFilter(10000, 0.0000001);
filter.add(record2.getRecordKey());
@@ -492,8 +480,8 @@ public class TestHoodieBloomIndex {
}
}
private String writeParquetFile(String partitionPath, List<HoodieRecord> records, Schema schema,
BloomFilter filter, boolean createCommitTime) throws IOException, InterruptedException {
private String writeParquetFile(String partitionPath, List<HoodieRecord> records, Schema schema, BloomFilter filter,
boolean createCommitTime) throws IOException, InterruptedException {
Thread.sleep(1000);
String commitTime = new SimpleDateFormat("yyyyMMddHHmmss").format(new Date());
String fileId = UUID.randomUUID().toString();
@@ -502,15 +490,14 @@ public class TestHoodieBloomIndex {
return writeParquetFile(partitionPath, filename, records, schema, filter, createCommitTime);
}
private String writeParquetFile(String partitionPath, String filename, List<HoodieRecord> records,
Schema schema,
private String writeParquetFile(String partitionPath, String filename, List<HoodieRecord> records, Schema schema,
BloomFilter filter, boolean createCommitTime) throws IOException {
if (filter == null) {
filter = new BloomFilter(10000, 0.0000001);
}
HoodieAvroWriteSupport writeSupport = new HoodieAvroWriteSupport(
new AvroSchemaConverter().convert(schema), schema, filter);
HoodieAvroWriteSupport writeSupport = new HoodieAvroWriteSupport(new AvroSchemaConverter().convert(schema), schema,
filter);
String commitTime = FSUtils.getCommitTime(filename);
HoodieParquetConfig config = new HoodieParquetConfig(writeSupport, CompressionCodecName.GZIP,
ParquetWriter.DEFAULT_BLOCK_SIZE, ParquetWriter.DEFAULT_PAGE_SIZE, 120 * 1024 * 1024,
@@ -525,9 +512,7 @@ public class TestHoodieBloomIndex {
for (HoodieRecord record : records) {
GenericRecord avroRecord = (GenericRecord) record.getData().getInsertValue(schema).get();
HoodieAvroUtils.addCommitMetadataToRecord(avroRecord, commitTime, "" + seqId++);
HoodieAvroUtils
.addHoodieKeyToRecord(avroRecord, record.getRecordKey(), record.getPartitionPath(),
filename);
HoodieAvroUtils.addHoodieKeyToRecord(avroRecord, record.getRecordKey(), record.getPartitionPath(), filename);
writer.writeAvro(record.getRecordKey(), avroRecord);
filter.add(record.getRecordKey());
}
@@ -536,9 +521,7 @@ public class TestHoodieBloomIndex {
if (createCommitTime) {
// Also make sure the commit is valid
new File(basePath + "/" + HoodieTableMetaClient.METAFOLDER_NAME).mkdirs();
new File(
basePath + "/" + HoodieTableMetaClient.METAFOLDER_NAME + "/" + commitTime + ".commit")
.createNewFile();
new File(basePath + "/" + HoodieTableMetaClient.METAFOLDER_NAME + "/" + commitTime + ".commit").createNewFile();
}
return filename;
}

View File

@@ -77,7 +77,8 @@ public class TestHoodieCommitArchiveLog {
public void testArchiveDatasetWithArchival() throws IOException {
HoodieWriteConfig cfg = HoodieWriteConfig.newBuilder().withPath(basePath)
.withSchema(HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA).withParallelism(2, 2)
.withCompactionConfig(HoodieCompactionConfig.newBuilder().archiveCommitsWith(2, 4).build())
.withCompactionConfig(
HoodieCompactionConfig.newBuilder().archiveCommitsWith(2, 4).build())
.forTable("test-trip-table").build();
HoodieTestUtils.init(hadoopConf, basePath);
HoodieTestDataGenerator.createCommitFile(basePath, "100");
@@ -88,8 +89,7 @@ public class TestHoodieCommitArchiveLog {
HoodieTestDataGenerator.createCommitFile(basePath, "105");
HoodieTableMetaClient metaClient = new HoodieTableMetaClient(fs.getConf(), basePath);
HoodieTimeline timeline =
metaClient.getActiveTimeline().getCommitsTimeline().filterCompletedInstants();
HoodieTimeline timeline = metaClient.getActiveTimeline().getCommitsTimeline().filterCompletedInstants();
assertEquals("Loaded 6 commits and the count should match", 6, timeline.countInstants());
@@ -103,8 +103,7 @@ public class TestHoodieCommitArchiveLog {
HoodieTestUtils.createInflightCleanFiles(basePath, "106", "107");
//reload the timeline and get all the commmits before archive
timeline = metaClient.getActiveTimeline().reload().getAllCommitsTimeline()
.filterCompletedInstants();
timeline = metaClient.getActiveTimeline().reload().getAllCommitsTimeline().filterCompletedInstants();
List<HoodieInstant> originalCommits = timeline.getInstants().collect(Collectors.toList());
assertEquals("Loaded 6 commits and the count should match", 12, timeline.countInstants());
@@ -118,13 +117,12 @@ public class TestHoodieCommitArchiveLog {
assertTrue(archiveLog.archiveIfRequired());
//reload the timeline and remove the remaining commits
timeline = metaClient.getActiveTimeline().reload().getAllCommitsTimeline()
.filterCompletedInstants();
timeline = metaClient.getActiveTimeline().reload().getAllCommitsTimeline().filterCompletedInstants();
originalCommits.removeAll(timeline.getInstants().collect(Collectors.toList()));
//read the file
HoodieLogFormat.Reader reader = HoodieLogFormat
.newReader(fs, new HoodieLogFile(new Path(basePath + "/.hoodie/.commits_.archive.1")),
HoodieLogFormat.Reader reader = HoodieLogFormat.newReader(fs,
new HoodieLogFile(new Path(basePath + "/.hoodie/.commits_.archive.1")),
HoodieArchivedMetaEntry.getClassSchema());
int archivedRecordsCount = 0;
@@ -137,8 +135,7 @@ public class TestHoodieCommitArchiveLog {
assertEquals("Archived and read records for each block are same", 8, records.size());
archivedRecordsCount += records.size();
}
assertEquals("Total archived records and total read records are the same count", 8,
archivedRecordsCount);
assertEquals("Total archived records and total read records are the same count", 8, archivedRecordsCount);
//make sure the archived commits are the same as the (originalcommits - commitsleft)
List<String> readCommits = readRecords.stream().map(r -> (GenericRecord) r).map(r -> {
@@ -146,10 +143,8 @@ public class TestHoodieCommitArchiveLog {
}).collect(Collectors.toList());
Collections.sort(readCommits);
assertEquals(
"Read commits map should match the originalCommits - commitsLoadedFromArchival",
originalCommits.stream().map(HoodieInstant::getTimestamp).collect(Collectors.toList()),
readCommits);
assertEquals("Read commits map should match the originalCommits - commitsLoadedFromArchival",
originalCommits.stream().map(HoodieInstant::getTimestamp).collect(Collectors.toList()), readCommits);
// verify in-flight instants after archive
verifyInflightInstants(metaClient, 3);
@@ -168,15 +163,12 @@ public class TestHoodieCommitArchiveLog {
HoodieTestDataGenerator.createCommitFile(basePath, "102");
HoodieTestDataGenerator.createCommitFile(basePath, "103");
HoodieTimeline timeline = metaClient.getActiveTimeline().getCommitsTimeline()
.filterCompletedInstants();
HoodieTimeline timeline = metaClient.getActiveTimeline().getCommitsTimeline().filterCompletedInstants();
assertEquals("Loaded 4 commits and the count should match", 4, timeline.countInstants());
boolean result = archiveLog.archiveIfRequired();
assertTrue(result);
timeline = metaClient.getActiveTimeline().reload().getCommitsTimeline()
.filterCompletedInstants();
assertEquals("Should not archive commits when maxCommitsToKeep is 5", 4,
timeline.countInstants());
timeline = metaClient.getActiveTimeline().reload().getCommitsTimeline().filterCompletedInstants();
assertEquals("Should not archive commits when maxCommitsToKeep is 5", 4, timeline.countInstants());
}
@Test
@@ -194,21 +186,15 @@ public class TestHoodieCommitArchiveLog {
HoodieTestDataGenerator.createCommitFile(basePath, "104");
HoodieTestDataGenerator.createCommitFile(basePath, "105");
HoodieTimeline timeline = metaClient.getActiveTimeline().getCommitsTimeline()
.filterCompletedInstants();
HoodieTimeline timeline = metaClient.getActiveTimeline().getCommitsTimeline().filterCompletedInstants();
assertEquals("Loaded 6 commits and the count should match", 6, timeline.countInstants());
boolean result = archiveLog.archiveIfRequired();
assertTrue(result);
timeline = metaClient.getActiveTimeline().reload().getCommitsTimeline()
.filterCompletedInstants();
assertTrue("Archived commits should always be safe",
timeline.containsOrBeforeTimelineStarts("100"));
assertTrue("Archived commits should always be safe",
timeline.containsOrBeforeTimelineStarts("101"));
assertTrue("Archived commits should always be safe",
timeline.containsOrBeforeTimelineStarts("102"));
assertTrue("Archived commits should always be safe",
timeline.containsOrBeforeTimelineStarts("103"));
timeline = metaClient.getActiveTimeline().reload().getCommitsTimeline().filterCompletedInstants();
assertTrue("Archived commits should always be safe", timeline.containsOrBeforeTimelineStarts("100"));
assertTrue("Archived commits should always be safe", timeline.containsOrBeforeTimelineStarts("101"));
assertTrue("Archived commits should always be safe", timeline.containsOrBeforeTimelineStarts("102"));
assertTrue("Archived commits should always be safe", timeline.containsOrBeforeTimelineStarts("103"));
}
@Test
@@ -227,16 +213,14 @@ public class TestHoodieCommitArchiveLog {
HoodieTestDataGenerator.createCommitFile(basePath, "104");
HoodieTestDataGenerator.createCommitFile(basePath, "105");
HoodieTimeline timeline = metaClient.getActiveTimeline().getCommitsTimeline()
.filterCompletedInstants();
HoodieTimeline timeline = metaClient.getActiveTimeline().getCommitsTimeline().filterCompletedInstants();
assertEquals("Loaded 6 commits and the count should match", 6, timeline.countInstants());
boolean result = archiveLog.archiveIfRequired();
assertTrue(result);
timeline = metaClient.getActiveTimeline().reload().getCommitsTimeline()
.filterCompletedInstants();
timeline = metaClient.getActiveTimeline().reload().getCommitsTimeline().filterCompletedInstants();
assertEquals(
"Since we have a savepoint at 101, we should never archive any commit after 101 (we only archive 100)",
5, timeline.countInstants());
"Since we have a savepoint at 101, we should never archive any commit after 101 (we only " + "archive 100)", 5,
timeline.countInstants());
assertTrue("Archived commits should always be safe",
timeline.containsInstant(new HoodieInstant(false, HoodieTimeline.COMMIT_ACTION, "101")));
assertTrue("Archived commits should always be safe",
@@ -248,7 +232,7 @@ public class TestHoodieCommitArchiveLog {
private void verifyInflightInstants(HoodieTableMetaClient metaClient, int expectedTotalInstants) {
HoodieTimeline timeline = metaClient.getActiveTimeline().reload()
.getTimelineOfActions(Sets.newHashSet(HoodieTimeline.CLEAN_ACTION)).filterInflights();
assertEquals("Loaded inflight clean actions and the count should match",
expectedTotalInstants, timeline.countInstants());
assertEquals("Loaded inflight clean actions and the count should match", expectedTotalInstants,
timeline.countInstants());
}
}

View File

@@ -93,32 +93,27 @@ public class TestHoodieCompactor {
}
private HoodieWriteConfig.Builder getConfigBuilder() {
return HoodieWriteConfig.newBuilder().withPath(basePath)
.withSchema(HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA).withParallelism(2, 2)
.withCompactionConfig(
HoodieCompactionConfig.newBuilder().compactionSmallFileSize(1024 * 1024)
.withInlineCompaction(false).build())
.withStorageConfig(HoodieStorageConfig.newBuilder().limitFileSize(1024 * 1024).build())
.forTable("test-trip-table").withIndexConfig(
HoodieIndexConfig.newBuilder().withIndexType(HoodieIndex.IndexType.BLOOM).build());
return HoodieWriteConfig.newBuilder().withPath(basePath).withSchema(HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA)
.withParallelism(2, 2).withCompactionConfig(
HoodieCompactionConfig.newBuilder().compactionSmallFileSize(1024 * 1024).withInlineCompaction(false)
.build()).withStorageConfig(HoodieStorageConfig.newBuilder().limitFileSize(1024 * 1024).build())
.forTable("test-trip-table")
.withIndexConfig(HoodieIndexConfig.newBuilder().withIndexType(HoodieIndex.IndexType.BLOOM).build());
}
@Test(expected = IllegalArgumentException.class)
public void testCompactionOnCopyOnWriteFail() throws Exception {
HoodieTestUtils.initTableType(hadoopConf, basePath, HoodieTableType.COPY_ON_WRITE);
HoodieTableMetaClient metaClient = new HoodieTableMetaClient(jsc.hadoopConfiguration(),
basePath);
HoodieTableMetaClient metaClient = new HoodieTableMetaClient(jsc.hadoopConfiguration(), basePath);
HoodieTable table = HoodieTable.getHoodieTable(metaClient, getConfig());
compactor.compact(jsc, getConfig(), table, HoodieActiveTimeline.createNewCommitTime());
}
@Test
public void testCompactionEmpty() throws Exception {
HoodieTableMetaClient metaClient = new HoodieTableMetaClient(jsc.hadoopConfiguration(),
basePath);
HoodieTableMetaClient metaClient = new HoodieTableMetaClient(jsc.hadoopConfiguration(), basePath);
HoodieWriteConfig config = getConfig();
HoodieTable table = HoodieTable
.getHoodieTable(metaClient, config);
HoodieTable table = HoodieTable.getHoodieTable(metaClient, config);
HoodieWriteClient writeClient = new HoodieWriteClient(jsc, config);
String newCommitTime = writeClient.startCommit();
@@ -126,10 +121,9 @@ public class TestHoodieCompactor {
JavaRDD<HoodieRecord> recordsRDD = jsc.parallelize(records, 1);
writeClient.insert(recordsRDD, newCommitTime).collect();
JavaRDD<WriteStatus> result =
compactor.compact(jsc, getConfig(), table, HoodieActiveTimeline.createNewCommitTime());
assertTrue("If there is nothing to compact, result will be empty",
result.isEmpty());
JavaRDD<WriteStatus> result = compactor
.compact(jsc, getConfig(), table, HoodieActiveTimeline.createNewCommitTime());
assertTrue("If there is nothing to compact, result will be empty", result.isEmpty());
}
@Test
@@ -145,8 +139,7 @@ public class TestHoodieCompactor {
List<WriteStatus> statuses = writeClient.insert(recordsRDD, newCommitTime).collect();
// Update all the 100 records
HoodieTableMetaClient metaClient = new HoodieTableMetaClient(jsc.hadoopConfiguration(),
basePath);
HoodieTableMetaClient metaClient = new HoodieTableMetaClient(jsc.hadoopConfiguration(), basePath);
HoodieTable table = HoodieTable.getHoodieTable(metaClient, config);
newCommitTime = "101";
@@ -159,19 +152,16 @@ public class TestHoodieCompactor {
// Write them to corresponding avro logfiles
HoodieTestUtils
.writeRecordsToLogFiles(fs, metaClient.getBasePath(), HoodieTestDataGenerator.avroSchema,
updatedRecords);
.writeRecordsToLogFiles(fs, metaClient.getBasePath(), HoodieTestDataGenerator.avroSchema, updatedRecords);
// Verify that all data file has one log file
metaClient = new HoodieTableMetaClient(jsc.hadoopConfiguration(), basePath);
table = HoodieTable.getHoodieTable(metaClient, config);
for (String partitionPath : dataGen.getPartitionPaths()) {
List<FileSlice> groupedLogFiles =
table.getRTFileSystemView().getLatestFileSlices(partitionPath)
.collect(Collectors.toList());
List<FileSlice> groupedLogFiles = table.getRTFileSystemView().getLatestFileSlices(partitionPath)
.collect(Collectors.toList());
for (FileSlice fileSlice : groupedLogFiles) {
assertEquals("There should be 1 log file written for every data file", 1,
fileSlice.getLogFiles().count());
assertEquals("There should be 1 log file written for every data file", 1, fileSlice.getLogFiles().count());
}
}
@@ -179,18 +169,19 @@ public class TestHoodieCompactor {
metaClient = new HoodieTableMetaClient(jsc.hadoopConfiguration(), basePath);
table = HoodieTable.getHoodieTable(metaClient, config);
JavaRDD<WriteStatus> result =
compactor.compact(jsc, getConfig(), table, HoodieActiveTimeline.createNewCommitTime());
JavaRDD<WriteStatus> result = compactor
.compact(jsc, getConfig(), table, HoodieActiveTimeline.createNewCommitTime());
// Verify that all partition paths are present in the WriteStatus result
for (String partitionPath : dataGen.getPartitionPaths()) {
List<WriteStatus> writeStatuses = result.collect();
assertTrue(writeStatuses.stream()
.filter(writeStatus -> writeStatus.getStat().getPartitionPath()
.contentEquals(partitionPath)).count() > 0);
.filter(writeStatus -> writeStatus.getStat().getPartitionPath().contentEquals(partitionPath))
.count() > 0);
}
}
// TODO - after modifying HoodieReadClient to support realtime tables - add more tests to make sure the data read is the updated data (compaction correctness)
// TODO - after modifying HoodieReadClient to support realtime tables - add more tests to make
// sure the data read is the updated data (compaction correctness)
// TODO - add more test cases for compactions after a failed commit/compaction
}

View File

@@ -16,6 +16,9 @@
package com.uber.hoodie.io.strategy;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertTrue;
import com.beust.jcommander.internal.Lists;
import com.google.common.collect.Maps;
import com.uber.hoodie.config.HoodieCompactionConfig;
@@ -25,20 +28,16 @@ import com.uber.hoodie.io.compact.strategy.BoundedIOCompactionStrategy;
import com.uber.hoodie.io.compact.strategy.DayBasedCompactionStrategy;
import com.uber.hoodie.io.compact.strategy.LogFileSizeBasedCompactionStrategy;
import com.uber.hoodie.io.compact.strategy.UnBoundedCompactionStrategy;
import org.junit.Test;
import java.util.List;
import java.util.Map;
import java.util.Random;
import java.util.stream.Collectors;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertTrue;
import org.junit.Test;
public class TestHoodieCompactionStrategy {
private static final long MB = 1024 * 1024L;
private String [] partitionPaths = {"2017/01/01", "2017/01/02", "2017/01/03"};
private String[] partitionPaths = {"2017/01/01", "2017/01/02", "2017/01/03"};
@Test
public void testUnBounded() {
@@ -48,9 +47,8 @@ public class TestHoodieCompactionStrategy {
sizesMap.put(100 * MB, Lists.newArrayList(MB));
sizesMap.put(90 * MB, Lists.newArrayList(1024 * MB));
UnBoundedCompactionStrategy strategy = new UnBoundedCompactionStrategy();
HoodieWriteConfig writeConfig = HoodieWriteConfig.newBuilder().withPath("/tmp")
.withCompactionConfig(
HoodieCompactionConfig.newBuilder().withCompactionStrategy(strategy).build()).build();
HoodieWriteConfig writeConfig = HoodieWriteConfig.newBuilder().withPath("/tmp").withCompactionConfig(
HoodieCompactionConfig.newBuilder().withCompactionStrategy(strategy).build()).build();
List<CompactionOperation> operations = createCompactionOperations(writeConfig, sizesMap);
List<CompactionOperation> returned = strategy.orderAndFilter(writeConfig, operations);
assertEquals("UnBounded should not re-order or filter", operations, returned);
@@ -64,23 +62,19 @@ public class TestHoodieCompactionStrategy {
sizesMap.put(100 * MB, Lists.newArrayList(MB));
sizesMap.put(90 * MB, Lists.newArrayList(1024 * MB));
BoundedIOCompactionStrategy strategy = new BoundedIOCompactionStrategy();
HoodieWriteConfig writeConfig = HoodieWriteConfig.newBuilder().withPath("/tmp")
.withCompactionConfig(
HoodieCompactionConfig.newBuilder().withCompactionStrategy(strategy)
.withTargetIOPerCompactionInMB(400).build()).build();
HoodieWriteConfig writeConfig = HoodieWriteConfig.newBuilder().withPath("/tmp").withCompactionConfig(
HoodieCompactionConfig.newBuilder().withCompactionStrategy(strategy).withTargetIOPerCompactionInMB(400).build())
.build();
List<CompactionOperation> operations = createCompactionOperations(writeConfig, sizesMap);
List<CompactionOperation> returned = strategy.orderAndFilter(writeConfig, operations);
assertTrue("BoundedIOCompaction should have resulted in fewer compactions",
returned.size() < operations.size());
assertEquals("BoundedIOCompaction should have resulted in 2 compactions being chosen",
2, returned.size());
assertTrue("BoundedIOCompaction should have resulted in fewer compactions", returned.size() < operations.size());
assertEquals("BoundedIOCompaction should have resulted in 2 compactions being chosen", 2, returned.size());
// Total size of all the log files
Long returnedSize = returned.stream()
.map(s -> s.getMetrics().get(BoundedIOCompactionStrategy.TOTAL_IO_MB)).map(s -> (Long) s)
.reduce((size1, size2) -> size1 + size2).orElse(0L);
assertEquals("Should chose the first 2 compactions which should result in a total IO of 690 MB",
610, (long) returnedSize);
Long returnedSize = returned.stream().map(s -> s.getMetrics().get(BoundedIOCompactionStrategy.TOTAL_IO_MB))
.map(s -> (Long) s).reduce((size1, size2) -> size1 + size2).orElse(0L);
assertEquals("Should chose the first 2 compactions which should result in a total IO of 690 MB", 610,
(long) returnedSize);
}
@Test
@@ -91,23 +85,20 @@ public class TestHoodieCompactionStrategy {
sizesMap.put(100 * MB, Lists.newArrayList(MB));
sizesMap.put(90 * MB, Lists.newArrayList(1024 * MB));
LogFileSizeBasedCompactionStrategy strategy = new LogFileSizeBasedCompactionStrategy();
HoodieWriteConfig writeConfig = HoodieWriteConfig.newBuilder().withPath("/tmp")
.withCompactionConfig(
HoodieCompactionConfig.newBuilder().withCompactionStrategy(strategy)
.withTargetIOPerCompactionInMB(400).build()).build();
HoodieWriteConfig writeConfig = HoodieWriteConfig.newBuilder().withPath("/tmp").withCompactionConfig(
HoodieCompactionConfig.newBuilder().withCompactionStrategy(strategy).withTargetIOPerCompactionInMB(400).build())
.build();
List<CompactionOperation> operations = createCompactionOperations(writeConfig, sizesMap);
List<CompactionOperation> returned = strategy.orderAndFilter(writeConfig, operations);
assertTrue("LogFileSizeBasedCompactionStrategy should have resulted in fewer compactions",
returned.size() < operations.size());
assertEquals("LogFileSizeBasedCompactionStrategy should have resulted in 1 compaction",
1, returned.size());
assertEquals("LogFileSizeBasedCompactionStrategy should have resulted in 1 compaction", 1, returned.size());
// Total size of all the log files
Long returnedSize = returned.stream()
.map(s -> s.getMetrics().get(BoundedIOCompactionStrategy.TOTAL_IO_MB)).map(s -> (Long) s)
.reduce((size1, size2) -> size1 + size2).orElse(0L);
assertEquals("Should chose the first 2 compactions which should result in a total IO of 690 MB",
1204, (long) returnedSize);
Long returnedSize = returned.stream().map(s -> s.getMetrics().get(BoundedIOCompactionStrategy.TOTAL_IO_MB))
.map(s -> (Long) s).reduce((size1, size2) -> size1 + size2).orElse(0L);
assertEquals("Should chose the first 2 compactions which should result in a total IO of 690 MB", 1204,
(long) returnedSize);
}
@Test
@@ -118,20 +109,18 @@ public class TestHoodieCompactionStrategy {
sizesMap.put(100 * MB, Lists.newArrayList(MB));
sizesMap.put(90 * MB, Lists.newArrayList(1024 * MB));
DayBasedCompactionStrategy strategy = new DayBasedCompactionStrategy();
HoodieWriteConfig writeConfig = HoodieWriteConfig.newBuilder().withPath("/tmp")
.withCompactionConfig(
HoodieCompactionConfig.newBuilder().withCompactionStrategy(strategy)
.withTargetIOPerCompactionInMB(400).build()).build();
HoodieWriteConfig writeConfig = HoodieWriteConfig.newBuilder().withPath("/tmp").withCompactionConfig(
HoodieCompactionConfig.newBuilder().withCompactionStrategy(strategy).withTargetIOPerCompactionInMB(400).build())
.build();
List<CompactionOperation> operations = createCompactionOperations(writeConfig, sizesMap);
List<CompactionOperation> returned = strategy.orderAndFilter(writeConfig, operations);
assertTrue("DayBasedCompactionStrategy should have resulted in fewer compactions",
returned.size() < operations.size());
returned.size() < operations.size());
int comparision = strategy.getComparator().compare(returned.get(returned.size() - 1), returned.get(0));
// Either the partition paths are sorted in descending order or they are equal
assertTrue("DayBasedCompactionStrategy should sort partitions in descending order",
comparision >= 0);
assertTrue("DayBasedCompactionStrategy should sort partitions in descending order", comparision >= 0);
}
private List<CompactionOperation> createCompactionOperations(HoodieWriteConfig config,
@@ -139,9 +128,8 @@ public class TestHoodieCompactionStrategy {
List<CompactionOperation> operations = Lists.newArrayList(sizesMap.size());
sizesMap.forEach((k, v) -> {
operations.add(new CompactionOperation(TestHoodieDataFile.newDataFile(k),
partitionPaths[new Random().nextInt(partitionPaths.length - 1)],
v.stream().map(TestHoodieLogFile::newLogFile).collect(
Collectors.toList()), config));
partitionPaths[new Random().nextInt(partitionPaths.length - 1)],
v.stream().map(TestHoodieLogFile::newLogFile).collect(Collectors.toList()), config));
});
return operations;
}

View File

@@ -28,6 +28,10 @@ public class TestHoodieDataFile extends HoodieDataFile {
this.size = size;
}
public static HoodieDataFile newDataFile(long size) {
return new TestHoodieDataFile(size);
}
@Override
public String getPath() {
return "/tmp/test";
@@ -43,13 +47,8 @@ public class TestHoodieDataFile extends HoodieDataFile {
return "100";
}
@Override
public long getFileSize() {
return size;
}
public static HoodieDataFile newDataFile(long size) {
return new TestHoodieDataFile(size);
}
}

View File

@@ -29,6 +29,10 @@ public class TestHoodieLogFile extends HoodieLogFile {
this.size = size;
}
public static HoodieLogFile newLogFile(long size) {
return new TestHoodieLogFile(size);
}
@Override
public Path getPath() {
return new Path("/tmp/test-log");
@@ -38,8 +42,4 @@ public class TestHoodieLogFile extends HoodieLogFile {
public Optional<Long> getFileSize() {
return Optional.of(size);
}
public static HoodieLogFile newLogFile(long size) {
return new TestHoodieLogFile(size);
}
}

View File

@@ -40,7 +40,6 @@ public class TestHoodieMetrics {
@Test
public void testRegisterGauge() {
metrics.registerGauge("metric1", 123L);
assertTrue(Metrics.getInstance().getRegistry().getGauges().get("metric1").getValue().toString()
.equals("123"));
assertTrue(Metrics.getInstance().getRegistry().getGauges().get("metric1").getValue().toString().equals("123"));
}
}

View File

@@ -89,14 +89,13 @@ public class TestCopyOnWriteTable {
String commitTime = HoodieTestUtils.makeNewCommitTime();
HoodieWriteConfig config = makeHoodieClientConfig();
HoodieTableMetaClient metaClient = new HoodieTableMetaClient(jsc.hadoopConfiguration(),
basePath);
HoodieTableMetaClient metaClient = new HoodieTableMetaClient(jsc.hadoopConfiguration(), basePath);
HoodieTable table = HoodieTable.getHoodieTable(metaClient, config);
HoodieCreateHandle io = new HoodieCreateHandle(config, commitTime, table, partitionPath);
Path newPath = io.makeNewPath(record.getPartitionPath(), unitNumber, fileName);
assertTrue(newPath.toString().equals(this.basePath + "/" + partitionPath + "/" + FSUtils
.makeDataFileName(commitTime, unitNumber, fileName)));
assertTrue(newPath.toString().equals(
this.basePath + "/" + partitionPath + "/" + FSUtils.makeDataFileName(commitTime, unitNumber, fileName)));
}
private HoodieWriteConfig makeHoodieClientConfig() throws Exception {
@@ -105,8 +104,7 @@ public class TestCopyOnWriteTable {
private HoodieWriteConfig.Builder makeHoodieClientConfigBuilder() throws Exception {
// Prepare the AvroParquetIO
String schemaStr = IOUtils
.toString(getClass().getResourceAsStream("/exampleSchema.txt"), "UTF-8");
String schemaStr = IOUtils.toString(getClass().getResourceAsStream("/exampleSchema.txt"), "UTF-8");
return HoodieWriteConfig.newBuilder().withPath(basePath).withSchema(schemaStr);
}
@@ -122,28 +120,27 @@ public class TestCopyOnWriteTable {
HoodieCopyOnWriteTable table = new HoodieCopyOnWriteTable(config, metadata);
// Get some records belong to the same partition (2016/01/31)
String recordStr1 = "{\"_row_key\":\"8eb5b87a-1feh-4edd-87b4-6ec96dc405a0\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}";
String recordStr2 = "{\"_row_key\":\"8eb5b87b-1feu-4edd-87b4-6ec96dc405a0\",\"time\":\"2016-01-31T03:20:41.415Z\",\"number\":100}";
String recordStr3 = "{\"_row_key\":\"8eb5b87c-1fej-4edd-87b4-6ec96dc405a0\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":15}";
String recordStr4 = "{\"_row_key\":\"8eb5b87d-1fej-4edd-87b4-6ec96dc405a0\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":51}";
String recordStr1 = "{\"_row_key\":\"8eb5b87a-1feh-4edd-87b4-6ec96dc405a0\","
+ "\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}";
String recordStr2 = "{\"_row_key\":\"8eb5b87b-1feu-4edd-87b4-6ec96dc405a0\","
+ "\"time\":\"2016-01-31T03:20:41.415Z\",\"number\":100}";
String recordStr3 = "{\"_row_key\":\"8eb5b87c-1fej-4edd-87b4-6ec96dc405a0\","
+ "\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":15}";
String recordStr4 = "{\"_row_key\":\"8eb5b87d-1fej-4edd-87b4-6ec96dc405a0\","
+ "\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":51}";
List<HoodieRecord> records = new ArrayList<>();
TestRawTripPayload rowChange1 = new TestRawTripPayload(recordStr1);
records.add(
new HoodieRecord(new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()),
rowChange1));
records.add(new HoodieRecord(new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()), rowChange1));
TestRawTripPayload rowChange2 = new TestRawTripPayload(recordStr2);
records.add(
new HoodieRecord(new HoodieKey(rowChange2.getRowKey(), rowChange2.getPartitionPath()),
rowChange2));
records.add(new HoodieRecord(new HoodieKey(rowChange2.getRowKey(), rowChange2.getPartitionPath()), rowChange2));
TestRawTripPayload rowChange3 = new TestRawTripPayload(recordStr3);
records.add(
new HoodieRecord(new HoodieKey(rowChange3.getRowKey(), rowChange3.getPartitionPath()),
rowChange3));
records.add(new HoodieRecord(new HoodieKey(rowChange3.getRowKey(), rowChange3.getPartitionPath()), rowChange3));
// Insert new records
HoodieClientTestUtils.collectStatuses(table.handleInsert(firstCommitTime, records.iterator()));
// We should have a parquet file generated (TODO: better control # files after we revise AvroParquetIO)
// We should have a parquet file generated (TODO: better control # files after we revise
// AvroParquetIO)
File parquetFile = null;
for (File file : new File(this.basePath + partitionPath).listFiles()) {
if (file.getName().endsWith(".parquet")) {
@@ -155,18 +152,17 @@ public class TestCopyOnWriteTable {
// Read out the bloom filter and make sure filter can answer record exist or not
Path parquetFilePath = new Path(parquetFile.getAbsolutePath());
BloomFilter filter = ParquetUtils
.readBloomFilterFromParquetMetadata(jsc.hadoopConfiguration(), parquetFilePath);
BloomFilter filter = ParquetUtils.readBloomFilterFromParquetMetadata(jsc.hadoopConfiguration(), parquetFilePath);
for (HoodieRecord record : records) {
assertTrue(filter.mightContain(record.getRecordKey()));
}
// Create a commit file
new File(this.basePath + "/" + HoodieTableMetaClient.METAFOLDER_NAME + "/"
+ FSUtils.getCommitTime(parquetFile.getName()) + ".commit").createNewFile();
new File(
this.basePath + "/" + HoodieTableMetaClient.METAFOLDER_NAME + "/" + FSUtils.getCommitTime(parquetFile.getName())
+ ".commit").createNewFile();
// Read the parquet file, check the record content
List<GenericRecord> fileRecords = ParquetUtils
.readAvroRecords(jsc.hadoopConfiguration(), parquetFilePath);
List<GenericRecord> fileRecords = ParquetUtils.readAvroRecords(jsc.hadoopConfiguration(), parquetFilePath);
GenericRecord newRecord;
int index = 0;
for (GenericRecord record : fileRecords) {
@@ -175,13 +171,12 @@ public class TestCopyOnWriteTable {
}
// We update the 1st record & add a new record
String updateRecordStr1 = "{\"_row_key\":\"8eb5b87a-1feh-4edd-87b4-6ec96dc405a0\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":15}";
String updateRecordStr1 = "{\"_row_key\":\"8eb5b87a-1feh-4edd-87b4-6ec96dc405a0\","
+ "\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":15}";
TestRawTripPayload updateRowChanges1 = new TestRawTripPayload(updateRecordStr1);
HoodieRecord updatedRecord1 = new HoodieRecord(
new HoodieKey(updateRowChanges1.getRowKey(), updateRowChanges1.getPartitionPath()),
updateRowChanges1);
updatedRecord1.setCurrentLocation(
new HoodieRecordLocation(null, FSUtils.getFileId(parquetFile.getName())));
new HoodieKey(updateRowChanges1.getRowKey(), updateRowChanges1.getPartitionPath()), updateRowChanges1);
updatedRecord1.setCurrentLocation(new HoodieRecordLocation(null, FSUtils.getFileId(parquetFile.getName())));
TestRawTripPayload rowChange4 = new TestRawTripPayload(recordStr4);
HoodieRecord insertedRecord1 = new HoodieRecord(
@@ -201,10 +196,9 @@ public class TestCopyOnWriteTable {
File updatedParquetFile = null;
for (File file : new File(basePath + "/2016/01/31").listFiles()) {
if (file.getName().endsWith(".parquet")) {
if (FSUtils.getFileId(file.getName())
.equals(FSUtils.getFileId(parquetFile.getName())) &&
HoodieTimeline.compareTimestamps(FSUtils.getCommitTime(file.getName()),
FSUtils.getCommitTime(parquetFile.getName()), HoodieTimeline.GREATER)) {
if (FSUtils.getFileId(file.getName()).equals(FSUtils.getFileId(parquetFile.getName()))
&& HoodieTimeline.compareTimestamps(FSUtils.getCommitTime(file.getName()),
FSUtils.getCommitTime(parquetFile.getName()), HoodieTimeline.GREATER)) {
updatedParquetFile = file;
break;
}
@@ -213,8 +207,8 @@ public class TestCopyOnWriteTable {
assertTrue(updatedParquetFile != null);
// Check whether the record has been updated
Path updatedParquetFilePath = new Path(updatedParquetFile.getAbsolutePath());
BloomFilter updatedFilter = ParquetUtils
.readBloomFilterFromParquetMetadata(jsc.hadoopConfiguration(), updatedParquetFilePath);
BloomFilter updatedFilter = ParquetUtils.readBloomFilterFromParquetMetadata(jsc.hadoopConfiguration(),
updatedParquetFilePath);
for (HoodieRecord record : records) {
// No change to the _row_key
assertTrue(updatedFilter.mightContain(record.getRecordKey()));
@@ -223,8 +217,7 @@ public class TestCopyOnWriteTable {
assertTrue(updatedFilter.mightContain(insertedRecord1.getRecordKey()));
records.add(insertedRecord1);// add this so it can further check below
ParquetReader updatedReader = ParquetReader
.builder(new AvroReadSupport<>(), updatedParquetFilePath).build();
ParquetReader updatedReader = ParquetReader.builder(new AvroReadSupport<>(), updatedParquetFilePath).build();
index = 0;
while ((newRecord = (GenericRecord) updatedReader.read()) != null) {
assertTrue(newRecord.get("_row_key").toString().equals(records.get(index).getRecordKey()));
@@ -246,13 +239,9 @@ public class TestCopyOnWriteTable {
List<HoodieRecord> records = new ArrayList<>();
for (int i = 0; i < n; i++) {
String recordStr = String.format("{\"_row_key\":\"%s\",\"time\":\"%s\",\"number\":%d}",
UUID.randomUUID().toString(),
time,
i);
UUID.randomUUID().toString(), time, i);
TestRawTripPayload rowChange = new TestRawTripPayload(recordStr);
records.add(new HoodieRecord(
new HoodieKey(rowChange.getRowKey(), rowChange.getPartitionPath()),
rowChange));
records.add(new HoodieRecord(new HoodieKey(rowChange.getRowKey(), rowChange.getPartitionPath()), rowChange));
}
return records;
}
@@ -261,31 +250,28 @@ public class TestCopyOnWriteTable {
@Test
public void testMetadataAggregateFromWriteStatus() throws Exception {
// Prepare the AvroParquetIO
HoodieWriteConfig config = makeHoodieClientConfigBuilder()
.withWriteStatusClass(MetadataMergeWriteStatus.class).build();
HoodieWriteConfig config = makeHoodieClientConfigBuilder().withWriteStatusClass(MetadataMergeWriteStatus.class)
.build();
String firstCommitTime = HoodieTestUtils.makeNewCommitTime();
HoodieTableMetaClient metadata = new HoodieTableMetaClient(jsc.hadoopConfiguration(), basePath);
HoodieCopyOnWriteTable table = new HoodieCopyOnWriteTable(config, metadata);
// Get some records belong to the same partition (2016/01/31)
String recordStr1 = "{\"_row_key\":\"8eb5b87a-1feh-4edd-87b4-6ec96dc405a0\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}";
String recordStr2 = "{\"_row_key\":\"8eb5b87b-1feu-4edd-87b4-6ec96dc405a0\",\"time\":\"2016-01-31T03:20:41.415Z\",\"number\":100}";
String recordStr3 = "{\"_row_key\":\"8eb5b87c-1fej-4edd-87b4-6ec96dc405a0\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":15}";
String recordStr1 = "{\"_row_key\":\"8eb5b87a-1feh-4edd-87b4-6ec96dc405a0\","
+ "\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":12}";
String recordStr2 = "{\"_row_key\":\"8eb5b87b-1feu-4edd-87b4-6ec96dc405a0\","
+ "\"time\":\"2016-01-31T03:20:41.415Z\",\"number\":100}";
String recordStr3 = "{\"_row_key\":\"8eb5b87c-1fej-4edd-87b4-6ec96dc405a0\","
+ "\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":15}";
List<HoodieRecord> records = new ArrayList<>();
TestRawTripPayload rowChange1 = new TestRawTripPayload(recordStr1);
records.add(
new HoodieRecord(new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()),
rowChange1));
records.add(new HoodieRecord(new HoodieKey(rowChange1.getRowKey(), rowChange1.getPartitionPath()), rowChange1));
TestRawTripPayload rowChange2 = new TestRawTripPayload(recordStr2);
records.add(
new HoodieRecord(new HoodieKey(rowChange2.getRowKey(), rowChange2.getPartitionPath()),
rowChange2));
records.add(new HoodieRecord(new HoodieKey(rowChange2.getRowKey(), rowChange2.getPartitionPath()), rowChange2));
TestRawTripPayload rowChange3 = new TestRawTripPayload(recordStr3);
records.add(
new HoodieRecord(new HoodieKey(rowChange3.getRowKey(), rowChange3.getPartitionPath()),
rowChange3));
records.add(new HoodieRecord(new HoodieKey(rowChange3.getRowKey(), rowChange3.getPartitionPath()), rowChange3));
// Insert new records
List<WriteStatus> writeStatuses = HoodieClientTestUtils
@@ -293,7 +279,8 @@ public class TestCopyOnWriteTable {
Map<String, String> allWriteStatusMergedMetadataMap = MetadataMergeWriteStatus
.mergeMetadataForWriteStatuses(writeStatuses);
assertTrue(allWriteStatusMergedMetadataMap.containsKey("InputRecordCount_1506582000"));
// For metadata key InputRecordCount_1506582000, value is 2 for each record. So sum of this should be 2 * 3
// For metadata key InputRecordCount_1506582000, value is 2 for each record. So sum of this
// should be 2 * 3
assertEquals("6", allWriteStatusMergedMetadataMap.get("InputRecordCount_1506582000"));
}
@@ -314,26 +301,19 @@ public class TestCopyOnWriteTable {
List<WriteStatus> statuses = HoodieClientTestUtils
.collectStatuses(table.handleInsert(commitTime, records.iterator()));
WriteStatus status = statuses.get(0);
Path partialFile = new Path(String.format("%s/%s/%s",
basePath,
status.getPartitionPath(),
FSUtils.makeDataFileName(commitTime, 0, status.getFileId()))
);
Path partialFile = new Path(String.format("%s/%s/%s", basePath, status.getPartitionPath(),
FSUtils.makeDataFileName(commitTime, 0, status.getFileId())));
assertTrue(fs.exists(partialFile));
// When we retry
records = newHoodieRecords(10, "2016-01-31T03:16:41.415Z");
records.addAll(newHoodieRecords(1, "2016-02-01T03:16:41.415Z"));
statuses = HoodieClientTestUtils
.collectStatuses(table.handleInsert(commitTime, records.iterator()));
statuses = HoodieClientTestUtils.collectStatuses(table.handleInsert(commitTime, records.iterator()));
status = statuses.get(0);
Path retriedFIle = new Path(String.format("%s/%s/%s",
basePath,
status.getPartitionPath(),
FSUtils.makeDataFileName(commitTime, 0, status.getFileId()))
);
Path retriedFIle = new Path(String.format("%s/%s/%s", basePath, status.getPartitionPath(),
FSUtils.makeDataFileName(commitTime, 0, status.getFileId())));
assertTrue(fs.exists(retriedFIle));
assertFalse(fs.exists(partialFile));
}
@@ -371,8 +351,7 @@ public class TestCopyOnWriteTable {
records.addAll(newHoodieRecords(1, "2016-02-02T03:16:41.415Z"));
// Insert new records
returnedStatuses = HoodieClientTestUtils
.collectStatuses(table.handleInsert(commitTime, records.iterator()));
returnedStatuses = HoodieClientTestUtils.collectStatuses(table.handleInsert(commitTime, records.iterator()));
assertEquals(3, returnedStatuses.size());
assertEquals("2016/01/31", returnedStatuses.get(0).getPartitionPath());
@@ -389,8 +368,8 @@ public class TestCopyOnWriteTable {
@Test
public void testFileSizeUpsertRecords() throws Exception {
HoodieWriteConfig config = makeHoodieClientConfigBuilder().withStorageConfig(
HoodieStorageConfig.newBuilder().limitFileSize(64 * 1024).parquetBlockSize(64 * 1024)
.parquetPageSize(64 * 1024).build()).build();
HoodieStorageConfig.newBuilder().limitFileSize(64 * 1024).parquetBlockSize(64 * 1024).parquetPageSize(64 * 1024)
.build()).build();
String commitTime = HoodieTestUtils.makeNewCommitTime();
HoodieTableMetaClient metadata = new HoodieTableMetaClient(jsc.hadoopConfiguration(), basePath);
HoodieCopyOnWriteTable table = new HoodieCopyOnWriteTable(config, metadata);
@@ -398,12 +377,11 @@ public class TestCopyOnWriteTable {
List<HoodieRecord> records = new ArrayList<>();
// Approx 1150 records are written for block size of 64KB
for (int i = 0; i < 2000; i++) {
String recordStr = "{\"_row_key\":\"" + UUID.randomUUID().toString()
+ "\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":" + i + "}";
String recordStr =
"{\"_row_key\":\"" + UUID.randomUUID().toString() + "\",\"time\":\"2016-01-31T03:16:41.415Z\",\"number\":" + i
+ "}";
TestRawTripPayload rowChange = new TestRawTripPayload(recordStr);
records
.add(new HoodieRecord(new HoodieKey(rowChange.getRowKey(), rowChange.getPartitionPath()),
rowChange));
records.add(new HoodieRecord(new HoodieKey(rowChange.getRowKey(), rowChange.getPartitionPath()), rowChange));
}
// Insert new records
@@ -412,39 +390,30 @@ public class TestCopyOnWriteTable {
// Check the updated file
int counts = 0;
for (File file : new File(basePath + "/2016/01/31").listFiles()) {
if (file.getName().endsWith(".parquet") && FSUtils.getCommitTime(file.getName())
.equals(commitTime)) {
if (file.getName().endsWith(".parquet") && FSUtils.getCommitTime(file.getName()).equals(commitTime)) {
System.out.println(file.getName() + "-" + file.length());
counts++;
}
}
assertEquals(
"If the number of records are more than 1150, then there should be a new file", 3,
counts);
assertEquals("If the number of records are more than 1150, then there should be a new file", 3, counts);
}
private List<HoodieCopyOnWriteTable.InsertBucket> testUpsertPartitioner(int smallFileSize,
int numInserts,
int numUpdates,
int fileSize,
boolean autoSplitInserts) throws Exception {
final String TEST_PARTITION_PATH = "2016/09/26";
HoodieWriteConfig config = makeHoodieClientConfigBuilder()
.withCompactionConfig(HoodieCompactionConfig.newBuilder()
.compactionSmallFileSize(smallFileSize).insertSplitSize(100)
.autoTuneInsertSplits(autoSplitInserts).build())
.withStorageConfig(HoodieStorageConfig.newBuilder().limitFileSize(1000 * 1024).build())
.build();
private List<HoodieCopyOnWriteTable.InsertBucket> testUpsertPartitioner(int smallFileSize, int numInserts,
int numUpdates, int fileSize, boolean autoSplitInserts) throws Exception {
final String testPartitionPath = "2016/09/26";
HoodieWriteConfig config = makeHoodieClientConfigBuilder().withCompactionConfig(
HoodieCompactionConfig.newBuilder().compactionSmallFileSize(smallFileSize).insertSplitSize(100)
.autoTuneInsertSplits(autoSplitInserts).build()).withStorageConfig(
HoodieStorageConfig.newBuilder().limitFileSize(1000 * 1024).build()).build();
HoodieClientTestUtils.fakeCommitFile(basePath, "001");
HoodieClientTestUtils.fakeDataFile(basePath, TEST_PARTITION_PATH, "001", "file1", fileSize);
HoodieClientTestUtils.fakeDataFile(basePath, testPartitionPath, "001", "file1", fileSize);
HoodieTableMetaClient metadata = new HoodieTableMetaClient(jsc.hadoopConfiguration(), basePath);
HoodieCopyOnWriteTable table = new HoodieCopyOnWriteTable(config, metadata);
HoodieTestDataGenerator dataGenerator = new HoodieTestDataGenerator(
new String[]{TEST_PARTITION_PATH});
HoodieTestDataGenerator dataGenerator = new HoodieTestDataGenerator(new String[] {testPartitionPath});
List<HoodieRecord> insertRecords = dataGenerator.generateInserts("001", numInserts);
List<HoodieRecord> updateRecords = dataGenerator.generateUpdates("001", numUpdates);
for (HoodieRecord updateRec : updateRecords) {
@@ -454,8 +423,8 @@ public class TestCopyOnWriteTable {
records.addAll(insertRecords);
records.addAll(updateRecords);
WorkloadProfile profile = new WorkloadProfile(jsc.parallelize(records));
HoodieCopyOnWriteTable.UpsertPartitioner partitioner = (HoodieCopyOnWriteTable.UpsertPartitioner)
table.getUpsertPartitioner(profile);
HoodieCopyOnWriteTable.UpsertPartitioner partitioner =
(HoodieCopyOnWriteTable.UpsertPartitioner) table.getUpsertPartitioner(profile);
assertEquals("Should have 3 partitions", 3, partitioner.numPartitions());
assertEquals("Bucket 0 is UPDATE", HoodieCopyOnWriteTable.BucketType.UPDATE,
@@ -464,40 +433,35 @@ public class TestCopyOnWriteTable {
partitioner.getBucketInfo(1).bucketType);
assertEquals("Bucket 2 is INSERT", HoodieCopyOnWriteTable.BucketType.INSERT,
partitioner.getBucketInfo(2).bucketType);
assertEquals("Update record should have gone to the 1 update partiton", 0,
partitioner.getPartition(new Tuple2<>(updateRecords.get(0).getKey(),
Option.apply(updateRecords.get(0).getCurrentLocation()))));
return partitioner.getInsertBuckets(TEST_PARTITION_PATH);
assertEquals("Update record should have gone to the 1 update partiton", 0, partitioner.getPartition(
new Tuple2<>(updateRecords.get(0).getKey(), Option.apply(updateRecords.get(0).getCurrentLocation()))));
return partitioner.getInsertBuckets(testPartitionPath);
}
@Test
public void testUpsertPartitioner() throws Exception {
// Inserts + Updates... Check all updates go together & inserts subsplit
List<HoodieCopyOnWriteTable.InsertBucket> insertBuckets = testUpsertPartitioner(0, 200, 100,
1024, false);
List<HoodieCopyOnWriteTable.InsertBucket> insertBuckets = testUpsertPartitioner(0, 200, 100, 1024, false);
assertEquals("Total of 2 insert buckets", 2, insertBuckets.size());
}
@Test
public void testUpsertPartitionerWithSmallInsertHandling() throws Exception {
// Inserts + Updates .. Check updates go together & inserts subsplit, after expanding smallest file
List<HoodieCopyOnWriteTable.InsertBucket> insertBuckets = testUpsertPartitioner(1000 * 1024,
400, 100, 800 * 1024, false);
// Inserts + Updates .. Check updates go together & inserts subsplit, after expanding
// smallest file
List<HoodieCopyOnWriteTable.InsertBucket> insertBuckets = testUpsertPartitioner(1000 * 1024, 400, 100, 800 * 1024,
false);
assertEquals("Total of 3 insert buckets", 3, insertBuckets.size());
assertEquals("First insert bucket must be same as update bucket", 0,
insertBuckets.get(0).bucketNumber);
assertEquals("First insert bucket should have weight 0.5", 0.5, insertBuckets.get(0).weight,
0.01);
assertEquals("First insert bucket must be same as update bucket", 0, insertBuckets.get(0).bucketNumber);
assertEquals("First insert bucket should have weight 0.5", 0.5, insertBuckets.get(0).weight, 0.01);
// Now with insert split size auto tuned
insertBuckets = testUpsertPartitioner(1000 * 1024, 2400, 100, 800 * 1024, true);
assertEquals("Total of 3 insert buckets", 3, insertBuckets.size());
assertEquals("First insert bucket must be same as update bucket", 0,
insertBuckets.get(0).bucketNumber);
assertEquals("First insert bucket should have weight 0.5", 200.0 / 2400,
insertBuckets.get(0).weight, 0.01);
assertEquals("First insert bucket must be same as update bucket", 0, insertBuckets.get(0).bucketNumber);
assertEquals("First insert bucket should have weight 0.5", 200.0 / 2400, insertBuckets.get(0).weight, 0.01);
}
@After

View File

@@ -18,7 +18,6 @@
package com.uber.hoodie.table;
import static com.uber.hoodie.common.HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertFalse;
@@ -76,15 +75,14 @@ import org.junit.rules.TemporaryFolder;
public class TestMergeOnReadTable {
private transient JavaSparkContext jsc = null;
private transient SQLContext sqlContext;
private static String basePath = null;
//NOTE : Be careful in using DFS (FileSystem.class) vs LocalFs(RawLocalFileSystem.class)
//The implementation and gurantees of many API's differ, for example check rename(src,dst)
private static MiniDFSCluster dfsCluster;
private static DistributedFileSystem dfs;
private static HdfsTestService hdfsTestService;
private transient JavaSparkContext jsc = null;
private transient SQLContext sqlContext;
@AfterClass
public static void cleanUp() throws Exception {
@@ -92,13 +90,15 @@ public class TestMergeOnReadTable {
hdfsTestService.stop();
dfsCluster.shutdown();
}
// Need to closeAll to clear FileSystem.Cache, required because DFS and LocalFS used in the same JVM
// Need to closeAll to clear FileSystem.Cache, required because DFS and LocalFS used in the
// same JVM
FileSystem.closeAll();
}
@BeforeClass
public static void setUpDFS() throws IOException {
// Need to closeAll to clear FileSystem.Cache, required because DFS and LocalFS used in the same JVM
// Need to closeAll to clear FileSystem.Cache, required because DFS and LocalFS used in the
// same JVM
FileSystem.closeAll();
if (hdfsTestService == null) {
hdfsTestService = new HdfsTestService();
@@ -111,8 +111,7 @@ public class TestMergeOnReadTable {
@Before
public void init() throws IOException {
// Initialize a local spark env
jsc = new JavaSparkContext(
HoodieClientTestUtils.getSparkConfForTest("TestHoodieMergeOnReadTable"));
jsc = new JavaSparkContext(HoodieClientTestUtils.getSparkConfForTest("TestHoodieMergeOnReadTable"));
// Create a temp folder as the base path
TemporaryFolder folder = new TemporaryFolder();
@@ -154,28 +153,23 @@ public class TestMergeOnReadTable {
List<WriteStatus> statuses = client.upsert(writeRecords, newCommitTime).collect();
assertNoWriteErrors(statuses);
HoodieTableMetaClient metaClient = new HoodieTableMetaClient(jsc.hadoopConfiguration(),
cfg.getBasePath());
HoodieTableMetaClient metaClient = new HoodieTableMetaClient(jsc.hadoopConfiguration(), cfg.getBasePath());
HoodieTable hoodieTable = HoodieTable.getHoodieTable(metaClient, cfg);
Optional<HoodieInstant> deltaCommit =
metaClient.getActiveTimeline().getDeltaCommitTimeline().firstInstant();
Optional<HoodieInstant> deltaCommit = metaClient.getActiveTimeline().getDeltaCommitTimeline().firstInstant();
assertTrue(deltaCommit.isPresent());
assertEquals("Delta commit should be 001", "001", deltaCommit.get().getTimestamp());
Optional<HoodieInstant> commit =
metaClient.getActiveTimeline().getCommitTimeline().firstInstant();
Optional<HoodieInstant> commit = metaClient.getActiveTimeline().getCommitTimeline().firstInstant();
assertFalse(commit.isPresent());
FileStatus[] allFiles = HoodieTestUtils
.listAllDataFilesInPath(metaClient.getFs(), cfg.getBasePath());
FileStatus[] allFiles = HoodieTestUtils.listAllDataFilesInPath(metaClient.getFs(), cfg.getBasePath());
TableFileSystemView.ReadOptimizedView roView = new HoodieTableFileSystemView(metaClient,
hoodieTable.getCommitTimeline().filterCompletedInstants(), allFiles);
Stream<HoodieDataFile> dataFilesToRead = roView.getLatestDataFiles();
assertTrue(!dataFilesToRead.findAny().isPresent());
roView = new HoodieTableFileSystemView(metaClient, hoodieTable.getCompletedCommitTimeline(),
allFiles);
roView = new HoodieTableFileSystemView(metaClient, hoodieTable.getCompletedCommitTimeline(), allFiles);
dataFilesToRead = roView.getLatestDataFiles();
assertTrue("RealtimeTableView should list the parquet files we wrote in the delta commit",
dataFilesToRead.findAny().isPresent());
@@ -209,21 +203,17 @@ public class TestMergeOnReadTable {
client.compact(compactionCommitTime);
allFiles = HoodieTestUtils.listAllDataFilesInPath(dfs, cfg.getBasePath());
roView = new HoodieTableFileSystemView(metaClient, hoodieTable.getCompletedCommitTimeline(),
allFiles);
roView = new HoodieTableFileSystemView(metaClient, hoodieTable.getCompletedCommitTimeline(), allFiles);
dataFilesToRead = roView.getLatestDataFiles();
assertTrue(dataFilesToRead.findAny().isPresent());
// verify that there is a commit
HoodieTable table = HoodieTable.getHoodieTable(
new HoodieTableMetaClient(jsc.hadoopConfiguration(), cfg.getBasePath(), true),
getConfig(false));
new HoodieTableMetaClient(jsc.hadoopConfiguration(), cfg.getBasePath(), true), getConfig(false));
HoodieTimeline timeline = table.getCommitTimeline().filterCompletedInstants();
assertEquals("Expecting a single commit.", 1,
timeline.findInstantsAfter("000", Integer.MAX_VALUE).countInstants());
assertEquals("Expecting a single commit.", 1, timeline.findInstantsAfter("000", Integer.MAX_VALUE).countInstants());
String latestCompactionCommitTime = timeline.lastInstant().get().getTimestamp();
assertTrue(HoodieTimeline
.compareTimestamps("000", latestCompactionCommitTime, HoodieTimeline.LESSER));
assertTrue(HoodieTimeline.compareTimestamps("000", latestCompactionCommitTime, HoodieTimeline.LESSER));
assertEquals("Must contain 200 records", 200,
HoodieClientTestUtils.readSince(basePath, sqlContext, timeline, "000").count());
@@ -232,8 +222,7 @@ public class TestMergeOnReadTable {
// Check if record level metadata is aggregated properly at the end of write.
@Test
public void testMetadataAggregateFromWriteStatus() throws Exception {
HoodieWriteConfig cfg = getConfigBuilder(false)
.withWriteStatusClass(MetadataMergeWriteStatus.class).build();
HoodieWriteConfig cfg = getConfigBuilder(false).withWriteStatusClass(MetadataMergeWriteStatus.class).build();
HoodieWriteClient client = new HoodieWriteClient(jsc, cfg);
String newCommitTime = "001";
@@ -248,7 +237,8 @@ public class TestMergeOnReadTable {
Map<String, String> allWriteStatusMergedMetadataMap = MetadataMergeWriteStatus
.mergeMetadataForWriteStatuses(statuses);
assertTrue(allWriteStatusMergedMetadataMap.containsKey("InputRecordCount_1506582000"));
// For metadata key InputRecordCount_1506582000, value is 2 for each record. So sum of this should be 2 * records.size()
// For metadata key InputRecordCount_1506582000, value is 2 for each record. So sum of this
// should be 2 * records.size()
assertEquals(String.valueOf(2 * records.size()),
allWriteStatusMergedMetadataMap.get("InputRecordCount_1506582000"));
}
@@ -271,28 +261,23 @@ public class TestMergeOnReadTable {
List<WriteStatus> statuses = client.upsert(writeRecords, newCommitTime).collect();
assertNoWriteErrors(statuses);
HoodieTableMetaClient metaClient = new HoodieTableMetaClient(jsc.hadoopConfiguration(),
cfg.getBasePath());
HoodieTableMetaClient metaClient = new HoodieTableMetaClient(jsc.hadoopConfiguration(), cfg.getBasePath());
HoodieTable hoodieTable = HoodieTable.getHoodieTable(metaClient, cfg);
Optional<HoodieInstant> deltaCommit =
metaClient.getActiveTimeline().getDeltaCommitTimeline().firstInstant();
Optional<HoodieInstant> deltaCommit = metaClient.getActiveTimeline().getDeltaCommitTimeline().firstInstant();
assertTrue(deltaCommit.isPresent());
assertEquals("Delta commit should be 001", "001", deltaCommit.get().getTimestamp());
Optional<HoodieInstant> commit =
metaClient.getActiveTimeline().getCommitTimeline().firstInstant();
Optional<HoodieInstant> commit = metaClient.getActiveTimeline().getCommitTimeline().firstInstant();
assertFalse(commit.isPresent());
FileStatus[] allFiles = HoodieTestUtils
.listAllDataFilesInPath(metaClient.getFs(), cfg.getBasePath());
FileStatus[] allFiles = HoodieTestUtils.listAllDataFilesInPath(metaClient.getFs(), cfg.getBasePath());
TableFileSystemView.ReadOptimizedView roView = new HoodieTableFileSystemView(metaClient,
hoodieTable.getCommitTimeline().filterCompletedInstants(), allFiles);
Stream<HoodieDataFile> dataFilesToRead = roView.getLatestDataFiles();
assertTrue(!dataFilesToRead.findAny().isPresent());
roView = new HoodieTableFileSystemView(metaClient, hoodieTable.getCompletedCommitTimeline(),
allFiles);
roView = new HoodieTableFileSystemView(metaClient, hoodieTable.getCompletedCommitTimeline(), allFiles);
dataFilesToRead = roView.getLatestDataFiles();
assertTrue("RealtimeTableView should list the parquet files we wrote in the delta commit",
dataFilesToRead.findAny().isPresent());
@@ -329,15 +314,12 @@ public class TestMergeOnReadTable {
assertFalse(commit.isPresent());
allFiles = HoodieTestUtils.listAllDataFilesInPath(dfs, cfg.getBasePath());
roView = new HoodieTableFileSystemView(metaClient, hoodieTable.getCompletedCommitTimeline(),
allFiles);
roView = new HoodieTableFileSystemView(metaClient, hoodieTable.getCompletedCommitTimeline(), allFiles);
dataFilesToRead = roView.getLatestDataFiles();
assertTrue(dataFilesToRead.findAny().isPresent());
List<String> dataFiles = roView.getLatestDataFiles().map(hf -> hf.getPath())
.collect(Collectors.toList());
List<GenericRecord> recordsRead = HoodieMergeOnReadTestUtils
.getRecordsUsingInputFormat(dataFiles, basePath);
List<String> dataFiles = roView.getLatestDataFiles().map(hf -> hf.getPath()).collect(Collectors.toList());
List<GenericRecord> recordsRead = HoodieMergeOnReadTestUtils.getRecordsUsingInputFormat(dataFiles, basePath);
//Wrote 40 records and deleted 20 records, so remaining 40-20 = 20
assertEquals("Must contain 20 records", 20, recordsRead.size());
}
@@ -365,10 +347,8 @@ public class TestMergeOnReadTable {
//verify there are no errors
assertNoWriteErrors(statuses);
HoodieTableMetaClient metaClient = new HoodieTableMetaClient(jsc.hadoopConfiguration(),
cfg.getBasePath());
Optional<HoodieInstant> commit =
metaClient.getActiveTimeline().getCommitTimeline().firstInstant();
HoodieTableMetaClient metaClient = new HoodieTableMetaClient(jsc.hadoopConfiguration(), cfg.getBasePath());
Optional<HoodieInstant> commit = metaClient.getActiveTimeline().getCommitTimeline().firstInstant();
assertTrue(commit.isPresent());
assertEquals("commit should be 001", "001", commit.get().getTimestamp());
@@ -391,10 +371,8 @@ public class TestMergeOnReadTable {
client.rollback(newCommitTime);
metaClient = new HoodieTableMetaClient(jsc.hadoopConfiguration(), cfg.getBasePath());
HoodieTable hoodieTable = HoodieTable
.getHoodieTable(metaClient, cfg);
FileStatus[] allFiles = HoodieTestUtils
.listAllDataFilesInPath(metaClient.getFs(), cfg.getBasePath());
HoodieTable hoodieTable = HoodieTable.getHoodieTable(metaClient, cfg);
FileStatus[] allFiles = HoodieTestUtils.listAllDataFilesInPath(metaClient.getFs(), cfg.getBasePath());
HoodieTableFileSystemView roView = new HoodieTableFileSystemView(metaClient,
hoodieTable.getCompletedCommitTimeline(), allFiles);
@@ -428,28 +406,23 @@ public class TestMergeOnReadTable {
List<WriteStatus> statuses = client.upsert(writeRecords, newCommitTime).collect();
assertNoWriteErrors(statuses);
HoodieTableMetaClient metaClient = new HoodieTableMetaClient(jsc.hadoopConfiguration(),
cfg.getBasePath());
HoodieTableMetaClient metaClient = new HoodieTableMetaClient(jsc.hadoopConfiguration(), cfg.getBasePath());
HoodieTable hoodieTable = HoodieTable.getHoodieTable(metaClient, cfg);
Optional<HoodieInstant> deltaCommit =
metaClient.getActiveTimeline().getDeltaCommitTimeline().firstInstant();
Optional<HoodieInstant> deltaCommit = metaClient.getActiveTimeline().getDeltaCommitTimeline().firstInstant();
assertTrue(deltaCommit.isPresent());
assertEquals("Delta commit should be 001", "001", deltaCommit.get().getTimestamp());
Optional<HoodieInstant> commit =
metaClient.getActiveTimeline().getCommitTimeline().firstInstant();
Optional<HoodieInstant> commit = metaClient.getActiveTimeline().getCommitTimeline().firstInstant();
assertFalse(commit.isPresent());
FileStatus[] allFiles = HoodieTestUtils
.listAllDataFilesInPath(metaClient.getFs(), cfg.getBasePath());
FileStatus[] allFiles = HoodieTestUtils.listAllDataFilesInPath(metaClient.getFs(), cfg.getBasePath());
TableFileSystemView.ReadOptimizedView roView = new HoodieTableFileSystemView(metaClient,
hoodieTable.getCommitTimeline().filterCompletedInstants(), allFiles);
Stream<HoodieDataFile> dataFilesToRead = roView.getLatestDataFiles();
assertTrue(!dataFilesToRead.findAny().isPresent());
roView = new HoodieTableFileSystemView(metaClient, hoodieTable.getCompletedCommitTimeline(),
allFiles);
roView = new HoodieTableFileSystemView(metaClient, hoodieTable.getCompletedCommitTimeline(), allFiles);
dataFilesToRead = roView.getLatestDataFiles();
assertTrue("RealtimeTableView should list the parquet files we wrote in the delta commit",
dataFilesToRead.findAny().isPresent());
@@ -473,10 +446,8 @@ public class TestMergeOnReadTable {
commit = metaClient.getActiveTimeline().getCommitTimeline().firstInstant();
assertFalse(commit.isPresent());
List<String> dataFiles = roView.getLatestDataFiles().map(hf -> hf.getPath())
.collect(Collectors.toList());
List<GenericRecord> recordsRead = HoodieMergeOnReadTestUtils
.getRecordsUsingInputFormat(dataFiles, basePath);
List<String> dataFiles = roView.getLatestDataFiles().map(hf -> hf.getPath()).collect(Collectors.toList());
List<GenericRecord> recordsRead = HoodieMergeOnReadTestUtils.getRecordsUsingInputFormat(dataFiles, basePath);
assertEquals(recordsRead.size(), 200);
@@ -485,8 +456,7 @@ public class TestMergeOnReadTable {
metaClient = new HoodieTableMetaClient(jsc.hadoopConfiguration(), cfg.getBasePath());
hoodieTable = HoodieTable.getHoodieTable(metaClient, cfg);
roView = new HoodieTableFileSystemView(metaClient, hoodieTable.getCompletedCommitTimeline(),
allFiles);
roView = new HoodieTableFileSystemView(metaClient, hoodieTable.getCompletedCommitTimeline(), allFiles);
dataFiles = roView.getLatestDataFiles().map(hf -> hf.getPath()).collect(Collectors.toList());
recordsRead = HoodieMergeOnReadTestUtils.getRecordsUsingInputFormat(dataFiles, basePath);
@@ -512,11 +482,10 @@ public class TestMergeOnReadTable {
allFiles = HoodieTestUtils.listAllDataFilesInPath(metaClient.getFs(), cfg.getBasePath());
metaClient = new HoodieTableMetaClient(jsc.hadoopConfiguration(), cfg.getBasePath());
hoodieTable = HoodieTable.getHoodieTable(metaClient, cfg);
roView = new HoodieTableFileSystemView(metaClient, hoodieTable.getCommitsTimeline(),
allFiles);
roView = new HoodieTableFileSystemView(metaClient, hoodieTable.getCommitsTimeline(), allFiles);
final String compactedCommitTime = metaClient.getActiveTimeline().reload()
.getCommitsTimeline().lastInstant().get().getTimestamp();
final String compactedCommitTime = metaClient.getActiveTimeline().reload().getCommitsTimeline().lastInstant().get()
.getTimestamp();
assertTrue(roView.getLatestDataFiles().filter(file -> {
if (compactedCommitTime.equals(file.getCommitTime())) {
@@ -531,8 +500,7 @@ public class TestMergeOnReadTable {
allFiles = HoodieTestUtils.listAllDataFilesInPath(metaClient.getFs(), cfg.getBasePath());
metaClient = new HoodieTableMetaClient(jsc.hadoopConfiguration(), cfg.getBasePath());
hoodieTable = HoodieTable.getHoodieTable(metaClient, cfg);
roView = new HoodieTableFileSystemView(metaClient, hoodieTable.getCommitsTimeline(),
allFiles);
roView = new HoodieTableFileSystemView(metaClient, hoodieTable.getCommitsTimeline(), allFiles);
assertFalse(roView.getLatestDataFiles().filter(file -> {
if (compactedCommitTime.equals(file.getCommitTime())) {
@@ -564,30 +532,28 @@ public class TestMergeOnReadTable {
HoodieTableMetaClient metaClient = new HoodieTableMetaClient(jsc.hadoopConfiguration(), cfg.getBasePath());
HoodieTable hoodieTable = HoodieTable.getHoodieTable(metaClient, cfg);
Optional<HoodieInstant> deltaCommit =
metaClient.getActiveTimeline().getDeltaCommitTimeline().firstInstant();
Optional<HoodieInstant> deltaCommit = metaClient.getActiveTimeline().getDeltaCommitTimeline().firstInstant();
assertTrue(deltaCommit.isPresent());
assertEquals("Delta commit should be 001", "001", deltaCommit.get().getTimestamp());
Optional<HoodieInstant> commit =
metaClient.getActiveTimeline().getCommitTimeline().firstInstant();
Optional<HoodieInstant> commit = metaClient.getActiveTimeline().getCommitTimeline().firstInstant();
assertFalse(commit.isPresent());
FileStatus[] allFiles = HoodieTestUtils
.listAllDataFilesInPath(metaClient.getFs(), cfg.getBasePath());
FileStatus[] allFiles = HoodieTestUtils.listAllDataFilesInPath(metaClient.getFs(), cfg.getBasePath());
TableFileSystemView.ReadOptimizedView roView = new HoodieTableFileSystemView(metaClient,
hoodieTable.getCommitsTimeline().filterCompletedInstants(), allFiles);
Stream<HoodieDataFile> dataFilesToRead = roView.getLatestDataFiles();
Map<String, Long> parquetFileIdToSize = dataFilesToRead.collect(Collectors.toMap(HoodieDataFile::getFileId, HoodieDataFile::getFileSize));
Map<String, Long> parquetFileIdToSize = dataFilesToRead.collect(
Collectors.toMap(HoodieDataFile::getFileId, HoodieDataFile::getFileSize));
roView = new HoodieTableFileSystemView(metaClient, hoodieTable.getCompletedCommitTimeline(),
allFiles);
roView = new HoodieTableFileSystemView(metaClient, hoodieTable.getCompletedCommitTimeline(), allFiles);
dataFilesToRead = roView.getLatestDataFiles();
assertTrue("RealtimeTableView should list the parquet files we wrote in the delta commit",
dataFilesToRead.findAny().isPresent());
/**
* Write 2 (only updates + inserts, written to .log file + correction of existing parquet file size)
* Write 2 (only updates + inserts, written to .log file + correction of existing parquet
* file size)
*/
newCommitTime = "002";
client.startCommitWithTime(newCommitTime);
@@ -608,18 +574,17 @@ public class TestMergeOnReadTable {
assertFalse(commit.isPresent());
allFiles = HoodieTestUtils.listAllDataFilesInPath(metaClient.getFs(), cfg.getBasePath());
roView = new HoodieTableFileSystemView(metaClient, hoodieTable.getActiveTimeline().reload()
.getCommitsTimeline().filterCompletedInstants(), allFiles);
roView = new HoodieTableFileSystemView(metaClient,
hoodieTable.getActiveTimeline().reload().getCommitsTimeline().filterCompletedInstants(), allFiles);
dataFilesToRead = roView.getLatestDataFiles();
Map<String, Long> parquetFileIdToNewSize = dataFilesToRead.collect(Collectors.toMap(HoodieDataFile::getFileId, HoodieDataFile::getFileSize));
Map<String, Long> parquetFileIdToNewSize = dataFilesToRead.collect(
Collectors.toMap(HoodieDataFile::getFileId, HoodieDataFile::getFileSize));
assertTrue(parquetFileIdToNewSize.entrySet().stream()
.filter(entry -> parquetFileIdToSize.get(entry.getKey()) < entry.getValue()).count() > 0);
List<String> dataFiles = roView.getLatestDataFiles().map(hf -> hf.getPath())
.collect(Collectors.toList());
List<GenericRecord> recordsRead = HoodieMergeOnReadTestUtils
.getRecordsUsingInputFormat(dataFiles, basePath);
List<String> dataFiles = roView.getLatestDataFiles().map(hf -> hf.getPath()).collect(Collectors.toList());
List<GenericRecord> recordsRead = HoodieMergeOnReadTestUtils.getRecordsUsingInputFormat(dataFiles, basePath);
//Wrote 20 records in 2 batches
assertEquals("Must contain 40 records", 40, recordsRead.size());
}
@@ -639,8 +604,7 @@ public class TestMergeOnReadTable {
List<WriteStatus> statuses = writeClient.insert(recordsRDD, newCommitTime).collect();
// Update all the 100 records
HoodieTableMetaClient metaClient = new HoodieTableMetaClient(jsc.hadoopConfiguration(),
basePath);
HoodieTableMetaClient metaClient = new HoodieTableMetaClient(jsc.hadoopConfiguration(), basePath);
HoodieTable table = HoodieTable.getHoodieTable(metaClient, config);
newCommitTime = "101";
@@ -653,19 +617,17 @@ public class TestMergeOnReadTable {
// Write them to corresponding avro logfiles
HoodieTestUtils
.writeRecordsToLogFiles(metaClient.getFs(), metaClient.getBasePath(),
HoodieTestDataGenerator.avroSchema, updatedRecords);
.writeRecordsToLogFiles(metaClient.getFs(), metaClient.getBasePath(), HoodieTestDataGenerator.avroSchema,
updatedRecords);
// Verify that all data file has one log file
metaClient = new HoodieTableMetaClient(jsc.hadoopConfiguration(), basePath);
table = HoodieTable.getHoodieTable(metaClient, config);
for (String partitionPath : dataGen.getPartitionPaths()) {
List<FileSlice> groupedLogFiles =
table.getRTFileSystemView().getLatestFileSlices(partitionPath)
.collect(Collectors.toList());
List<FileSlice> groupedLogFiles = table.getRTFileSystemView().getLatestFileSlices(partitionPath)
.collect(Collectors.toList());
for (FileSlice fileSlice : groupedLogFiles) {
assertEquals("There should be 1 log file written for every data file", 1,
fileSlice.getLogFiles().count());
assertEquals("There should be 1 log file written for every data file", 1, fileSlice.getLogFiles().count());
}
}
@@ -674,31 +636,27 @@ public class TestMergeOnReadTable {
table = HoodieTable.getHoodieTable(metaClient, config);
String commitTime = writeClient.startCompaction();
JavaRDD<WriteStatus> result =
writeClient.compact(commitTime);
JavaRDD<WriteStatus> result = writeClient.compact(commitTime);
// Verify that recently written compacted data file has no log file
metaClient = new HoodieTableMetaClient(jsc.hadoopConfiguration(), basePath);
table = HoodieTable.getHoodieTable(metaClient, config);
HoodieActiveTimeline timeline = metaClient.getActiveTimeline();
assertTrue("Compaction commit should be > than last insert",
HoodieTimeline.compareTimestamps(timeline.lastInstant().get().getTimestamp(), newCommitTime,
HoodieTimeline.GREATER));
assertTrue("Compaction commit should be > than last insert", HoodieTimeline.compareTimestamps(
timeline.lastInstant().get().getTimestamp(), newCommitTime, HoodieTimeline.GREATER));
for (String partitionPath : dataGen.getPartitionPaths()) {
List<FileSlice> groupedLogFiles = table.getRTFileSystemView()
.getLatestFileSlices(partitionPath)
List<FileSlice> groupedLogFiles = table.getRTFileSystemView().getLatestFileSlices(partitionPath)
.collect(Collectors.toList());
for (FileSlice slice : groupedLogFiles) {
assertTrue(
"After compaction there should be no log files visiable on a Realtime view",
assertTrue("After compaction there should be no log files visiable on a Realtime view",
slice.getLogFiles().collect(Collectors.toList()).isEmpty());
}
List<WriteStatus> writeStatuses = result.collect();
assertTrue(writeStatuses.stream()
.filter(writeStatus -> writeStatus.getStat().getPartitionPath()
.contentEquals(partitionPath)).count() > 0);
.filter(writeStatus -> writeStatus.getStat().getPartitionPath().contentEquals(partitionPath))
.count() > 0);
}
}
@@ -707,16 +665,13 @@ public class TestMergeOnReadTable {
}
private HoodieWriteConfig.Builder getConfigBuilder(Boolean autoCommit) {
return HoodieWriteConfig.newBuilder().withPath(basePath)
.withSchema(TRIP_EXAMPLE_SCHEMA).withParallelism(2, 2)
.withAutoCommit(autoCommit)
.withAssumeDatePartitioning(true)
.withCompactionConfig(
HoodieCompactionConfig.newBuilder().compactionSmallFileSize(1024 * 1024 * 1024)
.withInlineCompaction(false).withMaxNumDeltaCommitsBeforeCompaction(1).build())
return HoodieWriteConfig.newBuilder().withPath(basePath).withSchema(TRIP_EXAMPLE_SCHEMA).withParallelism(2, 2)
.withAutoCommit(autoCommit).withAssumeDatePartitioning(true).withCompactionConfig(
HoodieCompactionConfig.newBuilder().compactionSmallFileSize(1024 * 1024 * 1024).withInlineCompaction(false)
.withMaxNumDeltaCommitsBeforeCompaction(1).build())
.withStorageConfig(HoodieStorageConfig.newBuilder().limitFileSize(1024 * 1024 * 1024).build())
.forTable("test-trip-table").withIndexConfig(
HoodieIndexConfig.newBuilder().withIndexType(HoodieIndex.IndexType.BLOOM).build());
.forTable("test-trip-table")
.withIndexConfig(HoodieIndexConfig.newBuilder().withIndexType(HoodieIndex.IndexType.BLOOM).build());
}
private void assertNoWriteErrors(List<WriteStatus> statuses) {