[HUDI-687] Stop incremental reader on RO table when there is a pending compaction (#1396)
This commit is contained in:
@@ -20,6 +20,7 @@ package org.apache.hudi.common;
|
|||||||
|
|
||||||
import org.apache.hudi.avro.HoodieAvroUtils;
|
import org.apache.hudi.avro.HoodieAvroUtils;
|
||||||
import org.apache.hudi.common.model.HoodieTestUtils;
|
import org.apache.hudi.common.model.HoodieTestUtils;
|
||||||
|
import org.apache.hudi.hadoop.HoodieParquetInputFormat;
|
||||||
import org.apache.hudi.hadoop.realtime.HoodieParquetRealtimeInputFormat;
|
import org.apache.hudi.hadoop.realtime.HoodieParquetRealtimeInputFormat;
|
||||||
|
|
||||||
import org.apache.avro.Schema;
|
import org.apache.avro.Schema;
|
||||||
@@ -51,29 +52,36 @@ public class HoodieMergeOnReadTestUtils {
|
|||||||
public static List<GenericRecord> getRecordsUsingInputFormat(List<String> inputPaths, String basePath,
|
public static List<GenericRecord> getRecordsUsingInputFormat(List<String> inputPaths, String basePath,
|
||||||
Configuration conf) {
|
Configuration conf) {
|
||||||
JobConf jobConf = new JobConf(conf);
|
JobConf jobConf = new JobConf(conf);
|
||||||
|
return getRecordsUsingInputFormat(inputPaths, basePath, jobConf, new HoodieParquetRealtimeInputFormat());
|
||||||
|
}
|
||||||
|
|
||||||
|
public static List<GenericRecord> getRecordsUsingInputFormat(List<String> inputPaths,
|
||||||
|
String basePath,
|
||||||
|
JobConf jobConf,
|
||||||
|
HoodieParquetInputFormat inputFormat) {
|
||||||
Schema schema = HoodieAvroUtils.addMetadataFields(
|
Schema schema = HoodieAvroUtils.addMetadataFields(
|
||||||
new Schema.Parser().parse(HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA));
|
new Schema.Parser().parse(HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA));
|
||||||
HoodieParquetRealtimeInputFormat inputFormat = new HoodieParquetRealtimeInputFormat();
|
|
||||||
setPropsForInputFormat(inputFormat, jobConf, schema, basePath);
|
setPropsForInputFormat(inputFormat, jobConf, schema, basePath);
|
||||||
return inputPaths.stream().map(path -> {
|
return inputPaths.stream().map(path -> {
|
||||||
setInputPath(jobConf, path);
|
setInputPath(jobConf, path);
|
||||||
List<GenericRecord> records = new ArrayList<>();
|
List<GenericRecord> records = new ArrayList<>();
|
||||||
try {
|
try {
|
||||||
List<InputSplit> splits = Arrays.asList(inputFormat.getSplits(jobConf, 1));
|
List<InputSplit> splits = Arrays.asList(inputFormat.getSplits(jobConf, 1));
|
||||||
RecordReader recordReader = inputFormat.getRecordReader(splits.get(0), jobConf, null);
|
for (InputSplit split : splits) {
|
||||||
Void key = (Void) recordReader.createKey();
|
RecordReader recordReader = inputFormat.getRecordReader(split, jobConf, null);
|
||||||
ArrayWritable writable = (ArrayWritable) recordReader.createValue();
|
Void key = (Void) recordReader.createKey();
|
||||||
while (recordReader.next(key, writable)) {
|
ArrayWritable writable = (ArrayWritable) recordReader.createValue();
|
||||||
GenericRecordBuilder newRecord = new GenericRecordBuilder(schema);
|
while (recordReader.next(key, writable)) {
|
||||||
// writable returns an array with [field1, field2, _hoodie_commit_time,
|
GenericRecordBuilder newRecord = new GenericRecordBuilder(schema);
|
||||||
// _hoodie_commit_seqno]
|
// writable returns an array with [field1, field2, _hoodie_commit_time,
|
||||||
Writable[] values = writable.get();
|
// _hoodie_commit_seqno]
|
||||||
final int[] fieldIndex = {0};
|
Writable[] values = writable.get();
|
||||||
assert schema.getFields().size() <= values.length;
|
assert schema.getFields().size() <= values.length;
|
||||||
schema.getFields().forEach(field -> {
|
schema.getFields().forEach(field -> {
|
||||||
newRecord.set(field, values[fieldIndex[0]++]);
|
newRecord.set(field, values[field.pos()]);
|
||||||
});
|
});
|
||||||
records.add(newRecord.build());
|
records.add(newRecord.build());
|
||||||
|
}
|
||||||
}
|
}
|
||||||
} catch (IOException ie) {
|
} catch (IOException ie) {
|
||||||
ie.printStackTrace();
|
ie.printStackTrace();
|
||||||
@@ -85,7 +93,7 @@ public class HoodieMergeOnReadTestUtils {
|
|||||||
}).orElse(new ArrayList<GenericRecord>());
|
}).orElse(new ArrayList<GenericRecord>());
|
||||||
}
|
}
|
||||||
|
|
||||||
private static void setPropsForInputFormat(HoodieParquetRealtimeInputFormat inputFormat, JobConf jobConf,
|
private static void setPropsForInputFormat(HoodieParquetInputFormat inputFormat, JobConf jobConf,
|
||||||
Schema schema, String basePath) {
|
Schema schema, String basePath) {
|
||||||
List<Schema.Field> fields = schema.getFields();
|
List<Schema.Field> fields = schema.getFields();
|
||||||
String names = fields.stream().map(f -> f.name().toString()).collect(Collectors.joining(","));
|
String names = fields.stream().map(f -> f.name().toString()).collect(Collectors.joining(","));
|
||||||
|
|||||||
@@ -18,6 +18,7 @@
|
|||||||
|
|
||||||
package org.apache.hudi.table;
|
package org.apache.hudi.table;
|
||||||
|
|
||||||
|
import org.apache.hudi.client.HoodieWriteClient;
|
||||||
import org.apache.hudi.client.WriteStatus;
|
import org.apache.hudi.client.WriteStatus;
|
||||||
import org.apache.hudi.common.HoodieClientTestHarness;
|
import org.apache.hudi.common.HoodieClientTestHarness;
|
||||||
import org.apache.hudi.common.HoodieClientTestUtils;
|
import org.apache.hudi.common.HoodieClientTestUtils;
|
||||||
@@ -29,9 +30,9 @@ import org.apache.hudi.common.fs.FSUtils;
|
|||||||
import org.apache.hudi.common.model.HoodieKey;
|
import org.apache.hudi.common.model.HoodieKey;
|
||||||
import org.apache.hudi.common.model.HoodieRecord;
|
import org.apache.hudi.common.model.HoodieRecord;
|
||||||
import org.apache.hudi.common.model.HoodieRecordLocation;
|
import org.apache.hudi.common.model.HoodieRecordLocation;
|
||||||
|
import org.apache.hudi.common.model.HoodieTableType;
|
||||||
import org.apache.hudi.common.model.HoodieTestUtils;
|
import org.apache.hudi.common.model.HoodieTestUtils;
|
||||||
import org.apache.hudi.common.table.HoodieTableMetaClient;
|
import org.apache.hudi.common.table.HoodieTableMetaClient;
|
||||||
import org.apache.hudi.common.table.timeline.HoodieTimeline;
|
|
||||||
import org.apache.hudi.common.util.FileIOUtils;
|
import org.apache.hudi.common.util.FileIOUtils;
|
||||||
import org.apache.hudi.common.util.Option;
|
import org.apache.hudi.common.util.Option;
|
||||||
import org.apache.hudi.common.util.ParquetUtils;
|
import org.apache.hudi.common.util.ParquetUtils;
|
||||||
@@ -39,11 +40,17 @@ import org.apache.hudi.common.util.collection.Pair;
|
|||||||
import org.apache.hudi.config.HoodieCompactionConfig;
|
import org.apache.hudi.config.HoodieCompactionConfig;
|
||||||
import org.apache.hudi.config.HoodieStorageConfig;
|
import org.apache.hudi.config.HoodieStorageConfig;
|
||||||
import org.apache.hudi.config.HoodieWriteConfig;
|
import org.apache.hudi.config.HoodieWriteConfig;
|
||||||
|
import org.apache.hudi.hadoop.HoodieHiveUtil;
|
||||||
|
import org.apache.hudi.hadoop.HoodieParquetInputFormat;
|
||||||
import org.apache.hudi.io.HoodieCreateHandle;
|
import org.apache.hudi.io.HoodieCreateHandle;
|
||||||
import org.apache.hudi.table.HoodieCopyOnWriteTable.UpsertPartitioner;
|
import org.apache.hudi.table.HoodieCopyOnWriteTable.UpsertPartitioner;
|
||||||
|
|
||||||
import org.apache.avro.generic.GenericRecord;
|
import org.apache.avro.generic.GenericRecord;
|
||||||
|
import org.apache.hadoop.fs.FileStatus;
|
||||||
import org.apache.hadoop.fs.Path;
|
import org.apache.hadoop.fs.Path;
|
||||||
|
import org.apache.hadoop.mapred.FileInputFormat;
|
||||||
|
import org.apache.hadoop.mapred.JobConf;
|
||||||
|
|
||||||
import org.apache.log4j.LogManager;
|
import org.apache.log4j.LogManager;
|
||||||
import org.apache.log4j.Logger;
|
import org.apache.log4j.Logger;
|
||||||
import org.apache.parquet.avro.AvroReadSupport;
|
import org.apache.parquet.avro.AvroReadSupport;
|
||||||
@@ -64,7 +71,6 @@ import java.util.UUID;
|
|||||||
import scala.Tuple2;
|
import scala.Tuple2;
|
||||||
|
|
||||||
import static org.junit.Assert.assertEquals;
|
import static org.junit.Assert.assertEquals;
|
||||||
import static org.junit.Assert.assertNotNull;
|
|
||||||
import static org.junit.Assert.assertTrue;
|
import static org.junit.Assert.assertTrue;
|
||||||
import static org.mockito.Mockito.mock;
|
import static org.mockito.Mockito.mock;
|
||||||
import static org.mockito.Mockito.when;
|
import static org.mockito.Mockito.when;
|
||||||
@@ -129,6 +135,8 @@ public class TestCopyOnWriteTable extends HoodieClientTestHarness {
|
|||||||
// Prepare the AvroParquetIO
|
// Prepare the AvroParquetIO
|
||||||
HoodieWriteConfig config = makeHoodieClientConfig();
|
HoodieWriteConfig config = makeHoodieClientConfig();
|
||||||
String firstCommitTime = HoodieTestUtils.makeNewCommitTime();
|
String firstCommitTime = HoodieTestUtils.makeNewCommitTime();
|
||||||
|
HoodieWriteClient writeClient = new HoodieWriteClient(jsc, config);
|
||||||
|
writeClient.startCommitWithTime(firstCommitTime);
|
||||||
metaClient = HoodieTableMetaClient.reload(metaClient);
|
metaClient = HoodieTableMetaClient.reload(metaClient);
|
||||||
|
|
||||||
String partitionPath = "/2016/01/31";
|
String partitionPath = "/2016/01/31";
|
||||||
@@ -154,30 +162,17 @@ public class TestCopyOnWriteTable extends HoodieClientTestHarness {
|
|||||||
|
|
||||||
// Insert new records
|
// Insert new records
|
||||||
final HoodieCopyOnWriteTable cowTable = table;
|
final HoodieCopyOnWriteTable cowTable = table;
|
||||||
jsc.parallelize(Arrays.asList(1)).map(x -> {
|
writeClient.insert(jsc.parallelize(records, 1), firstCommitTime);
|
||||||
return cowTable.handleInsert(firstCommitTime, FSUtils.createNewFileIdPfx(), records.iterator());
|
|
||||||
}).map(x -> HoodieClientTestUtils.collectStatuses(x)).collect();
|
|
||||||
|
|
||||||
// We should have a parquet file generated (TODO: better control # files after we revise
|
FileStatus[] allFiles = getIncrementalFiles(partitionPath, "0", -1);
|
||||||
// AvroParquetIO)
|
assertEquals(1, allFiles.length);
|
||||||
File parquetFile = null;
|
|
||||||
for (File file : new File(this.basePath + partitionPath).listFiles()) {
|
|
||||||
if (file.getName().endsWith(".parquet")) {
|
|
||||||
parquetFile = file;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
assertTrue(parquetFile != null);
|
|
||||||
|
|
||||||
// Read out the bloom filter and make sure filter can answer record exist or not
|
// Read out the bloom filter and make sure filter can answer record exist or not
|
||||||
Path parquetFilePath = new Path(parquetFile.getAbsolutePath());
|
Path parquetFilePath = allFiles[0].getPath();
|
||||||
BloomFilter filter = ParquetUtils.readBloomFilterFromParquetMetadata(jsc.hadoopConfiguration(), parquetFilePath);
|
BloomFilter filter = ParquetUtils.readBloomFilterFromParquetMetadata(jsc.hadoopConfiguration(), parquetFilePath);
|
||||||
for (HoodieRecord record : records) {
|
for (HoodieRecord record : records) {
|
||||||
assertTrue(filter.mightContain(record.getRecordKey()));
|
assertTrue(filter.mightContain(record.getRecordKey()));
|
||||||
}
|
}
|
||||||
// Create a commit file
|
|
||||||
new File(this.basePath + "/" + HoodieTableMetaClient.METAFOLDER_NAME + "/"
|
|
||||||
+ FSUtils.getCommitTime(parquetFile.getName()) + ".commit").createNewFile();
|
|
||||||
|
|
||||||
// Read the parquet file, check the record content
|
// Read the parquet file, check the record content
|
||||||
List<GenericRecord> fileRecords = ParquetUtils.readAvroRecords(jsc.hadoopConfiguration(), parquetFilePath);
|
List<GenericRecord> fileRecords = ParquetUtils.readAvroRecords(jsc.hadoopConfiguration(), parquetFilePath);
|
||||||
@@ -194,9 +189,6 @@ public class TestCopyOnWriteTable extends HoodieClientTestHarness {
|
|||||||
TestRawTripPayload updateRowChanges1 = new TestRawTripPayload(updateRecordStr1);
|
TestRawTripPayload updateRowChanges1 = new TestRawTripPayload(updateRecordStr1);
|
||||||
HoodieRecord updatedRecord1 = new HoodieRecord(
|
HoodieRecord updatedRecord1 = new HoodieRecord(
|
||||||
new HoodieKey(updateRowChanges1.getRowKey(), updateRowChanges1.getPartitionPath()), updateRowChanges1);
|
new HoodieKey(updateRowChanges1.getRowKey(), updateRowChanges1.getPartitionPath()), updateRowChanges1);
|
||||||
updatedRecord1.unseal();
|
|
||||||
updatedRecord1.setCurrentLocation(new HoodieRecordLocation(null, FSUtils.getFileId(parquetFile.getName())));
|
|
||||||
updatedRecord1.seal();
|
|
||||||
|
|
||||||
TestRawTripPayload rowChange4 = new TestRawTripPayload(recordStr4);
|
TestRawTripPayload rowChange4 = new TestRawTripPayload(recordStr4);
|
||||||
HoodieRecord insertedRecord1 =
|
HoodieRecord insertedRecord1 =
|
||||||
@@ -207,27 +199,16 @@ public class TestCopyOnWriteTable extends HoodieClientTestHarness {
|
|||||||
Thread.sleep(1000);
|
Thread.sleep(1000);
|
||||||
String newCommitTime = HoodieTestUtils.makeNewCommitTime();
|
String newCommitTime = HoodieTestUtils.makeNewCommitTime();
|
||||||
metaClient = HoodieTableMetaClient.reload(metaClient);
|
metaClient = HoodieTableMetaClient.reload(metaClient);
|
||||||
final HoodieCopyOnWriteTable newTable = (HoodieCopyOnWriteTable) HoodieTable.create(metaClient, config, jsc);
|
writeClient.startCommitWithTime(newCommitTime);
|
||||||
List<WriteStatus> statuses = jsc.parallelize(Arrays.asList(1)).map(x -> {
|
List<WriteStatus> statuses = writeClient.upsert(jsc.parallelize(updatedRecords), newCommitTime).collect();
|
||||||
return newTable.handleUpdate(newCommitTime, updatedRecord1.getPartitionPath(),
|
|
||||||
updatedRecord1.getCurrentLocation().getFileId(), updatedRecords.iterator());
|
allFiles = getIncrementalFiles(partitionPath, firstCommitTime, -1);
|
||||||
}).flatMap(x -> HoodieClientTestUtils.collectStatuses(x).iterator()).collect();
|
assertEquals(1, allFiles.length);
|
||||||
|
// verify new incremental file group is same as the previous one
|
||||||
|
assertEquals(FSUtils.getFileId(parquetFilePath.getName()), FSUtils.getFileId(allFiles[0].getPath().getName()));
|
||||||
|
|
||||||
// Check the updated file
|
|
||||||
File updatedParquetFile = null;
|
|
||||||
for (File file : new File(basePath + "/2016/01/31").listFiles()) {
|
|
||||||
if (file.getName().endsWith(".parquet")) {
|
|
||||||
if (FSUtils.getFileId(file.getName()).equals(FSUtils.getFileId(parquetFile.getName()))
|
|
||||||
&& HoodieTimeline.compareTimestamps(FSUtils.getCommitTime(file.getName()),
|
|
||||||
FSUtils.getCommitTime(parquetFile.getName()), HoodieTimeline.GREATER)) {
|
|
||||||
updatedParquetFile = file;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
assertNotNull(updatedParquetFile);
|
|
||||||
// Check whether the record has been updated
|
// Check whether the record has been updated
|
||||||
Path updatedParquetFilePath = new Path(updatedParquetFile.getAbsolutePath());
|
Path updatedParquetFilePath = allFiles[0].getPath();
|
||||||
BloomFilter updatedFilter =
|
BloomFilter updatedFilter =
|
||||||
ParquetUtils.readBloomFilterFromParquetMetadata(jsc.hadoopConfiguration(), updatedParquetFilePath);
|
ParquetUtils.readBloomFilterFromParquetMetadata(jsc.hadoopConfiguration(), updatedParquetFilePath);
|
||||||
for (HoodieRecord record : records) {
|
for (HoodieRecord record : records) {
|
||||||
@@ -254,6 +235,32 @@ public class TestCopyOnWriteTable extends HoodieClientTestHarness {
|
|||||||
assertEquals(4, writeStatus.getStat().getNumWrites());// 3 rewritten records + 1 new record
|
assertEquals(4, writeStatus.getStat().getNumWrites());// 3 rewritten records + 1 new record
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private FileStatus[] getIncrementalFiles(String partitionPath, String startCommitTime, int numCommitsToPull)
|
||||||
|
throws Exception {
|
||||||
|
// initialize parquet input format
|
||||||
|
HoodieParquetInputFormat hoodieInputFormat = new HoodieParquetInputFormat();
|
||||||
|
JobConf jobConf = new JobConf(jsc.hadoopConfiguration());
|
||||||
|
hoodieInputFormat.setConf(jobConf);
|
||||||
|
HoodieTestUtils.init(jsc.hadoopConfiguration(), basePath, HoodieTableType.COPY_ON_WRITE);
|
||||||
|
setupIncremental(jobConf, startCommitTime, numCommitsToPull);
|
||||||
|
FileInputFormat.setInputPaths(jobConf, basePath + partitionPath);
|
||||||
|
return hoodieInputFormat.listStatus(jobConf);
|
||||||
|
}
|
||||||
|
|
||||||
|
private void setupIncremental(JobConf jobConf, String startCommit, int numberOfCommitsToPull) {
|
||||||
|
String modePropertyName =
|
||||||
|
String.format(HoodieHiveUtil.HOODIE_CONSUME_MODE_PATTERN, HoodieTestUtils.RAW_TRIPS_TEST_NAME);
|
||||||
|
jobConf.set(modePropertyName, HoodieHiveUtil.INCREMENTAL_SCAN_MODE);
|
||||||
|
|
||||||
|
String startCommitTimestampName =
|
||||||
|
String.format(HoodieHiveUtil.HOODIE_START_COMMIT_PATTERN, HoodieTestUtils.RAW_TRIPS_TEST_NAME);
|
||||||
|
jobConf.set(startCommitTimestampName, startCommit);
|
||||||
|
|
||||||
|
String maxCommitPulls =
|
||||||
|
String.format(HoodieHiveUtil.HOODIE_MAX_COMMIT_PATTERN, HoodieTestUtils.RAW_TRIPS_TEST_NAME);
|
||||||
|
jobConf.setInt(maxCommitPulls, numberOfCommitsToPull);
|
||||||
|
}
|
||||||
|
|
||||||
private List<HoodieRecord> newHoodieRecords(int n, String time) throws Exception {
|
private List<HoodieRecord> newHoodieRecords(int n, String time) throws Exception {
|
||||||
List<HoodieRecord> records = new ArrayList<>();
|
List<HoodieRecord> records = new ArrayList<>();
|
||||||
for (int i = 0; i < n; i++) {
|
for (int i = 0; i < n; i++) {
|
||||||
|
|||||||
@@ -18,6 +18,8 @@
|
|||||||
|
|
||||||
package org.apache.hudi.table;
|
package org.apache.hudi.table;
|
||||||
|
|
||||||
|
import org.apache.hadoop.mapred.FileInputFormat;
|
||||||
|
import org.apache.hadoop.mapred.JobConf;
|
||||||
import org.apache.hudi.client.HoodieReadClient;
|
import org.apache.hudi.client.HoodieReadClient;
|
||||||
import org.apache.hudi.client.HoodieWriteClient;
|
import org.apache.hudi.client.HoodieWriteClient;
|
||||||
import org.apache.hudi.client.WriteStatus;
|
import org.apache.hudi.client.WriteStatus;
|
||||||
@@ -50,6 +52,9 @@ import org.apache.hudi.config.HoodieCompactionConfig;
|
|||||||
import org.apache.hudi.config.HoodieIndexConfig;
|
import org.apache.hudi.config.HoodieIndexConfig;
|
||||||
import org.apache.hudi.config.HoodieStorageConfig;
|
import org.apache.hudi.config.HoodieStorageConfig;
|
||||||
import org.apache.hudi.config.HoodieWriteConfig;
|
import org.apache.hudi.config.HoodieWriteConfig;
|
||||||
|
import org.apache.hudi.hadoop.HoodieHiveUtil;
|
||||||
|
import org.apache.hudi.hadoop.HoodieParquetInputFormat;
|
||||||
|
import org.apache.hudi.hadoop.realtime.HoodieParquetRealtimeInputFormat;
|
||||||
import org.apache.hudi.index.HoodieIndex;
|
import org.apache.hudi.index.HoodieIndex;
|
||||||
import org.apache.hudi.index.HoodieIndex.IndexType;
|
import org.apache.hudi.index.HoodieIndex.IndexType;
|
||||||
|
|
||||||
@@ -70,6 +75,7 @@ import java.util.Arrays;
|
|||||||
import java.util.HashMap;
|
import java.util.HashMap;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
|
import java.util.Set;
|
||||||
import java.util.stream.Collectors;
|
import java.util.stream.Collectors;
|
||||||
import java.util.stream.Stream;
|
import java.util.stream.Stream;
|
||||||
|
|
||||||
@@ -80,6 +86,12 @@ import static org.junit.Assert.assertTrue;
|
|||||||
|
|
||||||
public class TestMergeOnReadTable extends HoodieClientTestHarness {
|
public class TestMergeOnReadTable extends HoodieClientTestHarness {
|
||||||
|
|
||||||
|
private HoodieParquetInputFormat roInputFormat;
|
||||||
|
private JobConf roJobConf;
|
||||||
|
|
||||||
|
private HoodieParquetRealtimeInputFormat rtInputFormat;
|
||||||
|
private JobConf rtJobConf;
|
||||||
|
|
||||||
@Before
|
@Before
|
||||||
public void init() throws IOException {
|
public void init() throws IOException {
|
||||||
initDFS();
|
initDFS();
|
||||||
@@ -89,6 +101,15 @@ public class TestMergeOnReadTable extends HoodieClientTestHarness {
|
|||||||
dfs.mkdirs(new Path(basePath));
|
dfs.mkdirs(new Path(basePath));
|
||||||
HoodieTestUtils.init(jsc.hadoopConfiguration(), basePath, HoodieTableType.MERGE_ON_READ);
|
HoodieTestUtils.init(jsc.hadoopConfiguration(), basePath, HoodieTableType.MERGE_ON_READ);
|
||||||
initTestDataGenerator();
|
initTestDataGenerator();
|
||||||
|
|
||||||
|
// initialize parquet input format
|
||||||
|
roInputFormat = new HoodieParquetInputFormat();
|
||||||
|
roJobConf = new JobConf(jsc.hadoopConfiguration());
|
||||||
|
roInputFormat.setConf(roJobConf);
|
||||||
|
|
||||||
|
rtInputFormat = new HoodieParquetRealtimeInputFormat();
|
||||||
|
rtJobConf = new JobConf(jsc.hadoopConfiguration());
|
||||||
|
rtInputFormat.setConf(rtJobConf);
|
||||||
}
|
}
|
||||||
|
|
||||||
@After
|
@After
|
||||||
@@ -114,63 +135,23 @@ public class TestMergeOnReadTable extends HoodieClientTestHarness {
|
|||||||
client.startCommitWithTime(newCommitTime);
|
client.startCommitWithTime(newCommitTime);
|
||||||
|
|
||||||
List<HoodieRecord> records = dataGen.generateInserts(newCommitTime, 200);
|
List<HoodieRecord> records = dataGen.generateInserts(newCommitTime, 200);
|
||||||
JavaRDD<HoodieRecord> writeRecords = jsc.parallelize(records, 1);
|
insertAndGetFilePaths(records, client, cfg, newCommitTime);
|
||||||
|
|
||||||
List<WriteStatus> statuses = client.upsert(writeRecords, newCommitTime).collect();
|
|
||||||
assertNoWriteErrors(statuses);
|
|
||||||
|
|
||||||
HoodieTableMetaClient metaClient = new HoodieTableMetaClient(jsc.hadoopConfiguration(), cfg.getBasePath());
|
|
||||||
HoodieTable hoodieTable = HoodieTable.create(metaClient, cfg, jsc);
|
|
||||||
|
|
||||||
Option<HoodieInstant> deltaCommit = metaClient.getActiveTimeline().getDeltaCommitTimeline().firstInstant();
|
|
||||||
assertTrue(deltaCommit.isPresent());
|
|
||||||
Assert.assertEquals("Delta commit should be 001", "001", deltaCommit.get().getTimestamp());
|
|
||||||
|
|
||||||
Option<HoodieInstant> commit = metaClient.getActiveTimeline().getCommitTimeline().firstInstant();
|
|
||||||
assertFalse(commit.isPresent());
|
|
||||||
|
|
||||||
FileStatus[] allFiles = HoodieTestUtils.listAllDataFilesInPath(metaClient.getFs(), cfg.getBasePath());
|
|
||||||
BaseFileOnlyView roView =
|
|
||||||
new HoodieTableFileSystemView(metaClient, metaClient.getCommitTimeline().filterCompletedInstants(), allFiles);
|
|
||||||
Stream<HoodieBaseFile> dataFilesToRead = roView.getLatestBaseFiles();
|
|
||||||
assertTrue(!dataFilesToRead.findAny().isPresent());
|
|
||||||
|
|
||||||
roView = new HoodieTableFileSystemView(metaClient, hoodieTable.getCompletedCommitsTimeline(), allFiles);
|
|
||||||
dataFilesToRead = roView.getLatestBaseFiles();
|
|
||||||
assertTrue("should list the parquet files we wrote in the delta commit",
|
|
||||||
dataFilesToRead.findAny().isPresent());
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Write 2 (updates)
|
* Write 2 (updates)
|
||||||
*/
|
*/
|
||||||
newCommitTime = "004";
|
newCommitTime = "004";
|
||||||
client.startCommitWithTime(newCommitTime);
|
client.startCommitWithTime(newCommitTime);
|
||||||
|
|
||||||
records = dataGen.generateUpdates(newCommitTime, 100);
|
records = dataGen.generateUpdates(newCommitTime, 100);
|
||||||
Map<HoodieKey, HoodieRecord> recordsMap = new HashMap<>();
|
updateAndGetFilePaths(records, client, cfg, newCommitTime);
|
||||||
for (HoodieRecord rec : records) {
|
|
||||||
if (!recordsMap.containsKey(rec.getKey())) {
|
|
||||||
recordsMap.put(rec.getKey(), rec);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
statuses = client.upsert(jsc.parallelize(records, 1), newCommitTime).collect();
|
|
||||||
// Verify there are no errors
|
|
||||||
assertNoWriteErrors(statuses);
|
|
||||||
metaClient = HoodieTableMetaClient.reload(metaClient);
|
|
||||||
deltaCommit = metaClient.getActiveTimeline().getDeltaCommitTimeline().lastInstant();
|
|
||||||
assertTrue(deltaCommit.isPresent());
|
|
||||||
assertEquals("Latest Delta commit should be 004", "004", deltaCommit.get().getTimestamp());
|
|
||||||
|
|
||||||
commit = metaClient.getActiveTimeline().getCommitTimeline().firstInstant();
|
|
||||||
assertFalse(commit.isPresent());
|
|
||||||
|
|
||||||
String compactionCommitTime = client.scheduleCompaction(Option.empty()).get().toString();
|
String compactionCommitTime = client.scheduleCompaction(Option.empty()).get().toString();
|
||||||
client.compact(compactionCommitTime);
|
client.compact(compactionCommitTime);
|
||||||
|
|
||||||
allFiles = HoodieTestUtils.listAllDataFilesInPath(dfs, cfg.getBasePath());
|
FileStatus[] allFiles = HoodieTestUtils.listAllDataFilesInPath(dfs, cfg.getBasePath());
|
||||||
roView = new HoodieTableFileSystemView(metaClient, hoodieTable.getCompletedCommitsTimeline(), allFiles);
|
HoodieTable hoodieTable = HoodieTable.create(metaClient, cfg, jsc);
|
||||||
dataFilesToRead = roView.getLatestBaseFiles();
|
HoodieTableFileSystemView roView = new HoodieTableFileSystemView(metaClient, hoodieTable.getCompletedCommitsTimeline(), allFiles);
|
||||||
|
Stream<HoodieBaseFile> dataFilesToRead = roView.getLatestBaseFiles();
|
||||||
assertTrue(dataFilesToRead.findAny().isPresent());
|
assertTrue(dataFilesToRead.findAny().isPresent());
|
||||||
|
|
||||||
// verify that there is a commit
|
// verify that there is a commit
|
||||||
@@ -186,6 +167,101 @@ public class TestMergeOnReadTable extends HoodieClientTestHarness {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// test incremental read does not go past compaction instant for RO views
|
||||||
|
// For RT views, incremental read can go past compaction
|
||||||
|
@Test
|
||||||
|
public void testIncrementalReadsWithCompaction() throws Exception {
|
||||||
|
String partitionPath = "2020/02/20"; // use only one partition for this test
|
||||||
|
dataGen = new HoodieTestDataGenerator(new String[] { partitionPath });
|
||||||
|
HoodieWriteConfig cfg = getConfig(true);
|
||||||
|
try (HoodieWriteClient client = getWriteClient(cfg);) {
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Write 1 (only inserts)
|
||||||
|
*/
|
||||||
|
String commitTime1 = "001";
|
||||||
|
client.startCommitWithTime(commitTime1);
|
||||||
|
|
||||||
|
List<HoodieRecord> records001 = dataGen.generateInserts(commitTime1, 200);
|
||||||
|
insertAndGetFilePaths(records001, client, cfg, commitTime1);
|
||||||
|
|
||||||
|
// verify only one parquet file shows up with commit time 001
|
||||||
|
FileStatus[] incrementalROFiles = getROIncrementalFiles(partitionPath, true);
|
||||||
|
validateIncrementalFiles(partitionPath, 1, incrementalROFiles, roInputFormat,
|
||||||
|
roJobConf,200, commitTime1);
|
||||||
|
Path firstFilePath = incrementalROFiles[0].getPath();
|
||||||
|
|
||||||
|
FileStatus[] incrementalRTFiles = getRTIncrementalFiles(partitionPath);
|
||||||
|
validateIncrementalFiles(partitionPath, 1, incrementalRTFiles, rtInputFormat,
|
||||||
|
rtJobConf,200, commitTime1);
|
||||||
|
assertEquals(firstFilePath, incrementalRTFiles[0].getPath());
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Write 2 (updates)
|
||||||
|
*/
|
||||||
|
String updateTime = "004";
|
||||||
|
client.startCommitWithTime(updateTime);
|
||||||
|
List<HoodieRecord> records004 = dataGen.generateUpdates(updateTime, 100);
|
||||||
|
updateAndGetFilePaths(records004, client, cfg, updateTime);
|
||||||
|
|
||||||
|
// verify RO incremental reads - only one parquet file shows up because updates to into log files
|
||||||
|
incrementalROFiles = getROIncrementalFiles(partitionPath, false);
|
||||||
|
validateIncrementalFiles(partitionPath, 1, incrementalROFiles, roInputFormat,
|
||||||
|
roJobConf, 200, commitTime1);
|
||||||
|
assertEquals(firstFilePath, incrementalROFiles[0].getPath());
|
||||||
|
|
||||||
|
// verify RT incremental reads includes updates also
|
||||||
|
incrementalRTFiles = getRTIncrementalFiles(partitionPath);
|
||||||
|
validateIncrementalFiles(partitionPath, 1, incrementalRTFiles, rtInputFormat,
|
||||||
|
rtJobConf, 200, commitTime1, updateTime);
|
||||||
|
|
||||||
|
// request compaction, but do not perform compaction
|
||||||
|
String compactionCommitTime = "005";
|
||||||
|
client.scheduleCompactionAtInstant("005", Option.empty());
|
||||||
|
|
||||||
|
// verify RO incremental reads - only one parquet file shows up because updates go into log files
|
||||||
|
incrementalROFiles = getROIncrementalFiles(partitionPath, true);
|
||||||
|
validateIncrementalFiles(partitionPath,1, incrementalROFiles, roInputFormat,
|
||||||
|
roJobConf, 200, commitTime1);
|
||||||
|
|
||||||
|
// verify RT incremental reads includes updates also
|
||||||
|
incrementalRTFiles = getRTIncrementalFiles(partitionPath);
|
||||||
|
validateIncrementalFiles(partitionPath, 1, incrementalRTFiles, rtInputFormat,
|
||||||
|
rtJobConf, 200, commitTime1, updateTime);
|
||||||
|
|
||||||
|
// write 3 - more inserts
|
||||||
|
String insertsTime = "006";
|
||||||
|
List<HoodieRecord> records006 = dataGen.generateInserts(insertsTime, 200);
|
||||||
|
client.startCommitWithTime(insertsTime);
|
||||||
|
insertAndGetFilePaths(records006, client, cfg, insertsTime);
|
||||||
|
|
||||||
|
incrementalROFiles = getROIncrementalFiles(partitionPath, true);
|
||||||
|
assertEquals(firstFilePath, incrementalROFiles[0].getPath());
|
||||||
|
// verify 006 does not show up in RO mode because of pending compaction
|
||||||
|
validateIncrementalFiles(partitionPath, 1, incrementalROFiles, roInputFormat,
|
||||||
|
roJobConf, 200, commitTime1);
|
||||||
|
|
||||||
|
// verify that if stopAtCompaction is disabled, inserts from "insertsTime" show up
|
||||||
|
incrementalROFiles = getROIncrementalFiles(partitionPath, false);
|
||||||
|
validateIncrementalFiles(partitionPath,2, incrementalROFiles, roInputFormat,
|
||||||
|
roJobConf, 400, commitTime1, insertsTime);
|
||||||
|
|
||||||
|
// verify 006 shows up in RT views
|
||||||
|
incrementalRTFiles = getRTIncrementalFiles(partitionPath);
|
||||||
|
validateIncrementalFiles(partitionPath, 2, incrementalRTFiles, rtInputFormat,
|
||||||
|
rtJobConf, 400, commitTime1, updateTime, insertsTime);
|
||||||
|
|
||||||
|
// perform the scheduled compaction
|
||||||
|
client.compact(compactionCommitTime);
|
||||||
|
|
||||||
|
incrementalROFiles = getROIncrementalFiles(partitionPath, "002", -1, true);
|
||||||
|
assertTrue(incrementalROFiles.length == 2);
|
||||||
|
// verify 006 shows up because of pending compaction
|
||||||
|
validateIncrementalFiles(partitionPath, 2, incrementalROFiles, roInputFormat,
|
||||||
|
roJobConf, 400, commitTime1, compactionCommitTime, insertsTime);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// Check if record level metadata is aggregated properly at the end of write.
|
// Check if record level metadata is aggregated properly at the end of write.
|
||||||
@Test
|
@Test
|
||||||
public void testMetadataAggregateFromWriteStatus() throws Exception {
|
public void testMetadataAggregateFromWriteStatus() throws Exception {
|
||||||
@@ -1309,4 +1385,115 @@ public class TestMergeOnReadTable extends HoodieClientTestHarness {
|
|||||||
assertFalse("Errors found in write of " + status.getFileId(), status.hasErrors());
|
assertFalse("Errors found in write of " + status.getFileId(), status.hasErrors());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private FileStatus[] insertAndGetFilePaths(List<HoodieRecord> records, HoodieWriteClient client,
|
||||||
|
HoodieWriteConfig cfg, String commitTime) throws IOException {
|
||||||
|
JavaRDD<HoodieRecord> writeRecords = jsc.parallelize(records, 1);
|
||||||
|
|
||||||
|
List<WriteStatus> statuses = client.insert(writeRecords, commitTime).collect();
|
||||||
|
assertNoWriteErrors(statuses);
|
||||||
|
|
||||||
|
metaClient = new HoodieTableMetaClient(jsc.hadoopConfiguration(), cfg.getBasePath());
|
||||||
|
HoodieTable hoodieTable = HoodieTable.create(metaClient, cfg, jsc);
|
||||||
|
|
||||||
|
Option<HoodieInstant> deltaCommit = metaClient.getActiveTimeline().getDeltaCommitTimeline().lastInstant();
|
||||||
|
assertTrue(deltaCommit.isPresent());
|
||||||
|
Assert.assertEquals("Delta commit should be specified value", commitTime, deltaCommit.get().getTimestamp());
|
||||||
|
|
||||||
|
Option<HoodieInstant> commit = metaClient.getActiveTimeline().getCommitTimeline().lastInstant();
|
||||||
|
assertFalse(commit.isPresent());
|
||||||
|
|
||||||
|
FileStatus[] allFiles = HoodieTestUtils.listAllDataFilesInPath(metaClient.getFs(), cfg.getBasePath());
|
||||||
|
BaseFileOnlyView roView =
|
||||||
|
new HoodieTableFileSystemView(metaClient, metaClient.getCommitTimeline().filterCompletedInstants(), allFiles);
|
||||||
|
Stream<HoodieBaseFile> dataFilesToRead = roView.getLatestBaseFiles();
|
||||||
|
assertTrue(!dataFilesToRead.findAny().isPresent());
|
||||||
|
|
||||||
|
roView = new HoodieTableFileSystemView(metaClient, hoodieTable.getCompletedCommitsTimeline(), allFiles);
|
||||||
|
dataFilesToRead = roView.getLatestBaseFiles();
|
||||||
|
assertTrue("should list the parquet files we wrote in the delta commit",
|
||||||
|
dataFilesToRead.findAny().isPresent());
|
||||||
|
return allFiles;
|
||||||
|
}
|
||||||
|
|
||||||
|
private FileStatus[] updateAndGetFilePaths(List<HoodieRecord> records, HoodieWriteClient client,
|
||||||
|
HoodieWriteConfig cfg, String commitTime) throws IOException {
|
||||||
|
Map<HoodieKey, HoodieRecord> recordsMap = new HashMap<>();
|
||||||
|
for (HoodieRecord rec : records) {
|
||||||
|
if (!recordsMap.containsKey(rec.getKey())) {
|
||||||
|
recordsMap.put(rec.getKey(), rec);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
List<WriteStatus> statuses = client.upsert(jsc.parallelize(records, 1), commitTime).collect();
|
||||||
|
// Verify there are no errors
|
||||||
|
assertNoWriteErrors(statuses);
|
||||||
|
metaClient = HoodieTableMetaClient.reload(metaClient);
|
||||||
|
Option<HoodieInstant> deltaCommit = metaClient.getActiveTimeline().getDeltaCommitTimeline().lastInstant();
|
||||||
|
assertTrue(deltaCommit.isPresent());
|
||||||
|
assertEquals("Latest Delta commit should match specified time",
|
||||||
|
commitTime, deltaCommit.get().getTimestamp());
|
||||||
|
|
||||||
|
Option<HoodieInstant> commit = metaClient.getActiveTimeline().getCommitTimeline().firstInstant();
|
||||||
|
assertFalse(commit.isPresent());
|
||||||
|
return HoodieTestUtils.listAllDataFilesInPath(metaClient.getFs(), cfg.getBasePath());
|
||||||
|
}
|
||||||
|
|
||||||
|
private FileStatus[] getROIncrementalFiles(String partitionPath, boolean stopAtCompaction)
|
||||||
|
throws Exception {
|
||||||
|
return getROIncrementalFiles(partitionPath, "000", -1, stopAtCompaction);
|
||||||
|
}
|
||||||
|
|
||||||
|
private FileStatus[] getROIncrementalFiles(String partitionPath, String startCommitTime, int numCommitsToPull, boolean stopAtCompaction)
|
||||||
|
throws Exception {
|
||||||
|
HoodieTestUtils.init(jsc.hadoopConfiguration(), basePath, HoodieTableType.MERGE_ON_READ);
|
||||||
|
setupIncremental(roJobConf, startCommitTime, numCommitsToPull, stopAtCompaction);
|
||||||
|
FileInputFormat.setInputPaths(roJobConf, basePath + "/" + partitionPath);
|
||||||
|
return roInputFormat.listStatus(roJobConf);
|
||||||
|
}
|
||||||
|
|
||||||
|
private FileStatus[] getRTIncrementalFiles(String partitionPath)
|
||||||
|
throws Exception {
|
||||||
|
return getRTIncrementalFiles(partitionPath, "000", -1);
|
||||||
|
}
|
||||||
|
|
||||||
|
private FileStatus[] getRTIncrementalFiles(String partitionPath, String startCommitTime, int numCommitsToPull)
|
||||||
|
throws Exception {
|
||||||
|
HoodieTestUtils.init(jsc.hadoopConfiguration(), basePath, HoodieTableType.MERGE_ON_READ);
|
||||||
|
setupIncremental(rtJobConf, startCommitTime, numCommitsToPull, false);
|
||||||
|
FileInputFormat.setInputPaths(rtJobConf, basePath + "/" + partitionPath);
|
||||||
|
return rtInputFormat.listStatus(rtJobConf);
|
||||||
|
}
|
||||||
|
|
||||||
|
private void setupIncremental(JobConf jobConf, String startCommit, int numberOfCommitsToPull, boolean stopAtCompaction) {
|
||||||
|
String modePropertyName =
|
||||||
|
String.format(HoodieHiveUtil.HOODIE_CONSUME_MODE_PATTERN, HoodieTestUtils.RAW_TRIPS_TEST_NAME);
|
||||||
|
jobConf.set(modePropertyName, HoodieHiveUtil.INCREMENTAL_SCAN_MODE);
|
||||||
|
|
||||||
|
String startCommitTimestampName =
|
||||||
|
String.format(HoodieHiveUtil.HOODIE_START_COMMIT_PATTERN, HoodieTestUtils.RAW_TRIPS_TEST_NAME);
|
||||||
|
jobConf.set(startCommitTimestampName, startCommit);
|
||||||
|
|
||||||
|
String maxCommitPulls =
|
||||||
|
String.format(HoodieHiveUtil.HOODIE_MAX_COMMIT_PATTERN, HoodieTestUtils.RAW_TRIPS_TEST_NAME);
|
||||||
|
jobConf.setInt(maxCommitPulls, numberOfCommitsToPull);
|
||||||
|
|
||||||
|
String stopAtCompactionPropName =
|
||||||
|
String.format(HoodieHiveUtil.HOODIE_STOP_AT_COMPACTION_PATTERN, HoodieTestUtils.RAW_TRIPS_TEST_NAME);
|
||||||
|
jobConf.setBoolean(stopAtCompactionPropName, stopAtCompaction);
|
||||||
|
}
|
||||||
|
|
||||||
|
private void validateIncrementalFiles(String partitionPath, int expectedNumFiles,
|
||||||
|
FileStatus[] files, HoodieParquetInputFormat inputFormat,
|
||||||
|
JobConf jobConf, int expectedRecords, String... expectedCommits) {
|
||||||
|
|
||||||
|
assertEquals(expectedNumFiles, files.length);
|
||||||
|
Set<String> expectedCommitsSet = Arrays.asList(expectedCommits).stream().collect(Collectors.toSet());
|
||||||
|
List<GenericRecord> records = HoodieMergeOnReadTestUtils.getRecordsUsingInputFormat(
|
||||||
|
Arrays.asList(basePath + "/" + partitionPath), basePath, jobConf, inputFormat);
|
||||||
|
assertEquals(expectedRecords, records.size());
|
||||||
|
Set<String> actualCommits = records.stream().map(r ->
|
||||||
|
r.get(HoodieRecord.COMMIT_TIME_METADATA_FIELD).toString()).collect(Collectors.toSet());
|
||||||
|
assertEquals(expectedCommitsSet, actualCommits);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -112,7 +112,7 @@ public class HoodieDefaultTimeline implements HoodieTimeline {
|
|||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public HoodieTimeline getCommitsAndCompactionTimeline() {
|
public HoodieDefaultTimeline getCommitsAndCompactionTimeline() {
|
||||||
Set<String> validActions = CollectionUtils.createSet(COMMIT_ACTION, DELTA_COMMIT_ACTION, COMPACTION_ACTION);
|
Set<String> validActions = CollectionUtils.createSet(COMMIT_ACTION, DELTA_COMMIT_ACTION, COMPACTION_ACTION);
|
||||||
return new HoodieDefaultTimeline(instants.stream().filter(s -> validActions.contains(s.getAction())), details);
|
return new HoodieDefaultTimeline(instants.stream().filter(s -> validActions.contains(s.getAction())), details);
|
||||||
}
|
}
|
||||||
@@ -136,6 +136,13 @@ public class HoodieDefaultTimeline implements HoodieTimeline {
|
|||||||
details);
|
details);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public HoodieDefaultTimeline findInstantsBefore(String instantTime) {
|
||||||
|
return new HoodieDefaultTimeline(instants.stream()
|
||||||
|
.filter(s -> HoodieTimeline.compareTimestamps(s.getTimestamp(), instantTime, LESSER)),
|
||||||
|
details);
|
||||||
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public HoodieTimeline filter(Predicate<HoodieInstant> filter) {
|
public HoodieTimeline filter(Predicate<HoodieInstant> filter) {
|
||||||
return new HoodieDefaultTimeline(instants.stream().filter(filter), details);
|
return new HoodieDefaultTimeline(instants.stream().filter(filter), details);
|
||||||
|
|||||||
@@ -143,6 +143,11 @@ public interface HoodieTimeline extends Serializable {
|
|||||||
*/
|
*/
|
||||||
HoodieTimeline findInstantsAfter(String instantTime, int numCommits);
|
HoodieTimeline findInstantsAfter(String instantTime, int numCommits);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Create a new Timeline with all instants before specified time.
|
||||||
|
*/
|
||||||
|
HoodieTimeline findInstantsBefore(String instantTime);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Custom Filter of Instants.
|
* Custom Filter of Instants.
|
||||||
*/
|
*/
|
||||||
|
|||||||
@@ -18,6 +18,7 @@
|
|||||||
|
|
||||||
package org.apache.hudi.common.table.timeline;
|
package org.apache.hudi.common.table.timeline;
|
||||||
|
|
||||||
|
import org.apache.hadoop.fs.Path;
|
||||||
import org.apache.hudi.common.HoodieCommonTestHarness;
|
import org.apache.hudi.common.HoodieCommonTestHarness;
|
||||||
import org.apache.hudi.common.model.HoodieTestUtils;
|
import org.apache.hudi.common.model.HoodieTestUtils;
|
||||||
import org.apache.hudi.common.table.HoodieTableMetaClient;
|
import org.apache.hudi.common.table.HoodieTableMetaClient;
|
||||||
@@ -25,8 +26,6 @@ import org.apache.hudi.common.table.timeline.HoodieInstant.State;
|
|||||||
import org.apache.hudi.common.table.timeline.versioning.TimelineLayoutVersion;
|
import org.apache.hudi.common.table.timeline.versioning.TimelineLayoutVersion;
|
||||||
import org.apache.hudi.common.util.CollectionUtils;
|
import org.apache.hudi.common.util.CollectionUtils;
|
||||||
import org.apache.hudi.common.util.Option;
|
import org.apache.hudi.common.util.Option;
|
||||||
|
|
||||||
import org.apache.hadoop.fs.Path;
|
|
||||||
import org.junit.Before;
|
import org.junit.Before;
|
||||||
import org.junit.Rule;
|
import org.junit.Rule;
|
||||||
import org.junit.Test;
|
import org.junit.Test;
|
||||||
@@ -153,10 +152,15 @@ public class TestHoodieActiveTimeline extends HoodieCommonTestHarness {
|
|||||||
public void testTimelineOperations() {
|
public void testTimelineOperations() {
|
||||||
timeline = new MockHoodieTimeline(Stream.of("01", "03", "05", "07", "09", "11", "13", "15", "17", "19"),
|
timeline = new MockHoodieTimeline(Stream.of("01", "03", "05", "07", "09", "11", "13", "15", "17", "19"),
|
||||||
Stream.of("21", "23"));
|
Stream.of("21", "23"));
|
||||||
HoodieTestUtils.assertStreamEquals("", Stream.of("05", "07", "09", "11"), timeline.getCommitTimeline()
|
HoodieTestUtils.assertStreamEquals("findInstantsInRange should return 4 instants", Stream.of("05", "07", "09", "11"),
|
||||||
.filterCompletedInstants().findInstantsInRange("04", "11").getInstants().map(HoodieInstant::getTimestamp));
|
timeline.getCommitTimeline().filterCompletedInstants().findInstantsInRange("04", "11")
|
||||||
HoodieTestUtils.assertStreamEquals("", Stream.of("09", "11"), timeline.getCommitTimeline().filterCompletedInstants()
|
.getInstants().map(HoodieInstant::getTimestamp));
|
||||||
.findInstantsAfter("07", 2).getInstants().map(HoodieInstant::getTimestamp));
|
HoodieTestUtils.assertStreamEquals("findInstantsAfter 07 should return 2 instants", Stream.of("09", "11"),
|
||||||
|
timeline.getCommitTimeline().filterCompletedInstants().findInstantsAfter("07", 2)
|
||||||
|
.getInstants().map(HoodieInstant::getTimestamp));
|
||||||
|
HoodieTestUtils.assertStreamEquals("findInstantsBefore 07 should return 3 instants", Stream.of("01", "03", "05"),
|
||||||
|
timeline.getCommitTimeline().filterCompletedInstants().findInstantsBefore("07")
|
||||||
|
.getInstants().map(HoodieInstant::getTimestamp));
|
||||||
assertFalse(timeline.empty());
|
assertFalse(timeline.empty());
|
||||||
assertFalse(timeline.getCommitTimeline().filterPendingExcludingCompaction().empty());
|
assertFalse(timeline.getCommitTimeline().filterPendingExcludingCompaction().empty());
|
||||||
assertEquals("", 12, timeline.countInstants());
|
assertEquals("", 12, timeline.countInstants());
|
||||||
|
|||||||
@@ -38,6 +38,23 @@ public class HoodieHiveUtil {
|
|||||||
public static final String HOODIE_CONSUME_MODE_PATTERN = "hoodie.%s.consume.mode";
|
public static final String HOODIE_CONSUME_MODE_PATTERN = "hoodie.%s.consume.mode";
|
||||||
public static final String HOODIE_START_COMMIT_PATTERN = "hoodie.%s.consume.start.timestamp";
|
public static final String HOODIE_START_COMMIT_PATTERN = "hoodie.%s.consume.start.timestamp";
|
||||||
public static final String HOODIE_MAX_COMMIT_PATTERN = "hoodie.%s.consume.max.commits";
|
public static final String HOODIE_MAX_COMMIT_PATTERN = "hoodie.%s.consume.max.commits";
|
||||||
|
/*
|
||||||
|
* Boolean property to stop incremental reader when there is a pending compaction.
|
||||||
|
* This is needed to prevent certain race conditions with RO views of MOR tables. only applicable for RO views.
|
||||||
|
*
|
||||||
|
* example timeline:
|
||||||
|
*
|
||||||
|
* t0 -> create bucket1.parquet
|
||||||
|
* t1 -> create and append updates bucket1.log
|
||||||
|
* t2 -> request compaction
|
||||||
|
* t3 -> create bucket2.parquet
|
||||||
|
*
|
||||||
|
* if compaction at t2 takes a long time, incremental readers on RO tables can move to t3 and would skip updates in t1
|
||||||
|
*
|
||||||
|
* To workaround this problem, we want to stop returning data belonging to commits > t2.
|
||||||
|
* After compaction is complete, incremental reader would see updates in t2, t3, so on.
|
||||||
|
*/
|
||||||
|
public static final String HOODIE_STOP_AT_COMPACTION_PATTERN = "hoodie.%s.ro.stop.at.compaction";
|
||||||
public static final String INCREMENTAL_SCAN_MODE = "INCREMENTAL";
|
public static final String INCREMENTAL_SCAN_MODE = "INCREMENTAL";
|
||||||
public static final String SNAPSHOT_SCAN_MODE = "SNAPSHOT";
|
public static final String SNAPSHOT_SCAN_MODE = "SNAPSHOT";
|
||||||
public static final String DEFAULT_SCAN_MODE = SNAPSHOT_SCAN_MODE;
|
public static final String DEFAULT_SCAN_MODE = SNAPSHOT_SCAN_MODE;
|
||||||
@@ -46,6 +63,13 @@ public class HoodieHiveUtil {
|
|||||||
public static final int DEFAULT_LEVELS_TO_BASEPATH = 3;
|
public static final int DEFAULT_LEVELS_TO_BASEPATH = 3;
|
||||||
public static final Pattern HOODIE_CONSUME_MODE_PATTERN_STRING = Pattern.compile("hoodie\\.(.*)\\.consume\\.mode");
|
public static final Pattern HOODIE_CONSUME_MODE_PATTERN_STRING = Pattern.compile("hoodie\\.(.*)\\.consume\\.mode");
|
||||||
|
|
||||||
|
public static boolean stopAtCompaction(JobContext job, String tableName) {
|
||||||
|
String compactionPropName = String.format(HOODIE_STOP_AT_COMPACTION_PATTERN, tableName);
|
||||||
|
boolean stopAtCompaction = job.getConfiguration().getBoolean(compactionPropName, true);
|
||||||
|
LOG.info("Read stop at compaction - " + stopAtCompaction);
|
||||||
|
return stopAtCompaction;
|
||||||
|
}
|
||||||
|
|
||||||
public static Integer readMaxCommits(JobContext job, String tableName) {
|
public static Integer readMaxCommits(JobContext job, String tableName) {
|
||||||
String maxCommitName = String.format(HOODIE_MAX_COMMIT_PATTERN, tableName);
|
String maxCommitName = String.format(HOODIE_MAX_COMMIT_PATTERN, tableName);
|
||||||
int maxCommits = job.getConfiguration().getInt(maxCommitName, DEFAULT_MAX_COMMITS);
|
int maxCommits = job.getConfiguration().getInt(maxCommitName, DEFAULT_MAX_COMMITS);
|
||||||
|
|||||||
@@ -18,17 +18,6 @@
|
|||||||
|
|
||||||
package org.apache.hudi.hadoop;
|
package org.apache.hudi.hadoop;
|
||||||
|
|
||||||
import org.apache.hudi.common.model.HoodieBaseFile;
|
|
||||||
import org.apache.hudi.common.model.HoodieCommitMetadata;
|
|
||||||
import org.apache.hudi.common.model.HoodiePartitionMetadata;
|
|
||||||
import org.apache.hudi.common.table.HoodieTableMetaClient;
|
|
||||||
import org.apache.hudi.common.table.timeline.HoodieInstant;
|
|
||||||
import org.apache.hudi.common.table.timeline.HoodieTimeline;
|
|
||||||
import org.apache.hudi.common.table.view.HoodieTableFileSystemView;
|
|
||||||
import org.apache.hudi.common.table.view.TableFileSystemView.BaseFileOnlyView;
|
|
||||||
import org.apache.hudi.common.util.StringUtils;
|
|
||||||
import org.apache.hudi.exception.HoodieIOException;
|
|
||||||
|
|
||||||
import org.apache.hadoop.conf.Configurable;
|
import org.apache.hadoop.conf.Configurable;
|
||||||
import org.apache.hadoop.conf.Configuration;
|
import org.apache.hadoop.conf.Configuration;
|
||||||
import org.apache.hadoop.fs.FileStatus;
|
import org.apache.hadoop.fs.FileStatus;
|
||||||
@@ -42,6 +31,18 @@ import org.apache.hadoop.mapred.JobConf;
|
|||||||
import org.apache.hadoop.mapred.RecordReader;
|
import org.apache.hadoop.mapred.RecordReader;
|
||||||
import org.apache.hadoop.mapred.Reporter;
|
import org.apache.hadoop.mapred.Reporter;
|
||||||
import org.apache.hadoop.mapreduce.Job;
|
import org.apache.hadoop.mapreduce.Job;
|
||||||
|
import org.apache.hudi.common.model.HoodieBaseFile;
|
||||||
|
import org.apache.hudi.common.model.HoodieCommitMetadata;
|
||||||
|
import org.apache.hudi.common.model.HoodiePartitionMetadata;
|
||||||
|
import org.apache.hudi.common.table.HoodieTableMetaClient;
|
||||||
|
import org.apache.hudi.common.table.timeline.HoodieDefaultTimeline;
|
||||||
|
import org.apache.hudi.common.table.timeline.HoodieInstant;
|
||||||
|
import org.apache.hudi.common.table.timeline.HoodieTimeline;
|
||||||
|
import org.apache.hudi.common.table.view.HoodieTableFileSystemView;
|
||||||
|
import org.apache.hudi.common.table.view.TableFileSystemView.BaseFileOnlyView;
|
||||||
|
import org.apache.hudi.common.util.Option;
|
||||||
|
import org.apache.hudi.common.util.StringUtils;
|
||||||
|
import org.apache.hudi.exception.HoodieIOException;
|
||||||
import org.apache.log4j.LogManager;
|
import org.apache.log4j.LogManager;
|
||||||
import org.apache.log4j.Logger;
|
import org.apache.log4j.Logger;
|
||||||
|
|
||||||
@@ -118,6 +119,36 @@ public class HoodieParquetInputFormat extends MapredParquetInputFormat implement
|
|||||||
return returns.toArray(new FileStatus[returns.size()]);
|
return returns.toArray(new FileStatus[returns.size()]);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Filter any specific instants that we do not want to process.
|
||||||
|
* example timeline:
|
||||||
|
*
|
||||||
|
* t0 -> create bucket1.parquet
|
||||||
|
* t1 -> create and append updates bucket1.log
|
||||||
|
* t2 -> request compaction
|
||||||
|
* t3 -> create bucket2.parquet
|
||||||
|
*
|
||||||
|
* if compaction at t2 takes a long time, incremental readers on RO tables can move to t3 and would skip updates in t1
|
||||||
|
*
|
||||||
|
* To workaround this problem, we want to stop returning data belonging to commits > t2.
|
||||||
|
* After compaction is complete, incremental reader would see updates in t2, t3, so on.
|
||||||
|
*/
|
||||||
|
protected HoodieDefaultTimeline filterInstantsTimeline(HoodieDefaultTimeline timeline) {
|
||||||
|
HoodieDefaultTimeline commitsAndCompactionTimeline = timeline.getCommitsAndCompactionTimeline();
|
||||||
|
Option<HoodieInstant> pendingCompactionInstant = commitsAndCompactionTimeline.filterPendingCompactionTimeline().firstInstant();
|
||||||
|
if (pendingCompactionInstant.isPresent()) {
|
||||||
|
HoodieDefaultTimeline instantsTimeline = commitsAndCompactionTimeline.findInstantsBefore(pendingCompactionInstant.get().getTimestamp());
|
||||||
|
int numCommitsFilteredByCompaction = commitsAndCompactionTimeline.getCommitsTimeline().countInstants()
|
||||||
|
- instantsTimeline.getCommitsTimeline().countInstants();
|
||||||
|
LOG.info("Earliest pending compaction instant is: " + pendingCompactionInstant.get().getTimestamp()
|
||||||
|
+ " skipping " + numCommitsFilteredByCompaction + " commits");
|
||||||
|
|
||||||
|
return instantsTimeline;
|
||||||
|
} else {
|
||||||
|
return timeline;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Achieves listStatus functionality for an incrementally queried table. Instead of listing all
|
* Achieves listStatus functionality for an incrementally queried table. Instead of listing all
|
||||||
* partitions and then filtering based on the commits of interest, this logic first extracts the
|
* partitions and then filtering based on the commits of interest, this logic first extracts the
|
||||||
@@ -126,10 +157,18 @@ public class HoodieParquetInputFormat extends MapredParquetInputFormat implement
|
|||||||
private List<FileStatus> listStatusForIncrementalMode(
|
private List<FileStatus> listStatusForIncrementalMode(
|
||||||
JobConf job, HoodieTableMetaClient tableMetaClient, List<Path> inputPaths) throws IOException {
|
JobConf job, HoodieTableMetaClient tableMetaClient, List<Path> inputPaths) throws IOException {
|
||||||
String tableName = tableMetaClient.getTableConfig().getTableName();
|
String tableName = tableMetaClient.getTableConfig().getTableName();
|
||||||
HoodieTimeline timeline = tableMetaClient.getActiveTimeline().getCommitsTimeline().filterCompletedInstants();
|
Job jobContext = Job.getInstance(job);
|
||||||
String lastIncrementalTs = HoodieHiveUtil.readStartCommitTime(Job.getInstance(job), tableName);
|
HoodieDefaultTimeline baseTimeline;
|
||||||
|
if (HoodieHiveUtil.stopAtCompaction(jobContext, tableName)) {
|
||||||
|
baseTimeline = filterInstantsTimeline(tableMetaClient.getActiveTimeline());
|
||||||
|
} else {
|
||||||
|
baseTimeline = tableMetaClient.getActiveTimeline();
|
||||||
|
}
|
||||||
|
|
||||||
|
HoodieTimeline timeline = baseTimeline.getCommitsTimeline().filterCompletedInstants();
|
||||||
|
String lastIncrementalTs = HoodieHiveUtil.readStartCommitTime(jobContext, tableName);
|
||||||
// Total number of commits to return in this batch. Set this to -1 to get all the commits.
|
// Total number of commits to return in this batch. Set this to -1 to get all the commits.
|
||||||
Integer maxCommits = HoodieHiveUtil.readMaxCommits(Job.getInstance(job), tableName);
|
Integer maxCommits = HoodieHiveUtil.readMaxCommits(jobContext, tableName);
|
||||||
LOG.info("Last Incremental timestamp was set as " + lastIncrementalTs);
|
LOG.info("Last Incremental timestamp was set as " + lastIncrementalTs);
|
||||||
List<HoodieInstant> commitsToCheck = timeline.findInstantsAfter(lastIncrementalTs, maxCommits)
|
List<HoodieInstant> commitsToCheck = timeline.findInstantsAfter(lastIncrementalTs, maxCommits)
|
||||||
.getInstants().collect(Collectors.toList());
|
.getInstants().collect(Collectors.toList());
|
||||||
|
|||||||
@@ -23,6 +23,7 @@ import org.apache.hudi.common.model.FileSlice;
|
|||||||
import org.apache.hudi.common.model.HoodieLogFile;
|
import org.apache.hudi.common.model.HoodieLogFile;
|
||||||
import org.apache.hudi.common.model.HoodieRecord;
|
import org.apache.hudi.common.model.HoodieRecord;
|
||||||
import org.apache.hudi.common.table.HoodieTableMetaClient;
|
import org.apache.hudi.common.table.HoodieTableMetaClient;
|
||||||
|
import org.apache.hudi.common.table.timeline.HoodieDefaultTimeline;
|
||||||
import org.apache.hudi.common.table.timeline.HoodieInstant;
|
import org.apache.hudi.common.table.timeline.HoodieInstant;
|
||||||
import org.apache.hudi.common.table.timeline.HoodieTimeline;
|
import org.apache.hudi.common.table.timeline.HoodieTimeline;
|
||||||
import org.apache.hudi.common.table.view.HoodieTableFileSystemView;
|
import org.apache.hudi.common.table.view.HoodieTableFileSystemView;
|
||||||
@@ -159,6 +160,12 @@ public class HoodieParquetRealtimeInputFormat extends HoodieParquetInputFormat i
|
|||||||
return super.listStatus(job);
|
return super.listStatus(job);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected HoodieDefaultTimeline filterInstantsTimeline(HoodieDefaultTimeline timeline) {
|
||||||
|
// no specific filtering for Realtime format
|
||||||
|
return timeline;
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Add a field to the existing fields projected.
|
* Add a field to the existing fields projected.
|
||||||
*/
|
*/
|
||||||
|
|||||||
@@ -50,8 +50,13 @@ public class InputFormatTestUtil {
|
|||||||
basePath.create();
|
basePath.create();
|
||||||
HoodieTestUtils.init(HoodieTestUtils.getDefaultHadoopConf(), basePath.getRoot().toString());
|
HoodieTestUtils.init(HoodieTestUtils.getDefaultHadoopConf(), basePath.getRoot().toString());
|
||||||
File partitionPath = basePath.newFolder("2016", "05", "01");
|
File partitionPath = basePath.newFolder("2016", "05", "01");
|
||||||
|
return simulateInserts(partitionPath, "fileId1", numberOfFiles, commitNumber);
|
||||||
|
}
|
||||||
|
|
||||||
|
public static File simulateInserts(File partitionPath, String fileId, int numberOfFiles, String commitNumber)
|
||||||
|
throws IOException {
|
||||||
for (int i = 0; i < numberOfFiles; i++) {
|
for (int i = 0; i < numberOfFiles; i++) {
|
||||||
File dataFile = new File(partitionPath, FSUtils.makeDataFileName(commitNumber, TEST_WRITE_TOKEN, "fileid" + i));
|
File dataFile = new File(partitionPath, FSUtils.makeDataFileName(commitNumber, TEST_WRITE_TOKEN, fileId + i));
|
||||||
dataFile.createNewFile();
|
dataFile.createNewFile();
|
||||||
}
|
}
|
||||||
return partitionPath;
|
return partitionPath;
|
||||||
|
|||||||
@@ -18,6 +18,7 @@
|
|||||||
|
|
||||||
package org.apache.hudi.hadoop;
|
package org.apache.hudi.hadoop;
|
||||||
|
|
||||||
|
import org.apache.hudi.avro.model.HoodieCompactionPlan;
|
||||||
import org.apache.hudi.common.fs.FSUtils;
|
import org.apache.hudi.common.fs.FSUtils;
|
||||||
import org.apache.hudi.common.model.HoodieCommitMetadata;
|
import org.apache.hudi.common.model.HoodieCommitMetadata;
|
||||||
import org.apache.hudi.common.model.HoodieTestUtils;
|
import org.apache.hudi.common.model.HoodieTestUtils;
|
||||||
@@ -32,6 +33,11 @@ import org.apache.hadoop.mapred.InputSplit;
|
|||||||
import org.apache.hadoop.mapred.JobConf;
|
import org.apache.hadoop.mapred.JobConf;
|
||||||
import org.apache.hadoop.mapred.RecordReader;
|
import org.apache.hadoop.mapred.RecordReader;
|
||||||
import org.apache.hadoop.mapreduce.Job;
|
import org.apache.hadoop.mapreduce.Job;
|
||||||
|
import org.apache.hudi.common.table.HoodieTableMetaClient;
|
||||||
|
import org.apache.hudi.common.table.timeline.HoodieActiveTimeline;
|
||||||
|
import org.apache.hudi.common.table.timeline.HoodieInstant;
|
||||||
|
import org.apache.hudi.common.table.timeline.HoodieTimeline;
|
||||||
|
import org.apache.hudi.common.table.timeline.TimelineMetadataUtils;
|
||||||
import org.junit.Before;
|
import org.junit.Before;
|
||||||
import org.junit.Rule;
|
import org.junit.Rule;
|
||||||
import org.junit.Test;
|
import org.junit.Test;
|
||||||
@@ -41,9 +47,11 @@ import java.io.File;
|
|||||||
import java.io.FileOutputStream;
|
import java.io.FileOutputStream;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.nio.charset.StandardCharsets;
|
import java.nio.charset.StandardCharsets;
|
||||||
|
import java.util.ArrayList;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
|
||||||
import static org.junit.Assert.assertEquals;
|
import static org.junit.Assert.assertEquals;
|
||||||
|
import static org.junit.Assert.assertFalse;
|
||||||
import static org.junit.Assert.assertTrue;
|
import static org.junit.Assert.assertTrue;
|
||||||
|
|
||||||
public class TestHoodieParquetInputFormat {
|
public class TestHoodieParquetInputFormat {
|
||||||
@@ -61,6 +69,67 @@ public class TestHoodieParquetInputFormat {
|
|||||||
@Rule
|
@Rule
|
||||||
public TemporaryFolder basePath = new TemporaryFolder();
|
public TemporaryFolder basePath = new TemporaryFolder();
|
||||||
|
|
||||||
|
// Verify that HoodieParquetInputFormat does not return instants after pending compaction
|
||||||
|
@Test
|
||||||
|
public void testPendingCompactionWithActiveCommits() throws IOException {
|
||||||
|
// setup 4 sample instants in timeline
|
||||||
|
List<HoodieInstant> instants = new ArrayList<>();
|
||||||
|
HoodieInstant t1 = new HoodieInstant(HoodieInstant.State.COMPLETED, HoodieTimeline.COMMIT_ACTION, "1");
|
||||||
|
HoodieInstant t2 = new HoodieInstant(HoodieInstant.State.INFLIGHT, HoodieTimeline.DELTA_COMMIT_ACTION, "2");
|
||||||
|
HoodieInstant t3 = new HoodieInstant(HoodieInstant.State.REQUESTED, HoodieTimeline.COMPACTION_ACTION, "3");
|
||||||
|
HoodieInstant t4 = new HoodieInstant(HoodieInstant.State.COMPLETED, HoodieTimeline.DELTA_COMMIT_ACTION, "4");
|
||||||
|
HoodieInstant t5 = new HoodieInstant(HoodieInstant.State.REQUESTED, HoodieTimeline.COMPACTION_ACTION, "5");
|
||||||
|
HoodieInstant t6 = new HoodieInstant(HoodieInstant.State.COMPLETED, HoodieTimeline.DELTA_COMMIT_ACTION, "6");
|
||||||
|
|
||||||
|
instants.add(t1);
|
||||||
|
instants.add(t2);
|
||||||
|
instants.add(t3);
|
||||||
|
instants.add(t4);
|
||||||
|
instants.add(t5);
|
||||||
|
instants.add(t6);
|
||||||
|
HoodieTableMetaClient metaClient = HoodieTestUtils.init(basePath.getRoot().getAbsolutePath().toString());
|
||||||
|
HoodieActiveTimeline timeline = new HoodieActiveTimeline(metaClient);
|
||||||
|
timeline.setInstants(instants);
|
||||||
|
|
||||||
|
// Verify getCommitsTimelineBeforePendingCompaction does not return instants after first compaction instant
|
||||||
|
HoodieTimeline filteredTimeline = new HoodieParquetInputFormat().filterInstantsTimeline(timeline);
|
||||||
|
assertTrue(filteredTimeline.containsInstant(t1));
|
||||||
|
assertTrue(filteredTimeline.containsInstant(t2));
|
||||||
|
assertFalse(filteredTimeline.containsInstant(t3));
|
||||||
|
assertFalse(filteredTimeline.containsInstant(t4));
|
||||||
|
assertFalse(filteredTimeline.containsInstant(t5));
|
||||||
|
assertFalse(filteredTimeline.containsInstant(t6));
|
||||||
|
|
||||||
|
|
||||||
|
// remove compaction instant and setup timeline again
|
||||||
|
instants.remove(t3);
|
||||||
|
timeline = new HoodieActiveTimeline(metaClient);
|
||||||
|
timeline.setInstants(instants);
|
||||||
|
filteredTimeline = new HoodieParquetInputFormat().filterInstantsTimeline(timeline);
|
||||||
|
|
||||||
|
// verify all remaining instants are returned.
|
||||||
|
assertTrue(filteredTimeline.containsInstant(t1));
|
||||||
|
assertTrue(filteredTimeline.containsInstant(t2));
|
||||||
|
assertFalse(filteredTimeline.containsInstant(t3));
|
||||||
|
assertTrue(filteredTimeline.containsInstant(t4));
|
||||||
|
assertFalse(filteredTimeline.containsInstant(t5));
|
||||||
|
assertFalse(filteredTimeline.containsInstant(t6));
|
||||||
|
|
||||||
|
// remove remaining compaction instant and setup timeline again
|
||||||
|
instants.remove(t5);
|
||||||
|
timeline = new HoodieActiveTimeline(metaClient);
|
||||||
|
timeline.setInstants(instants);
|
||||||
|
filteredTimeline = new HoodieParquetInputFormat().filterInstantsTimeline(timeline);
|
||||||
|
|
||||||
|
// verify all remaining instants are returned.
|
||||||
|
assertTrue(filteredTimeline.containsInstant(t1));
|
||||||
|
assertTrue(filteredTimeline.containsInstant(t2));
|
||||||
|
assertFalse(filteredTimeline.containsInstant(t3));
|
||||||
|
assertTrue(filteredTimeline.containsInstant(t4));
|
||||||
|
assertFalse(filteredTimeline.containsInstant(t5));
|
||||||
|
assertTrue(filteredTimeline.containsInstant(t6));
|
||||||
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testInputFormatLoad() throws IOException {
|
public void testInputFormatLoad() throws IOException {
|
||||||
// initial commit
|
// initial commit
|
||||||
@@ -133,6 +202,22 @@ public class TestHoodieParquetInputFormat {
|
|||||||
fileOutputStream.close();
|
fileOutputStream.close();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private File createCompactionFile(TemporaryFolder basePath, String commitTime)
|
||||||
|
throws IOException {
|
||||||
|
File file = new File(basePath.getRoot().toString() + "/.hoodie/",
|
||||||
|
HoodieTimeline.makeRequestedCompactionFileName(commitTime));
|
||||||
|
assertTrue(file.createNewFile());
|
||||||
|
FileOutputStream os = new FileOutputStream(file);
|
||||||
|
try {
|
||||||
|
HoodieCompactionPlan compactionPlan = HoodieCompactionPlan.newBuilder().setVersion(2).build();
|
||||||
|
// Write empty commit metadata
|
||||||
|
os.write(TimelineMetadataUtils.serializeCompactionPlan(compactionPlan).get());
|
||||||
|
return file;
|
||||||
|
} finally {
|
||||||
|
os.close();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testIncrementalWithMultipleCommits() throws IOException {
|
public void testIncrementalWithMultipleCommits() throws IOException {
|
||||||
// initial commit
|
// initial commit
|
||||||
@@ -228,6 +313,43 @@ public class TestHoodieParquetInputFormat {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// test incremental read does not go past compaction instant for RO views
|
||||||
|
@Test
|
||||||
|
public void testIncrementalWithPendingCompaction() throws IOException {
|
||||||
|
// initial commit
|
||||||
|
File partitionDir = InputFormatTestUtil.prepareTable(basePath, 10, "100");
|
||||||
|
createCommitFile(basePath, "100", "2016/05/01");
|
||||||
|
|
||||||
|
// simulate compaction requested at 300
|
||||||
|
File compactionFile = createCompactionFile(basePath, "300");
|
||||||
|
|
||||||
|
// write inserts into new bucket
|
||||||
|
InputFormatTestUtil.simulateInserts(partitionDir, "fileId2", 10, "400");
|
||||||
|
createCommitFile(basePath, "400", "2016/05/01");
|
||||||
|
|
||||||
|
// Add the paths
|
||||||
|
FileInputFormat.setInputPaths(jobConf, partitionDir.getPath());
|
||||||
|
InputFormatTestUtil.setupIncremental(jobConf, "0", -1);
|
||||||
|
FileStatus[] files = inputFormat.listStatus(jobConf);
|
||||||
|
assertEquals("Pulling all commit from beginning, should not return instants after begin compaction",
|
||||||
|
10, files.length);
|
||||||
|
ensureFilesInCommit("Pulling all commit from beginning, should not return instants after begin compaction",
|
||||||
|
files, "100", 10);
|
||||||
|
|
||||||
|
// delete compaction and verify inserts show up
|
||||||
|
compactionFile.delete();
|
||||||
|
InputFormatTestUtil.setupIncremental(jobConf, "0", -1);
|
||||||
|
files = inputFormat.listStatus(jobConf);
|
||||||
|
assertEquals("after deleting compaction, should get all inserted files",
|
||||||
|
20, files.length);
|
||||||
|
|
||||||
|
ensureFilesInCommit("Pulling all commit from beginning, should return instants before requested compaction",
|
||||||
|
files, "100", 10);
|
||||||
|
ensureFilesInCommit("Pulling all commit from beginning, should return instants after requested compaction",
|
||||||
|
files, "400", 10);
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
private void ensureRecordsInCommit(String msg, String commit, int expectedNumberOfRecordsInCommit,
|
private void ensureRecordsInCommit(String msg, String commit, int expectedNumberOfRecordsInCommit,
|
||||||
int totalExpected) throws IOException {
|
int totalExpected) throws IOException {
|
||||||
int actualCount = 0;
|
int actualCount = 0;
|
||||||
|
|||||||
Reference in New Issue
Block a user