[HUDI-687] Stop incremental reader on RO table when there is a pending compaction (#1396)
This commit is contained in:
@@ -20,6 +20,7 @@ package org.apache.hudi.common;
|
||||
|
||||
import org.apache.hudi.avro.HoodieAvroUtils;
|
||||
import org.apache.hudi.common.model.HoodieTestUtils;
|
||||
import org.apache.hudi.hadoop.HoodieParquetInputFormat;
|
||||
import org.apache.hudi.hadoop.realtime.HoodieParquetRealtimeInputFormat;
|
||||
|
||||
import org.apache.avro.Schema;
|
||||
@@ -51,29 +52,36 @@ public class HoodieMergeOnReadTestUtils {
|
||||
public static List<GenericRecord> getRecordsUsingInputFormat(List<String> inputPaths, String basePath,
|
||||
Configuration conf) {
|
||||
JobConf jobConf = new JobConf(conf);
|
||||
return getRecordsUsingInputFormat(inputPaths, basePath, jobConf, new HoodieParquetRealtimeInputFormat());
|
||||
}
|
||||
|
||||
public static List<GenericRecord> getRecordsUsingInputFormat(List<String> inputPaths,
|
||||
String basePath,
|
||||
JobConf jobConf,
|
||||
HoodieParquetInputFormat inputFormat) {
|
||||
Schema schema = HoodieAvroUtils.addMetadataFields(
|
||||
new Schema.Parser().parse(HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA));
|
||||
HoodieParquetRealtimeInputFormat inputFormat = new HoodieParquetRealtimeInputFormat();
|
||||
setPropsForInputFormat(inputFormat, jobConf, schema, basePath);
|
||||
return inputPaths.stream().map(path -> {
|
||||
setInputPath(jobConf, path);
|
||||
List<GenericRecord> records = new ArrayList<>();
|
||||
try {
|
||||
List<InputSplit> splits = Arrays.asList(inputFormat.getSplits(jobConf, 1));
|
||||
RecordReader recordReader = inputFormat.getRecordReader(splits.get(0), jobConf, null);
|
||||
Void key = (Void) recordReader.createKey();
|
||||
ArrayWritable writable = (ArrayWritable) recordReader.createValue();
|
||||
while (recordReader.next(key, writable)) {
|
||||
GenericRecordBuilder newRecord = new GenericRecordBuilder(schema);
|
||||
// writable returns an array with [field1, field2, _hoodie_commit_time,
|
||||
// _hoodie_commit_seqno]
|
||||
Writable[] values = writable.get();
|
||||
final int[] fieldIndex = {0};
|
||||
assert schema.getFields().size() <= values.length;
|
||||
schema.getFields().forEach(field -> {
|
||||
newRecord.set(field, values[fieldIndex[0]++]);
|
||||
});
|
||||
records.add(newRecord.build());
|
||||
for (InputSplit split : splits) {
|
||||
RecordReader recordReader = inputFormat.getRecordReader(split, jobConf, null);
|
||||
Void key = (Void) recordReader.createKey();
|
||||
ArrayWritable writable = (ArrayWritable) recordReader.createValue();
|
||||
while (recordReader.next(key, writable)) {
|
||||
GenericRecordBuilder newRecord = new GenericRecordBuilder(schema);
|
||||
// writable returns an array with [field1, field2, _hoodie_commit_time,
|
||||
// _hoodie_commit_seqno]
|
||||
Writable[] values = writable.get();
|
||||
assert schema.getFields().size() <= values.length;
|
||||
schema.getFields().forEach(field -> {
|
||||
newRecord.set(field, values[field.pos()]);
|
||||
});
|
||||
records.add(newRecord.build());
|
||||
}
|
||||
}
|
||||
} catch (IOException ie) {
|
||||
ie.printStackTrace();
|
||||
@@ -85,7 +93,7 @@ public class HoodieMergeOnReadTestUtils {
|
||||
}).orElse(new ArrayList<GenericRecord>());
|
||||
}
|
||||
|
||||
private static void setPropsForInputFormat(HoodieParquetRealtimeInputFormat inputFormat, JobConf jobConf,
|
||||
private static void setPropsForInputFormat(HoodieParquetInputFormat inputFormat, JobConf jobConf,
|
||||
Schema schema, String basePath) {
|
||||
List<Schema.Field> fields = schema.getFields();
|
||||
String names = fields.stream().map(f -> f.name().toString()).collect(Collectors.joining(","));
|
||||
|
||||
@@ -18,6 +18,7 @@
|
||||
|
||||
package org.apache.hudi.table;
|
||||
|
||||
import org.apache.hudi.client.HoodieWriteClient;
|
||||
import org.apache.hudi.client.WriteStatus;
|
||||
import org.apache.hudi.common.HoodieClientTestHarness;
|
||||
import org.apache.hudi.common.HoodieClientTestUtils;
|
||||
@@ -29,9 +30,9 @@ import org.apache.hudi.common.fs.FSUtils;
|
||||
import org.apache.hudi.common.model.HoodieKey;
|
||||
import org.apache.hudi.common.model.HoodieRecord;
|
||||
import org.apache.hudi.common.model.HoodieRecordLocation;
|
||||
import org.apache.hudi.common.model.HoodieTableType;
|
||||
import org.apache.hudi.common.model.HoodieTestUtils;
|
||||
import org.apache.hudi.common.table.HoodieTableMetaClient;
|
||||
import org.apache.hudi.common.table.timeline.HoodieTimeline;
|
||||
import org.apache.hudi.common.util.FileIOUtils;
|
||||
import org.apache.hudi.common.util.Option;
|
||||
import org.apache.hudi.common.util.ParquetUtils;
|
||||
@@ -39,11 +40,17 @@ import org.apache.hudi.common.util.collection.Pair;
|
||||
import org.apache.hudi.config.HoodieCompactionConfig;
|
||||
import org.apache.hudi.config.HoodieStorageConfig;
|
||||
import org.apache.hudi.config.HoodieWriteConfig;
|
||||
import org.apache.hudi.hadoop.HoodieHiveUtil;
|
||||
import org.apache.hudi.hadoop.HoodieParquetInputFormat;
|
||||
import org.apache.hudi.io.HoodieCreateHandle;
|
||||
import org.apache.hudi.table.HoodieCopyOnWriteTable.UpsertPartitioner;
|
||||
|
||||
import org.apache.avro.generic.GenericRecord;
|
||||
import org.apache.hadoop.fs.FileStatus;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.apache.hadoop.mapred.FileInputFormat;
|
||||
import org.apache.hadoop.mapred.JobConf;
|
||||
|
||||
import org.apache.log4j.LogManager;
|
||||
import org.apache.log4j.Logger;
|
||||
import org.apache.parquet.avro.AvroReadSupport;
|
||||
@@ -64,7 +71,6 @@ import java.util.UUID;
|
||||
import scala.Tuple2;
|
||||
|
||||
import static org.junit.Assert.assertEquals;
|
||||
import static org.junit.Assert.assertNotNull;
|
||||
import static org.junit.Assert.assertTrue;
|
||||
import static org.mockito.Mockito.mock;
|
||||
import static org.mockito.Mockito.when;
|
||||
@@ -129,6 +135,8 @@ public class TestCopyOnWriteTable extends HoodieClientTestHarness {
|
||||
// Prepare the AvroParquetIO
|
||||
HoodieWriteConfig config = makeHoodieClientConfig();
|
||||
String firstCommitTime = HoodieTestUtils.makeNewCommitTime();
|
||||
HoodieWriteClient writeClient = new HoodieWriteClient(jsc, config);
|
||||
writeClient.startCommitWithTime(firstCommitTime);
|
||||
metaClient = HoodieTableMetaClient.reload(metaClient);
|
||||
|
||||
String partitionPath = "/2016/01/31";
|
||||
@@ -154,30 +162,17 @@ public class TestCopyOnWriteTable extends HoodieClientTestHarness {
|
||||
|
||||
// Insert new records
|
||||
final HoodieCopyOnWriteTable cowTable = table;
|
||||
jsc.parallelize(Arrays.asList(1)).map(x -> {
|
||||
return cowTable.handleInsert(firstCommitTime, FSUtils.createNewFileIdPfx(), records.iterator());
|
||||
}).map(x -> HoodieClientTestUtils.collectStatuses(x)).collect();
|
||||
writeClient.insert(jsc.parallelize(records, 1), firstCommitTime);
|
||||
|
||||
// We should have a parquet file generated (TODO: better control # files after we revise
|
||||
// AvroParquetIO)
|
||||
File parquetFile = null;
|
||||
for (File file : new File(this.basePath + partitionPath).listFiles()) {
|
||||
if (file.getName().endsWith(".parquet")) {
|
||||
parquetFile = file;
|
||||
break;
|
||||
}
|
||||
}
|
||||
assertTrue(parquetFile != null);
|
||||
FileStatus[] allFiles = getIncrementalFiles(partitionPath, "0", -1);
|
||||
assertEquals(1, allFiles.length);
|
||||
|
||||
// Read out the bloom filter and make sure filter can answer record exist or not
|
||||
Path parquetFilePath = new Path(parquetFile.getAbsolutePath());
|
||||
Path parquetFilePath = allFiles[0].getPath();
|
||||
BloomFilter filter = ParquetUtils.readBloomFilterFromParquetMetadata(jsc.hadoopConfiguration(), parquetFilePath);
|
||||
for (HoodieRecord record : records) {
|
||||
assertTrue(filter.mightContain(record.getRecordKey()));
|
||||
}
|
||||
// Create a commit file
|
||||
new File(this.basePath + "/" + HoodieTableMetaClient.METAFOLDER_NAME + "/"
|
||||
+ FSUtils.getCommitTime(parquetFile.getName()) + ".commit").createNewFile();
|
||||
|
||||
// Read the parquet file, check the record content
|
||||
List<GenericRecord> fileRecords = ParquetUtils.readAvroRecords(jsc.hadoopConfiguration(), parquetFilePath);
|
||||
@@ -194,9 +189,6 @@ public class TestCopyOnWriteTable extends HoodieClientTestHarness {
|
||||
TestRawTripPayload updateRowChanges1 = new TestRawTripPayload(updateRecordStr1);
|
||||
HoodieRecord updatedRecord1 = new HoodieRecord(
|
||||
new HoodieKey(updateRowChanges1.getRowKey(), updateRowChanges1.getPartitionPath()), updateRowChanges1);
|
||||
updatedRecord1.unseal();
|
||||
updatedRecord1.setCurrentLocation(new HoodieRecordLocation(null, FSUtils.getFileId(parquetFile.getName())));
|
||||
updatedRecord1.seal();
|
||||
|
||||
TestRawTripPayload rowChange4 = new TestRawTripPayload(recordStr4);
|
||||
HoodieRecord insertedRecord1 =
|
||||
@@ -207,27 +199,16 @@ public class TestCopyOnWriteTable extends HoodieClientTestHarness {
|
||||
Thread.sleep(1000);
|
||||
String newCommitTime = HoodieTestUtils.makeNewCommitTime();
|
||||
metaClient = HoodieTableMetaClient.reload(metaClient);
|
||||
final HoodieCopyOnWriteTable newTable = (HoodieCopyOnWriteTable) HoodieTable.create(metaClient, config, jsc);
|
||||
List<WriteStatus> statuses = jsc.parallelize(Arrays.asList(1)).map(x -> {
|
||||
return newTable.handleUpdate(newCommitTime, updatedRecord1.getPartitionPath(),
|
||||
updatedRecord1.getCurrentLocation().getFileId(), updatedRecords.iterator());
|
||||
}).flatMap(x -> HoodieClientTestUtils.collectStatuses(x).iterator()).collect();
|
||||
writeClient.startCommitWithTime(newCommitTime);
|
||||
List<WriteStatus> statuses = writeClient.upsert(jsc.parallelize(updatedRecords), newCommitTime).collect();
|
||||
|
||||
allFiles = getIncrementalFiles(partitionPath, firstCommitTime, -1);
|
||||
assertEquals(1, allFiles.length);
|
||||
// verify new incremental file group is same as the previous one
|
||||
assertEquals(FSUtils.getFileId(parquetFilePath.getName()), FSUtils.getFileId(allFiles[0].getPath().getName()));
|
||||
|
||||
// Check the updated file
|
||||
File updatedParquetFile = null;
|
||||
for (File file : new File(basePath + "/2016/01/31").listFiles()) {
|
||||
if (file.getName().endsWith(".parquet")) {
|
||||
if (FSUtils.getFileId(file.getName()).equals(FSUtils.getFileId(parquetFile.getName()))
|
||||
&& HoodieTimeline.compareTimestamps(FSUtils.getCommitTime(file.getName()),
|
||||
FSUtils.getCommitTime(parquetFile.getName()), HoodieTimeline.GREATER)) {
|
||||
updatedParquetFile = file;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
assertNotNull(updatedParquetFile);
|
||||
// Check whether the record has been updated
|
||||
Path updatedParquetFilePath = new Path(updatedParquetFile.getAbsolutePath());
|
||||
Path updatedParquetFilePath = allFiles[0].getPath();
|
||||
BloomFilter updatedFilter =
|
||||
ParquetUtils.readBloomFilterFromParquetMetadata(jsc.hadoopConfiguration(), updatedParquetFilePath);
|
||||
for (HoodieRecord record : records) {
|
||||
@@ -254,6 +235,32 @@ public class TestCopyOnWriteTable extends HoodieClientTestHarness {
|
||||
assertEquals(4, writeStatus.getStat().getNumWrites());// 3 rewritten records + 1 new record
|
||||
}
|
||||
|
||||
private FileStatus[] getIncrementalFiles(String partitionPath, String startCommitTime, int numCommitsToPull)
|
||||
throws Exception {
|
||||
// initialize parquet input format
|
||||
HoodieParquetInputFormat hoodieInputFormat = new HoodieParquetInputFormat();
|
||||
JobConf jobConf = new JobConf(jsc.hadoopConfiguration());
|
||||
hoodieInputFormat.setConf(jobConf);
|
||||
HoodieTestUtils.init(jsc.hadoopConfiguration(), basePath, HoodieTableType.COPY_ON_WRITE);
|
||||
setupIncremental(jobConf, startCommitTime, numCommitsToPull);
|
||||
FileInputFormat.setInputPaths(jobConf, basePath + partitionPath);
|
||||
return hoodieInputFormat.listStatus(jobConf);
|
||||
}
|
||||
|
||||
private void setupIncremental(JobConf jobConf, String startCommit, int numberOfCommitsToPull) {
|
||||
String modePropertyName =
|
||||
String.format(HoodieHiveUtil.HOODIE_CONSUME_MODE_PATTERN, HoodieTestUtils.RAW_TRIPS_TEST_NAME);
|
||||
jobConf.set(modePropertyName, HoodieHiveUtil.INCREMENTAL_SCAN_MODE);
|
||||
|
||||
String startCommitTimestampName =
|
||||
String.format(HoodieHiveUtil.HOODIE_START_COMMIT_PATTERN, HoodieTestUtils.RAW_TRIPS_TEST_NAME);
|
||||
jobConf.set(startCommitTimestampName, startCommit);
|
||||
|
||||
String maxCommitPulls =
|
||||
String.format(HoodieHiveUtil.HOODIE_MAX_COMMIT_PATTERN, HoodieTestUtils.RAW_TRIPS_TEST_NAME);
|
||||
jobConf.setInt(maxCommitPulls, numberOfCommitsToPull);
|
||||
}
|
||||
|
||||
private List<HoodieRecord> newHoodieRecords(int n, String time) throws Exception {
|
||||
List<HoodieRecord> records = new ArrayList<>();
|
||||
for (int i = 0; i < n; i++) {
|
||||
|
||||
@@ -18,6 +18,8 @@
|
||||
|
||||
package org.apache.hudi.table;
|
||||
|
||||
import org.apache.hadoop.mapred.FileInputFormat;
|
||||
import org.apache.hadoop.mapred.JobConf;
|
||||
import org.apache.hudi.client.HoodieReadClient;
|
||||
import org.apache.hudi.client.HoodieWriteClient;
|
||||
import org.apache.hudi.client.WriteStatus;
|
||||
@@ -50,6 +52,9 @@ import org.apache.hudi.config.HoodieCompactionConfig;
|
||||
import org.apache.hudi.config.HoodieIndexConfig;
|
||||
import org.apache.hudi.config.HoodieStorageConfig;
|
||||
import org.apache.hudi.config.HoodieWriteConfig;
|
||||
import org.apache.hudi.hadoop.HoodieHiveUtil;
|
||||
import org.apache.hudi.hadoop.HoodieParquetInputFormat;
|
||||
import org.apache.hudi.hadoop.realtime.HoodieParquetRealtimeInputFormat;
|
||||
import org.apache.hudi.index.HoodieIndex;
|
||||
import org.apache.hudi.index.HoodieIndex.IndexType;
|
||||
|
||||
@@ -70,6 +75,7 @@ import java.util.Arrays;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
import java.util.stream.Collectors;
|
||||
import java.util.stream.Stream;
|
||||
|
||||
@@ -80,6 +86,12 @@ import static org.junit.Assert.assertTrue;
|
||||
|
||||
public class TestMergeOnReadTable extends HoodieClientTestHarness {
|
||||
|
||||
private HoodieParquetInputFormat roInputFormat;
|
||||
private JobConf roJobConf;
|
||||
|
||||
private HoodieParquetRealtimeInputFormat rtInputFormat;
|
||||
private JobConf rtJobConf;
|
||||
|
||||
@Before
|
||||
public void init() throws IOException {
|
||||
initDFS();
|
||||
@@ -89,6 +101,15 @@ public class TestMergeOnReadTable extends HoodieClientTestHarness {
|
||||
dfs.mkdirs(new Path(basePath));
|
||||
HoodieTestUtils.init(jsc.hadoopConfiguration(), basePath, HoodieTableType.MERGE_ON_READ);
|
||||
initTestDataGenerator();
|
||||
|
||||
// initialize parquet input format
|
||||
roInputFormat = new HoodieParquetInputFormat();
|
||||
roJobConf = new JobConf(jsc.hadoopConfiguration());
|
||||
roInputFormat.setConf(roJobConf);
|
||||
|
||||
rtInputFormat = new HoodieParquetRealtimeInputFormat();
|
||||
rtJobConf = new JobConf(jsc.hadoopConfiguration());
|
||||
rtInputFormat.setConf(rtJobConf);
|
||||
}
|
||||
|
||||
@After
|
||||
@@ -114,63 +135,23 @@ public class TestMergeOnReadTable extends HoodieClientTestHarness {
|
||||
client.startCommitWithTime(newCommitTime);
|
||||
|
||||
List<HoodieRecord> records = dataGen.generateInserts(newCommitTime, 200);
|
||||
JavaRDD<HoodieRecord> writeRecords = jsc.parallelize(records, 1);
|
||||
|
||||
List<WriteStatus> statuses = client.upsert(writeRecords, newCommitTime).collect();
|
||||
assertNoWriteErrors(statuses);
|
||||
|
||||
HoodieTableMetaClient metaClient = new HoodieTableMetaClient(jsc.hadoopConfiguration(), cfg.getBasePath());
|
||||
HoodieTable hoodieTable = HoodieTable.create(metaClient, cfg, jsc);
|
||||
|
||||
Option<HoodieInstant> deltaCommit = metaClient.getActiveTimeline().getDeltaCommitTimeline().firstInstant();
|
||||
assertTrue(deltaCommit.isPresent());
|
||||
Assert.assertEquals("Delta commit should be 001", "001", deltaCommit.get().getTimestamp());
|
||||
|
||||
Option<HoodieInstant> commit = metaClient.getActiveTimeline().getCommitTimeline().firstInstant();
|
||||
assertFalse(commit.isPresent());
|
||||
|
||||
FileStatus[] allFiles = HoodieTestUtils.listAllDataFilesInPath(metaClient.getFs(), cfg.getBasePath());
|
||||
BaseFileOnlyView roView =
|
||||
new HoodieTableFileSystemView(metaClient, metaClient.getCommitTimeline().filterCompletedInstants(), allFiles);
|
||||
Stream<HoodieBaseFile> dataFilesToRead = roView.getLatestBaseFiles();
|
||||
assertTrue(!dataFilesToRead.findAny().isPresent());
|
||||
|
||||
roView = new HoodieTableFileSystemView(metaClient, hoodieTable.getCompletedCommitsTimeline(), allFiles);
|
||||
dataFilesToRead = roView.getLatestBaseFiles();
|
||||
assertTrue("should list the parquet files we wrote in the delta commit",
|
||||
dataFilesToRead.findAny().isPresent());
|
||||
insertAndGetFilePaths(records, client, cfg, newCommitTime);
|
||||
|
||||
/**
|
||||
* Write 2 (updates)
|
||||
*/
|
||||
newCommitTime = "004";
|
||||
client.startCommitWithTime(newCommitTime);
|
||||
|
||||
records = dataGen.generateUpdates(newCommitTime, 100);
|
||||
Map<HoodieKey, HoodieRecord> recordsMap = new HashMap<>();
|
||||
for (HoodieRecord rec : records) {
|
||||
if (!recordsMap.containsKey(rec.getKey())) {
|
||||
recordsMap.put(rec.getKey(), rec);
|
||||
}
|
||||
}
|
||||
|
||||
statuses = client.upsert(jsc.parallelize(records, 1), newCommitTime).collect();
|
||||
// Verify there are no errors
|
||||
assertNoWriteErrors(statuses);
|
||||
metaClient = HoodieTableMetaClient.reload(metaClient);
|
||||
deltaCommit = metaClient.getActiveTimeline().getDeltaCommitTimeline().lastInstant();
|
||||
assertTrue(deltaCommit.isPresent());
|
||||
assertEquals("Latest Delta commit should be 004", "004", deltaCommit.get().getTimestamp());
|
||||
|
||||
commit = metaClient.getActiveTimeline().getCommitTimeline().firstInstant();
|
||||
assertFalse(commit.isPresent());
|
||||
updateAndGetFilePaths(records, client, cfg, newCommitTime);
|
||||
|
||||
String compactionCommitTime = client.scheduleCompaction(Option.empty()).get().toString();
|
||||
client.compact(compactionCommitTime);
|
||||
|
||||
allFiles = HoodieTestUtils.listAllDataFilesInPath(dfs, cfg.getBasePath());
|
||||
roView = new HoodieTableFileSystemView(metaClient, hoodieTable.getCompletedCommitsTimeline(), allFiles);
|
||||
dataFilesToRead = roView.getLatestBaseFiles();
|
||||
FileStatus[] allFiles = HoodieTestUtils.listAllDataFilesInPath(dfs, cfg.getBasePath());
|
||||
HoodieTable hoodieTable = HoodieTable.create(metaClient, cfg, jsc);
|
||||
HoodieTableFileSystemView roView = new HoodieTableFileSystemView(metaClient, hoodieTable.getCompletedCommitsTimeline(), allFiles);
|
||||
Stream<HoodieBaseFile> dataFilesToRead = roView.getLatestBaseFiles();
|
||||
assertTrue(dataFilesToRead.findAny().isPresent());
|
||||
|
||||
// verify that there is a commit
|
||||
@@ -186,6 +167,101 @@ public class TestMergeOnReadTable extends HoodieClientTestHarness {
|
||||
}
|
||||
}
|
||||
|
||||
// test incremental read does not go past compaction instant for RO views
|
||||
// For RT views, incremental read can go past compaction
|
||||
@Test
|
||||
public void testIncrementalReadsWithCompaction() throws Exception {
|
||||
String partitionPath = "2020/02/20"; // use only one partition for this test
|
||||
dataGen = new HoodieTestDataGenerator(new String[] { partitionPath });
|
||||
HoodieWriteConfig cfg = getConfig(true);
|
||||
try (HoodieWriteClient client = getWriteClient(cfg);) {
|
||||
|
||||
/**
|
||||
* Write 1 (only inserts)
|
||||
*/
|
||||
String commitTime1 = "001";
|
||||
client.startCommitWithTime(commitTime1);
|
||||
|
||||
List<HoodieRecord> records001 = dataGen.generateInserts(commitTime1, 200);
|
||||
insertAndGetFilePaths(records001, client, cfg, commitTime1);
|
||||
|
||||
// verify only one parquet file shows up with commit time 001
|
||||
FileStatus[] incrementalROFiles = getROIncrementalFiles(partitionPath, true);
|
||||
validateIncrementalFiles(partitionPath, 1, incrementalROFiles, roInputFormat,
|
||||
roJobConf,200, commitTime1);
|
||||
Path firstFilePath = incrementalROFiles[0].getPath();
|
||||
|
||||
FileStatus[] incrementalRTFiles = getRTIncrementalFiles(partitionPath);
|
||||
validateIncrementalFiles(partitionPath, 1, incrementalRTFiles, rtInputFormat,
|
||||
rtJobConf,200, commitTime1);
|
||||
assertEquals(firstFilePath, incrementalRTFiles[0].getPath());
|
||||
|
||||
/**
|
||||
* Write 2 (updates)
|
||||
*/
|
||||
String updateTime = "004";
|
||||
client.startCommitWithTime(updateTime);
|
||||
List<HoodieRecord> records004 = dataGen.generateUpdates(updateTime, 100);
|
||||
updateAndGetFilePaths(records004, client, cfg, updateTime);
|
||||
|
||||
// verify RO incremental reads - only one parquet file shows up because updates to into log files
|
||||
incrementalROFiles = getROIncrementalFiles(partitionPath, false);
|
||||
validateIncrementalFiles(partitionPath, 1, incrementalROFiles, roInputFormat,
|
||||
roJobConf, 200, commitTime1);
|
||||
assertEquals(firstFilePath, incrementalROFiles[0].getPath());
|
||||
|
||||
// verify RT incremental reads includes updates also
|
||||
incrementalRTFiles = getRTIncrementalFiles(partitionPath);
|
||||
validateIncrementalFiles(partitionPath, 1, incrementalRTFiles, rtInputFormat,
|
||||
rtJobConf, 200, commitTime1, updateTime);
|
||||
|
||||
// request compaction, but do not perform compaction
|
||||
String compactionCommitTime = "005";
|
||||
client.scheduleCompactionAtInstant("005", Option.empty());
|
||||
|
||||
// verify RO incremental reads - only one parquet file shows up because updates go into log files
|
||||
incrementalROFiles = getROIncrementalFiles(partitionPath, true);
|
||||
validateIncrementalFiles(partitionPath,1, incrementalROFiles, roInputFormat,
|
||||
roJobConf, 200, commitTime1);
|
||||
|
||||
// verify RT incremental reads includes updates also
|
||||
incrementalRTFiles = getRTIncrementalFiles(partitionPath);
|
||||
validateIncrementalFiles(partitionPath, 1, incrementalRTFiles, rtInputFormat,
|
||||
rtJobConf, 200, commitTime1, updateTime);
|
||||
|
||||
// write 3 - more inserts
|
||||
String insertsTime = "006";
|
||||
List<HoodieRecord> records006 = dataGen.generateInserts(insertsTime, 200);
|
||||
client.startCommitWithTime(insertsTime);
|
||||
insertAndGetFilePaths(records006, client, cfg, insertsTime);
|
||||
|
||||
incrementalROFiles = getROIncrementalFiles(partitionPath, true);
|
||||
assertEquals(firstFilePath, incrementalROFiles[0].getPath());
|
||||
// verify 006 does not show up in RO mode because of pending compaction
|
||||
validateIncrementalFiles(partitionPath, 1, incrementalROFiles, roInputFormat,
|
||||
roJobConf, 200, commitTime1);
|
||||
|
||||
// verify that if stopAtCompaction is disabled, inserts from "insertsTime" show up
|
||||
incrementalROFiles = getROIncrementalFiles(partitionPath, false);
|
||||
validateIncrementalFiles(partitionPath,2, incrementalROFiles, roInputFormat,
|
||||
roJobConf, 400, commitTime1, insertsTime);
|
||||
|
||||
// verify 006 shows up in RT views
|
||||
incrementalRTFiles = getRTIncrementalFiles(partitionPath);
|
||||
validateIncrementalFiles(partitionPath, 2, incrementalRTFiles, rtInputFormat,
|
||||
rtJobConf, 400, commitTime1, updateTime, insertsTime);
|
||||
|
||||
// perform the scheduled compaction
|
||||
client.compact(compactionCommitTime);
|
||||
|
||||
incrementalROFiles = getROIncrementalFiles(partitionPath, "002", -1, true);
|
||||
assertTrue(incrementalROFiles.length == 2);
|
||||
// verify 006 shows up because of pending compaction
|
||||
validateIncrementalFiles(partitionPath, 2, incrementalROFiles, roInputFormat,
|
||||
roJobConf, 400, commitTime1, compactionCommitTime, insertsTime);
|
||||
}
|
||||
}
|
||||
|
||||
// Check if record level metadata is aggregated properly at the end of write.
|
||||
@Test
|
||||
public void testMetadataAggregateFromWriteStatus() throws Exception {
|
||||
@@ -1309,4 +1385,115 @@ public class TestMergeOnReadTable extends HoodieClientTestHarness {
|
||||
assertFalse("Errors found in write of " + status.getFileId(), status.hasErrors());
|
||||
}
|
||||
}
|
||||
|
||||
private FileStatus[] insertAndGetFilePaths(List<HoodieRecord> records, HoodieWriteClient client,
|
||||
HoodieWriteConfig cfg, String commitTime) throws IOException {
|
||||
JavaRDD<HoodieRecord> writeRecords = jsc.parallelize(records, 1);
|
||||
|
||||
List<WriteStatus> statuses = client.insert(writeRecords, commitTime).collect();
|
||||
assertNoWriteErrors(statuses);
|
||||
|
||||
metaClient = new HoodieTableMetaClient(jsc.hadoopConfiguration(), cfg.getBasePath());
|
||||
HoodieTable hoodieTable = HoodieTable.create(metaClient, cfg, jsc);
|
||||
|
||||
Option<HoodieInstant> deltaCommit = metaClient.getActiveTimeline().getDeltaCommitTimeline().lastInstant();
|
||||
assertTrue(deltaCommit.isPresent());
|
||||
Assert.assertEquals("Delta commit should be specified value", commitTime, deltaCommit.get().getTimestamp());
|
||||
|
||||
Option<HoodieInstant> commit = metaClient.getActiveTimeline().getCommitTimeline().lastInstant();
|
||||
assertFalse(commit.isPresent());
|
||||
|
||||
FileStatus[] allFiles = HoodieTestUtils.listAllDataFilesInPath(metaClient.getFs(), cfg.getBasePath());
|
||||
BaseFileOnlyView roView =
|
||||
new HoodieTableFileSystemView(metaClient, metaClient.getCommitTimeline().filterCompletedInstants(), allFiles);
|
||||
Stream<HoodieBaseFile> dataFilesToRead = roView.getLatestBaseFiles();
|
||||
assertTrue(!dataFilesToRead.findAny().isPresent());
|
||||
|
||||
roView = new HoodieTableFileSystemView(metaClient, hoodieTable.getCompletedCommitsTimeline(), allFiles);
|
||||
dataFilesToRead = roView.getLatestBaseFiles();
|
||||
assertTrue("should list the parquet files we wrote in the delta commit",
|
||||
dataFilesToRead.findAny().isPresent());
|
||||
return allFiles;
|
||||
}
|
||||
|
||||
private FileStatus[] updateAndGetFilePaths(List<HoodieRecord> records, HoodieWriteClient client,
|
||||
HoodieWriteConfig cfg, String commitTime) throws IOException {
|
||||
Map<HoodieKey, HoodieRecord> recordsMap = new HashMap<>();
|
||||
for (HoodieRecord rec : records) {
|
||||
if (!recordsMap.containsKey(rec.getKey())) {
|
||||
recordsMap.put(rec.getKey(), rec);
|
||||
}
|
||||
}
|
||||
|
||||
List<WriteStatus> statuses = client.upsert(jsc.parallelize(records, 1), commitTime).collect();
|
||||
// Verify there are no errors
|
||||
assertNoWriteErrors(statuses);
|
||||
metaClient = HoodieTableMetaClient.reload(metaClient);
|
||||
Option<HoodieInstant> deltaCommit = metaClient.getActiveTimeline().getDeltaCommitTimeline().lastInstant();
|
||||
assertTrue(deltaCommit.isPresent());
|
||||
assertEquals("Latest Delta commit should match specified time",
|
||||
commitTime, deltaCommit.get().getTimestamp());
|
||||
|
||||
Option<HoodieInstant> commit = metaClient.getActiveTimeline().getCommitTimeline().firstInstant();
|
||||
assertFalse(commit.isPresent());
|
||||
return HoodieTestUtils.listAllDataFilesInPath(metaClient.getFs(), cfg.getBasePath());
|
||||
}
|
||||
|
||||
private FileStatus[] getROIncrementalFiles(String partitionPath, boolean stopAtCompaction)
|
||||
throws Exception {
|
||||
return getROIncrementalFiles(partitionPath, "000", -1, stopAtCompaction);
|
||||
}
|
||||
|
||||
private FileStatus[] getROIncrementalFiles(String partitionPath, String startCommitTime, int numCommitsToPull, boolean stopAtCompaction)
|
||||
throws Exception {
|
||||
HoodieTestUtils.init(jsc.hadoopConfiguration(), basePath, HoodieTableType.MERGE_ON_READ);
|
||||
setupIncremental(roJobConf, startCommitTime, numCommitsToPull, stopAtCompaction);
|
||||
FileInputFormat.setInputPaths(roJobConf, basePath + "/" + partitionPath);
|
||||
return roInputFormat.listStatus(roJobConf);
|
||||
}
|
||||
|
||||
private FileStatus[] getRTIncrementalFiles(String partitionPath)
|
||||
throws Exception {
|
||||
return getRTIncrementalFiles(partitionPath, "000", -1);
|
||||
}
|
||||
|
||||
private FileStatus[] getRTIncrementalFiles(String partitionPath, String startCommitTime, int numCommitsToPull)
|
||||
throws Exception {
|
||||
HoodieTestUtils.init(jsc.hadoopConfiguration(), basePath, HoodieTableType.MERGE_ON_READ);
|
||||
setupIncremental(rtJobConf, startCommitTime, numCommitsToPull, false);
|
||||
FileInputFormat.setInputPaths(rtJobConf, basePath + "/" + partitionPath);
|
||||
return rtInputFormat.listStatus(rtJobConf);
|
||||
}
|
||||
|
||||
private void setupIncremental(JobConf jobConf, String startCommit, int numberOfCommitsToPull, boolean stopAtCompaction) {
|
||||
String modePropertyName =
|
||||
String.format(HoodieHiveUtil.HOODIE_CONSUME_MODE_PATTERN, HoodieTestUtils.RAW_TRIPS_TEST_NAME);
|
||||
jobConf.set(modePropertyName, HoodieHiveUtil.INCREMENTAL_SCAN_MODE);
|
||||
|
||||
String startCommitTimestampName =
|
||||
String.format(HoodieHiveUtil.HOODIE_START_COMMIT_PATTERN, HoodieTestUtils.RAW_TRIPS_TEST_NAME);
|
||||
jobConf.set(startCommitTimestampName, startCommit);
|
||||
|
||||
String maxCommitPulls =
|
||||
String.format(HoodieHiveUtil.HOODIE_MAX_COMMIT_PATTERN, HoodieTestUtils.RAW_TRIPS_TEST_NAME);
|
||||
jobConf.setInt(maxCommitPulls, numberOfCommitsToPull);
|
||||
|
||||
String stopAtCompactionPropName =
|
||||
String.format(HoodieHiveUtil.HOODIE_STOP_AT_COMPACTION_PATTERN, HoodieTestUtils.RAW_TRIPS_TEST_NAME);
|
||||
jobConf.setBoolean(stopAtCompactionPropName, stopAtCompaction);
|
||||
}
|
||||
|
||||
private void validateIncrementalFiles(String partitionPath, int expectedNumFiles,
|
||||
FileStatus[] files, HoodieParquetInputFormat inputFormat,
|
||||
JobConf jobConf, int expectedRecords, String... expectedCommits) {
|
||||
|
||||
assertEquals(expectedNumFiles, files.length);
|
||||
Set<String> expectedCommitsSet = Arrays.asList(expectedCommits).stream().collect(Collectors.toSet());
|
||||
List<GenericRecord> records = HoodieMergeOnReadTestUtils.getRecordsUsingInputFormat(
|
||||
Arrays.asList(basePath + "/" + partitionPath), basePath, jobConf, inputFormat);
|
||||
assertEquals(expectedRecords, records.size());
|
||||
Set<String> actualCommits = records.stream().map(r ->
|
||||
r.get(HoodieRecord.COMMIT_TIME_METADATA_FIELD).toString()).collect(Collectors.toSet());
|
||||
assertEquals(expectedCommitsSet, actualCommits);
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user