1
0

[HUDI-1445] Refactor AbstractHoodieLogRecordScanner to use Builder (#2313)

This commit is contained in:
Danny Chan
2020-12-10 20:02:02 +08:00
committed by GitHub
parent bd9cceccb5
commit 4bc45a391a
11 changed files with 431 additions and 82 deletions

View File

@@ -460,9 +460,20 @@ public class TestHoodieLogFormat extends HoodieCommonTestHarness {
writer.close();
// scan all log blocks (across multiple log files)
HoodieMergedLogRecordScanner scanner = new HoodieMergedLogRecordScanner(fs, basePath,
logFiles.stream().map(logFile -> logFile.getPath().toString()).collect(Collectors.toList()), schema, "100",
10240L, readBlocksLazily, false, bufferSize, BASE_OUTPUT_PATH);
HoodieMergedLogRecordScanner scanner = HoodieMergedLogRecordScanner.newBuilder()
.withFileSystem(fs)
.withBasePath(basePath)
.withLogFilePaths(
logFiles.stream()
.map(logFile -> logFile.getPath().toString()).collect(Collectors.toList()))
.withReaderSchema(schema)
.withLatestInstantTime("100")
.withMaxMemorySizeInBytes(10240L)
.withReadBlocksLazily(readBlocksLazily)
.withReverseReader(false)
.withBufferSize(bufferSize)
.withSpillableMapBasePath(BASE_OUTPUT_PATH)
.build();
List<IndexedRecord> scannedRecords = new ArrayList<>();
for (HoodieRecord record : scanner) {
@@ -601,8 +612,18 @@ public class TestHoodieLogFormat extends HoodieCommonTestHarness {
FSUtils.getAllLogFiles(fs, partitionPath, "test-fileid1", HoodieLogFile.DELTA_EXTENSION, "100")
.map(s -> s.getPath().toString()).collect(Collectors.toList());
HoodieMergedLogRecordScanner scanner = new HoodieMergedLogRecordScanner(fs, basePath, allLogFiles, schema, "100",
10240L, readBlocksLazily, false, bufferSize, BASE_OUTPUT_PATH);
HoodieMergedLogRecordScanner scanner = HoodieMergedLogRecordScanner.newBuilder()
.withFileSystem(fs)
.withBasePath(basePath)
.withLogFilePaths(allLogFiles)
.withReaderSchema(schema)
.withLatestInstantTime("100")
.withMaxMemorySizeInBytes(10240L)
.withReadBlocksLazily(readBlocksLazily)
.withReverseReader(false)
.withBufferSize(bufferSize)
.withSpillableMapBasePath(BASE_OUTPUT_PATH)
.build();
assertEquals(200, scanner.getTotalLogRecords());
Set<String> readKeys = new HashSet<>(200);
scanner.forEach(s -> readKeys.add(s.getKey().getRecordKey()));
@@ -663,8 +684,18 @@ public class TestHoodieLogFormat extends HoodieCommonTestHarness {
FSUtils.getAllLogFiles(fs, partitionPath, "test-fileid1", HoodieLogFile.DELTA_EXTENSION, "100")
.map(s -> s.getPath().toString()).collect(Collectors.toList());
HoodieMergedLogRecordScanner scanner = new HoodieMergedLogRecordScanner(fs, basePath, allLogFiles, schema, "102",
10240L, readBlocksLazily, false, bufferSize, BASE_OUTPUT_PATH);
HoodieMergedLogRecordScanner scanner = HoodieMergedLogRecordScanner.newBuilder()
.withFileSystem(fs)
.withBasePath(basePath)
.withLogFilePaths(allLogFiles)
.withReaderSchema(schema)
.withLatestInstantTime("102")
.withMaxMemorySizeInBytes(10240L)
.withReadBlocksLazily(readBlocksLazily)
.withReverseReader(false)
.withBufferSize(bufferSize)
.withSpillableMapBasePath(BASE_OUTPUT_PATH)
.build();
assertEquals(200, scanner.getTotalLogRecords(), "We read 200 records from 2 write batches");
Set<String> readKeys = new HashSet<>(200);
scanner.forEach(s -> readKeys.add(s.getKey().getRecordKey()));
@@ -742,8 +773,18 @@ public class TestHoodieLogFormat extends HoodieCommonTestHarness {
FSUtils.getAllLogFiles(fs, partitionPath, "test-fileid1", HoodieLogFile.DELTA_EXTENSION, "100")
.map(s -> s.getPath().toString()).collect(Collectors.toList());
HoodieMergedLogRecordScanner scanner = new HoodieMergedLogRecordScanner(fs, basePath, allLogFiles, schema, "103",
10240L, true, false, bufferSize, BASE_OUTPUT_PATH);
HoodieMergedLogRecordScanner scanner = HoodieMergedLogRecordScanner.newBuilder()
.withFileSystem(fs)
.withBasePath(basePath)
.withLogFilePaths(allLogFiles)
.withReaderSchema(schema)
.withLatestInstantTime("103")
.withMaxMemorySizeInBytes(10240L)
.withReadBlocksLazily(true)
.withReverseReader(false)
.withBufferSize(bufferSize)
.withSpillableMapBasePath(BASE_OUTPUT_PATH)
.build();
assertEquals(200, scanner.getTotalLogRecords(), "We would read 200 records");
Set<String> readKeys = new HashSet<>(200);
scanner.forEach(s -> readKeys.add(s.getKey().getRecordKey()));
@@ -802,8 +843,18 @@ public class TestHoodieLogFormat extends HoodieCommonTestHarness {
FSUtils.getAllLogFiles(fs, partitionPath, "test-fileid1", HoodieLogFile.DELTA_EXTENSION, "100")
.map(s -> s.getPath().toString()).collect(Collectors.toList());
HoodieMergedLogRecordScanner scanner = new HoodieMergedLogRecordScanner(fs, basePath, allLogFiles, schema, "102",
10240L, readBlocksLazily, false, bufferSize, BASE_OUTPUT_PATH);
HoodieMergedLogRecordScanner scanner = HoodieMergedLogRecordScanner.newBuilder()
.withFileSystem(fs)
.withBasePath(basePath)
.withLogFilePaths(allLogFiles)
.withReaderSchema(schema)
.withLatestInstantTime("102")
.withMaxMemorySizeInBytes(10240L)
.withReadBlocksLazily(readBlocksLazily)
.withReverseReader(false)
.withBufferSize(bufferSize)
.withSpillableMapBasePath(BASE_OUTPUT_PATH)
.build();
assertEquals(200, scanner.getTotalLogRecords(), "We still would read 200 records");
final List<String> readKeys = new ArrayList<>(200);
final List<Boolean> emptyPayloads = new ArrayList<>();
@@ -833,8 +884,18 @@ public class TestHoodieLogFormat extends HoodieCommonTestHarness {
writer.appendBlock(commandBlock);
readKeys.clear();
scanner = new HoodieMergedLogRecordScanner(fs, basePath, allLogFiles, schema, "101", 10240L, readBlocksLazily,
false, bufferSize, BASE_OUTPUT_PATH);
scanner = HoodieMergedLogRecordScanner.newBuilder()
.withFileSystem(fs)
.withBasePath(basePath)
.withLogFilePaths(allLogFiles)
.withReaderSchema(schema)
.withLatestInstantTime("101")
.withMaxMemorySizeInBytes(10240L)
.withReadBlocksLazily(readBlocksLazily)
.withReverseReader(false)
.withBufferSize(bufferSize)
.withSpillableMapBasePath(BASE_OUTPUT_PATH)
.build();
scanner.forEach(s -> readKeys.add(s.getKey().getRecordKey()));
assertEquals(200, readKeys.size(), "Stream collect should return all 200 records after rollback of delete");
}
@@ -898,8 +959,18 @@ public class TestHoodieLogFormat extends HoodieCommonTestHarness {
.map(s -> s.getPath().toString()).collect(Collectors.toList());
// all data must be rolled back before merge
HoodieMergedLogRecordScanner scanner = new HoodieMergedLogRecordScanner(fs, basePath, allLogFiles, schema, "100",
10240L, readBlocksLazily, false, bufferSize, BASE_OUTPUT_PATH);
HoodieMergedLogRecordScanner scanner = HoodieMergedLogRecordScanner.newBuilder()
.withFileSystem(fs)
.withBasePath(basePath)
.withLogFilePaths(allLogFiles)
.withReaderSchema(schema)
.withLatestInstantTime("100")
.withMaxMemorySizeInBytes(10240L)
.withReadBlocksLazily(readBlocksLazily)
.withReverseReader(false)
.withBufferSize(bufferSize)
.withSpillableMapBasePath(BASE_OUTPUT_PATH)
.build();
assertEquals(0, scanner.getTotalLogRecords(), "We would have scanned 0 records because of rollback");
final List<String> readKeys = new ArrayList<>();
@@ -949,8 +1020,18 @@ public class TestHoodieLogFormat extends HoodieCommonTestHarness {
FSUtils.getAllLogFiles(fs, partitionPath, "test-fileid1", HoodieLogFile.DELTA_EXTENSION, "100")
.map(s -> s.getPath().toString()).collect(Collectors.toList());
HoodieMergedLogRecordScanner scanner = new HoodieMergedLogRecordScanner(fs, basePath, allLogFiles, schema, "100",
10240L, readBlocksLazily, false, bufferSize, BASE_OUTPUT_PATH);
HoodieMergedLogRecordScanner scanner = HoodieMergedLogRecordScanner.newBuilder()
.withFileSystem(fs)
.withBasePath(basePath)
.withLogFilePaths(allLogFiles)
.withReaderSchema(schema)
.withLatestInstantTime("100")
.withMaxMemorySizeInBytes(10240L)
.withReadBlocksLazily(readBlocksLazily)
.withReverseReader(false)
.withBufferSize(bufferSize)
.withSpillableMapBasePath(BASE_OUTPUT_PATH)
.build();
assertEquals(0, scanner.getTotalLogRecords(), "We would read 0 records");
}
@@ -983,8 +1064,18 @@ public class TestHoodieLogFormat extends HoodieCommonTestHarness {
FSUtils.getAllLogFiles(fs, partitionPath, "test-fileid1", HoodieLogFile.DELTA_EXTENSION, "100")
.map(s -> s.getPath().toString()).collect(Collectors.toList());
HoodieMergedLogRecordScanner scanner = new HoodieMergedLogRecordScanner(fs, basePath, allLogFiles, schema, "100",
10240L, readBlocksLazily, false, bufferSize, BASE_OUTPUT_PATH);
HoodieMergedLogRecordScanner scanner = HoodieMergedLogRecordScanner.newBuilder()
.withFileSystem(fs)
.withBasePath(basePath)
.withLogFilePaths(allLogFiles)
.withReaderSchema(schema)
.withLatestInstantTime("100")
.withMaxMemorySizeInBytes(10240L)
.withReadBlocksLazily(readBlocksLazily)
.withReverseReader(false)
.withBufferSize(bufferSize)
.withSpillableMapBasePath(BASE_OUTPUT_PATH)
.build();
assertEquals(100, scanner.getTotalLogRecords(), "We still would read 100 records");
final List<String> readKeys = new ArrayList<>(100);
scanner.forEach(s -> readKeys.add(s.getKey().getRecordKey()));
@@ -1036,8 +1127,18 @@ public class TestHoodieLogFormat extends HoodieCommonTestHarness {
FSUtils.getAllLogFiles(fs, partitionPath, "test-fileid1", HoodieLogFile.DELTA_EXTENSION, "100")
.map(s -> s.getPath().toString()).collect(Collectors.toList());
HoodieMergedLogRecordScanner scanner = new HoodieMergedLogRecordScanner(fs, basePath, allLogFiles, schema, "101",
10240L, readBlocksLazily, false, bufferSize, BASE_OUTPUT_PATH);
HoodieMergedLogRecordScanner scanner = HoodieMergedLogRecordScanner.newBuilder()
.withFileSystem(fs)
.withBasePath(basePath)
.withLogFilePaths(allLogFiles)
.withReaderSchema(schema)
.withLatestInstantTime("101")
.withMaxMemorySizeInBytes(10240L)
.withReadBlocksLazily(readBlocksLazily)
.withReverseReader(false)
.withBufferSize(bufferSize)
.withSpillableMapBasePath(BASE_OUTPUT_PATH)
.build();
assertEquals(0, scanner.getTotalLogRecords(), "We would read 0 records");
}
@@ -1126,8 +1227,18 @@ public class TestHoodieLogFormat extends HoodieCommonTestHarness {
FSUtils.getAllLogFiles(fs, partitionPath, "test-fileid1", HoodieLogFile.DELTA_EXTENSION, "100")
.map(s -> s.getPath().toString()).collect(Collectors.toList());
HoodieMergedLogRecordScanner scanner = new HoodieMergedLogRecordScanner(fs, basePath, allLogFiles, schema, "101",
10240L, readBlocksLazily, false, bufferSize, BASE_OUTPUT_PATH);
HoodieMergedLogRecordScanner scanner = HoodieMergedLogRecordScanner.newBuilder()
.withFileSystem(fs)
.withBasePath(basePath)
.withLogFilePaths(allLogFiles)
.withReaderSchema(schema)
.withLatestInstantTime("101")
.withMaxMemorySizeInBytes(10240L)
.withReadBlocksLazily(readBlocksLazily)
.withReverseReader(false)
.withBufferSize(bufferSize)
.withSpillableMapBasePath(BASE_OUTPUT_PATH)
.build();
assertEquals(0, scanner.getTotalLogRecords(), "We would read 0 records");
}
@@ -1183,8 +1294,18 @@ public class TestHoodieLogFormat extends HoodieCommonTestHarness {
List<String> allLogFiles = FSUtils.getAllLogFiles(fs, partitionPath, "test-fileid1",
HoodieLogFile.DELTA_EXTENSION, "100").map(s -> s.getPath().toString()).collect(Collectors.toList());
HoodieMergedLogRecordScanner scanner = new HoodieMergedLogRecordScanner(fs, basePath, allLogFiles, schema,
"100", 10240L, readBlocksLazily, false, bufferSize, BASE_OUTPUT_PATH);
HoodieMergedLogRecordScanner scanner = HoodieMergedLogRecordScanner.newBuilder()
.withFileSystem(fs)
.withBasePath(basePath)
.withLogFilePaths(allLogFiles)
.withReaderSchema(schema)
.withLatestInstantTime("100")
.withMaxMemorySizeInBytes(10240L)
.withReadBlocksLazily(readBlocksLazily)
.withReverseReader(false)
.withBufferSize(bufferSize)
.withSpillableMapBasePath(BASE_OUTPUT_PATH)
.build();
assertEquals(Math.max(numRecordsInLog1, numRecordsInLog2), scanner.getNumMergedRecordsInLog(),
"We would read 100 records");