[HUDI-684] Introduced abstraction for writing and reading different types of base file formats. (#1687)
Notable changes:
1. HoodieFileWriter and HoodieFileReader abstractions for writer/reader side of a base file format
2. HoodieDataBlock abstraction for creation specific data blocks for base file formats. (e.g. Parquet has HoodieAvroDataBlock)
3. All hardocded references to Parquet / Parquet based classes have been abstracted to call methods which accept a base file format
4. HiveSyncTool accepts the base file format as a CLI parameter
5. HoodieDeltaStreamer accepts the base file format as a CLI parameter
6. HoodieSparkSqlWriter accepts the base file format as a parameter
This commit is contained in:
@@ -21,6 +21,7 @@ package org.apache.hudi.hadoop;
|
||||
import org.apache.hudi.avro.model.HoodieCompactionPlan;
|
||||
import org.apache.hudi.common.fs.FSUtils;
|
||||
import org.apache.hudi.common.model.HoodieCommitMetadata;
|
||||
import org.apache.hudi.common.model.HoodieFileFormat;
|
||||
import org.apache.hudi.common.model.HoodieWriteStat;
|
||||
import org.apache.hudi.common.table.HoodieTableMetaClient;
|
||||
import org.apache.hudi.common.table.timeline.HoodieActiveTimeline;
|
||||
@@ -59,6 +60,8 @@ public class TestHoodieParquetInputFormat {
|
||||
|
||||
private HoodieParquetInputFormat inputFormat;
|
||||
private JobConf jobConf;
|
||||
private final HoodieFileFormat baseFileFormat = HoodieFileFormat.PARQUET;
|
||||
private final String baseFileExtension = baseFileFormat.getFileExtension();
|
||||
|
||||
public static void ensureFilesInCommit(String msg, FileStatus[] files, String commit, int expected) {
|
||||
int count = 0;
|
||||
@@ -145,7 +148,7 @@ public class TestHoodieParquetInputFormat {
|
||||
@Test
|
||||
public void testInputFormatLoad() throws IOException {
|
||||
// initial commit
|
||||
File partitionDir = InputFormatTestUtil.prepareTable(basePath, 10, "100");
|
||||
File partitionDir = InputFormatTestUtil.prepareTable(basePath, baseFileFormat, 10, "100");
|
||||
InputFormatTestUtil.commit(basePath, "100");
|
||||
|
||||
// Add the paths
|
||||
@@ -161,7 +164,7 @@ public class TestHoodieParquetInputFormat {
|
||||
@Test
|
||||
public void testInputFormatUpdates() throws IOException {
|
||||
// initial commit
|
||||
File partitionDir = InputFormatTestUtil.prepareTable(basePath, 10, "100");
|
||||
File partitionDir = InputFormatTestUtil.prepareTable(basePath, baseFileFormat, 10, "100");
|
||||
InputFormatTestUtil.commit(basePath, "100");
|
||||
|
||||
// Add the paths
|
||||
@@ -171,7 +174,7 @@ public class TestHoodieParquetInputFormat {
|
||||
assertEquals(10, files.length);
|
||||
|
||||
// update files
|
||||
InputFormatTestUtil.simulateUpdates(partitionDir, "100", 5, "200", true);
|
||||
InputFormatTestUtil.simulateUpdates(partitionDir, baseFileExtension, "100", 5, "200", true);
|
||||
// Before the commit
|
||||
files = inputFormat.listStatus(jobConf);
|
||||
assertEquals(10, files.length);
|
||||
@@ -188,7 +191,7 @@ public class TestHoodieParquetInputFormat {
|
||||
@Test
|
||||
public void testInputFormatWithCompaction() throws IOException {
|
||||
// initial commit
|
||||
File partitionDir = InputFormatTestUtil.prepareTable(basePath, 10, "100");
|
||||
File partitionDir = InputFormatTestUtil.prepareTable(basePath, baseFileFormat, 10, "100");
|
||||
InputFormatTestUtil.commit(basePath, "100");
|
||||
|
||||
// Add the paths
|
||||
@@ -204,7 +207,7 @@ public class TestHoodieParquetInputFormat {
|
||||
createCompactionFile(basePath, "125");
|
||||
|
||||
// add inserts after compaction timestamp
|
||||
InputFormatTestUtil.simulateInserts(partitionDir, "fileId2", 5, "200");
|
||||
InputFormatTestUtil.simulateInserts(partitionDir, baseFileExtension, "fileId2", 5, "200");
|
||||
InputFormatTestUtil.commit(basePath, "200");
|
||||
|
||||
// verify snapshot reads show all new inserts even though there is pending compaction
|
||||
@@ -221,7 +224,7 @@ public class TestHoodieParquetInputFormat {
|
||||
@Test
|
||||
public void testIncrementalSimple() throws IOException {
|
||||
// initial commit
|
||||
File partitionDir = InputFormatTestUtil.prepareTable(basePath, 10, "100");
|
||||
File partitionDir = InputFormatTestUtil.prepareTable(basePath, baseFileFormat, 10, "100");
|
||||
createCommitFile(basePath, "100", "2016/05/01");
|
||||
|
||||
// Add the paths
|
||||
@@ -266,25 +269,25 @@ public class TestHoodieParquetInputFormat {
|
||||
@Test
|
||||
public void testIncrementalWithMultipleCommits() throws IOException {
|
||||
// initial commit
|
||||
File partitionDir = InputFormatTestUtil.prepareTable(basePath, 10, "100");
|
||||
File partitionDir = InputFormatTestUtil.prepareTable(basePath, baseFileFormat, 10, "100");
|
||||
createCommitFile(basePath, "100", "2016/05/01");
|
||||
|
||||
// Add the paths
|
||||
FileInputFormat.setInputPaths(jobConf, partitionDir.getPath());
|
||||
// update files
|
||||
InputFormatTestUtil.simulateUpdates(partitionDir, "100", 5, "200", false);
|
||||
InputFormatTestUtil.simulateUpdates(partitionDir, baseFileExtension, "100", 5, "200", false);
|
||||
createCommitFile(basePath, "200", "2016/05/01");
|
||||
|
||||
InputFormatTestUtil.simulateUpdates(partitionDir, "100", 4, "300", false);
|
||||
InputFormatTestUtil.simulateUpdates(partitionDir, baseFileExtension, "100", 4, "300", false);
|
||||
createCommitFile(basePath, "300", "2016/05/01");
|
||||
|
||||
InputFormatTestUtil.simulateUpdates(partitionDir, "100", 3, "400", false);
|
||||
InputFormatTestUtil.simulateUpdates(partitionDir, baseFileExtension, "100", 3, "400", false);
|
||||
createCommitFile(basePath, "400", "2016/05/01");
|
||||
|
||||
InputFormatTestUtil.simulateUpdates(partitionDir, "100", 2, "500", false);
|
||||
InputFormatTestUtil.simulateUpdates(partitionDir, baseFileExtension, "100", 2, "500", false);
|
||||
createCommitFile(basePath, "500", "2016/05/01");
|
||||
|
||||
InputFormatTestUtil.simulateUpdates(partitionDir, "100", 1, "600", false);
|
||||
InputFormatTestUtil.simulateUpdates(partitionDir, baseFileExtension, "100", 1, "600", false);
|
||||
createCommitFile(basePath, "600", "2016/05/01");
|
||||
|
||||
InputFormatTestUtil.setupIncremental(jobConf, "100", 1);
|
||||
@@ -364,14 +367,14 @@ public class TestHoodieParquetInputFormat {
|
||||
@Test
|
||||
public void testIncrementalWithPendingCompaction() throws IOException {
|
||||
// initial commit
|
||||
File partitionDir = InputFormatTestUtil.prepareTable(basePath, 10, "100");
|
||||
File partitionDir = InputFormatTestUtil.prepareTable(basePath, baseFileFormat, 10, "100");
|
||||
createCommitFile(basePath, "100", "2016/05/01");
|
||||
|
||||
// simulate compaction requested at 300
|
||||
File compactionFile = createCompactionFile(basePath, "300");
|
||||
|
||||
// write inserts into new bucket
|
||||
InputFormatTestUtil.simulateInserts(partitionDir, "fileId2", 10, "400");
|
||||
InputFormatTestUtil.simulateInserts(partitionDir, baseFileExtension, "fileId2", 10, "400");
|
||||
createCommitFile(basePath, "400", "2016/05/01");
|
||||
|
||||
// Add the paths
|
||||
|
||||
@@ -20,8 +20,10 @@ package org.apache.hudi.hadoop.testutils;
|
||||
|
||||
import org.apache.hudi.avro.HoodieAvroUtils;
|
||||
import org.apache.hudi.common.fs.FSUtils;
|
||||
import org.apache.hudi.common.model.HoodieFileFormat;
|
||||
import org.apache.hudi.common.model.HoodieLogFile;
|
||||
import org.apache.hudi.common.model.HoodieRecord;
|
||||
import org.apache.hudi.common.model.HoodieTableType;
|
||||
import org.apache.hudi.common.table.log.HoodieLogFormat;
|
||||
import org.apache.hudi.common.table.log.block.HoodieAvroDataBlock;
|
||||
import org.apache.hudi.common.table.log.block.HoodieCommandBlock;
|
||||
@@ -59,25 +61,29 @@ public class InputFormatTestUtil {
|
||||
|
||||
private static String TEST_WRITE_TOKEN = "1-0-1";
|
||||
|
||||
public static File prepareTable(java.nio.file.Path basePath, int numberOfFiles, String commitNumber)
|
||||
public static File prepareTable(java.nio.file.Path basePath, HoodieFileFormat baseFileFormat, int numberOfFiles,
|
||||
String commitNumber)
|
||||
throws IOException {
|
||||
HoodieTestUtils.init(HoodieTestUtils.getDefaultHadoopConf(), basePath.toString());
|
||||
HoodieTestUtils.init(HoodieTestUtils.getDefaultHadoopConf(), basePath.toString(), HoodieTableType.COPY_ON_WRITE,
|
||||
baseFileFormat);
|
||||
java.nio.file.Path partitionPath = basePath.resolve(Paths.get("2016", "05", "01"));
|
||||
Files.createDirectories(partitionPath);
|
||||
return simulateInserts(partitionPath.toFile(), "fileId1", numberOfFiles, commitNumber);
|
||||
return simulateInserts(partitionPath.toFile(), baseFileFormat.getFileExtension(), "fileId1", numberOfFiles,
|
||||
commitNumber);
|
||||
}
|
||||
|
||||
public static File simulateInserts(File partitionPath, String fileId, int numberOfFiles, String commitNumber)
|
||||
public static File simulateInserts(File partitionPath, String baseFileExtension, String fileId, int numberOfFiles,
|
||||
String commitNumber)
|
||||
throws IOException {
|
||||
for (int i = 0; i < numberOfFiles; i++) {
|
||||
Files.createFile(partitionPath.toPath()
|
||||
.resolve(FSUtils.makeDataFileName(commitNumber, TEST_WRITE_TOKEN, fileId + i)));
|
||||
.resolve(FSUtils.makeDataFileName(commitNumber, TEST_WRITE_TOKEN, fileId + i, baseFileExtension)));
|
||||
}
|
||||
return partitionPath;
|
||||
}
|
||||
|
||||
public static void simulateUpdates(File directory, final String originalCommit, int numberOfFilesUpdated,
|
||||
String newCommit, boolean randomize) throws IOException {
|
||||
public static void simulateUpdates(File directory, String baseFileExtension, final String originalCommit,
|
||||
int numberOfFilesUpdated, String newCommit, boolean randomize) throws IOException {
|
||||
List<File> dataFiles = Arrays.asList(Objects.requireNonNull(directory.listFiles((dir, name) -> {
|
||||
String commitTs = FSUtils.getCommitTime(name);
|
||||
return originalCommit.equals(commitTs);
|
||||
@@ -88,7 +94,8 @@ public class InputFormatTestUtil {
|
||||
List<File> toUpdateList = dataFiles.subList(0, Math.min(numberOfFilesUpdated, dataFiles.size()));
|
||||
for (File file : toUpdateList) {
|
||||
String fileId = FSUtils.getFileId(file.getName());
|
||||
Files.createFile(directory.toPath().resolve(FSUtils.makeDataFileName(newCommit, TEST_WRITE_TOKEN, fileId)));
|
||||
Files.createFile(directory.toPath().resolve(FSUtils.makeDataFileName(newCommit, TEST_WRITE_TOKEN, fileId,
|
||||
baseFileExtension)));
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user