Reformatting code per Google Code Style all over
This commit is contained in:
committed by
vinoth chandar
parent
5a62480a92
commit
e45679f5e2
@@ -17,22 +17,23 @@
|
||||
package com.uber.hoodie.hadoop;
|
||||
|
||||
|
||||
import org.junit.Test;
|
||||
import static org.junit.Assert.*;
|
||||
import static org.junit.Assert.assertTrue;
|
||||
|
||||
import java.lang.annotation.Annotation;
|
||||
import org.junit.Test;
|
||||
|
||||
public class AnnotationTest {
|
||||
|
||||
@Test
|
||||
public void testAnnotation() {
|
||||
assertTrue(HoodieInputFormat.class.isAnnotationPresent(UseFileSplitsFromInputFormat.class));
|
||||
Annotation[] annotations = HoodieInputFormat.class.getAnnotations();
|
||||
boolean found = false;
|
||||
for (Annotation annotation : annotations) {
|
||||
if ("UseFileSplitsFromInputFormat".equals(annotation.annotationType().getSimpleName())){
|
||||
found = true;
|
||||
}
|
||||
}
|
||||
assertTrue(found);
|
||||
@Test
|
||||
public void testAnnotation() {
|
||||
assertTrue(HoodieInputFormat.class.isAnnotationPresent(UseFileSplitsFromInputFormat.class));
|
||||
Annotation[] annotations = HoodieInputFormat.class.getAnnotations();
|
||||
boolean found = false;
|
||||
for (Annotation annotation : annotations) {
|
||||
if ("UseFileSplitsFromInputFormat".equals(annotation.annotationType().getSimpleName())) {
|
||||
found = true;
|
||||
}
|
||||
}
|
||||
assertTrue(found);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -16,222 +16,235 @@
|
||||
|
||||
package com.uber.hoodie.hadoop;
|
||||
|
||||
import static org.junit.Assert.assertEquals;
|
||||
|
||||
import com.uber.hoodie.common.util.FSUtils;
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import org.apache.avro.Schema;
|
||||
import org.apache.hadoop.fs.FileStatus;
|
||||
import org.apache.hadoop.io.ArrayWritable;
|
||||
import org.apache.hadoop.mapred.*;
|
||||
import org.apache.hadoop.mapred.FileInputFormat;
|
||||
import org.apache.hadoop.mapred.InputSplit;
|
||||
import org.apache.hadoop.mapred.JobConf;
|
||||
import org.apache.hadoop.mapred.RecordReader;
|
||||
import org.junit.Before;
|
||||
import org.junit.Rule;
|
||||
import org.junit.Test;
|
||||
import org.junit.rules.TemporaryFolder;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
|
||||
import static org.junit.Assert.assertEquals;
|
||||
|
||||
public class HoodieInputFormatTest {
|
||||
private HoodieInputFormat inputFormat;
|
||||
private JobConf jobConf;
|
||||
|
||||
@Before public void setUp() {
|
||||
inputFormat = new HoodieInputFormat();
|
||||
jobConf = new JobConf();
|
||||
inputFormat.setConf(jobConf);
|
||||
}
|
||||
private HoodieInputFormat inputFormat;
|
||||
private JobConf jobConf;
|
||||
|
||||
@Rule public TemporaryFolder basePath = new TemporaryFolder();
|
||||
@Before
|
||||
public void setUp() {
|
||||
inputFormat = new HoodieInputFormat();
|
||||
jobConf = new JobConf();
|
||||
inputFormat.setConf(jobConf);
|
||||
}
|
||||
|
||||
@Test public void testInputFormatLoad() throws IOException {
|
||||
// initial commit
|
||||
File partitionDir = InputFormatTestUtil.prepareDataset(basePath, 10, "100");
|
||||
InputFormatTestUtil.commit(basePath, "100");
|
||||
@Rule
|
||||
public TemporaryFolder basePath = new TemporaryFolder();
|
||||
|
||||
// Add the paths
|
||||
FileInputFormat.setInputPaths(jobConf, partitionDir.getPath());
|
||||
@Test
|
||||
public void testInputFormatLoad() throws IOException {
|
||||
// initial commit
|
||||
File partitionDir = InputFormatTestUtil.prepareDataset(basePath, 10, "100");
|
||||
InputFormatTestUtil.commit(basePath, "100");
|
||||
|
||||
InputSplit[] inputSplits = inputFormat.getSplits(jobConf, 10);
|
||||
assertEquals(10, inputSplits.length);
|
||||
// Add the paths
|
||||
FileInputFormat.setInputPaths(jobConf, partitionDir.getPath());
|
||||
|
||||
FileStatus[] files = inputFormat.listStatus(jobConf);
|
||||
assertEquals(10, files.length);
|
||||
}
|
||||
InputSplit[] inputSplits = inputFormat.getSplits(jobConf, 10);
|
||||
assertEquals(10, inputSplits.length);
|
||||
|
||||
@Test public void testInputFormatUpdates() throws IOException {
|
||||
// initial commit
|
||||
File partitionDir = InputFormatTestUtil.prepareDataset(basePath, 10, "100");
|
||||
InputFormatTestUtil.commit(basePath, "100");
|
||||
FileStatus[] files = inputFormat.listStatus(jobConf);
|
||||
assertEquals(10, files.length);
|
||||
}
|
||||
|
||||
// Add the paths
|
||||
FileInputFormat.setInputPaths(jobConf, partitionDir.getPath());
|
||||
@Test
|
||||
public void testInputFormatUpdates() throws IOException {
|
||||
// initial commit
|
||||
File partitionDir = InputFormatTestUtil.prepareDataset(basePath, 10, "100");
|
||||
InputFormatTestUtil.commit(basePath, "100");
|
||||
|
||||
FileStatus[] files = inputFormat.listStatus(jobConf);
|
||||
assertEquals(10, files.length);
|
||||
// Add the paths
|
||||
FileInputFormat.setInputPaths(jobConf, partitionDir.getPath());
|
||||
|
||||
// update files
|
||||
InputFormatTestUtil.simulateUpdates(partitionDir, "100", 5, "200", true);
|
||||
// Before the commit
|
||||
files = inputFormat.listStatus(jobConf);
|
||||
assertEquals(10, files.length);
|
||||
ensureFilesInCommit(
|
||||
"Commit 200 has not been committed. We should not see files from this commit", files,
|
||||
"200", 0);
|
||||
InputFormatTestUtil.commit(basePath, "200");
|
||||
files = inputFormat.listStatus(jobConf);
|
||||
assertEquals(10, files.length);
|
||||
ensureFilesInCommit(
|
||||
"5 files have been updated to commit 200. We should see 5 files from commit 200 and 5 files from 100 commit",
|
||||
files, "200", 5);
|
||||
ensureFilesInCommit(
|
||||
"5 files have been updated to commit 200. We should see 5 files from commit 100 and 5 files from 200 commit",
|
||||
files, "100", 5);
|
||||
}
|
||||
FileStatus[] files = inputFormat.listStatus(jobConf);
|
||||
assertEquals(10, files.length);
|
||||
|
||||
@Test public void testIncrementalSimple() throws IOException {
|
||||
// initial commit
|
||||
File partitionDir = InputFormatTestUtil.prepareDataset(basePath, 10, "100");
|
||||
InputFormatTestUtil.commit(basePath, "100");
|
||||
// update files
|
||||
InputFormatTestUtil.simulateUpdates(partitionDir, "100", 5, "200", true);
|
||||
// Before the commit
|
||||
files = inputFormat.listStatus(jobConf);
|
||||
assertEquals(10, files.length);
|
||||
ensureFilesInCommit(
|
||||
"Commit 200 has not been committed. We should not see files from this commit", files,
|
||||
"200", 0);
|
||||
InputFormatTestUtil.commit(basePath, "200");
|
||||
files = inputFormat.listStatus(jobConf);
|
||||
assertEquals(10, files.length);
|
||||
ensureFilesInCommit(
|
||||
"5 files have been updated to commit 200. We should see 5 files from commit 200 and 5 files from 100 commit",
|
||||
files, "200", 5);
|
||||
ensureFilesInCommit(
|
||||
"5 files have been updated to commit 200. We should see 5 files from commit 100 and 5 files from 200 commit",
|
||||
files, "100", 5);
|
||||
}
|
||||
|
||||
// Add the paths
|
||||
FileInputFormat.setInputPaths(jobConf, partitionDir.getPath());
|
||||
@Test
|
||||
public void testIncrementalSimple() throws IOException {
|
||||
// initial commit
|
||||
File partitionDir = InputFormatTestUtil.prepareDataset(basePath, 10, "100");
|
||||
InputFormatTestUtil.commit(basePath, "100");
|
||||
|
||||
InputFormatTestUtil.setupIncremental(jobConf, "100", 1);
|
||||
// Add the paths
|
||||
FileInputFormat.setInputPaths(jobConf, partitionDir.getPath());
|
||||
|
||||
FileStatus[] files = inputFormat.listStatus(jobConf);
|
||||
assertEquals(
|
||||
"We should exclude commit 100 when returning incremental pull with start commit time as 100",
|
||||
0, files.length);
|
||||
}
|
||||
InputFormatTestUtil.setupIncremental(jobConf, "100", 1);
|
||||
|
||||
@Test public void testIncrementalWithMultipleCommits() throws IOException {
|
||||
// initial commit
|
||||
File partitionDir = InputFormatTestUtil.prepareDataset(basePath, 10, "100");
|
||||
InputFormatTestUtil.commit(basePath, "100");
|
||||
// Add the paths
|
||||
FileInputFormat.setInputPaths(jobConf, partitionDir.getPath());
|
||||
// update files
|
||||
InputFormatTestUtil.simulateUpdates(partitionDir, "100", 5, "200", false);
|
||||
InputFormatTestUtil.commit(basePath, "200");
|
||||
FileStatus[] files = inputFormat.listStatus(jobConf);
|
||||
assertEquals(
|
||||
"We should exclude commit 100 when returning incremental pull with start commit time as 100",
|
||||
0, files.length);
|
||||
}
|
||||
|
||||
InputFormatTestUtil.simulateUpdates(partitionDir, "100", 4, "300", false);
|
||||
InputFormatTestUtil.commit(basePath, "300");
|
||||
@Test
|
||||
public void testIncrementalWithMultipleCommits() throws IOException {
|
||||
// initial commit
|
||||
File partitionDir = InputFormatTestUtil.prepareDataset(basePath, 10, "100");
|
||||
InputFormatTestUtil.commit(basePath, "100");
|
||||
// Add the paths
|
||||
FileInputFormat.setInputPaths(jobConf, partitionDir.getPath());
|
||||
// update files
|
||||
InputFormatTestUtil.simulateUpdates(partitionDir, "100", 5, "200", false);
|
||||
InputFormatTestUtil.commit(basePath, "200");
|
||||
|
||||
InputFormatTestUtil.simulateUpdates(partitionDir, "100", 3, "400", false);
|
||||
InputFormatTestUtil.commit(basePath, "400");
|
||||
InputFormatTestUtil.simulateUpdates(partitionDir, "100", 4, "300", false);
|
||||
InputFormatTestUtil.commit(basePath, "300");
|
||||
|
||||
InputFormatTestUtil.simulateUpdates(partitionDir, "100", 2, "500", false);
|
||||
InputFormatTestUtil.commit(basePath, "500");
|
||||
InputFormatTestUtil.simulateUpdates(partitionDir, "100", 3, "400", false);
|
||||
InputFormatTestUtil.commit(basePath, "400");
|
||||
|
||||
InputFormatTestUtil.simulateUpdates(partitionDir, "100", 1, "600", false);
|
||||
InputFormatTestUtil.commit(basePath, "600");
|
||||
InputFormatTestUtil.simulateUpdates(partitionDir, "100", 2, "500", false);
|
||||
InputFormatTestUtil.commit(basePath, "500");
|
||||
|
||||
InputFormatTestUtil.setupIncremental(jobConf, "100", 1);
|
||||
FileStatus[] files = inputFormat.listStatus(jobConf);
|
||||
assertEquals("Pulling 1 commit from 100, should get us the 5 files committed at 200", 5,
|
||||
files.length);
|
||||
ensureFilesInCommit("Pulling 1 commit from 100, should get us the 5 files committed at 200",
|
||||
files, "200", 5);
|
||||
InputFormatTestUtil.simulateUpdates(partitionDir, "100", 1, "600", false);
|
||||
InputFormatTestUtil.commit(basePath, "600");
|
||||
|
||||
InputFormatTestUtil.setupIncremental(jobConf, "100", 3);
|
||||
files = inputFormat.listStatus(jobConf);
|
||||
InputFormatTestUtil.setupIncremental(jobConf, "100", 1);
|
||||
FileStatus[] files = inputFormat.listStatus(jobConf);
|
||||
assertEquals("Pulling 1 commit from 100, should get us the 5 files committed at 200", 5,
|
||||
files.length);
|
||||
ensureFilesInCommit("Pulling 1 commit from 100, should get us the 5 files committed at 200",
|
||||
files, "200", 5);
|
||||
|
||||
assertEquals(
|
||||
"Pulling 3 commits from 100, should get us the 3 files from 400 commit, 1 file from 300 commit and 1 file from 200 commit",
|
||||
5, files.length);
|
||||
ensureFilesInCommit("Pulling 3 commits from 100, should get us the 3 files from 400 commit",
|
||||
files, "400", 3);
|
||||
ensureFilesInCommit("Pulling 3 commits from 100, should get us the 1 files from 300 commit",
|
||||
files, "300", 1);
|
||||
ensureFilesInCommit("Pulling 3 commits from 100, should get us the 1 files from 200 commit",
|
||||
files, "200", 1);
|
||||
InputFormatTestUtil.setupIncremental(jobConf, "100", 3);
|
||||
files = inputFormat.listStatus(jobConf);
|
||||
|
||||
InputFormatTestUtil.setupIncremental(jobConf, "100", HoodieHiveUtil.MAX_COMMIT_ALL);
|
||||
files = inputFormat.listStatus(jobConf);
|
||||
assertEquals(
|
||||
"Pulling 3 commits from 100, should get us the 3 files from 400 commit, 1 file from 300 commit and 1 file from 200 commit",
|
||||
5, files.length);
|
||||
ensureFilesInCommit("Pulling 3 commits from 100, should get us the 3 files from 400 commit",
|
||||
files, "400", 3);
|
||||
ensureFilesInCommit("Pulling 3 commits from 100, should get us the 1 files from 300 commit",
|
||||
files, "300", 1);
|
||||
ensureFilesInCommit("Pulling 3 commits from 100, should get us the 1 files from 200 commit",
|
||||
files, "200", 1);
|
||||
|
||||
assertEquals(
|
||||
"Pulling all commits from 100, should get us the 1 file from each of 200,300,400,500,400 commits",
|
||||
5, files.length);
|
||||
ensureFilesInCommit(
|
||||
"Pulling all commits from 100, should get us the 1 files from 600 commit", files, "600",
|
||||
1);
|
||||
ensureFilesInCommit(
|
||||
"Pulling all commits from 100, should get us the 1 files from 500 commit", files, "500",
|
||||
1);
|
||||
ensureFilesInCommit(
|
||||
"Pulling all commits from 100, should get us the 1 files from 400 commit", files, "400",
|
||||
1);
|
||||
ensureFilesInCommit(
|
||||
"Pulling all commits from 100, should get us the 1 files from 300 commit", files, "300",
|
||||
1);
|
||||
ensureFilesInCommit(
|
||||
"Pulling all commits from 100, should get us the 1 files from 200 commit", files, "200",
|
||||
1);
|
||||
}
|
||||
InputFormatTestUtil.setupIncremental(jobConf, "100", HoodieHiveUtil.MAX_COMMIT_ALL);
|
||||
files = inputFormat.listStatus(jobConf);
|
||||
|
||||
//TODO enable this after enabling predicate pushdown
|
||||
public void testPredicatePushDown() throws IOException {
|
||||
// initial commit
|
||||
Schema schema = InputFormatTestUtil.readSchema("/sample1.avro");
|
||||
String commit1 = "20160628071126";
|
||||
File partitionDir =
|
||||
InputFormatTestUtil.prepareParquetDataset(basePath, schema, 1, 10, commit1);
|
||||
InputFormatTestUtil.commit(basePath, commit1);
|
||||
// Add the paths
|
||||
FileInputFormat.setInputPaths(jobConf, partitionDir.getPath());
|
||||
// check whether we have 10 records at this point
|
||||
ensureRecordsInCommit("We need to have 10 records at this point for commit " + commit1, commit1, 10, 10);
|
||||
assertEquals(
|
||||
"Pulling all commits from 100, should get us the 1 file from each of 200,300,400,500,400 commits",
|
||||
5, files.length);
|
||||
ensureFilesInCommit(
|
||||
"Pulling all commits from 100, should get us the 1 files from 600 commit", files, "600",
|
||||
1);
|
||||
ensureFilesInCommit(
|
||||
"Pulling all commits from 100, should get us the 1 files from 500 commit", files, "500",
|
||||
1);
|
||||
ensureFilesInCommit(
|
||||
"Pulling all commits from 100, should get us the 1 files from 400 commit", files, "400",
|
||||
1);
|
||||
ensureFilesInCommit(
|
||||
"Pulling all commits from 100, should get us the 1 files from 300 commit", files, "300",
|
||||
1);
|
||||
ensureFilesInCommit(
|
||||
"Pulling all commits from 100, should get us the 1 files from 200 commit", files, "200",
|
||||
1);
|
||||
}
|
||||
|
||||
// update 2 records in the original parquet file and save it as commit 200
|
||||
String commit2 = "20160629193623";
|
||||
InputFormatTestUtil.simulateParquetUpdates(partitionDir, schema, commit1, 10, 2, commit2);
|
||||
InputFormatTestUtil.commit(basePath, commit2);
|
||||
//TODO enable this after enabling predicate pushdown
|
||||
public void testPredicatePushDown() throws IOException {
|
||||
// initial commit
|
||||
Schema schema = InputFormatTestUtil.readSchema("/sample1.avro");
|
||||
String commit1 = "20160628071126";
|
||||
File partitionDir =
|
||||
InputFormatTestUtil.prepareParquetDataset(basePath, schema, 1, 10, commit1);
|
||||
InputFormatTestUtil.commit(basePath, commit1);
|
||||
// Add the paths
|
||||
FileInputFormat.setInputPaths(jobConf, partitionDir.getPath());
|
||||
// check whether we have 10 records at this point
|
||||
ensureRecordsInCommit("We need to have 10 records at this point for commit " + commit1, commit1,
|
||||
10, 10);
|
||||
|
||||
InputFormatTestUtil.setupIncremental(jobConf, commit1, 1);
|
||||
// check whether we have 2 records at this point
|
||||
ensureRecordsInCommit(
|
||||
"We need to have 2 records that was modified at commit " + commit2 + " and no more", commit2, 2, 2);
|
||||
// Make sure we have the 10 records if we roll back the stattime
|
||||
InputFormatTestUtil.setupIncremental(jobConf, "0", 2);
|
||||
ensureRecordsInCommit(
|
||||
"We need to have 8 records that was modified at commit " + commit1 + " and no more", commit1, 8, 10);
|
||||
ensureRecordsInCommit(
|
||||
"We need to have 2 records that was modified at commit " + commit2 + " and no more", commit2, 2, 10);
|
||||
}
|
||||
// update 2 records in the original parquet file and save it as commit 200
|
||||
String commit2 = "20160629193623";
|
||||
InputFormatTestUtil.simulateParquetUpdates(partitionDir, schema, commit1, 10, 2, commit2);
|
||||
InputFormatTestUtil.commit(basePath, commit2);
|
||||
|
||||
private void ensureRecordsInCommit(String msg, String commit,
|
||||
int expectedNumberOfRecordsInCommit, int totalExpected) throws IOException {
|
||||
int actualCount = 0;
|
||||
int totalCount = 0;
|
||||
InputSplit[] splits = inputFormat.getSplits(jobConf, 1);
|
||||
for(InputSplit split:splits) {
|
||||
RecordReader<Void, ArrayWritable>
|
||||
recordReader = inputFormat.getRecordReader(split, jobConf, null);
|
||||
Void key = recordReader.createKey();
|
||||
ArrayWritable writable = recordReader.createValue();
|
||||
InputFormatTestUtil.setupIncremental(jobConf, commit1, 1);
|
||||
// check whether we have 2 records at this point
|
||||
ensureRecordsInCommit(
|
||||
"We need to have 2 records that was modified at commit " + commit2 + " and no more",
|
||||
commit2, 2, 2);
|
||||
// Make sure we have the 10 records if we roll back the stattime
|
||||
InputFormatTestUtil.setupIncremental(jobConf, "0", 2);
|
||||
ensureRecordsInCommit(
|
||||
"We need to have 8 records that was modified at commit " + commit1 + " and no more",
|
||||
commit1, 8, 10);
|
||||
ensureRecordsInCommit(
|
||||
"We need to have 2 records that was modified at commit " + commit2 + " and no more",
|
||||
commit2, 2, 10);
|
||||
}
|
||||
|
||||
while(recordReader.next(key, writable)) {
|
||||
// writable returns an array with [field1, field2, _hoodie_commit_time, _hoodie_commit_seqno]
|
||||
// Take the commit time and compare with the one we are interested in
|
||||
if(commit.equals((writable.get()[2]).toString())) {
|
||||
actualCount++;
|
||||
}
|
||||
totalCount++;
|
||||
}
|
||||
private void ensureRecordsInCommit(String msg, String commit,
|
||||
int expectedNumberOfRecordsInCommit, int totalExpected) throws IOException {
|
||||
int actualCount = 0;
|
||||
int totalCount = 0;
|
||||
InputSplit[] splits = inputFormat.getSplits(jobConf, 1);
|
||||
for (InputSplit split : splits) {
|
||||
RecordReader<Void, ArrayWritable>
|
||||
recordReader = inputFormat.getRecordReader(split, jobConf, null);
|
||||
Void key = recordReader.createKey();
|
||||
ArrayWritable writable = recordReader.createValue();
|
||||
|
||||
while (recordReader.next(key, writable)) {
|
||||
// writable returns an array with [field1, field2, _hoodie_commit_time, _hoodie_commit_seqno]
|
||||
// Take the commit time and compare with the one we are interested in
|
||||
if (commit.equals((writable.get()[2]).toString())) {
|
||||
actualCount++;
|
||||
}
|
||||
assertEquals(msg, expectedNumberOfRecordsInCommit, actualCount);
|
||||
assertEquals(msg, totalExpected, totalCount);
|
||||
totalCount++;
|
||||
}
|
||||
}
|
||||
assertEquals(msg, expectedNumberOfRecordsInCommit, actualCount);
|
||||
assertEquals(msg, totalExpected, totalCount);
|
||||
}
|
||||
|
||||
public static void ensureFilesInCommit(String msg, FileStatus[] files, String commit,
|
||||
int expected) {
|
||||
int count = 0;
|
||||
for (FileStatus file : files) {
|
||||
String commitTs = FSUtils.getCommitTime(file.getPath().getName());
|
||||
if (commit.equals(commitTs)) {
|
||||
count++;
|
||||
}
|
||||
}
|
||||
assertEquals(msg, expected, count);
|
||||
public static void ensureFilesInCommit(String msg, FileStatus[] files, String commit,
|
||||
int expected) {
|
||||
int count = 0;
|
||||
for (FileStatus file : files) {
|
||||
String commitTs = FSUtils.getCommitTime(file.getPath().getName());
|
||||
if (commit.equals(commitTs)) {
|
||||
count++;
|
||||
}
|
||||
}
|
||||
assertEquals(msg, expected, count);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -16,20 +16,10 @@
|
||||
|
||||
package com.uber.hoodie.hadoop;
|
||||
|
||||
import com.uber.hoodie.avro.MercifulJsonConverter;
|
||||
import com.uber.hoodie.common.model.HoodieRecord;
|
||||
import com.uber.hoodie.common.model.HoodieTestUtils;
|
||||
import com.uber.hoodie.common.util.FSUtils;
|
||||
import com.uber.hoodie.common.util.SchemaTestUtil;
|
||||
import com.uber.hoodie.common.util.TestRecord;
|
||||
import org.apache.avro.Schema;
|
||||
import org.apache.avro.generic.GenericData;
|
||||
import org.apache.avro.generic.GenericRecord;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.apache.hadoop.mapred.JobConf;
|
||||
import org.apache.parquet.avro.AvroParquetWriter;
|
||||
import org.junit.rules.TemporaryFolder;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.FilenameFilter;
|
||||
import java.io.IOException;
|
||||
@@ -37,123 +27,139 @@ import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
import org.apache.avro.Schema;
|
||||
import org.apache.avro.generic.GenericRecord;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.apache.hadoop.mapred.JobConf;
|
||||
import org.apache.parquet.avro.AvroParquetWriter;
|
||||
import org.junit.rules.TemporaryFolder;
|
||||
|
||||
public class InputFormatTestUtil {
|
||||
public static File prepareDataset(TemporaryFolder basePath, int numberOfFiles,
|
||||
String commitNumber) throws IOException {
|
||||
basePath.create();
|
||||
HoodieTestUtils.init(basePath.getRoot().toString());
|
||||
File partitionPath = basePath.newFolder("2016", "05", "01");
|
||||
for (int i = 0; i < numberOfFiles; i++) {
|
||||
File dataFile =
|
||||
new File(partitionPath, FSUtils.makeDataFileName(commitNumber, 1, "fileid" + i));
|
||||
dataFile.createNewFile();
|
||||
|
||||
public static File prepareDataset(TemporaryFolder basePath, int numberOfFiles,
|
||||
String commitNumber) throws IOException {
|
||||
basePath.create();
|
||||
HoodieTestUtils.init(basePath.getRoot().toString());
|
||||
File partitionPath = basePath.newFolder("2016", "05", "01");
|
||||
for (int i = 0; i < numberOfFiles; i++) {
|
||||
File dataFile =
|
||||
new File(partitionPath, FSUtils.makeDataFileName(commitNumber, 1, "fileid" + i));
|
||||
dataFile.createNewFile();
|
||||
}
|
||||
return partitionPath;
|
||||
}
|
||||
|
||||
public static void simulateUpdates(File directory, final String originalCommit,
|
||||
int numberOfFilesUpdated,
|
||||
String newCommit, boolean randomize) throws IOException {
|
||||
List<File> dataFiles = Arrays.asList(directory.listFiles(new FilenameFilter() {
|
||||
@Override
|
||||
public boolean accept(File dir, String name) {
|
||||
String commitTs = FSUtils.getCommitTime(name);
|
||||
return originalCommit.equals(commitTs);
|
||||
}
|
||||
}));
|
||||
if (randomize) {
|
||||
Collections.shuffle(dataFiles);
|
||||
}
|
||||
List<File> toUpdateList =
|
||||
dataFiles.subList(0, Math.min(numberOfFilesUpdated, dataFiles.size()));
|
||||
for (File file : toUpdateList) {
|
||||
String fileId = FSUtils.getFileId(file.getName());
|
||||
File dataFile = new File(directory, FSUtils.makeDataFileName(newCommit, 1, fileId));
|
||||
dataFile.createNewFile();
|
||||
}
|
||||
}
|
||||
|
||||
public static void commit(TemporaryFolder basePath, String commitNumber) throws IOException {
|
||||
// create the commit
|
||||
new File(basePath.getRoot().toString() + "/.hoodie/", commitNumber + ".commit").createNewFile();
|
||||
}
|
||||
|
||||
public static void setupIncremental(JobConf jobConf, String startCommit,
|
||||
int numberOfCommitsToPull) {
|
||||
String modePropertyName = String.format(HoodieHiveUtil.HOODIE_CONSUME_MODE_PATTERN,
|
||||
HoodieTestUtils.RAW_TRIPS_TEST_NAME);
|
||||
jobConf.set(modePropertyName, HoodieHiveUtil.INCREMENTAL_SCAN_MODE);
|
||||
|
||||
String startCommitTimestampName = String
|
||||
.format(HoodieHiveUtil.HOODIE_START_COMMIT_PATTERN, HoodieTestUtils.RAW_TRIPS_TEST_NAME);
|
||||
jobConf.set(startCommitTimestampName, startCommit);
|
||||
|
||||
String maxCommitPulls = String
|
||||
.format(HoodieHiveUtil.HOODIE_MAX_COMMIT_PATTERN, HoodieTestUtils.RAW_TRIPS_TEST_NAME);
|
||||
jobConf.setInt(maxCommitPulls, numberOfCommitsToPull);
|
||||
}
|
||||
|
||||
public static Schema readSchema(String location) throws IOException {
|
||||
return new Schema.Parser().parse(InputFormatTestUtil.class.getResourceAsStream(location));
|
||||
}
|
||||
|
||||
public static File prepareParquetDataset(TemporaryFolder basePath, Schema schema,
|
||||
int numberOfFiles, int numberOfRecords,
|
||||
String commitNumber) throws IOException {
|
||||
basePath.create();
|
||||
HoodieTestUtils.init(basePath.getRoot().toString());
|
||||
File partitionPath = basePath.newFolder("2016", "05", "01");
|
||||
AvroParquetWriter parquetWriter;
|
||||
for (int i = 0; i < numberOfFiles; i++) {
|
||||
String fileId = FSUtils.makeDataFileName(commitNumber, 1, "fileid" + i);
|
||||
File dataFile =
|
||||
new File(partitionPath, fileId);
|
||||
// dataFile.createNewFile();
|
||||
parquetWriter = new AvroParquetWriter(new Path(dataFile.getAbsolutePath()),
|
||||
schema);
|
||||
try {
|
||||
for (GenericRecord record : generateAvroRecords(schema, numberOfRecords, commitNumber,
|
||||
fileId)) {
|
||||
parquetWriter.write(record);
|
||||
}
|
||||
return partitionPath;
|
||||
} finally {
|
||||
parquetWriter.close();
|
||||
}
|
||||
}
|
||||
return partitionPath;
|
||||
|
||||
public static void simulateUpdates(File directory, final String originalCommit, int numberOfFilesUpdated,
|
||||
String newCommit, boolean randomize) throws IOException {
|
||||
List<File> dataFiles = Arrays.asList(directory.listFiles(new FilenameFilter() {
|
||||
@Override public boolean accept(File dir, String name) {
|
||||
String commitTs = FSUtils.getCommitTime(name);
|
||||
return originalCommit.equals(commitTs);
|
||||
}
|
||||
}));
|
||||
if(randomize) {
|
||||
Collections.shuffle(dataFiles);
|
||||
}
|
||||
List<File> toUpdateList =
|
||||
dataFiles.subList(0, Math.min(numberOfFilesUpdated, dataFiles.size()));
|
||||
for (File file : toUpdateList) {
|
||||
String fileId = FSUtils.getFileId(file.getName());
|
||||
File dataFile = new File(directory, FSUtils.makeDataFileName(newCommit, 1, fileId));
|
||||
dataFile.createNewFile();
|
||||
}
|
||||
|
||||
private static Iterable<? extends GenericRecord> generateAvroRecords(Schema schema,
|
||||
int numberOfRecords, String commitTime, String fileId) throws IOException {
|
||||
List<GenericRecord> records = new ArrayList<>(numberOfRecords);
|
||||
for (int i = 0; i < numberOfRecords; i++) {
|
||||
records.add(SchemaTestUtil.generateAvroRecordFromJson(schema, i, commitTime, fileId));
|
||||
}
|
||||
return records;
|
||||
}
|
||||
|
||||
public static void simulateParquetUpdates(File directory, Schema schema, String originalCommit,
|
||||
int totalNumberOfRecords, int numberOfRecordsToUpdate,
|
||||
String newCommit) throws IOException {
|
||||
File fileToUpdate = directory.listFiles(new FilenameFilter() {
|
||||
@Override
|
||||
public boolean accept(File dir, String name) {
|
||||
return name.endsWith("parquet");
|
||||
}
|
||||
})[0];
|
||||
String fileId = FSUtils.getFileId(fileToUpdate.getName());
|
||||
File dataFile = new File(directory, FSUtils.makeDataFileName(newCommit, 1, fileId));
|
||||
AvroParquetWriter parquetWriter = new AvroParquetWriter(new Path(dataFile.getAbsolutePath()),
|
||||
schema);
|
||||
try {
|
||||
for (GenericRecord record : generateAvroRecords(schema, totalNumberOfRecords,
|
||||
originalCommit, fileId)) {
|
||||
if (numberOfRecordsToUpdate > 0) {
|
||||
// update this record
|
||||
record.put(HoodieRecord.COMMIT_TIME_METADATA_FIELD, newCommit);
|
||||
String oldSeqNo = (String) record.get(HoodieRecord.COMMIT_SEQNO_METADATA_FIELD);
|
||||
record.put(HoodieRecord.COMMIT_SEQNO_METADATA_FIELD,
|
||||
oldSeqNo.replace(originalCommit, newCommit));
|
||||
numberOfRecordsToUpdate--;
|
||||
}
|
||||
parquetWriter.write(record);
|
||||
}
|
||||
} finally {
|
||||
parquetWriter.close();
|
||||
}
|
||||
|
||||
public static void commit(TemporaryFolder basePath, String commitNumber) throws IOException {
|
||||
// create the commit
|
||||
new File(basePath.getRoot().toString() + "/.hoodie/", commitNumber + ".commit").createNewFile();
|
||||
}
|
||||
|
||||
public static void setupIncremental(JobConf jobConf, String startCommit, int numberOfCommitsToPull) {
|
||||
String modePropertyName = String.format(HoodieHiveUtil.HOODIE_CONSUME_MODE_PATTERN,
|
||||
HoodieTestUtils.RAW_TRIPS_TEST_NAME);
|
||||
jobConf.set(modePropertyName, HoodieHiveUtil.INCREMENTAL_SCAN_MODE);
|
||||
|
||||
String startCommitTimestampName = String.format(HoodieHiveUtil.HOODIE_START_COMMIT_PATTERN, HoodieTestUtils.RAW_TRIPS_TEST_NAME);
|
||||
jobConf.set(startCommitTimestampName, startCommit);
|
||||
|
||||
String maxCommitPulls = String.format(HoodieHiveUtil.HOODIE_MAX_COMMIT_PATTERN, HoodieTestUtils.RAW_TRIPS_TEST_NAME);
|
||||
jobConf.setInt(maxCommitPulls, numberOfCommitsToPull);
|
||||
}
|
||||
|
||||
public static Schema readSchema(String location) throws IOException {
|
||||
return new Schema.Parser().parse(InputFormatTestUtil.class.getResourceAsStream(location));
|
||||
}
|
||||
|
||||
public static File prepareParquetDataset(TemporaryFolder basePath, Schema schema, int numberOfFiles, int numberOfRecords,
|
||||
String commitNumber) throws IOException {
|
||||
basePath.create();
|
||||
HoodieTestUtils.init(basePath.getRoot().toString());
|
||||
File partitionPath = basePath.newFolder("2016", "05", "01");
|
||||
AvroParquetWriter parquetWriter;
|
||||
for (int i = 0; i < numberOfFiles; i++) {
|
||||
String fileId = FSUtils.makeDataFileName(commitNumber, 1, "fileid" + i);
|
||||
File dataFile =
|
||||
new File(partitionPath, fileId);
|
||||
// dataFile.createNewFile();
|
||||
parquetWriter = new AvroParquetWriter(new Path(dataFile.getAbsolutePath()),
|
||||
schema);
|
||||
try {
|
||||
for (GenericRecord record : generateAvroRecords(schema, numberOfRecords, commitNumber, fileId)) {
|
||||
parquetWriter.write(record);
|
||||
}
|
||||
} finally {
|
||||
parquetWriter.close();
|
||||
}
|
||||
}
|
||||
return partitionPath;
|
||||
|
||||
}
|
||||
|
||||
private static Iterable<? extends GenericRecord> generateAvroRecords(Schema schema, int numberOfRecords, String commitTime, String fileId) throws IOException {
|
||||
List<GenericRecord> records = new ArrayList<>(numberOfRecords);
|
||||
for(int i=0;i<numberOfRecords;i++) {
|
||||
records.add(SchemaTestUtil.generateAvroRecordFromJson(schema, i, commitTime, fileId));
|
||||
}
|
||||
return records;
|
||||
}
|
||||
|
||||
public static void simulateParquetUpdates(File directory, Schema schema, String originalCommit,
|
||||
int totalNumberOfRecords, int numberOfRecordsToUpdate,
|
||||
String newCommit) throws IOException {
|
||||
File fileToUpdate = directory.listFiles(new FilenameFilter() {
|
||||
@Override public boolean accept(File dir, String name) {
|
||||
return name.endsWith("parquet");
|
||||
}
|
||||
})[0];
|
||||
String fileId = FSUtils.getFileId(fileToUpdate.getName());
|
||||
File dataFile = new File(directory, FSUtils.makeDataFileName(newCommit, 1, fileId));
|
||||
AvroParquetWriter parquetWriter = new AvroParquetWriter(new Path(dataFile.getAbsolutePath()),
|
||||
schema);
|
||||
try {
|
||||
for (GenericRecord record : generateAvroRecords(schema, totalNumberOfRecords,
|
||||
originalCommit, fileId)) {
|
||||
if (numberOfRecordsToUpdate > 0) {
|
||||
// update this record
|
||||
record.put(HoodieRecord.COMMIT_TIME_METADATA_FIELD, newCommit);
|
||||
String oldSeqNo = (String) record.get(HoodieRecord.COMMIT_SEQNO_METADATA_FIELD);
|
||||
record.put(HoodieRecord.COMMIT_SEQNO_METADATA_FIELD,
|
||||
oldSeqNo.replace(originalCommit, newCommit));
|
||||
numberOfRecordsToUpdate--;
|
||||
}
|
||||
parquetWriter.write(record);
|
||||
}
|
||||
} finally {
|
||||
parquetWriter.close();
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -15,63 +15,66 @@
|
||||
*/
|
||||
package com.uber.hoodie.hadoop;
|
||||
|
||||
import com.uber.hoodie.common.model.HoodieTestUtils;
|
||||
import static org.junit.Assert.assertFalse;
|
||||
import static org.junit.Assert.assertTrue;
|
||||
|
||||
import com.uber.hoodie.common.model.HoodieTestUtils;
|
||||
import com.uber.hoodie.common.table.HoodieTableMetaClient;
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.junit.Test;
|
||||
import org.junit.rules.TemporaryFolder;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
|
||||
import static org.junit.Assert.assertFalse;
|
||||
import static org.junit.Assert.assertTrue;
|
||||
|
||||
|
||||
/**
|
||||
*/
|
||||
public class TestHoodieROTablePathFilter {
|
||||
|
||||
@Test
|
||||
public void testHoodiePaths() throws IOException {
|
||||
// Create a temp folder as the base path
|
||||
HoodieTableMetaClient metaClient = HoodieTestUtils.initOnTemp();
|
||||
String basePath = metaClient.getBasePath();
|
||||
@Test
|
||||
public void testHoodiePaths() throws IOException {
|
||||
// Create a temp folder as the base path
|
||||
HoodieTableMetaClient metaClient = HoodieTestUtils.initOnTemp();
|
||||
String basePath = metaClient.getBasePath();
|
||||
|
||||
HoodieTestUtils.createCommitFiles(basePath, "001", "002");
|
||||
HoodieTestUtils.createInflightCommitFiles(basePath, "003");
|
||||
HoodieTestUtils.createCommitFiles(basePath, "001", "002");
|
||||
HoodieTestUtils.createInflightCommitFiles(basePath, "003");
|
||||
|
||||
HoodieTestUtils.createDataFile(basePath, "2017/01/01", "001", "f1");
|
||||
HoodieTestUtils.createDataFile(basePath, "2017/01/01", "001", "f2");
|
||||
HoodieTestUtils.createDataFile(basePath, "2017/01/01", "001", "f3");
|
||||
HoodieTestUtils.createDataFile(basePath, "2017/01/01", "002", "f2");
|
||||
HoodieTestUtils.createDataFile(basePath, "2017/01/01", "003", "f3");
|
||||
HoodieTestUtils.createDataFile(basePath, "2017/01/01", "001", "f1");
|
||||
HoodieTestUtils.createDataFile(basePath, "2017/01/01", "001", "f2");
|
||||
HoodieTestUtils.createDataFile(basePath, "2017/01/01", "001", "f3");
|
||||
HoodieTestUtils.createDataFile(basePath, "2017/01/01", "002", "f2");
|
||||
HoodieTestUtils.createDataFile(basePath, "2017/01/01", "003", "f3");
|
||||
|
||||
HoodieROTablePathFilter pathFilter = new HoodieROTablePathFilter();
|
||||
Path partitionPath = new Path("file://" + basePath + File.separator + "2017/01/01");
|
||||
assertTrue("Directories should be accepted", pathFilter.accept(partitionPath));
|
||||
HoodieROTablePathFilter pathFilter = new HoodieROTablePathFilter();
|
||||
Path partitionPath = new Path("file://" + basePath + File.separator + "2017/01/01");
|
||||
assertTrue("Directories should be accepted", pathFilter.accept(partitionPath));
|
||||
|
||||
assertTrue(pathFilter.accept(new Path("file:///" + HoodieTestUtils.getDataFilePath(basePath, "2017/01/01", "001", "f1"))));
|
||||
assertFalse(pathFilter.accept(new Path("file:///" + HoodieTestUtils.getDataFilePath(basePath, "2017/01/01", "001", "f2"))));
|
||||
assertTrue(pathFilter.accept(new Path("file:///" + HoodieTestUtils.getDataFilePath(basePath, "2017/01/01", "001", "f3"))));
|
||||
assertTrue(pathFilter.accept(new Path("file:///" + HoodieTestUtils.getDataFilePath(basePath, "2017/01/01", "002", "f2"))));
|
||||
assertFalse(pathFilter.accept(new Path("file:///" + HoodieTestUtils.getDataFilePath(basePath, "2017/01/01", "003", "f3"))));
|
||||
}
|
||||
assertTrue(pathFilter.accept(new Path(
|
||||
"file:///" + HoodieTestUtils.getDataFilePath(basePath, "2017/01/01", "001", "f1"))));
|
||||
assertFalse(pathFilter.accept(new Path(
|
||||
"file:///" + HoodieTestUtils.getDataFilePath(basePath, "2017/01/01", "001", "f2"))));
|
||||
assertTrue(pathFilter.accept(new Path(
|
||||
"file:///" + HoodieTestUtils.getDataFilePath(basePath, "2017/01/01", "001", "f3"))));
|
||||
assertTrue(pathFilter.accept(new Path(
|
||||
"file:///" + HoodieTestUtils.getDataFilePath(basePath, "2017/01/01", "002", "f2"))));
|
||||
assertFalse(pathFilter.accept(new Path(
|
||||
"file:///" + HoodieTestUtils.getDataFilePath(basePath, "2017/01/01", "003", "f3"))));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testNonHoodiePaths() throws IOException {
|
||||
TemporaryFolder folder = new TemporaryFolder();
|
||||
folder.create();
|
||||
String basePath = folder.getRoot().getAbsolutePath();
|
||||
HoodieROTablePathFilter pathFilter = new HoodieROTablePathFilter();
|
||||
@Test
|
||||
public void testNonHoodiePaths() throws IOException {
|
||||
TemporaryFolder folder = new TemporaryFolder();
|
||||
folder.create();
|
||||
String basePath = folder.getRoot().getAbsolutePath();
|
||||
HoodieROTablePathFilter pathFilter = new HoodieROTablePathFilter();
|
||||
|
||||
String path = basePath + File.separator + "nonhoodiefolder";
|
||||
new File(path).mkdirs();
|
||||
assertTrue(pathFilter.accept(new Path("file:///" + path)));
|
||||
String path = basePath + File.separator + "nonhoodiefolder";
|
||||
new File(path).mkdirs();
|
||||
assertTrue(pathFilter.accept(new Path("file:///" + path)));
|
||||
|
||||
path = basePath + File.separator + "nonhoodiefolder/somefile";
|
||||
new File(path).createNewFile();
|
||||
assertTrue(pathFilter.accept(new Path("file:///" + path)));
|
||||
}
|
||||
path = basePath + File.separator + "nonhoodiefolder/somefile";
|
||||
new File(path).createNewFile();
|
||||
assertTrue(pathFilter.accept(new Path("file:///" + path)));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -19,6 +19,8 @@
|
||||
package com.uber.hoodie.hadoop.realtime;
|
||||
|
||||
|
||||
import static org.junit.Assert.assertTrue;
|
||||
|
||||
import com.google.common.collect.Maps;
|
||||
import com.uber.hoodie.common.model.HoodieLogFile;
|
||||
import com.uber.hoodie.common.model.HoodieTableType;
|
||||
@@ -30,6 +32,13 @@ import com.uber.hoodie.common.util.FSUtils;
|
||||
import com.uber.hoodie.common.util.HoodieAvroUtils;
|
||||
import com.uber.hoodie.common.util.SchemaTestUtil;
|
||||
import com.uber.hoodie.hadoop.InputFormatTestUtil;
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.stream.Collectors;
|
||||
import org.apache.avro.Schema;
|
||||
import org.apache.avro.generic.IndexedRecord;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
@@ -52,211 +61,234 @@ import org.junit.Rule;
|
||||
import org.junit.Test;
|
||||
import org.junit.rules.TemporaryFolder;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import static org.junit.Assert.assertTrue;
|
||||
|
||||
public class HoodieRealtimeRecordReaderTest {
|
||||
|
||||
private JobConf jobConf;
|
||||
private JobConf jobConf;
|
||||
|
||||
@Before
|
||||
public void setUp() {
|
||||
jobConf = new JobConf();
|
||||
@Before
|
||||
public void setUp() {
|
||||
jobConf = new JobConf();
|
||||
}
|
||||
|
||||
@Rule
|
||||
public TemporaryFolder basePath = new TemporaryFolder();
|
||||
|
||||
private HoodieLogFormat.Writer writeLogFile(File partitionDir, Schema schema, String fileId,
|
||||
String baseCommit, String newCommit, int numberOfRecords)
|
||||
throws InterruptedException, IOException {
|
||||
HoodieLogFormat.Writer writer = HoodieLogFormat.newWriterBuilder()
|
||||
.onParentPath(new Path(partitionDir.getPath()))
|
||||
.withFileExtension(HoodieLogFile.DELTA_EXTENSION).withFileId(fileId)
|
||||
.overBaseCommit(baseCommit).withFs(FSUtils.getFs()).build();
|
||||
List<IndexedRecord> records = new ArrayList<>();
|
||||
for (int i = 0; i < numberOfRecords; i++) {
|
||||
records.add(SchemaTestUtil.generateAvroRecordFromJson(schema, i, newCommit, "fileid0"));
|
||||
}
|
||||
Schema writeSchema = records.get(0).getSchema();
|
||||
Map<HoodieLogBlock.LogMetadataType, String> metadata = Maps.newHashMap();
|
||||
metadata.put(HoodieLogBlock.LogMetadataType.INSTANT_TIME, newCommit);
|
||||
HoodieAvroDataBlock dataBlock = new HoodieAvroDataBlock(records, writeSchema, metadata);
|
||||
writer = writer.appendBlock(dataBlock);
|
||||
long size = writer.getCurrentSize();
|
||||
return writer;
|
||||
}
|
||||
|
||||
@Rule
|
||||
public TemporaryFolder basePath = new TemporaryFolder();
|
||||
@Test
|
||||
public void testReader() throws Exception {
|
||||
// initial commit
|
||||
Schema schema = HoodieAvroUtils.addMetadataFields(SchemaTestUtil.getEvolvedSchema());
|
||||
HoodieTestUtils
|
||||
.initTableType(basePath.getRoot().getAbsolutePath(), HoodieTableType.MERGE_ON_READ);
|
||||
String commitTime = "100";
|
||||
File partitionDir = InputFormatTestUtil
|
||||
.prepareParquetDataset(basePath, schema, 1, 100, commitTime);
|
||||
InputFormatTestUtil.commit(basePath, commitTime);
|
||||
// Add the paths
|
||||
FileInputFormat.setInputPaths(jobConf, partitionDir.getPath());
|
||||
|
||||
private HoodieLogFormat.Writer writeLogFile(File partitionDir, Schema schema, String fileId,
|
||||
String baseCommit, String newCommit, int numberOfRecords) throws InterruptedException,IOException {
|
||||
HoodieLogFormat.Writer writer = HoodieLogFormat.newWriterBuilder().onParentPath(new Path(partitionDir.getPath()))
|
||||
.withFileExtension(HoodieLogFile.DELTA_EXTENSION).withFileId(fileId)
|
||||
.overBaseCommit(baseCommit).withFs(FSUtils.getFs()).build();
|
||||
List<IndexedRecord> records = new ArrayList<>();
|
||||
for(int i=0; i < numberOfRecords; i++) {
|
||||
records.add(SchemaTestUtil.generateAvroRecordFromJson(schema, i, newCommit, "fileid0"));
|
||||
}
|
||||
Schema writeSchema = records.get(0).getSchema();
|
||||
Map<HoodieLogBlock.LogMetadataType, String> metadata = Maps.newHashMap();
|
||||
metadata.put(HoodieLogBlock.LogMetadataType.INSTANT_TIME, newCommit);
|
||||
HoodieAvroDataBlock dataBlock = new HoodieAvroDataBlock(records, writeSchema, metadata);
|
||||
writer = writer.appendBlock(dataBlock);
|
||||
long size = writer.getCurrentSize();
|
||||
return writer;
|
||||
}
|
||||
// update files or generate new log file
|
||||
String newCommitTime = "101";
|
||||
HoodieLogFormat.Writer writer = writeLogFile(partitionDir, schema, "fileid0", commitTime,
|
||||
newCommitTime, 100);
|
||||
long size = writer.getCurrentSize();
|
||||
writer.close();
|
||||
assertTrue("block - size should be > 0", size > 0);
|
||||
|
||||
@Test
|
||||
public void testReader() throws Exception {
|
||||
// initial commit
|
||||
Schema schema = HoodieAvroUtils.addMetadataFields(SchemaTestUtil.getEvolvedSchema());
|
||||
HoodieTestUtils.initTableType(basePath.getRoot().getAbsolutePath(), HoodieTableType.MERGE_ON_READ);
|
||||
String commitTime = "100";
|
||||
File partitionDir = InputFormatTestUtil.prepareParquetDataset(basePath, schema, 1, 100, commitTime);
|
||||
InputFormatTestUtil.commit(basePath, commitTime);
|
||||
// Add the paths
|
||||
FileInputFormat.setInputPaths(jobConf, partitionDir.getPath());
|
||||
//create a split with baseFile (parquet file written earlier) and new log file(s)
|
||||
String logFilePath = writer.getLogFile().getPath().toString();
|
||||
HoodieRealtimeFileSplit split = new HoodieRealtimeFileSplit(new FileSplit(new Path(partitionDir
|
||||
+ "/fileid0_1_" + commitTime + ".parquet"), 0, 1, jobConf), basePath.getRoot().getPath(),
|
||||
Arrays.asList(logFilePath), newCommitTime);
|
||||
|
||||
// update files or generate new log file
|
||||
String newCommitTime = "101";
|
||||
HoodieLogFormat.Writer writer = writeLogFile(partitionDir, schema, "fileid0", commitTime, newCommitTime, 100);
|
||||
long size = writer.getCurrentSize();
|
||||
writer.close();
|
||||
assertTrue("block - size should be > 0", size > 0);
|
||||
|
||||
//create a split with baseFile (parquet file written earlier) and new log file(s)
|
||||
String logFilePath = writer.getLogFile().getPath().toString();
|
||||
HoodieRealtimeFileSplit split = new HoodieRealtimeFileSplit(new FileSplit(new Path(partitionDir
|
||||
+ "/fileid0_1_" + commitTime + ".parquet"),0,1,jobConf), basePath.getRoot().getPath(), Arrays.asList(logFilePath), newCommitTime);
|
||||
|
||||
//create a RecordReader to be used by HoodieRealtimeRecordReader
|
||||
RecordReader<Void, ArrayWritable> reader =
|
||||
new MapredParquetInputFormat().
|
||||
getRecordReader(new FileSplit(split.getPath(), 0,
|
||||
FSUtils.getFs().getLength(split.getPath()), (String[]) null), jobConf, null);
|
||||
JobConf jobConf = new JobConf();
|
||||
List<Schema.Field> fields = schema.getFields();
|
||||
String names = fields.stream().map(f -> f.name().toString()).collect(Collectors.joining(","));
|
||||
String postions = fields.stream().map(f -> String.valueOf(f.pos())).collect(Collectors.joining(","));
|
||||
jobConf.set(ColumnProjectionUtils.READ_COLUMN_NAMES_CONF_STR, names);
|
||||
jobConf.set(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR, postions);
|
||||
jobConf.set("partition_columns", "datestr");
|
||||
|
||||
//validate record reader compaction
|
||||
HoodieRealtimeRecordReader recordReader = new HoodieRealtimeRecordReader(split, jobConf, reader);
|
||||
|
||||
//use reader to read base Parquet File and log file, merge in flight and return latest commit
|
||||
//here all 100 records should be updated, see above
|
||||
Void key = recordReader.createKey();
|
||||
ArrayWritable value = recordReader.createValue();
|
||||
while(recordReader.next(key, value)) {
|
||||
Writable[] values = value.get();
|
||||
//check if the record written is with latest commit, here "101"
|
||||
Assert.assertEquals(values[0].toString(), newCommitTime);
|
||||
key = recordReader.createKey();
|
||||
value = recordReader.createValue();
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testReaderWithNestedAndComplexSchema() throws Exception {
|
||||
// initial commit
|
||||
Schema schema = HoodieAvroUtils.addMetadataFields(SchemaTestUtil.getComplexEvolvedSchema());
|
||||
HoodieTestUtils.initTableType(basePath.getRoot().getAbsolutePath(), HoodieTableType.MERGE_ON_READ);
|
||||
String commitTime = "100";
|
||||
int numberOfRecords = 100;
|
||||
int numberOfLogRecords = numberOfRecords / 2;
|
||||
File partitionDir = InputFormatTestUtil.prepareParquetDataset(basePath, schema, 1, numberOfRecords, commitTime);
|
||||
InputFormatTestUtil.commit(basePath, commitTime);
|
||||
// Add the paths
|
||||
FileInputFormat.setInputPaths(jobConf, partitionDir.getPath());
|
||||
|
||||
// update files or generate new log file
|
||||
String newCommitTime = "101";
|
||||
HoodieLogFormat.Writer writer = writeLogFile(partitionDir, schema, "fileid0", commitTime, newCommitTime, numberOfLogRecords);
|
||||
long size = writer.getCurrentSize();
|
||||
writer.close();
|
||||
assertTrue("block - size should be > 0", size > 0);
|
||||
|
||||
//create a split with baseFile (parquet file written earlier) and new log file(s)
|
||||
String logFilePath = writer.getLogFile().getPath().toString();
|
||||
HoodieRealtimeFileSplit split = new HoodieRealtimeFileSplit(new FileSplit(new Path(partitionDir
|
||||
+ "/fileid0_1_" + commitTime + ".parquet"),0,1,jobConf), basePath.getRoot().getPath(), Arrays.asList(logFilePath), newCommitTime);
|
||||
|
||||
//create a RecordReader to be used by HoodieRealtimeRecordReader
|
||||
RecordReader<Void, ArrayWritable> reader =
|
||||
new MapredParquetInputFormat().
|
||||
//create a RecordReader to be used by HoodieRealtimeRecordReader
|
||||
RecordReader<Void, ArrayWritable> reader =
|
||||
new MapredParquetInputFormat().
|
||||
getRecordReader(new FileSplit(split.getPath(), 0,
|
||||
FSUtils.getFs().getLength(split.getPath()), (String[]) null), jobConf, null);
|
||||
JobConf jobConf = new JobConf();
|
||||
List<Schema.Field> fields = schema.getFields();
|
||||
FSUtils.getFs().getLength(split.getPath()), (String[]) null), jobConf, null);
|
||||
JobConf jobConf = new JobConf();
|
||||
List<Schema.Field> fields = schema.getFields();
|
||||
String names = fields.stream().map(f -> f.name().toString()).collect(Collectors.joining(","));
|
||||
String postions = fields.stream().map(f -> String.valueOf(f.pos()))
|
||||
.collect(Collectors.joining(","));
|
||||
jobConf.set(ColumnProjectionUtils.READ_COLUMN_NAMES_CONF_STR, names);
|
||||
jobConf.set(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR, postions);
|
||||
jobConf.set("partition_columns", "datestr");
|
||||
|
||||
String names = fields.stream().map(f -> f.name()).collect(Collectors.joining(","));
|
||||
String positions = fields.stream().map(f -> String.valueOf(f.pos())).collect(Collectors.joining(","));
|
||||
jobConf.set(ColumnProjectionUtils.READ_COLUMN_NAMES_CONF_STR, names);
|
||||
jobConf.set(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR, positions);
|
||||
jobConf.set("partition_columns", "datestr");
|
||||
//validate record reader compaction
|
||||
HoodieRealtimeRecordReader recordReader = new HoodieRealtimeRecordReader(split, jobConf,
|
||||
reader);
|
||||
|
||||
// validate record reader compaction
|
||||
HoodieRealtimeRecordReader recordReader = new HoodieRealtimeRecordReader(split, jobConf, reader);
|
||||
|
||||
// use reader to read base Parquet File and log file, merge in flight and return latest commit
|
||||
// here the first 50 records should be updated, see above
|
||||
Void key = recordReader.createKey();
|
||||
ArrayWritable value = recordReader.createValue();
|
||||
int numRecordsRead = 0;
|
||||
while (recordReader.next(key, value)) {
|
||||
int currentRecordNo = numRecordsRead;
|
||||
++numRecordsRead;
|
||||
Writable[] values = value.get();
|
||||
String recordCommitTime;
|
||||
//check if the record written is with latest commit, here "101"
|
||||
if (numRecordsRead > numberOfLogRecords) {
|
||||
recordCommitTime = commitTime;
|
||||
} else {
|
||||
recordCommitTime = newCommitTime;
|
||||
}
|
||||
String recordCommitTimeSuffix = "@" + recordCommitTime;
|
||||
|
||||
Assert.assertEquals(values[0].toString(), recordCommitTime);
|
||||
key = recordReader.createKey();
|
||||
value = recordReader.createValue();
|
||||
|
||||
// Assert type STRING
|
||||
Assert.assertEquals("test value for field: field1", values[5].toString(), "field" + currentRecordNo);
|
||||
Assert.assertEquals("test value for field: field2",values[6].toString(), "field" + currentRecordNo + recordCommitTimeSuffix);
|
||||
Assert.assertEquals("test value for field: name", values[7].toString(), "name" + currentRecordNo);
|
||||
|
||||
// Assert type INT
|
||||
IntWritable intWritable = (IntWritable)values[8];
|
||||
Assert.assertEquals("test value for field: favoriteIntNumber", intWritable.get(), currentRecordNo + recordCommitTime.hashCode());
|
||||
|
||||
// Assert type LONG
|
||||
LongWritable longWritable = (LongWritable)values[9];
|
||||
Assert.assertEquals("test value for field: favoriteNumber", longWritable.get(), currentRecordNo + recordCommitTime.hashCode());
|
||||
|
||||
// Assert type FLOAT
|
||||
FloatWritable floatWritable = (FloatWritable)values[10];
|
||||
Assert.assertEquals("test value for field: favoriteFloatNumber", floatWritable.get(), (float)((currentRecordNo + recordCommitTime.hashCode()) / 1024.0), 0);
|
||||
|
||||
// Assert type DOUBLE
|
||||
DoubleWritable doubleWritable = (DoubleWritable)values[11];
|
||||
Assert.assertEquals("test value for field: favoriteDoubleNumber", doubleWritable.get(), (currentRecordNo + recordCommitTime.hashCode()) / 1024.0, 0);
|
||||
|
||||
// Assert type MAP
|
||||
ArrayWritable mapItem = (ArrayWritable)values[12];
|
||||
Writable[] mapItemValues = mapItem.get();
|
||||
ArrayWritable mapItemValue1 = (ArrayWritable)mapItemValues[0];
|
||||
ArrayWritable mapItemValue2 = (ArrayWritable)mapItemValues[1];
|
||||
Assert.assertEquals("test value for field: tags", mapItemValue1.get()[0].toString(), "mapItem1");
|
||||
Assert.assertEquals("test value for field: tags", mapItemValue2.get()[0].toString(), "mapItem2");
|
||||
ArrayWritable mapItemValue1value = (ArrayWritable)mapItemValue1.get()[1];
|
||||
ArrayWritable mapItemValue2value = (ArrayWritable)mapItemValue2.get()[1];
|
||||
Assert.assertEquals("test value for field: tags", mapItemValue1value.get().length, 2);
|
||||
Assert.assertEquals("test value for field: tags", mapItemValue2value.get().length, 2);
|
||||
Assert.assertEquals("test value for field: tags[\"mapItem1\"].item1", mapItemValue1value.get()[0].toString(), "item" + currentRecordNo);
|
||||
Assert.assertEquals("test value for field: tags[\"mapItem2\"].item1", mapItemValue2value.get()[0].toString(), "item2" + currentRecordNo);
|
||||
Assert.assertEquals("test value for field: tags[\"mapItem1\"].item2", mapItemValue1value.get()[1].toString(), "item" + currentRecordNo + recordCommitTimeSuffix);
|
||||
Assert.assertEquals("test value for field: tags[\"mapItem2\"].item2", mapItemValue2value.get()[1].toString(), "item2" + currentRecordNo + recordCommitTimeSuffix);
|
||||
|
||||
// Assert type RECORD
|
||||
ArrayWritable recordItem = (ArrayWritable)values[13];
|
||||
Writable[] nestedRecord = recordItem.get();
|
||||
Assert.assertEquals("test value for field: testNestedRecord.isAdmin", ((BooleanWritable)nestedRecord[0]).get(), false);
|
||||
Assert.assertEquals("test value for field: testNestedRecord.userId", nestedRecord[1].toString(), "UserId" + currentRecordNo + recordCommitTimeSuffix);
|
||||
|
||||
// Assert type ARRAY
|
||||
ArrayWritable arrayValue = (ArrayWritable)values[14];
|
||||
Writable[] arrayValues = arrayValue.get();
|
||||
for (int i = 0; i < arrayValues.length; i++) {
|
||||
Assert.assertEquals("test value for field: stringArray", arrayValues[i].toString(), "stringArray" + i + recordCommitTimeSuffix);
|
||||
}
|
||||
}
|
||||
//use reader to read base Parquet File and log file, merge in flight and return latest commit
|
||||
//here all 100 records should be updated, see above
|
||||
Void key = recordReader.createKey();
|
||||
ArrayWritable value = recordReader.createValue();
|
||||
while (recordReader.next(key, value)) {
|
||||
Writable[] values = value.get();
|
||||
//check if the record written is with latest commit, here "101"
|
||||
Assert.assertEquals(values[0].toString(), newCommitTime);
|
||||
key = recordReader.createKey();
|
||||
value = recordReader.createValue();
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testReaderWithNestedAndComplexSchema() throws Exception {
|
||||
// initial commit
|
||||
Schema schema = HoodieAvroUtils.addMetadataFields(SchemaTestUtil.getComplexEvolvedSchema());
|
||||
HoodieTestUtils
|
||||
.initTableType(basePath.getRoot().getAbsolutePath(), HoodieTableType.MERGE_ON_READ);
|
||||
String commitTime = "100";
|
||||
int numberOfRecords = 100;
|
||||
int numberOfLogRecords = numberOfRecords / 2;
|
||||
File partitionDir = InputFormatTestUtil
|
||||
.prepareParquetDataset(basePath, schema, 1, numberOfRecords, commitTime);
|
||||
InputFormatTestUtil.commit(basePath, commitTime);
|
||||
// Add the paths
|
||||
FileInputFormat.setInputPaths(jobConf, partitionDir.getPath());
|
||||
|
||||
// update files or generate new log file
|
||||
String newCommitTime = "101";
|
||||
HoodieLogFormat.Writer writer = writeLogFile(partitionDir, schema, "fileid0", commitTime,
|
||||
newCommitTime, numberOfLogRecords);
|
||||
long size = writer.getCurrentSize();
|
||||
writer.close();
|
||||
assertTrue("block - size should be > 0", size > 0);
|
||||
|
||||
//create a split with baseFile (parquet file written earlier) and new log file(s)
|
||||
String logFilePath = writer.getLogFile().getPath().toString();
|
||||
HoodieRealtimeFileSplit split = new HoodieRealtimeFileSplit(new FileSplit(new Path(partitionDir
|
||||
+ "/fileid0_1_" + commitTime + ".parquet"), 0, 1, jobConf), basePath.getRoot().getPath(),
|
||||
Arrays.asList(logFilePath), newCommitTime);
|
||||
|
||||
//create a RecordReader to be used by HoodieRealtimeRecordReader
|
||||
RecordReader<Void, ArrayWritable> reader =
|
||||
new MapredParquetInputFormat().
|
||||
getRecordReader(new FileSplit(split.getPath(), 0,
|
||||
FSUtils.getFs().getLength(split.getPath()), (String[]) null), jobConf, null);
|
||||
JobConf jobConf = new JobConf();
|
||||
List<Schema.Field> fields = schema.getFields();
|
||||
|
||||
String names = fields.stream().map(f -> f.name()).collect(Collectors.joining(","));
|
||||
String positions = fields.stream().map(f -> String.valueOf(f.pos()))
|
||||
.collect(Collectors.joining(","));
|
||||
jobConf.set(ColumnProjectionUtils.READ_COLUMN_NAMES_CONF_STR, names);
|
||||
jobConf.set(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR, positions);
|
||||
jobConf.set("partition_columns", "datestr");
|
||||
|
||||
// validate record reader compaction
|
||||
HoodieRealtimeRecordReader recordReader = new HoodieRealtimeRecordReader(split, jobConf,
|
||||
reader);
|
||||
|
||||
// use reader to read base Parquet File and log file, merge in flight and return latest commit
|
||||
// here the first 50 records should be updated, see above
|
||||
Void key = recordReader.createKey();
|
||||
ArrayWritable value = recordReader.createValue();
|
||||
int numRecordsRead = 0;
|
||||
while (recordReader.next(key, value)) {
|
||||
int currentRecordNo = numRecordsRead;
|
||||
++numRecordsRead;
|
||||
Writable[] values = value.get();
|
||||
String recordCommitTime;
|
||||
//check if the record written is with latest commit, here "101"
|
||||
if (numRecordsRead > numberOfLogRecords) {
|
||||
recordCommitTime = commitTime;
|
||||
} else {
|
||||
recordCommitTime = newCommitTime;
|
||||
}
|
||||
String recordCommitTimeSuffix = "@" + recordCommitTime;
|
||||
|
||||
Assert.assertEquals(values[0].toString(), recordCommitTime);
|
||||
key = recordReader.createKey();
|
||||
value = recordReader.createValue();
|
||||
|
||||
// Assert type STRING
|
||||
Assert.assertEquals("test value for field: field1", values[5].toString(),
|
||||
"field" + currentRecordNo);
|
||||
Assert.assertEquals("test value for field: field2", values[6].toString(),
|
||||
"field" + currentRecordNo + recordCommitTimeSuffix);
|
||||
Assert.assertEquals("test value for field: name", values[7].toString(),
|
||||
"name" + currentRecordNo);
|
||||
|
||||
// Assert type INT
|
||||
IntWritable intWritable = (IntWritable) values[8];
|
||||
Assert.assertEquals("test value for field: favoriteIntNumber", intWritable.get(),
|
||||
currentRecordNo + recordCommitTime.hashCode());
|
||||
|
||||
// Assert type LONG
|
||||
LongWritable longWritable = (LongWritable) values[9];
|
||||
Assert.assertEquals("test value for field: favoriteNumber", longWritable.get(),
|
||||
currentRecordNo + recordCommitTime.hashCode());
|
||||
|
||||
// Assert type FLOAT
|
||||
FloatWritable floatWritable = (FloatWritable) values[10];
|
||||
Assert.assertEquals("test value for field: favoriteFloatNumber", floatWritable.get(),
|
||||
(float) ((currentRecordNo + recordCommitTime.hashCode()) / 1024.0), 0);
|
||||
|
||||
// Assert type DOUBLE
|
||||
DoubleWritable doubleWritable = (DoubleWritable) values[11];
|
||||
Assert.assertEquals("test value for field: favoriteDoubleNumber", doubleWritable.get(),
|
||||
(currentRecordNo + recordCommitTime.hashCode()) / 1024.0, 0);
|
||||
|
||||
// Assert type MAP
|
||||
ArrayWritable mapItem = (ArrayWritable) values[12];
|
||||
Writable[] mapItemValues = mapItem.get();
|
||||
ArrayWritable mapItemValue1 = (ArrayWritable) mapItemValues[0];
|
||||
ArrayWritable mapItemValue2 = (ArrayWritable) mapItemValues[1];
|
||||
Assert.assertEquals("test value for field: tags", mapItemValue1.get()[0].toString(),
|
||||
"mapItem1");
|
||||
Assert.assertEquals("test value for field: tags", mapItemValue2.get()[0].toString(),
|
||||
"mapItem2");
|
||||
ArrayWritable mapItemValue1value = (ArrayWritable) mapItemValue1.get()[1];
|
||||
ArrayWritable mapItemValue2value = (ArrayWritable) mapItemValue2.get()[1];
|
||||
Assert.assertEquals("test value for field: tags", mapItemValue1value.get().length, 2);
|
||||
Assert.assertEquals("test value for field: tags", mapItemValue2value.get().length, 2);
|
||||
Assert.assertEquals("test value for field: tags[\"mapItem1\"].item1",
|
||||
mapItemValue1value.get()[0].toString(), "item" + currentRecordNo);
|
||||
Assert.assertEquals("test value for field: tags[\"mapItem2\"].item1",
|
||||
mapItemValue2value.get()[0].toString(), "item2" + currentRecordNo);
|
||||
Assert.assertEquals("test value for field: tags[\"mapItem1\"].item2",
|
||||
mapItemValue1value.get()[1].toString(),
|
||||
"item" + currentRecordNo + recordCommitTimeSuffix);
|
||||
Assert.assertEquals("test value for field: tags[\"mapItem2\"].item2",
|
||||
mapItemValue2value.get()[1].toString(),
|
||||
"item2" + currentRecordNo + recordCommitTimeSuffix);
|
||||
|
||||
// Assert type RECORD
|
||||
ArrayWritable recordItem = (ArrayWritable) values[13];
|
||||
Writable[] nestedRecord = recordItem.get();
|
||||
Assert.assertEquals("test value for field: testNestedRecord.isAdmin",
|
||||
((BooleanWritable) nestedRecord[0]).get(), false);
|
||||
Assert
|
||||
.assertEquals("test value for field: testNestedRecord.userId", nestedRecord[1].toString(),
|
||||
"UserId" + currentRecordNo + recordCommitTimeSuffix);
|
||||
|
||||
// Assert type ARRAY
|
||||
ArrayWritable arrayValue = (ArrayWritable) values[14];
|
||||
Writable[] arrayValues = arrayValue.get();
|
||||
for (int i = 0; i < arrayValues.length; i++) {
|
||||
Assert.assertEquals("test value for field: stringArray", arrayValues[i].toString(),
|
||||
"stringArray" + i + recordCommitTimeSuffix);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -13,11 +13,9 @@
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
|
||||
log4j.rootLogger=WARN, A1
|
||||
log4j.category.com.uber=INFO
|
||||
log4j.category.org.apache.parquet.hadoop=WARN
|
||||
|
||||
# A1 is set to be a ConsoleAppender.
|
||||
log4j.appender.A1=org.apache.log4j.ConsoleAppender
|
||||
# A1 uses PatternLayout.
|
||||
|
||||
Reference in New Issue
Block a user