1
0

Adding hoodie-hadoop-mr module to add HoodieInputFormat

This commit is contained in:
Prasanna Rajaperumal
2016-12-16 19:29:53 -08:00
parent 8e80c8d2ea
commit 61200b1207
7 changed files with 806 additions and 6 deletions

View File

@@ -0,0 +1,237 @@
/*
* Copyright (c) 2016 Uber Technologies, Inc. (hoodie-dev-group@uber.com)
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.uber.hoodie.hadoop;
import com.uber.hoodie.common.util.FSUtils;
import org.apache.avro.Schema;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.io.ArrayWritable;
import org.apache.hadoop.mapred.*;
import org.junit.Before;
import org.junit.Rule;
import org.junit.Test;
import org.junit.rules.TemporaryFolder;
import java.io.File;
import java.io.IOException;
import static org.junit.Assert.assertEquals;
public class HoodieInputFormatTest {
private HoodieInputFormat inputFormat;
private JobConf jobConf;
@Before public void setUp() {
inputFormat = new HoodieInputFormat();
jobConf = new JobConf();
inputFormat.setConf(jobConf);
}
@Rule public TemporaryFolder basePath = new TemporaryFolder();
@Test public void testInputFormatLoad() throws IOException {
// initial commit
File partitionDir = InputFormatTestUtil.prepareDataset(basePath, 10, "100");
InputFormatTestUtil.commit(basePath, "100");
// Add the paths
FileInputFormat.setInputPaths(jobConf, partitionDir.getPath());
InputSplit[] inputSplits = inputFormat.getSplits(jobConf, 10);
assertEquals(10, inputSplits.length);
FileStatus[] files = inputFormat.listStatus(jobConf);
assertEquals(10, files.length);
}
@Test public void testInputFormatUpdates() throws IOException {
// initial commit
File partitionDir = InputFormatTestUtil.prepareDataset(basePath, 10, "100");
InputFormatTestUtil.commit(basePath, "100");
// Add the paths
FileInputFormat.setInputPaths(jobConf, partitionDir.getPath());
FileStatus[] files = inputFormat.listStatus(jobConf);
assertEquals(10, files.length);
// update files
InputFormatTestUtil.simulateUpdates(partitionDir, "100", 5, "200", true);
// Before the commit
files = inputFormat.listStatus(jobConf);
assertEquals(10, files.length);
ensureFilesInCommit(
"Commit 200 has not been committed. We should not see files from this commit", files,
"200", 0);
InputFormatTestUtil.commit(basePath, "200");
files = inputFormat.listStatus(jobConf);
assertEquals(10, files.length);
ensureFilesInCommit(
"5 files have been updated to commit 200. We should see 5 files from commit 200 and 5 files from 100 commit",
files, "200", 5);
ensureFilesInCommit(
"5 files have been updated to commit 200. We should see 5 files from commit 100 and 5 files from 200 commit",
files, "100", 5);
}
@Test public void testIncrementalSimple() throws IOException {
// initial commit
File partitionDir = InputFormatTestUtil.prepareDataset(basePath, 10, "100");
InputFormatTestUtil.commit(basePath, "100");
// Add the paths
FileInputFormat.setInputPaths(jobConf, partitionDir.getPath());
InputFormatTestUtil.setupIncremental(jobConf, "100", 1);
FileStatus[] files = inputFormat.listStatus(jobConf);
assertEquals(
"We should exclude commit 100 when returning incremental pull with start commit time as 100",
0, files.length);
}
@Test public void testIncrementalWithMultipleCommits() throws IOException {
// initial commit
File partitionDir = InputFormatTestUtil.prepareDataset(basePath, 10, "100");
InputFormatTestUtil.commit(basePath, "100");
// Add the paths
FileInputFormat.setInputPaths(jobConf, partitionDir.getPath());
// update files
InputFormatTestUtil.simulateUpdates(partitionDir, "100", 5, "200", false);
InputFormatTestUtil.commit(basePath, "200");
InputFormatTestUtil.simulateUpdates(partitionDir, "100", 4, "300", false);
InputFormatTestUtil.commit(basePath, "300");
InputFormatTestUtil.simulateUpdates(partitionDir, "100", 3, "400", false);
InputFormatTestUtil.commit(basePath, "400");
InputFormatTestUtil.simulateUpdates(partitionDir, "100", 2, "500", false);
InputFormatTestUtil.commit(basePath, "500");
InputFormatTestUtil.simulateUpdates(partitionDir, "100", 1, "600", false);
InputFormatTestUtil.commit(basePath, "600");
InputFormatTestUtil.setupIncremental(jobConf, "100", 1);
FileStatus[] files = inputFormat.listStatus(jobConf);
assertEquals("Pulling 1 commit from 100, should get us the 5 files committed at 200", 5,
files.length);
ensureFilesInCommit("Pulling 1 commit from 100, should get us the 5 files committed at 200",
files, "200", 5);
InputFormatTestUtil.setupIncremental(jobConf, "100", 3);
files = inputFormat.listStatus(jobConf);
assertEquals(
"Pulling 3 commits from 100, should get us the 3 files from 400 commit, 1 file from 300 commit and 1 file from 200 commit",
5, files.length);
ensureFilesInCommit("Pulling 3 commits from 100, should get us the 3 files from 400 commit",
files, "400", 3);
ensureFilesInCommit("Pulling 3 commits from 100, should get us the 1 files from 300 commit",
files, "300", 1);
ensureFilesInCommit("Pulling 3 commits from 100, should get us the 1 files from 200 commit",
files, "200", 1);
InputFormatTestUtil.setupIncremental(jobConf, "100", HoodieHiveUtil.MAX_COMMIT_ALL);
files = inputFormat.listStatus(jobConf);
assertEquals(
"Pulling all commits from 100, should get us the 1 file from each of 200,300,400,500,400 commits",
5, files.length);
ensureFilesInCommit(
"Pulling all commits from 100, should get us the 1 files from 600 commit", files, "600",
1);
ensureFilesInCommit(
"Pulling all commits from 100, should get us the 1 files from 500 commit", files, "500",
1);
ensureFilesInCommit(
"Pulling all commits from 100, should get us the 1 files from 400 commit", files, "400",
1);
ensureFilesInCommit(
"Pulling all commits from 100, should get us the 1 files from 300 commit", files, "300",
1);
ensureFilesInCommit(
"Pulling all commits from 100, should get us the 1 files from 200 commit", files, "200",
1);
}
//TODO enable this after enabling predicate pushdown
public void testPredicatePushDown() throws IOException {
// initial commit
Schema schema = InputFormatTestUtil.readSchema("/sample1.avro");
String commit1 = "20160628071126";
File partitionDir =
InputFormatTestUtil.prepareParquetDataset(basePath, schema, 1, 10, commit1);
InputFormatTestUtil.commit(basePath, commit1);
// Add the paths
FileInputFormat.setInputPaths(jobConf, partitionDir.getPath());
// check whether we have 10 records at this point
ensureRecordsInCommit("We need to have 10 records at this point for commit " + commit1, commit1, 10, 10);
// update 2 records in the original parquet file and save it as commit 200
String commit2 = "20160629193623";
InputFormatTestUtil.simulateParquetUpdates(partitionDir, schema, commit1, 10, 2, commit2);
InputFormatTestUtil.commit(basePath, commit2);
InputFormatTestUtil.setupIncremental(jobConf, commit1, 1);
// check whether we have 2 records at this point
ensureRecordsInCommit(
"We need to have 2 records that was modified at commit " + commit2 + " and no more", commit2, 2, 2);
// Make sure we have the 10 records if we roll back the stattime
InputFormatTestUtil.setupIncremental(jobConf, "0", 2);
ensureRecordsInCommit(
"We need to have 8 records that was modified at commit " + commit1 + " and no more", commit1, 8, 10);
ensureRecordsInCommit(
"We need to have 2 records that was modified at commit " + commit2 + " and no more", commit2, 2, 10);
}
private void ensureRecordsInCommit(String msg, String commit,
int expectedNumberOfRecordsInCommit, int totalExpected) throws IOException {
int actualCount = 0;
int totalCount = 0;
InputSplit[] splits = inputFormat.getSplits(jobConf, 1);
for(InputSplit split:splits) {
RecordReader<Void, ArrayWritable>
recordReader = inputFormat.getRecordReader(split, jobConf, null);
Void key = recordReader.createKey();
ArrayWritable writable = recordReader.createValue();
while(recordReader.next(key, writable)) {
// writable returns an array with [field1, field2, _hoodie_commit_time, _hoodie_commit_seqno]
// Take the commit time and compare with the one we are interested in
if(commit.equals((writable.get()[2]).toString())) {
actualCount++;
}
totalCount++;
}
}
assertEquals(msg, expectedNumberOfRecordsInCommit, actualCount);
assertEquals(msg, totalExpected, totalCount);
}
public static void ensureFilesInCommit(String msg, FileStatus[] files, String commit,
int expected) {
int count = 0;
for (FileStatus file : files) {
String commitTs = FSUtils.getCommitTime(file.getPath().getName());
if (commit.equals(commitTs)) {
count++;
}
}
assertEquals(msg, expected, count);
}
}

View File

@@ -0,0 +1,165 @@
/*
* Copyright (c) 2016 Uber Technologies, Inc. (hoodie-dev-group@uber.com)
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.uber.hoodie.hadoop;
import com.uber.hoodie.common.model.HoodieRecord;
import com.uber.hoodie.common.model.HoodieTestUtils;
import com.uber.hoodie.common.util.FSUtils;
import org.apache.avro.Schema;
import org.apache.avro.generic.GenericRecord;
import org.apache.avro.generic.GenericRecordBuilder;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapred.JobConf;
import org.apache.parquet.avro.AvroParquetReader;
import org.apache.parquet.avro.AvroParquetWriter;
import org.junit.rules.TemporaryFolder;
import java.io.File;
import java.io.FilenameFilter;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.List;
public class InputFormatTestUtil {
public static File prepareDataset(TemporaryFolder basePath, int numberOfFiles,
String commitNumber) throws IOException {
basePath.create();
HoodieTestUtils.initializeHoodieDirectory(basePath.getRoot().toString());
File partitionPath = basePath.newFolder("2016", "05", "01");
for (int i = 0; i < numberOfFiles; i++) {
File dataFile =
new File(partitionPath, FSUtils.makeDataFileName(commitNumber, 1, "fileid" + i));
dataFile.createNewFile();
}
return partitionPath;
}
public static void simulateUpdates(File directory, final String originalCommit, int numberOfFilesUpdated,
String newCommit, boolean randomize) throws IOException {
List<File> dataFiles = Arrays.asList(directory.listFiles(new FilenameFilter() {
@Override public boolean accept(File dir, String name) {
String commitTs = FSUtils.getCommitTime(name);
return originalCommit.equals(commitTs);
}
}));
if(randomize) {
Collections.shuffle(dataFiles);
}
List<File> toUpdateList =
dataFiles.subList(0, Math.min(numberOfFilesUpdated, dataFiles.size()));
for (File file : toUpdateList) {
String fileId = FSUtils.getFileId(file.getName());
File dataFile = new File(directory, FSUtils.makeDataFileName(newCommit, 1, fileId));
dataFile.createNewFile();
}
}
public static void commit(TemporaryFolder basePath, String commitNumber) throws IOException {
// create the commit
new File(basePath.getRoot().toString() + "/.hoodie/", commitNumber + ".commit").createNewFile();
}
public static void setupIncremental(JobConf jobConf, String startCommit, int numberOfCommitsToPull) {
String modePropertyName = String.format(HoodieHiveUtil.HOODIE_CONSUME_MODE_PATTERN,
HoodieTestUtils.RAW_TRIPS_TEST_NAME);
jobConf.set(modePropertyName, HoodieHiveUtil.INCREMENTAL_SCAN_MODE);
String startCommitTimestampName = String.format(HoodieHiveUtil.HOODIE_START_COMMIT_PATTERN, HoodieTestUtils.RAW_TRIPS_TEST_NAME);
jobConf.set(startCommitTimestampName, startCommit);
String maxCommitPulls = String.format(HoodieHiveUtil.HOODIE_MAX_COMMIT_PATTERN, HoodieTestUtils.RAW_TRIPS_TEST_NAME);
jobConf.setInt(maxCommitPulls, numberOfCommitsToPull);
}
public static Schema readSchema(String location) throws IOException {
return new Schema.Parser().parse(InputFormatTestUtil.class.getResourceAsStream(location));
}
public static File prepareParquetDataset(TemporaryFolder basePath, Schema schema, int numberOfFiles, int numberOfRecords,
String commitNumber) throws IOException {
basePath.create();
HoodieTestUtils.initializeHoodieDirectory(basePath.getRoot().toString());
File partitionPath = basePath.newFolder("2016", "05", "01");
AvroParquetWriter parquetWriter;
for (int i = 0; i < numberOfFiles; i++) {
File dataFile =
new File(partitionPath, FSUtils.makeDataFileName(commitNumber, 1, "fileid" + i));
// dataFile.createNewFile();
parquetWriter = new AvroParquetWriter(new Path(dataFile.getAbsolutePath()),
schema);
try {
for (GenericRecord record : generateAvroRecords(schema, numberOfRecords, commitNumber)) {
parquetWriter.write(record);
}
} finally {
parquetWriter.close();
}
}
return partitionPath;
}
private static Iterable<? extends GenericRecord> generateAvroRecords(Schema schema, int numberOfRecords, String commitTime) {
List<GenericRecord> records = new ArrayList<>(numberOfRecords);
for(int i=0;i<numberOfRecords;i++) {
records.add(generateAvroRecord(schema, i, commitTime));
}
return records;
}
private static GenericRecord generateAvroRecord(Schema schema, int recordNumber,
String commitTime) {
return new GenericRecordBuilder(schema).set("field1", "field" + recordNumber)
.set("field2", "field" + recordNumber)
.set(HoodieRecord.COMMIT_TIME_METADATA_FIELD, commitTime)
.set(HoodieRecord.COMMIT_SEQNO_METADATA_FIELD, commitTime + "_" + recordNumber).build();
}
public static void simulateParquetUpdates(File directory, Schema schema, String originalCommit,
int totalNumberOfRecords, int numberOfRecordsToUpdate,
String newCommit) throws IOException {
File fileToUpdate = directory.listFiles(new FilenameFilter() {
@Override public boolean accept(File dir, String name) {
return name.endsWith("parquet");
}
})[0];
String fileId = FSUtils.getFileId(fileToUpdate.getName());
File dataFile = new File(directory, FSUtils.makeDataFileName(newCommit, 1, fileId));
AvroParquetWriter parquetWriter = new AvroParquetWriter(new Path(dataFile.getAbsolutePath()),
schema);
try {
for (GenericRecord record : generateAvroRecords(schema, totalNumberOfRecords,
originalCommit)) {
if (numberOfRecordsToUpdate > 0) {
// update this record
record.put(HoodieRecord.COMMIT_TIME_METADATA_FIELD, newCommit);
String oldSeqNo = (String) record.get(HoodieRecord.COMMIT_SEQNO_METADATA_FIELD);
record.put(HoodieRecord.COMMIT_SEQNO_METADATA_FIELD,
oldSeqNo.replace(originalCommit, newCommit));
numberOfRecordsToUpdate--;
}
parquetWriter.write(record);
}
} finally {
parquetWriter.close();
}
}
}