[HUDI-684] Introduced abstraction for writing and reading different types of base file formats. (#1687)
Notable changes:
1. HoodieFileWriter and HoodieFileReader abstractions for writer/reader side of a base file format
2. HoodieDataBlock abstraction for creation specific data blocks for base file formats. (e.g. Parquet has HoodieAvroDataBlock)
3. All hardocded references to Parquet / Parquet based classes have been abstracted to call methods which accept a base file format
4. HiveSyncTool accepts the base file format as a CLI parameter
5. HoodieDeltaStreamer accepts the base file format as a CLI parameter
6. HoodieSparkSqlWriter accepts the base file format as a parameter
This commit is contained in:
@@ -30,8 +30,8 @@ import org.apache.hudi.common.table.TableSchemaResolver;
|
|||||||
import org.apache.hudi.common.table.log.HoodieLogFormat;
|
import org.apache.hudi.common.table.log.HoodieLogFormat;
|
||||||
import org.apache.hudi.common.table.log.HoodieLogFormat.Reader;
|
import org.apache.hudi.common.table.log.HoodieLogFormat.Reader;
|
||||||
import org.apache.hudi.common.table.log.HoodieMergedLogRecordScanner;
|
import org.apache.hudi.common.table.log.HoodieMergedLogRecordScanner;
|
||||||
import org.apache.hudi.common.table.log.block.HoodieAvroDataBlock;
|
|
||||||
import org.apache.hudi.common.table.log.block.HoodieCorruptBlock;
|
import org.apache.hudi.common.table.log.block.HoodieCorruptBlock;
|
||||||
|
import org.apache.hudi.common.table.log.block.HoodieDataBlock;
|
||||||
import org.apache.hudi.common.table.log.block.HoodieLogBlock;
|
import org.apache.hudi.common.table.log.block.HoodieLogBlock;
|
||||||
import org.apache.hudi.common.table.log.block.HoodieLogBlock.HeaderMetadataType;
|
import org.apache.hudi.common.table.log.block.HoodieLogBlock.HeaderMetadataType;
|
||||||
import org.apache.hudi.common.table.log.block.HoodieLogBlock.HoodieLogBlockType;
|
import org.apache.hudi.common.table.log.block.HoodieLogBlock.HoodieLogBlockType;
|
||||||
@@ -118,8 +118,8 @@ public class HoodieLogFileCommand implements CommandMarker {
|
|||||||
dummyInstantTimeCount++;
|
dummyInstantTimeCount++;
|
||||||
instantTime = "dummy_instant_time_" + dummyInstantTimeCount;
|
instantTime = "dummy_instant_time_" + dummyInstantTimeCount;
|
||||||
}
|
}
|
||||||
if (n instanceof HoodieAvroDataBlock) {
|
if (n instanceof HoodieDataBlock) {
|
||||||
recordCount = ((HoodieAvroDataBlock) n).getRecords().size();
|
recordCount = ((HoodieDataBlock) n).getRecords().size();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (commitCountAndMetadata.containsKey(instantTime)) {
|
if (commitCountAndMetadata.containsKey(instantTime)) {
|
||||||
@@ -215,8 +215,8 @@ public class HoodieLogFileCommand implements CommandMarker {
|
|||||||
// read the avro blocks
|
// read the avro blocks
|
||||||
while (reader.hasNext()) {
|
while (reader.hasNext()) {
|
||||||
HoodieLogBlock n = reader.next();
|
HoodieLogBlock n = reader.next();
|
||||||
if (n instanceof HoodieAvroDataBlock) {
|
if (n instanceof HoodieDataBlock) {
|
||||||
HoodieAvroDataBlock blk = (HoodieAvroDataBlock) n;
|
HoodieDataBlock blk = (HoodieDataBlock) n;
|
||||||
List<IndexedRecord> records = blk.getRecords();
|
List<IndexedRecord> records = blk.getRecords();
|
||||||
for (IndexedRecord record : records) {
|
for (IndexedRecord record : records) {
|
||||||
if (allRecords.size() < limit) {
|
if (allRecords.size() < limit) {
|
||||||
|
|||||||
@@ -34,7 +34,7 @@ import org.apache.hudi.common.model.HoodieWriteStat;
|
|||||||
import org.apache.hudi.common.model.HoodieWriteStat.RuntimeStats;
|
import org.apache.hudi.common.model.HoodieWriteStat.RuntimeStats;
|
||||||
import org.apache.hudi.common.table.log.HoodieLogFormat;
|
import org.apache.hudi.common.table.log.HoodieLogFormat;
|
||||||
import org.apache.hudi.common.table.log.HoodieLogFormat.Writer;
|
import org.apache.hudi.common.table.log.HoodieLogFormat.Writer;
|
||||||
import org.apache.hudi.common.table.log.block.HoodieAvroDataBlock;
|
import org.apache.hudi.common.table.log.block.HoodieDataBlock;
|
||||||
import org.apache.hudi.common.table.log.block.HoodieDeleteBlock;
|
import org.apache.hudi.common.table.log.block.HoodieDeleteBlock;
|
||||||
import org.apache.hudi.common.table.log.block.HoodieLogBlock;
|
import org.apache.hudi.common.table.log.block.HoodieLogBlock;
|
||||||
import org.apache.hudi.common.table.log.block.HoodieLogBlock.HeaderMetadataType;
|
import org.apache.hudi.common.table.log.block.HoodieLogBlock.HeaderMetadataType;
|
||||||
@@ -207,7 +207,7 @@ public class HoodieAppendHandle<T extends HoodieRecordPayload> extends HoodieWri
|
|||||||
header.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, instantTime);
|
header.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, instantTime);
|
||||||
header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, writerSchema.toString());
|
header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, writerSchema.toString());
|
||||||
if (recordList.size() > 0) {
|
if (recordList.size() > 0) {
|
||||||
writer = writer.appendBlock(new HoodieAvroDataBlock(recordList, header));
|
writer = writer.appendBlock(HoodieDataBlock.getBlock(hoodieTable.getLogDataBlockFormat(), recordList, header));
|
||||||
recordList.clear();
|
recordList.clear();
|
||||||
}
|
}
|
||||||
if (keysToDelete.size() > 0) {
|
if (keysToDelete.size() > 0) {
|
||||||
|
|||||||
@@ -30,8 +30,7 @@ import org.apache.hudi.common.model.HoodieWriteStat.RuntimeStats;
|
|||||||
import org.apache.hudi.common.util.Option;
|
import org.apache.hudi.common.util.Option;
|
||||||
import org.apache.hudi.config.HoodieWriteConfig;
|
import org.apache.hudi.config.HoodieWriteConfig;
|
||||||
import org.apache.hudi.exception.HoodieInsertException;
|
import org.apache.hudi.exception.HoodieInsertException;
|
||||||
import org.apache.hudi.io.storage.HoodieStorageWriter;
|
import org.apache.hudi.io.storage.HoodieFileWriter;
|
||||||
import org.apache.hudi.io.storage.HoodieStorageWriterFactory;
|
|
||||||
import org.apache.hudi.table.HoodieTable;
|
import org.apache.hudi.table.HoodieTable;
|
||||||
|
|
||||||
import org.apache.avro.generic.GenericRecord;
|
import org.apache.avro.generic.GenericRecord;
|
||||||
@@ -47,7 +46,7 @@ public class HoodieCreateHandle<T extends HoodieRecordPayload> extends HoodieWri
|
|||||||
|
|
||||||
private static final Logger LOG = LogManager.getLogger(HoodieCreateHandle.class);
|
private static final Logger LOG = LogManager.getLogger(HoodieCreateHandle.class);
|
||||||
|
|
||||||
private final HoodieStorageWriter<IndexedRecord> storageWriter;
|
private final HoodieFileWriter<IndexedRecord> fileWriter;
|
||||||
private final Path path;
|
private final Path path;
|
||||||
private long recordsWritten = 0;
|
private long recordsWritten = 0;
|
||||||
private long insertRecordsWritten = 0;
|
private long insertRecordsWritten = 0;
|
||||||
@@ -68,8 +67,8 @@ public class HoodieCreateHandle<T extends HoodieRecordPayload> extends HoodieWri
|
|||||||
new Path(config.getBasePath()), FSUtils.getPartitionPath(config.getBasePath(), partitionPath));
|
new Path(config.getBasePath()), FSUtils.getPartitionPath(config.getBasePath(), partitionPath));
|
||||||
partitionMetadata.trySave(getPartitionId());
|
partitionMetadata.trySave(getPartitionId());
|
||||||
createMarkerFile(partitionPath);
|
createMarkerFile(partitionPath);
|
||||||
this.storageWriter =
|
this.fileWriter = createNewFileWriter(instantTime, path, hoodieTable, config, writerSchema,
|
||||||
HoodieStorageWriterFactory.getStorageWriter(instantTime, path, hoodieTable, config, writerSchema, this.sparkTaskContextSupplier);
|
this.sparkTaskContextSupplier);
|
||||||
} catch (IOException e) {
|
} catch (IOException e) {
|
||||||
throw new HoodieInsertException("Failed to initialize HoodieStorageWriter for path " + path, e);
|
throw new HoodieInsertException("Failed to initialize HoodieStorageWriter for path " + path, e);
|
||||||
}
|
}
|
||||||
@@ -88,7 +87,7 @@ public class HoodieCreateHandle<T extends HoodieRecordPayload> extends HoodieWri
|
|||||||
|
|
||||||
@Override
|
@Override
|
||||||
public boolean canWrite(HoodieRecord record) {
|
public boolean canWrite(HoodieRecord record) {
|
||||||
return storageWriter.canWrite() && record.getPartitionPath().equals(writeStatus.getPartitionPath());
|
return fileWriter.canWrite() && record.getPartitionPath().equals(writeStatus.getPartitionPath());
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@@ -101,7 +100,7 @@ public class HoodieCreateHandle<T extends HoodieRecordPayload> extends HoodieWri
|
|||||||
if (avroRecord.isPresent()) {
|
if (avroRecord.isPresent()) {
|
||||||
// Convert GenericRecord to GenericRecord with hoodie commit metadata in schema
|
// Convert GenericRecord to GenericRecord with hoodie commit metadata in schema
|
||||||
IndexedRecord recordWithMetadataInSchema = rewriteRecord((GenericRecord) avroRecord.get());
|
IndexedRecord recordWithMetadataInSchema = rewriteRecord((GenericRecord) avroRecord.get());
|
||||||
storageWriter.writeAvroWithMetadata(recordWithMetadataInSchema, record);
|
fileWriter.writeAvroWithMetadata(recordWithMetadataInSchema, record);
|
||||||
// update the new location of record, so we know where to find it next
|
// update the new location of record, so we know where to find it next
|
||||||
record.unseal();
|
record.unseal();
|
||||||
record.setNewLocation(new HoodieRecordLocation(instantTime, writeStatus.getFileId()));
|
record.setNewLocation(new HoodieRecordLocation(instantTime, writeStatus.getFileId()));
|
||||||
@@ -156,7 +155,7 @@ public class HoodieCreateHandle<T extends HoodieRecordPayload> extends HoodieWri
|
|||||||
.info("Closing the file " + writeStatus.getFileId() + " as we are done with all the records " + recordsWritten);
|
.info("Closing the file " + writeStatus.getFileId() + " as we are done with all the records " + recordsWritten);
|
||||||
try {
|
try {
|
||||||
|
|
||||||
storageWriter.close();
|
fileWriter.close();
|
||||||
|
|
||||||
HoodieWriteStat stat = new HoodieWriteStat();
|
HoodieWriteStat stat = new HoodieWriteStat();
|
||||||
stat.setPartitionPath(writeStatus.getPartitionPath());
|
stat.setPartitionPath(writeStatus.getPartitionPath());
|
||||||
|
|||||||
@@ -23,7 +23,6 @@ import org.apache.hudi.common.model.HoodieBaseFile;
|
|||||||
import org.apache.hudi.common.model.HoodieRecordPayload;
|
import org.apache.hudi.common.model.HoodieRecordPayload;
|
||||||
import org.apache.hudi.common.model.HoodieTableType;
|
import org.apache.hudi.common.model.HoodieTableType;
|
||||||
import org.apache.hudi.common.util.HoodieTimer;
|
import org.apache.hudi.common.util.HoodieTimer;
|
||||||
import org.apache.hudi.common.util.ParquetUtils;
|
|
||||||
import org.apache.hudi.common.util.collection.Pair;
|
import org.apache.hudi.common.util.collection.Pair;
|
||||||
import org.apache.hudi.config.HoodieWriteConfig;
|
import org.apache.hudi.config.HoodieWriteConfig;
|
||||||
import org.apache.hudi.exception.HoodieIndexException;
|
import org.apache.hudi.exception.HoodieIndexException;
|
||||||
@@ -34,6 +33,7 @@ import org.apache.hadoop.fs.Path;
|
|||||||
import org.apache.log4j.LogManager;
|
import org.apache.log4j.LogManager;
|
||||||
import org.apache.log4j.Logger;
|
import org.apache.log4j.Logger;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.HashSet;
|
import java.util.HashSet;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
@@ -61,23 +61,26 @@ public class HoodieKeyLookupHandle<T extends HoodieRecordPayload> extends Hoodie
|
|||||||
this.candidateRecordKeys = new ArrayList<>();
|
this.candidateRecordKeys = new ArrayList<>();
|
||||||
this.totalKeysChecked = 0;
|
this.totalKeysChecked = 0;
|
||||||
HoodieTimer timer = new HoodieTimer().startTimer();
|
HoodieTimer timer = new HoodieTimer().startTimer();
|
||||||
this.bloomFilter = ParquetUtils.readBloomFilterFromParquetMetadata(hoodieTable.getHadoopConf(),
|
|
||||||
new Path(getLatestDataFile().getPath()));
|
try {
|
||||||
|
this.bloomFilter = createNewFileReader().readBloomFilter();
|
||||||
|
} catch (IOException e) {
|
||||||
|
throw new HoodieIndexException(String.format("Error reading bloom filter from %s: %s", partitionPathFilePair, e));
|
||||||
|
}
|
||||||
LOG.info(String.format("Read bloom filter from %s in %d ms", partitionPathFilePair, timer.endTimer()));
|
LOG.info(String.format("Read bloom filter from %s in %d ms", partitionPathFilePair, timer.endTimer()));
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Given a list of row keys and one file, return only row keys existing in that file.
|
* Given a list of row keys and one file, return only row keys existing in that file.
|
||||||
*/
|
*/
|
||||||
public static List<String> checkCandidatesAgainstFile(Configuration configuration, List<String> candidateRecordKeys,
|
public List<String> checkCandidatesAgainstFile(Configuration configuration, List<String> candidateRecordKeys,
|
||||||
Path filePath) throws HoodieIndexException {
|
Path filePath) throws HoodieIndexException {
|
||||||
List<String> foundRecordKeys = new ArrayList<>();
|
List<String> foundRecordKeys = new ArrayList<>();
|
||||||
try {
|
try {
|
||||||
// Load all rowKeys from the file, to double-confirm
|
// Load all rowKeys from the file, to double-confirm
|
||||||
if (!candidateRecordKeys.isEmpty()) {
|
if (!candidateRecordKeys.isEmpty()) {
|
||||||
HoodieTimer timer = new HoodieTimer().startTimer();
|
HoodieTimer timer = new HoodieTimer().startTimer();
|
||||||
Set<String> fileRowKeys =
|
Set<String> fileRowKeys = createNewFileReader().filterRowKeys(new HashSet<>(candidateRecordKeys));
|
||||||
ParquetUtils.filterParquetRowKeys(configuration, filePath, new HashSet<>(candidateRecordKeys));
|
|
||||||
foundRecordKeys.addAll(fileRowKeys);
|
foundRecordKeys.addAll(fileRowKeys);
|
||||||
LOG.info(String.format("Checked keys against file %s, in %d ms. #candidates (%d) #found (%d)", filePath,
|
LOG.info(String.format("Checked keys against file %s, in %d ms. #candidates (%d) #found (%d)", filePath,
|
||||||
timer.endTimer(), candidateRecordKeys.size(), foundRecordKeys.size()));
|
timer.endTimer(), candidateRecordKeys.size(), foundRecordKeys.size()));
|
||||||
|
|||||||
@@ -37,8 +37,7 @@ import org.apache.hudi.common.util.collection.ExternalSpillableMap;
|
|||||||
import org.apache.hudi.config.HoodieWriteConfig;
|
import org.apache.hudi.config.HoodieWriteConfig;
|
||||||
import org.apache.hudi.exception.HoodieIOException;
|
import org.apache.hudi.exception.HoodieIOException;
|
||||||
import org.apache.hudi.exception.HoodieUpsertException;
|
import org.apache.hudi.exception.HoodieUpsertException;
|
||||||
import org.apache.hudi.io.storage.HoodieStorageWriter;
|
import org.apache.hudi.io.storage.HoodieFileWriter;
|
||||||
import org.apache.hudi.io.storage.HoodieStorageWriterFactory;
|
|
||||||
import org.apache.hudi.table.HoodieTable;
|
import org.apache.hudi.table.HoodieTable;
|
||||||
|
|
||||||
import org.apache.avro.Schema;
|
import org.apache.avro.Schema;
|
||||||
@@ -61,7 +60,7 @@ public class HoodieMergeHandle<T extends HoodieRecordPayload> extends HoodieWrit
|
|||||||
|
|
||||||
private Map<String, HoodieRecord<T>> keyToNewRecords;
|
private Map<String, HoodieRecord<T>> keyToNewRecords;
|
||||||
private Set<String> writtenRecordKeys;
|
private Set<String> writtenRecordKeys;
|
||||||
private HoodieStorageWriter<IndexedRecord> storageWriter;
|
private HoodieFileWriter<IndexedRecord> fileWriter;
|
||||||
private Path newFilePath;
|
private Path newFilePath;
|
||||||
private Path oldFilePath;
|
private Path oldFilePath;
|
||||||
private long recordsWritten = 0;
|
private long recordsWritten = 0;
|
||||||
@@ -115,7 +114,7 @@ public class HoodieMergeHandle<T extends HoodieRecordPayload> extends HoodieWrit
|
|||||||
|
|
||||||
oldFilePath = new Path(config.getBasePath() + "/" + partitionPath + "/" + latestValidFilePath);
|
oldFilePath = new Path(config.getBasePath() + "/" + partitionPath + "/" + latestValidFilePath);
|
||||||
String relativePath = new Path((partitionPath.isEmpty() ? "" : partitionPath + "/")
|
String relativePath = new Path((partitionPath.isEmpty() ? "" : partitionPath + "/")
|
||||||
+ FSUtils.makeDataFileName(instantTime, writeToken, fileId)).toString();
|
+ FSUtils.makeDataFileName(instantTime, writeToken, fileId, hoodieTable.getBaseFileExtension())).toString();
|
||||||
newFilePath = new Path(config.getBasePath(), relativePath);
|
newFilePath = new Path(config.getBasePath(), relativePath);
|
||||||
|
|
||||||
LOG.info(String.format("Merging new data into oldPath %s, as newPath %s", oldFilePath.toString(),
|
LOG.info(String.format("Merging new data into oldPath %s, as newPath %s", oldFilePath.toString(),
|
||||||
@@ -131,8 +130,8 @@ public class HoodieMergeHandle<T extends HoodieRecordPayload> extends HoodieWrit
|
|||||||
createMarkerFile(partitionPath);
|
createMarkerFile(partitionPath);
|
||||||
|
|
||||||
// Create the writer for writing the new version file
|
// Create the writer for writing the new version file
|
||||||
storageWriter =
|
fileWriter = createNewFileWriter(instantTime, newFilePath, hoodieTable, config, writerSchema, sparkTaskContextSupplier);
|
||||||
HoodieStorageWriterFactory.getStorageWriter(instantTime, newFilePath, hoodieTable, config, writerSchema, sparkTaskContextSupplier);
|
|
||||||
} catch (IOException io) {
|
} catch (IOException io) {
|
||||||
LOG.error("Error in update task at commit " + instantTime, io);
|
LOG.error("Error in update task at commit " + instantTime, io);
|
||||||
writeStatus.setGlobalError(io);
|
writeStatus.setGlobalError(io);
|
||||||
@@ -190,7 +189,7 @@ public class HoodieMergeHandle<T extends HoodieRecordPayload> extends HoodieWrit
|
|||||||
if (indexedRecord.isPresent()) {
|
if (indexedRecord.isPresent()) {
|
||||||
// Convert GenericRecord to GenericRecord with hoodie commit metadata in schema
|
// Convert GenericRecord to GenericRecord with hoodie commit metadata in schema
|
||||||
IndexedRecord recordWithMetadataInSchema = rewriteRecord((GenericRecord) indexedRecord.get());
|
IndexedRecord recordWithMetadataInSchema = rewriteRecord((GenericRecord) indexedRecord.get());
|
||||||
storageWriter.writeAvroWithMetadata(recordWithMetadataInSchema, hoodieRecord);
|
fileWriter.writeAvroWithMetadata(recordWithMetadataInSchema, hoodieRecord);
|
||||||
recordsWritten++;
|
recordsWritten++;
|
||||||
} else {
|
} else {
|
||||||
recordsDeleted++;
|
recordsDeleted++;
|
||||||
@@ -243,7 +242,7 @@ public class HoodieMergeHandle<T extends HoodieRecordPayload> extends HoodieWrit
|
|||||||
String errMsg = "Failed to merge old record into new file for key " + key + " from old file " + getOldFilePath()
|
String errMsg = "Failed to merge old record into new file for key " + key + " from old file " + getOldFilePath()
|
||||||
+ " to new file " + newFilePath;
|
+ " to new file " + newFilePath;
|
||||||
try {
|
try {
|
||||||
storageWriter.writeAvro(key, oldRecord);
|
fileWriter.writeAvro(key, oldRecord);
|
||||||
} catch (ClassCastException e) {
|
} catch (ClassCastException e) {
|
||||||
LOG.error("Schema mismatch when rewriting old record " + oldRecord + " from file " + getOldFilePath()
|
LOG.error("Schema mismatch when rewriting old record " + oldRecord + " from file " + getOldFilePath()
|
||||||
+ " to file " + newFilePath + " with writerSchema " + writerSchema.toString(true));
|
+ " to file " + newFilePath + " with writerSchema " + writerSchema.toString(true));
|
||||||
@@ -277,8 +276,8 @@ public class HoodieMergeHandle<T extends HoodieRecordPayload> extends HoodieWrit
|
|||||||
keyToNewRecords.clear();
|
keyToNewRecords.clear();
|
||||||
writtenRecordKeys.clear();
|
writtenRecordKeys.clear();
|
||||||
|
|
||||||
if (storageWriter != null) {
|
if (fileWriter != null) {
|
||||||
storageWriter.close();
|
fileWriter.close();
|
||||||
}
|
}
|
||||||
|
|
||||||
long fileSizeInBytes = FSUtils.getFileSize(fs, newFilePath);
|
long fileSizeInBytes = FSUtils.getFileSize(fs, newFilePath);
|
||||||
|
|||||||
@@ -18,14 +18,12 @@
|
|||||||
|
|
||||||
package org.apache.hudi.io;
|
package org.apache.hudi.io;
|
||||||
|
|
||||||
import org.apache.hudi.common.model.HoodieBaseFile;
|
|
||||||
import org.apache.hudi.common.model.HoodieRecordPayload;
|
import org.apache.hudi.common.model.HoodieRecordPayload;
|
||||||
import org.apache.hudi.common.util.ParquetUtils;
|
|
||||||
import org.apache.hudi.common.util.collection.Pair;
|
import org.apache.hudi.common.util.collection.Pair;
|
||||||
import org.apache.hudi.config.HoodieWriteConfig;
|
import org.apache.hudi.config.HoodieWriteConfig;
|
||||||
import org.apache.hudi.table.HoodieTable;
|
import org.apache.hudi.table.HoodieTable;
|
||||||
|
|
||||||
import org.apache.hadoop.fs.Path;
|
import java.io.IOException;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Extract range information for a given file slice.
|
* Extract range information for a given file slice.
|
||||||
@@ -37,8 +35,7 @@ public class HoodieRangeInfoHandle<T extends HoodieRecordPayload> extends Hoodie
|
|||||||
super(config, null, hoodieTable, partitionPathFilePair);
|
super(config, null, hoodieTable, partitionPathFilePair);
|
||||||
}
|
}
|
||||||
|
|
||||||
public String[] getMinMaxKeys() {
|
public String[] getMinMaxKeys() throws IOException {
|
||||||
HoodieBaseFile dataFile = getLatestDataFile();
|
return createNewFileReader().readMinMaxRecordKeys();
|
||||||
return ParquetUtils.readMinMaxRecordKeys(hoodieTable.getHadoopConf(), new Path(dataFile.getPath()));
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -22,9 +22,14 @@ import org.apache.hudi.common.model.HoodieBaseFile;
|
|||||||
import org.apache.hudi.common.model.HoodieRecordPayload;
|
import org.apache.hudi.common.model.HoodieRecordPayload;
|
||||||
import org.apache.hudi.common.util.collection.Pair;
|
import org.apache.hudi.common.util.collection.Pair;
|
||||||
import org.apache.hudi.config.HoodieWriteConfig;
|
import org.apache.hudi.config.HoodieWriteConfig;
|
||||||
|
import org.apache.hudi.io.storage.HoodieFileReader;
|
||||||
|
import org.apache.hudi.io.storage.HoodieFileReaderFactory;
|
||||||
import org.apache.hudi.table.HoodieTable;
|
import org.apache.hudi.table.HoodieTable;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
|
||||||
import org.apache.hadoop.fs.FileSystem;
|
import org.apache.hadoop.fs.FileSystem;
|
||||||
|
import org.apache.hadoop.fs.Path;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Base class for read operations done logically on the file group.
|
* Base class for read operations done logically on the file group.
|
||||||
@@ -56,4 +61,9 @@ public abstract class HoodieReadHandle<T extends HoodieRecordPayload> extends Ho
|
|||||||
return hoodieTable.getBaseFileOnlyView()
|
return hoodieTable.getBaseFileOnlyView()
|
||||||
.getLatestBaseFile(partitionPathFilePair.getLeft(), partitionPathFilePair.getRight()).get();
|
.getLatestBaseFile(partitionPathFilePair.getLeft(), partitionPathFilePair.getRight()).get();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
protected HoodieFileReader createNewFileReader() throws IOException {
|
||||||
|
return HoodieFileReaderFactory.getFileReader(hoodieTable.getHadoopConf(),
|
||||||
|
new Path(getLatestDataFile().getPath()));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -30,6 +30,8 @@ import org.apache.hudi.common.util.ReflectionUtils;
|
|||||||
import org.apache.hudi.config.HoodieWriteConfig;
|
import org.apache.hudi.config.HoodieWriteConfig;
|
||||||
import org.apache.hudi.exception.HoodieException;
|
import org.apache.hudi.exception.HoodieException;
|
||||||
import org.apache.hudi.exception.HoodieIOException;
|
import org.apache.hudi.exception.HoodieIOException;
|
||||||
|
import org.apache.hudi.io.storage.HoodieFileWriter;
|
||||||
|
import org.apache.hudi.io.storage.HoodieFileWriterFactory;
|
||||||
import org.apache.hudi.table.HoodieTable;
|
import org.apache.hudi.table.HoodieTable;
|
||||||
|
|
||||||
import org.apache.avro.Schema;
|
import org.apache.avro.Schema;
|
||||||
@@ -86,7 +88,8 @@ public abstract class HoodieWriteHandle<T extends HoodieRecordPayload> extends H
|
|||||||
throw new HoodieIOException("Failed to make dir " + path, e);
|
throw new HoodieIOException("Failed to make dir " + path, e);
|
||||||
}
|
}
|
||||||
|
|
||||||
return new Path(path.toString(), FSUtils.makeDataFileName(instantTime, writeToken, fileId));
|
return new Path(path.toString(), FSUtils.makeDataFileName(instantTime, writeToken, fileId,
|
||||||
|
hoodieTable.getMetaClient().getTableConfig().getBaseFileFormat().getFileExtension()));
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@@ -180,4 +183,9 @@ public abstract class HoodieWriteHandle<T extends HoodieRecordPayload> extends H
|
|||||||
protected long getAttemptId() {
|
protected long getAttemptId() {
|
||||||
return sparkTaskContextSupplier.getAttemptIdSupplier().get();
|
return sparkTaskContextSupplier.getAttemptIdSupplier().get();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
protected HoodieFileWriter createNewFileWriter(String instantTime, Path path, HoodieTable<T> hoodieTable,
|
||||||
|
HoodieWriteConfig config, Schema schema, SparkTaskContextSupplier sparkTaskContextSupplier) throws IOException {
|
||||||
|
return HoodieFileWriterFactory.getFileWriter(instantTime, path, hoodieTable, config, schema, sparkTaskContextSupplier);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -24,7 +24,7 @@ import org.apache.avro.generic.IndexedRecord;
|
|||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
|
||||||
public interface HoodieStorageWriter<R extends IndexedRecord> {
|
public interface HoodieFileWriter<R extends IndexedRecord> {
|
||||||
|
|
||||||
void writeAvroWithMetadata(R newRecord, HoodieRecord record) throws IOException;
|
void writeAvroWithMetadata(R newRecord, HoodieRecord record) throws IOException;
|
||||||
|
|
||||||
@@ -34,23 +34,21 @@ import org.apache.parquet.avro.AvroSchemaConverter;
|
|||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
|
||||||
import static org.apache.hudi.common.model.HoodieFileFormat.HOODIE_LOG;
|
|
||||||
import static org.apache.hudi.common.model.HoodieFileFormat.PARQUET;
|
import static org.apache.hudi.common.model.HoodieFileFormat.PARQUET;
|
||||||
|
|
||||||
public class HoodieStorageWriterFactory {
|
public class HoodieFileWriterFactory {
|
||||||
|
|
||||||
public static <T extends HoodieRecordPayload, R extends IndexedRecord> HoodieStorageWriter<R> getStorageWriter(
|
public static <T extends HoodieRecordPayload, R extends IndexedRecord> HoodieFileWriter<R> getFileWriter(
|
||||||
String instantTime, Path path, HoodieTable<T> hoodieTable, HoodieWriteConfig config, Schema schema,
|
String instantTime, Path path, HoodieTable<T> hoodieTable, HoodieWriteConfig config, Schema schema,
|
||||||
SparkTaskContextSupplier sparkTaskContextSupplier) throws IOException {
|
SparkTaskContextSupplier sparkTaskContextSupplier) throws IOException {
|
||||||
final String name = path.getName();
|
final String extension = FSUtils.getFileExtension(path.getName());
|
||||||
final String extension = FSUtils.isLogFile(path) ? HOODIE_LOG.getFileExtension() : FSUtils.getFileExtension(name);
|
|
||||||
if (PARQUET.getFileExtension().equals(extension)) {
|
if (PARQUET.getFileExtension().equals(extension)) {
|
||||||
return newParquetStorageWriter(instantTime, path, config, schema, hoodieTable, sparkTaskContextSupplier);
|
return newParquetFileWriter(instantTime, path, config, schema, hoodieTable, sparkTaskContextSupplier);
|
||||||
}
|
}
|
||||||
throw new UnsupportedOperationException(extension + " format not supported yet.");
|
throw new UnsupportedOperationException(extension + " format not supported yet.");
|
||||||
}
|
}
|
||||||
|
|
||||||
private static <T extends HoodieRecordPayload, R extends IndexedRecord> HoodieStorageWriter<R> newParquetStorageWriter(
|
private static <T extends HoodieRecordPayload, R extends IndexedRecord> HoodieFileWriter<R> newParquetFileWriter(
|
||||||
String instantTime, Path path, HoodieWriteConfig config, Schema schema, HoodieTable hoodieTable,
|
String instantTime, Path path, HoodieWriteConfig config, Schema schema, HoodieTable hoodieTable,
|
||||||
SparkTaskContextSupplier sparkTaskContextSupplier) throws IOException {
|
SparkTaskContextSupplier sparkTaskContextSupplier) throws IOException {
|
||||||
BloomFilter filter = BloomFilterFactory
|
BloomFilter filter = BloomFilterFactory
|
||||||
@@ -29,7 +29,6 @@ import org.apache.hudi.common.model.HoodieRecordPayload;
|
|||||||
import org.apache.avro.Schema;
|
import org.apache.avro.Schema;
|
||||||
import org.apache.avro.generic.GenericRecord;
|
import org.apache.avro.generic.GenericRecord;
|
||||||
import org.apache.avro.generic.IndexedRecord;
|
import org.apache.avro.generic.IndexedRecord;
|
||||||
import org.apache.hadoop.conf.Configuration;
|
|
||||||
import org.apache.hadoop.fs.Path;
|
import org.apache.hadoop.fs.Path;
|
||||||
import org.apache.parquet.hadoop.ParquetFileWriter;
|
import org.apache.parquet.hadoop.ParquetFileWriter;
|
||||||
import org.apache.parquet.hadoop.ParquetWriter;
|
import org.apache.parquet.hadoop.ParquetWriter;
|
||||||
@@ -42,7 +41,7 @@ import java.util.concurrent.atomic.AtomicLong;
|
|||||||
* the current file can take more records with the <code>canWrite()</code>
|
* the current file can take more records with the <code>canWrite()</code>
|
||||||
*/
|
*/
|
||||||
public class HoodieParquetWriter<T extends HoodieRecordPayload, R extends IndexedRecord>
|
public class HoodieParquetWriter<T extends HoodieRecordPayload, R extends IndexedRecord>
|
||||||
extends ParquetWriter<IndexedRecord> implements HoodieStorageWriter<R> {
|
extends ParquetWriter<IndexedRecord> implements HoodieFileWriter<R> {
|
||||||
|
|
||||||
private static AtomicLong recordIndex = new AtomicLong(1);
|
private static AtomicLong recordIndex = new AtomicLong(1);
|
||||||
|
|
||||||
@@ -51,7 +50,6 @@ public class HoodieParquetWriter<T extends HoodieRecordPayload, R extends Indexe
|
|||||||
private final long maxFileSize;
|
private final long maxFileSize;
|
||||||
private final HoodieAvroWriteSupport writeSupport;
|
private final HoodieAvroWriteSupport writeSupport;
|
||||||
private final String instantTime;
|
private final String instantTime;
|
||||||
private final Schema schema;
|
|
||||||
private final SparkTaskContextSupplier sparkTaskContextSupplier;
|
private final SparkTaskContextSupplier sparkTaskContextSupplier;
|
||||||
|
|
||||||
public HoodieParquetWriter(String instantTime, Path file, HoodieParquetConfig parquetConfig,
|
public HoodieParquetWriter(String instantTime, Path file, HoodieParquetConfig parquetConfig,
|
||||||
@@ -60,10 +58,10 @@ public class HoodieParquetWriter<T extends HoodieRecordPayload, R extends Indexe
|
|||||||
ParquetFileWriter.Mode.CREATE, parquetConfig.getWriteSupport(), parquetConfig.getCompressionCodecName(),
|
ParquetFileWriter.Mode.CREATE, parquetConfig.getWriteSupport(), parquetConfig.getCompressionCodecName(),
|
||||||
parquetConfig.getBlockSize(), parquetConfig.getPageSize(), parquetConfig.getPageSize(),
|
parquetConfig.getBlockSize(), parquetConfig.getPageSize(), parquetConfig.getPageSize(),
|
||||||
ParquetWriter.DEFAULT_IS_DICTIONARY_ENABLED, ParquetWriter.DEFAULT_IS_VALIDATING_ENABLED,
|
ParquetWriter.DEFAULT_IS_DICTIONARY_ENABLED, ParquetWriter.DEFAULT_IS_VALIDATING_ENABLED,
|
||||||
ParquetWriter.DEFAULT_WRITER_VERSION, registerFileSystem(file, parquetConfig.getHadoopConf()));
|
ParquetWriter.DEFAULT_WRITER_VERSION, FSUtils.registerFileSystem(file, parquetConfig.getHadoopConf()));
|
||||||
this.file = HoodieWrapperFileSystem.convertToHoodiePath(file, parquetConfig.getHadoopConf());
|
this.file = HoodieWrapperFileSystem.convertToHoodiePath(file, parquetConfig.getHadoopConf());
|
||||||
this.fs =
|
this.fs =
|
||||||
(HoodieWrapperFileSystem) this.file.getFileSystem(registerFileSystem(file, parquetConfig.getHadoopConf()));
|
(HoodieWrapperFileSystem) this.file.getFileSystem(FSUtils.registerFileSystem(file, parquetConfig.getHadoopConf()));
|
||||||
// We cannot accurately measure the snappy compressed output file size. We are choosing a
|
// We cannot accurately measure the snappy compressed output file size. We are choosing a
|
||||||
// conservative 10%
|
// conservative 10%
|
||||||
// TODO - compute this compression ratio dynamically by looking at the bytes written to the
|
// TODO - compute this compression ratio dynamically by looking at the bytes written to the
|
||||||
@@ -72,18 +70,9 @@ public class HoodieParquetWriter<T extends HoodieRecordPayload, R extends Indexe
|
|||||||
+ Math.round(parquetConfig.getMaxFileSize() * parquetConfig.getCompressionRatio());
|
+ Math.round(parquetConfig.getMaxFileSize() * parquetConfig.getCompressionRatio());
|
||||||
this.writeSupport = parquetConfig.getWriteSupport();
|
this.writeSupport = parquetConfig.getWriteSupport();
|
||||||
this.instantTime = instantTime;
|
this.instantTime = instantTime;
|
||||||
this.schema = schema;
|
|
||||||
this.sparkTaskContextSupplier = sparkTaskContextSupplier;
|
this.sparkTaskContextSupplier = sparkTaskContextSupplier;
|
||||||
}
|
}
|
||||||
|
|
||||||
public static Configuration registerFileSystem(Path file, Configuration conf) {
|
|
||||||
Configuration returnConf = new Configuration(conf);
|
|
||||||
String scheme = FSUtils.getFs(file.toString(), conf).getScheme();
|
|
||||||
returnConf.set("fs." + HoodieWrapperFileSystem.getHoodieScheme(scheme) + ".impl",
|
|
||||||
HoodieWrapperFileSystem.class.getName());
|
|
||||||
return returnConf;
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void writeAvroWithMetadata(R avroRecord, HoodieRecord record) throws IOException {
|
public void writeAvroWithMetadata(R avroRecord, HoodieRecord record) throws IOException {
|
||||||
String seqId =
|
String seqId =
|
||||||
|
|||||||
@@ -27,7 +27,6 @@ import org.apache.hudi.avro.model.HoodieRestoreMetadata;
|
|||||||
import org.apache.hudi.avro.model.HoodieRollbackMetadata;
|
import org.apache.hudi.avro.model.HoodieRollbackMetadata;
|
||||||
import org.apache.hudi.avro.model.HoodieSavepointMetadata;
|
import org.apache.hudi.avro.model.HoodieSavepointMetadata;
|
||||||
import org.apache.hudi.client.WriteStatus;
|
import org.apache.hudi.client.WriteStatus;
|
||||||
import org.apache.hudi.client.utils.ParquetReaderIterator;
|
|
||||||
import org.apache.hudi.common.model.HoodieBaseFile;
|
import org.apache.hudi.common.model.HoodieBaseFile;
|
||||||
import org.apache.hudi.common.model.HoodieKey;
|
import org.apache.hudi.common.model.HoodieKey;
|
||||||
import org.apache.hudi.common.model.HoodieRecord;
|
import org.apache.hudi.common.model.HoodieRecord;
|
||||||
@@ -44,6 +43,8 @@ import org.apache.hudi.exception.HoodieUpsertException;
|
|||||||
import org.apache.hudi.execution.SparkBoundedInMemoryExecutor;
|
import org.apache.hudi.execution.SparkBoundedInMemoryExecutor;
|
||||||
import org.apache.hudi.io.HoodieCreateHandle;
|
import org.apache.hudi.io.HoodieCreateHandle;
|
||||||
import org.apache.hudi.io.HoodieMergeHandle;
|
import org.apache.hudi.io.HoodieMergeHandle;
|
||||||
|
import org.apache.hudi.io.storage.HoodieFileReader;
|
||||||
|
import org.apache.hudi.io.storage.HoodieFileReaderFactory;
|
||||||
import org.apache.hudi.table.action.clean.CleanActionExecutor;
|
import org.apache.hudi.table.action.clean.CleanActionExecutor;
|
||||||
import org.apache.hudi.table.action.HoodieWriteMetadata;
|
import org.apache.hudi.table.action.HoodieWriteMetadata;
|
||||||
import org.apache.hudi.table.action.commit.BulkInsertCommitActionExecutor;
|
import org.apache.hudi.table.action.commit.BulkInsertCommitActionExecutor;
|
||||||
@@ -58,9 +59,6 @@ import org.apache.hudi.table.action.rollback.CopyOnWriteRollbackActionExecutor;
|
|||||||
import org.apache.hudi.table.action.savepoint.SavepointActionExecutor;
|
import org.apache.hudi.table.action.savepoint.SavepointActionExecutor;
|
||||||
import org.apache.log4j.LogManager;
|
import org.apache.log4j.LogManager;
|
||||||
import org.apache.log4j.Logger;
|
import org.apache.log4j.Logger;
|
||||||
import org.apache.parquet.avro.AvroParquetReader;
|
|
||||||
import org.apache.parquet.avro.AvroReadSupport;
|
|
||||||
import org.apache.parquet.hadoop.ParquetReader;
|
|
||||||
import org.apache.spark.api.java.JavaRDD;
|
import org.apache.spark.api.java.JavaRDD;
|
||||||
import org.apache.spark.api.java.JavaSparkContext;
|
import org.apache.spark.api.java.JavaSparkContext;
|
||||||
|
|
||||||
@@ -150,11 +148,13 @@ public class HoodieCopyOnWriteTable<T extends HoodieRecordPayload> extends Hoodi
|
|||||||
throw new HoodieUpsertException(
|
throw new HoodieUpsertException(
|
||||||
"Error in finding the old file path at commit " + instantTime + " for fileId: " + fileId);
|
"Error in finding the old file path at commit " + instantTime + " for fileId: " + fileId);
|
||||||
} else {
|
} else {
|
||||||
AvroReadSupport.setAvroReadSchema(getHadoopConf(), upsertHandle.getWriterSchema());
|
|
||||||
BoundedInMemoryExecutor<GenericRecord, GenericRecord, Void> wrapper = null;
|
BoundedInMemoryExecutor<GenericRecord, GenericRecord, Void> wrapper = null;
|
||||||
try (ParquetReader<IndexedRecord> reader =
|
HoodieFileReader<IndexedRecord> storageReader =
|
||||||
AvroParquetReader.<IndexedRecord>builder(upsertHandle.getOldFilePath()).withConf(getHadoopConf()).build()) {
|
HoodieFileReaderFactory.getFileReader(getHadoopConf(), upsertHandle.getOldFilePath());
|
||||||
wrapper = new SparkBoundedInMemoryExecutor(config, new ParquetReaderIterator(reader),
|
|
||||||
|
try {
|
||||||
|
wrapper =
|
||||||
|
new SparkBoundedInMemoryExecutor(config, storageReader.getRecordIterator(upsertHandle.getWriterSchema()),
|
||||||
new UpdateHandler(upsertHandle), x -> x);
|
new UpdateHandler(upsertHandle), x -> x);
|
||||||
wrapper.execute();
|
wrapper.execute();
|
||||||
} catch (Exception e) {
|
} catch (Exception e) {
|
||||||
|
|||||||
@@ -34,12 +34,14 @@ import org.apache.hudi.common.fs.ConsistencyGuard;
|
|||||||
import org.apache.hudi.common.fs.ConsistencyGuard.FileVisibility;
|
import org.apache.hudi.common.fs.ConsistencyGuard.FileVisibility;
|
||||||
import org.apache.hudi.common.fs.FSUtils;
|
import org.apache.hudi.common.fs.FSUtils;
|
||||||
import org.apache.hudi.common.fs.FailSafeConsistencyGuard;
|
import org.apache.hudi.common.fs.FailSafeConsistencyGuard;
|
||||||
|
import org.apache.hudi.common.model.HoodieFileFormat;
|
||||||
import org.apache.hudi.common.model.HoodieKey;
|
import org.apache.hudi.common.model.HoodieKey;
|
||||||
import org.apache.hudi.common.model.HoodieRecord;
|
import org.apache.hudi.common.model.HoodieRecord;
|
||||||
import org.apache.hudi.common.model.HoodieRecordPayload;
|
import org.apache.hudi.common.model.HoodieRecordPayload;
|
||||||
import org.apache.hudi.common.model.HoodieWriteStat;
|
import org.apache.hudi.common.model.HoodieWriteStat;
|
||||||
import org.apache.hudi.common.table.HoodieTableMetaClient;
|
import org.apache.hudi.common.table.HoodieTableMetaClient;
|
||||||
import org.apache.hudi.common.table.TableSchemaResolver;
|
import org.apache.hudi.common.table.TableSchemaResolver;
|
||||||
|
import org.apache.hudi.common.table.log.block.HoodieLogBlock.HoodieLogBlockType;
|
||||||
import org.apache.hudi.common.table.timeline.HoodieActiveTimeline;
|
import org.apache.hudi.common.table.timeline.HoodieActiveTimeline;
|
||||||
import org.apache.hudi.common.table.timeline.HoodieInstant;
|
import org.apache.hudi.common.table.timeline.HoodieInstant;
|
||||||
import org.apache.hudi.common.table.timeline.HoodieTimeline;
|
import org.apache.hudi.common.table.timeline.HoodieTimeline;
|
||||||
@@ -310,7 +312,7 @@ public abstract class HoodieTable<T extends HoodieRecordPayload> implements Seri
|
|||||||
|
|
||||||
/**
|
/**
|
||||||
* Schedule compaction for the instant time.
|
* Schedule compaction for the instant time.
|
||||||
*
|
*
|
||||||
* @param jsc Spark Context
|
* @param jsc Spark Context
|
||||||
* @param instantTime Instant Time for scheduling compaction
|
* @param instantTime Instant Time for scheduling compaction
|
||||||
* @param extraMetadata additional metadata to write into plan
|
* @param extraMetadata additional metadata to write into plan
|
||||||
@@ -381,7 +383,7 @@ public abstract class HoodieTable<T extends HoodieRecordPayload> implements Seri
|
|||||||
|
|
||||||
/**
|
/**
|
||||||
* Delete Marker directory corresponding to an instant.
|
* Delete Marker directory corresponding to an instant.
|
||||||
*
|
*
|
||||||
* @param instantTs Instant Time
|
* @param instantTs Instant Time
|
||||||
*/
|
*/
|
||||||
public void deleteMarkerDir(String instantTs) {
|
public void deleteMarkerDir(String instantTs) {
|
||||||
@@ -422,9 +424,11 @@ public abstract class HoodieTable<T extends HoodieRecordPayload> implements Seri
|
|||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
List<String> invalidDataPaths = FSUtils.getAllDataFilesForMarkers(fs, basePath, instantTs, markerDir.toString());
|
final String baseFileExtension = getBaseFileFormat().getFileExtension();
|
||||||
|
List<String> invalidDataPaths = FSUtils.getAllDataFilesForMarkers(fs, basePath, instantTs, markerDir.toString(),
|
||||||
|
baseFileExtension);
|
||||||
List<String> validDataPaths = stats.stream().map(w -> String.format("%s/%s", basePath, w.getPath()))
|
List<String> validDataPaths = stats.stream().map(w -> String.format("%s/%s", basePath, w.getPath()))
|
||||||
.filter(p -> p.endsWith(".parquet")).collect(Collectors.toList());
|
.filter(p -> p.endsWith(baseFileExtension)).collect(Collectors.toList());
|
||||||
// Contains list of partially created files. These needs to be cleaned up.
|
// Contains list of partially created files. These needs to be cleaned up.
|
||||||
invalidDataPaths.removeAll(validDataPaths);
|
invalidDataPaths.removeAll(validDataPaths);
|
||||||
if (!invalidDataPaths.isEmpty()) {
|
if (!invalidDataPaths.isEmpty()) {
|
||||||
@@ -478,7 +482,7 @@ public abstract class HoodieTable<T extends HoodieRecordPayload> implements Seri
|
|||||||
|
|
||||||
/**
|
/**
|
||||||
* Ensures all files passed either appear or disappear.
|
* Ensures all files passed either appear or disappear.
|
||||||
*
|
*
|
||||||
* @param jsc JavaSparkContext
|
* @param jsc JavaSparkContext
|
||||||
* @param groupByPartition Files grouped by partition
|
* @param groupByPartition Files grouped by partition
|
||||||
* @param visibility Appear/Disappear
|
* @param visibility Appear/Disappear
|
||||||
@@ -562,4 +566,26 @@ public abstract class HoodieTable<T extends HoodieRecordPayload> implements Seri
|
|||||||
throw new HoodieInsertException("Failed insert schema compability check.", e);
|
throw new HoodieInsertException("Failed insert schema compability check.", e);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public HoodieFileFormat getBaseFileFormat() {
|
||||||
|
return metaClient.getTableConfig().getBaseFileFormat();
|
||||||
|
}
|
||||||
|
|
||||||
|
public HoodieFileFormat getLogFileFormat() {
|
||||||
|
return metaClient.getTableConfig().getLogFileFormat();
|
||||||
|
}
|
||||||
|
|
||||||
|
public HoodieLogBlockType getLogDataBlockFormat() {
|
||||||
|
switch (getBaseFileFormat()) {
|
||||||
|
case PARQUET:
|
||||||
|
return HoodieLogBlockType.AVRO_DATA_BLOCK;
|
||||||
|
default:
|
||||||
|
throw new HoodieException("Base file format " + getBaseFileFormat()
|
||||||
|
+ " does not have associated log block format");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getBaseFileExtension() {
|
||||||
|
return getBaseFileFormat().getFileExtension();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -19,7 +19,6 @@
|
|||||||
package org.apache.hudi.table.action.commit;
|
package org.apache.hudi.table.action.commit;
|
||||||
|
|
||||||
import org.apache.hudi.client.WriteStatus;
|
import org.apache.hudi.client.WriteStatus;
|
||||||
import org.apache.hudi.client.utils.ParquetReaderIterator;
|
|
||||||
import org.apache.hudi.common.model.HoodieBaseFile;
|
import org.apache.hudi.common.model.HoodieBaseFile;
|
||||||
import org.apache.hudi.common.model.HoodieRecord;
|
import org.apache.hudi.common.model.HoodieRecord;
|
||||||
import org.apache.hudi.common.model.HoodieRecordPayload;
|
import org.apache.hudi.common.model.HoodieRecordPayload;
|
||||||
@@ -32,6 +31,8 @@ import org.apache.hudi.exception.HoodieUpsertException;
|
|||||||
import org.apache.hudi.execution.LazyInsertIterable;
|
import org.apache.hudi.execution.LazyInsertIterable;
|
||||||
import org.apache.hudi.execution.SparkBoundedInMemoryExecutor;
|
import org.apache.hudi.execution.SparkBoundedInMemoryExecutor;
|
||||||
import org.apache.hudi.io.HoodieMergeHandle;
|
import org.apache.hudi.io.HoodieMergeHandle;
|
||||||
|
import org.apache.hudi.io.storage.HoodieFileReader;
|
||||||
|
import org.apache.hudi.io.storage.HoodieFileReaderFactory;
|
||||||
import org.apache.hudi.table.HoodieTable;
|
import org.apache.hudi.table.HoodieTable;
|
||||||
import org.apache.hudi.table.WorkloadProfile;
|
import org.apache.hudi.table.WorkloadProfile;
|
||||||
|
|
||||||
@@ -39,9 +40,6 @@ import org.apache.avro.generic.GenericRecord;
|
|||||||
import org.apache.avro.generic.IndexedRecord;
|
import org.apache.avro.generic.IndexedRecord;
|
||||||
import org.apache.log4j.LogManager;
|
import org.apache.log4j.LogManager;
|
||||||
import org.apache.log4j.Logger;
|
import org.apache.log4j.Logger;
|
||||||
import org.apache.parquet.avro.AvroParquetReader;
|
|
||||||
import org.apache.parquet.avro.AvroReadSupport;
|
|
||||||
import org.apache.parquet.hadoop.ParquetReader;
|
|
||||||
import org.apache.spark.Partitioner;
|
import org.apache.spark.Partitioner;
|
||||||
import org.apache.spark.api.java.JavaSparkContext;
|
import org.apache.spark.api.java.JavaSparkContext;
|
||||||
|
|
||||||
@@ -89,11 +87,12 @@ public abstract class CommitActionExecutor<T extends HoodieRecordPayload<T>>
|
|||||||
throw new HoodieUpsertException(
|
throw new HoodieUpsertException(
|
||||||
"Error in finding the old file path at commit " + instantTime + " for fileId: " + fileId);
|
"Error in finding the old file path at commit " + instantTime + " for fileId: " + fileId);
|
||||||
} else {
|
} else {
|
||||||
AvroReadSupport.setAvroReadSchema(table.getHadoopConf(), upsertHandle.getWriterSchema());
|
|
||||||
BoundedInMemoryExecutor<GenericRecord, GenericRecord, Void> wrapper = null;
|
BoundedInMemoryExecutor<GenericRecord, GenericRecord, Void> wrapper = null;
|
||||||
try (ParquetReader<IndexedRecord> reader =
|
try {
|
||||||
AvroParquetReader.<IndexedRecord>builder(upsertHandle.getOldFilePath()).withConf(table.getHadoopConf()).build()) {
|
HoodieFileReader<IndexedRecord> storageReader =
|
||||||
wrapper = new SparkBoundedInMemoryExecutor(config, new ParquetReaderIterator(reader),
|
HoodieFileReaderFactory.getFileReader(table.getHadoopConf(), upsertHandle.getOldFilePath());
|
||||||
|
wrapper =
|
||||||
|
new SparkBoundedInMemoryExecutor(config, storageReader.getRecordIterator(upsertHandle.getWriterSchema()),
|
||||||
new UpdateHandler(upsertHandle), x -> x);
|
new UpdateHandler(upsertHandle), x -> x);
|
||||||
wrapper.execute();
|
wrapper.execute();
|
||||||
} catch (Exception e) {
|
} catch (Exception e) {
|
||||||
|
|||||||
@@ -83,7 +83,7 @@ public abstract class DeltaCommitActionExecutor<T extends HoodieRecordPayload<T>
|
|||||||
@Override
|
@Override
|
||||||
public Iterator<List<WriteStatus>> handleInsert(String idPfx, Iterator<HoodieRecord<T>> recordItr)
|
public Iterator<List<WriteStatus>> handleInsert(String idPfx, Iterator<HoodieRecord<T>> recordItr)
|
||||||
throws Exception {
|
throws Exception {
|
||||||
// If canIndexLogFiles, write inserts to log files else write inserts to parquet files
|
// If canIndexLogFiles, write inserts to log files else write inserts to base files
|
||||||
if (table.getIndex().canIndexLogFiles()) {
|
if (table.getIndex().canIndexLogFiles()) {
|
||||||
return new LazyInsertIterable<>(recordItr, config, instantTime, (HoodieTable<T>)table, idPfx,
|
return new LazyInsertIterable<>(recordItr, config, instantTime, (HoodieTable<T>)table, idPfx,
|
||||||
sparkTaskContextSupplier, new AppendHandleFactory<>());
|
sparkTaskContextSupplier, new AppendHandleFactory<>());
|
||||||
|
|||||||
@@ -71,8 +71,9 @@ public class RollbackHelper implements Serializable {
|
|||||||
*/
|
*/
|
||||||
public List<HoodieRollbackStat> performRollback(JavaSparkContext jsc, HoodieInstant instantToRollback, List<RollbackRequest> rollbackRequests) {
|
public List<HoodieRollbackStat> performRollback(JavaSparkContext jsc, HoodieInstant instantToRollback, List<RollbackRequest> rollbackRequests) {
|
||||||
|
|
||||||
|
String basefileExtension = metaClient.getTableConfig().getBaseFileFormat().getFileExtension();
|
||||||
SerializablePathFilter filter = (path) -> {
|
SerializablePathFilter filter = (path) -> {
|
||||||
if (path.toString().contains(".parquet")) {
|
if (path.toString().contains(basefileExtension)) {
|
||||||
String fileCommitTime = FSUtils.getCommitTime(path.getName());
|
String fileCommitTime = FSUtils.getCommitTime(path.getName());
|
||||||
return instantToRollback.getTimestamp().equals(fileCommitTime);
|
return instantToRollback.getTimestamp().equals(fileCommitTime);
|
||||||
} else if (path.toString().contains(".log")) {
|
} else if (path.toString().contains(".log")) {
|
||||||
@@ -184,8 +185,9 @@ public class RollbackHelper implements Serializable {
|
|||||||
Map<FileStatus, Boolean> results, String commit, String partitionPath) throws IOException {
|
Map<FileStatus, Boolean> results, String commit, String partitionPath) throws IOException {
|
||||||
LOG.info("Cleaning path " + partitionPath);
|
LOG.info("Cleaning path " + partitionPath);
|
||||||
FileSystem fs = metaClient.getFs();
|
FileSystem fs = metaClient.getFs();
|
||||||
|
String basefileExtension = metaClient.getTableConfig().getBaseFileFormat().getFileExtension();
|
||||||
PathFilter filter = (path) -> {
|
PathFilter filter = (path) -> {
|
||||||
if (path.toString().contains(".parquet")) {
|
if (path.toString().contains(basefileExtension)) {
|
||||||
String fileCommitTime = FSUtils.getCommitTime(path.getName());
|
String fileCommitTime = FSUtils.getCommitTime(path.getName());
|
||||||
return commit.equals(fileCommitTime);
|
return commit.equals(fileCommitTime);
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -257,7 +257,11 @@ public class TestHoodieBloomIndex extends HoodieClientTestHarness {
|
|||||||
List<String> uuids =
|
List<String> uuids =
|
||||||
Arrays.asList(record1.getRecordKey(), record2.getRecordKey(), record3.getRecordKey(), record4.getRecordKey());
|
Arrays.asList(record1.getRecordKey(), record2.getRecordKey(), record3.getRecordKey(), record4.getRecordKey());
|
||||||
|
|
||||||
List<String> results = HoodieKeyLookupHandle.checkCandidatesAgainstFile(hadoopConf, uuids,
|
HoodieWriteConfig config = HoodieWriteConfig.newBuilder().withPath(basePath).build();
|
||||||
|
HoodieTable table = HoodieTable.create(metaClient, config, hadoopConf);
|
||||||
|
HoodieKeyLookupHandle keyHandle = new HoodieKeyLookupHandle<>(config, table,
|
||||||
|
Pair.of("2016/01/31/", FSUtils.getFileId(filename)));
|
||||||
|
List<String> results = keyHandle.checkCandidatesAgainstFile(hadoopConf, uuids,
|
||||||
new Path(basePath + "/2016/01/31/" + filename));
|
new Path(basePath + "/2016/01/31/" + filename));
|
||||||
assertEquals(results.size(), 2);
|
assertEquals(results.size(), 2);
|
||||||
assertTrue(results.get(0).equals("1eb5b87a-1feh-4edd-87b4-6ec96dc405a0")
|
assertTrue(results.get(0).equals("1eb5b87a-1feh-4edd-87b4-6ec96dc405a0")
|
||||||
|
|||||||
@@ -34,26 +34,26 @@ import static org.junit.jupiter.api.Assertions.assertThrows;
|
|||||||
import static org.junit.jupiter.api.Assertions.assertTrue;
|
import static org.junit.jupiter.api.Assertions.assertTrue;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Tests for {@link HoodieStorageWriterFactory}.
|
* Tests for {@link HoodieFileWriterFactory}.
|
||||||
*/
|
*/
|
||||||
public class TestHoodieStorageWriterFactory extends HoodieClientTestBase {
|
public class TestHoodieFileWriterFactory extends HoodieClientTestBase {
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testGetStorageWriter() throws IOException {
|
public void testGetFileWriter() throws IOException {
|
||||||
// parquet file format.
|
// parquet file format.
|
||||||
final String instantTime = "100";
|
final String instantTime = "100";
|
||||||
final Path parquetPath = new Path(basePath + "/partition/path/f1_1-0-1_000.parquet");
|
final Path parquetPath = new Path(basePath + "/partition/path/f1_1-0-1_000.parquet");
|
||||||
final HoodieWriteConfig cfg = getConfig();
|
final HoodieWriteConfig cfg = getConfig();
|
||||||
HoodieTable table = HoodieTable.create(metaClient, cfg, hadoopConf);
|
HoodieTable table = HoodieTable.create(metaClient, cfg, hadoopConf);
|
||||||
SparkTaskContextSupplier supplier = new SparkTaskContextSupplier();
|
SparkTaskContextSupplier supplier = new SparkTaskContextSupplier();
|
||||||
HoodieStorageWriter<IndexedRecord> parquetWriter = HoodieStorageWriterFactory.getStorageWriter(instantTime,
|
HoodieFileWriter<IndexedRecord> parquetWriter = HoodieFileWriterFactory.getFileWriter(instantTime,
|
||||||
parquetPath, table, cfg, HoodieTestDataGenerator.AVRO_SCHEMA, supplier);
|
parquetPath, table, cfg, HoodieTestDataGenerator.AVRO_SCHEMA, supplier);
|
||||||
assertTrue(parquetWriter instanceof HoodieParquetWriter);
|
assertTrue(parquetWriter instanceof HoodieParquetWriter);
|
||||||
|
|
||||||
// other file format exception.
|
// other file format exception.
|
||||||
final Path logPath = new Path(basePath + "/partition/path/f.b51192a8-574b-4a85-b246-bcfec03ac8bf_100.log.2_1-0-1");
|
final Path logPath = new Path(basePath + "/partition/path/f.b51192a8-574b-4a85-b246-bcfec03ac8bf_100.log.2_1-0-1");
|
||||||
final Throwable thrown = assertThrows(UnsupportedOperationException.class, () -> {
|
final Throwable thrown = assertThrows(UnsupportedOperationException.class, () -> {
|
||||||
HoodieStorageWriter<IndexedRecord> logWriter = HoodieStorageWriterFactory.getStorageWriter(instantTime, logPath,
|
HoodieFileWriter<IndexedRecord> logWriter = HoodieFileWriterFactory.getFileWriter(instantTime, logPath,
|
||||||
table, cfg, HoodieTestDataGenerator.AVRO_SCHEMA, supplier);
|
table, cfg, HoodieTestDataGenerator.AVRO_SCHEMA, supplier);
|
||||||
}, "should fail since log storage writer is not supported yet.");
|
}, "should fail since log storage writer is not supported yet.");
|
||||||
assertTrue(thrown.getMessage().contains("format not supported yet."));
|
assertTrue(thrown.getMessage().contains("format not supported yet."));
|
||||||
@@ -24,6 +24,7 @@ import org.apache.hudi.client.WriteStatus;
|
|||||||
import org.apache.hudi.common.model.FileSlice;
|
import org.apache.hudi.common.model.FileSlice;
|
||||||
import org.apache.hudi.common.model.HoodieBaseFile;
|
import org.apache.hudi.common.model.HoodieBaseFile;
|
||||||
import org.apache.hudi.common.model.HoodieCommitMetadata;
|
import org.apache.hudi.common.model.HoodieCommitMetadata;
|
||||||
|
import org.apache.hudi.common.model.HoodieFileFormat;
|
||||||
import org.apache.hudi.common.model.HoodieFileGroup;
|
import org.apache.hudi.common.model.HoodieFileGroup;
|
||||||
import org.apache.hudi.common.model.HoodieKey;
|
import org.apache.hudi.common.model.HoodieKey;
|
||||||
import org.apache.hudi.common.model.HoodieRecord;
|
import org.apache.hudi.common.model.HoodieRecord;
|
||||||
@@ -46,7 +47,9 @@ import org.apache.hudi.config.HoodieCompactionConfig;
|
|||||||
import org.apache.hudi.config.HoodieIndexConfig;
|
import org.apache.hudi.config.HoodieIndexConfig;
|
||||||
import org.apache.hudi.config.HoodieStorageConfig;
|
import org.apache.hudi.config.HoodieStorageConfig;
|
||||||
import org.apache.hudi.config.HoodieWriteConfig;
|
import org.apache.hudi.config.HoodieWriteConfig;
|
||||||
|
import org.apache.hudi.exception.HoodieIOException;
|
||||||
import org.apache.hudi.hadoop.utils.HoodieHiveUtils;
|
import org.apache.hudi.hadoop.utils.HoodieHiveUtils;
|
||||||
|
import org.apache.hudi.hadoop.utils.HoodieInputFormatUtils;
|
||||||
import org.apache.hudi.hadoop.HoodieParquetInputFormat;
|
import org.apache.hudi.hadoop.HoodieParquetInputFormat;
|
||||||
import org.apache.hudi.hadoop.realtime.HoodieParquetRealtimeInputFormat;
|
import org.apache.hudi.hadoop.realtime.HoodieParquetRealtimeInputFormat;
|
||||||
import org.apache.hudi.index.HoodieIndex;
|
import org.apache.hudi.index.HoodieIndex;
|
||||||
@@ -66,9 +69,9 @@ import org.apache.hadoop.mapred.FileInputFormat;
|
|||||||
import org.apache.hadoop.mapred.JobConf;
|
import org.apache.hadoop.mapred.JobConf;
|
||||||
import org.apache.spark.api.java.JavaRDD;
|
import org.apache.spark.api.java.JavaRDD;
|
||||||
import org.junit.jupiter.api.AfterEach;
|
import org.junit.jupiter.api.AfterEach;
|
||||||
import org.junit.jupiter.api.BeforeEach;
|
|
||||||
import org.junit.jupiter.api.Test;
|
|
||||||
import org.junit.jupiter.api.io.TempDir;
|
import org.junit.jupiter.api.io.TempDir;
|
||||||
|
import org.junit.jupiter.params.ParameterizedTest;
|
||||||
|
import org.junit.jupiter.params.provider.MethodSource;
|
||||||
|
|
||||||
import java.io.File;
|
import java.io.File;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
@@ -90,37 +93,32 @@ import static org.junit.jupiter.api.Assertions.assertFalse;
|
|||||||
import static org.junit.jupiter.api.Assertions.assertTrue;
|
import static org.junit.jupiter.api.Assertions.assertTrue;
|
||||||
|
|
||||||
public class TestHoodieMergeOnReadTable extends HoodieClientTestHarness {
|
public class TestHoodieMergeOnReadTable extends HoodieClientTestHarness {
|
||||||
private HoodieParquetInputFormat roSnapshotInputFormat;
|
|
||||||
private JobConf roSnapshotJobConf;
|
private JobConf roSnapshotJobConf;
|
||||||
|
|
||||||
private HoodieParquetInputFormat roInputFormat;
|
|
||||||
private JobConf roJobConf;
|
private JobConf roJobConf;
|
||||||
|
|
||||||
private HoodieParquetRealtimeInputFormat rtInputFormat;
|
|
||||||
private JobConf rtJobConf;
|
private JobConf rtJobConf;
|
||||||
|
|
||||||
@BeforeEach
|
@TempDir
|
||||||
public void init() throws IOException {
|
public java.nio.file.Path tempFolder;
|
||||||
|
private HoodieFileFormat baseFileFormat;
|
||||||
|
|
||||||
|
static Stream<HoodieFileFormat> argumentsProvider() {
|
||||||
|
return Stream.of(HoodieFileFormat.PARQUET);
|
||||||
|
}
|
||||||
|
|
||||||
|
public void init(HoodieFileFormat baseFileFormat) throws IOException {
|
||||||
|
this.baseFileFormat = baseFileFormat;
|
||||||
|
|
||||||
initDFS();
|
initDFS();
|
||||||
initSparkContexts("TestHoodieMergeOnReadTable");
|
initSparkContexts("TestHoodieMergeOnReadTable");
|
||||||
hadoopConf.addResource(dfs.getConf());
|
hadoopConf.addResource(dfs.getConf());
|
||||||
initPath();
|
initPath();
|
||||||
dfs.mkdirs(new Path(basePath));
|
dfs.mkdirs(new Path(basePath));
|
||||||
HoodieTestUtils.init(hadoopConf, basePath, HoodieTableType.MERGE_ON_READ);
|
metaClient = HoodieTestUtils.init(hadoopConf, basePath, HoodieTableType.MERGE_ON_READ, baseFileFormat);
|
||||||
initTestDataGenerator();
|
initTestDataGenerator();
|
||||||
|
|
||||||
// initialize parquet input format
|
roSnapshotJobConf = new JobConf(hadoopConf);
|
||||||
roSnapshotInputFormat = new HoodieParquetInputFormat();
|
|
||||||
roSnapshotJobConf = new JobConf(jsc.hadoopConfiguration());
|
|
||||||
roSnapshotInputFormat.setConf(roSnapshotJobConf);
|
|
||||||
|
|
||||||
roInputFormat = new HoodieParquetInputFormat();
|
|
||||||
roJobConf = new JobConf(hadoopConf);
|
roJobConf = new JobConf(hadoopConf);
|
||||||
roInputFormat.setConf(roJobConf);
|
|
||||||
|
|
||||||
rtInputFormat = new HoodieParquetRealtimeInputFormat();
|
|
||||||
rtJobConf = new JobConf(hadoopConf);
|
rtJobConf = new JobConf(hadoopConf);
|
||||||
rtInputFormat.setConf(rtJobConf);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@AfterEach
|
@AfterEach
|
||||||
@@ -128,8 +126,11 @@ public class TestHoodieMergeOnReadTable extends HoodieClientTestHarness {
|
|||||||
cleanupResources();
|
cleanupResources();
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@ParameterizedTest
|
||||||
public void testSimpleInsertAndUpdate() throws Exception {
|
@MethodSource("argumentsProvider")
|
||||||
|
public void testSimpleInsertAndUpdate(HoodieFileFormat baseFileFormat) throws Exception {
|
||||||
|
init(baseFileFormat);
|
||||||
|
|
||||||
HoodieWriteConfig cfg = getConfig(true);
|
HoodieWriteConfig cfg = getConfig(true);
|
||||||
try (HoodieWriteClient client = getHoodieWriteClient(cfg);) {
|
try (HoodieWriteClient client = getHoodieWriteClient(cfg);) {
|
||||||
|
|
||||||
@@ -153,9 +154,10 @@ public class TestHoodieMergeOnReadTable extends HoodieClientTestHarness {
|
|||||||
String compactionCommitTime = client.scheduleCompaction(Option.empty()).get().toString();
|
String compactionCommitTime = client.scheduleCompaction(Option.empty()).get().toString();
|
||||||
client.compact(compactionCommitTime);
|
client.compact(compactionCommitTime);
|
||||||
|
|
||||||
FileStatus[] allFiles = HoodieTestUtils.listAllDataFilesInPath(dfs, cfg.getBasePath());
|
HoodieTable hoodieTable = HoodieTable.create(metaClient, cfg, hadoopConf);
|
||||||
hoodieTable = HoodieTable.create(metaClient, cfg, hadoopConf);
|
FileStatus[] allFiles = listAllDataFilesInPath(hoodieTable, cfg.getBasePath());
|
||||||
tableView = getHoodieTableFileSystemView(metaClient, hoodieTable.getCompletedCommitsTimeline(), allFiles);
|
tableView = getHoodieTableFileSystemView(metaClient, hoodieTable.getCompletedCommitsTimeline(), allFiles);
|
||||||
|
HoodieTableFileSystemView roView = new HoodieTableFileSystemView(metaClient, hoodieTable.getCompletedCommitsTimeline(), allFiles);
|
||||||
Stream<HoodieBaseFile> dataFilesToRead = tableView.getLatestBaseFiles();
|
Stream<HoodieBaseFile> dataFilesToRead = tableView.getLatestBaseFiles();
|
||||||
assertTrue(dataFilesToRead.findAny().isPresent());
|
assertTrue(dataFilesToRead.findAny().isPresent());
|
||||||
|
|
||||||
@@ -174,8 +176,11 @@ public class TestHoodieMergeOnReadTable extends HoodieClientTestHarness {
|
|||||||
|
|
||||||
// test incremental read does not go past compaction instant for RO views
|
// test incremental read does not go past compaction instant for RO views
|
||||||
// For RT views, incremental read can go past compaction
|
// For RT views, incremental read can go past compaction
|
||||||
@Test
|
@ParameterizedTest
|
||||||
public void testIncrementalReadsWithCompaction() throws Exception {
|
@MethodSource("argumentsProvider")
|
||||||
|
public void testIncrementalReadsWithCompaction(HoodieFileFormat baseFileFormat) throws Exception {
|
||||||
|
init(baseFileFormat);
|
||||||
|
|
||||||
String partitionPath = "2020/02/20"; // use only one partition for this test
|
String partitionPath = "2020/02/20"; // use only one partition for this test
|
||||||
dataGen = new HoodieTestDataGenerator(new String[] { partitionPath });
|
dataGen = new HoodieTestDataGenerator(new String[] { partitionPath });
|
||||||
HoodieWriteConfig cfg = getConfig(true);
|
HoodieWriteConfig cfg = getConfig(true);
|
||||||
@@ -190,19 +195,17 @@ public class TestHoodieMergeOnReadTable extends HoodieClientTestHarness {
|
|||||||
List<HoodieRecord> records001 = dataGen.generateInserts(commitTime1, 200);
|
List<HoodieRecord> records001 = dataGen.generateInserts(commitTime1, 200);
|
||||||
insertAndGetFilePaths(records001, client, cfg, commitTime1);
|
insertAndGetFilePaths(records001, client, cfg, commitTime1);
|
||||||
|
|
||||||
// verify only one parquet file shows up with commit time 001
|
// verify only one base file shows up with commit time 001
|
||||||
FileStatus[] snapshotROFiles = getROSnapshotFiles(partitionPath);
|
FileStatus[] snapshotROFiles = getROSnapshotFiles(partitionPath);
|
||||||
validateFiles(partitionPath,1, snapshotROFiles, roSnapshotInputFormat,
|
validateFiles(partitionPath, 1, snapshotROFiles, false, roSnapshotJobConf, 200, commitTime1);
|
||||||
roSnapshotJobConf,200, commitTime1);
|
|
||||||
|
|
||||||
FileStatus[] incrementalROFiles = getROIncrementalFiles(partitionPath, true);
|
FileStatus[] incrementalROFiles = getROIncrementalFiles(partitionPath, true);
|
||||||
validateFiles(partitionPath, 1, incrementalROFiles, roInputFormat,
|
validateFiles(partitionPath, 1, incrementalROFiles, false, roJobConf, 200, commitTime1);
|
||||||
roJobConf,200, commitTime1);
|
|
||||||
Path firstFilePath = incrementalROFiles[0].getPath();
|
Path firstFilePath = incrementalROFiles[0].getPath();
|
||||||
|
|
||||||
FileStatus[] incrementalRTFiles = getRTIncrementalFiles(partitionPath);
|
FileStatus[] incrementalRTFiles = getRTIncrementalFiles(partitionPath);
|
||||||
validateFiles(partitionPath, 1, incrementalRTFiles, rtInputFormat,
|
validateFiles(partitionPath, 1, incrementalRTFiles, true, rtJobConf,200, commitTime1);
|
||||||
rtJobConf,200, commitTime1);
|
|
||||||
assertEquals(firstFilePath, incrementalRTFiles[0].getPath());
|
assertEquals(firstFilePath, incrementalRTFiles[0].getPath());
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@@ -215,14 +218,12 @@ public class TestHoodieMergeOnReadTable extends HoodieClientTestHarness {
|
|||||||
|
|
||||||
// verify RO incremental reads - only one parquet file shows up because updates to into log files
|
// verify RO incremental reads - only one parquet file shows up because updates to into log files
|
||||||
incrementalROFiles = getROIncrementalFiles(partitionPath, false);
|
incrementalROFiles = getROIncrementalFiles(partitionPath, false);
|
||||||
validateFiles(partitionPath, 1, incrementalROFiles, roInputFormat,
|
validateFiles(partitionPath, 1, incrementalROFiles, false, roJobConf, 200, commitTime1);
|
||||||
roJobConf, 200, commitTime1);
|
|
||||||
assertEquals(firstFilePath, incrementalROFiles[0].getPath());
|
assertEquals(firstFilePath, incrementalROFiles[0].getPath());
|
||||||
|
|
||||||
// verify RT incremental reads includes updates also
|
// verify RT incremental reads includes updates also
|
||||||
incrementalRTFiles = getRTIncrementalFiles(partitionPath);
|
incrementalRTFiles = getRTIncrementalFiles(partitionPath);
|
||||||
validateFiles(partitionPath, 1, incrementalRTFiles, rtInputFormat,
|
validateFiles(partitionPath, 1, incrementalRTFiles, true, rtJobConf, 200, commitTime1, updateTime);
|
||||||
rtJobConf, 200, commitTime1, updateTime);
|
|
||||||
|
|
||||||
// request compaction, but do not perform compaction
|
// request compaction, but do not perform compaction
|
||||||
String compactionCommitTime = "005";
|
String compactionCommitTime = "005";
|
||||||
@@ -230,13 +231,11 @@ public class TestHoodieMergeOnReadTable extends HoodieClientTestHarness {
|
|||||||
|
|
||||||
// verify RO incremental reads - only one parquet file shows up because updates go into log files
|
// verify RO incremental reads - only one parquet file shows up because updates go into log files
|
||||||
incrementalROFiles = getROIncrementalFiles(partitionPath, true);
|
incrementalROFiles = getROIncrementalFiles(partitionPath, true);
|
||||||
validateFiles(partitionPath,1, incrementalROFiles, roInputFormat,
|
validateFiles(partitionPath,1, incrementalROFiles, false, roJobConf, 200, commitTime1);
|
||||||
roJobConf, 200, commitTime1);
|
|
||||||
|
|
||||||
// verify RT incremental reads includes updates also
|
// verify RT incremental reads includes updates also
|
||||||
incrementalRTFiles = getRTIncrementalFiles(partitionPath);
|
incrementalRTFiles = getRTIncrementalFiles(partitionPath);
|
||||||
validateFiles(partitionPath, 1, incrementalRTFiles, rtInputFormat,
|
validateFiles(partitionPath, 1, incrementalRTFiles, true, rtJobConf, 200, commitTime1, updateTime);
|
||||||
rtJobConf, 200, commitTime1, updateTime);
|
|
||||||
|
|
||||||
// write 3 - more inserts
|
// write 3 - more inserts
|
||||||
String insertsTime = "006";
|
String insertsTime = "006";
|
||||||
@@ -246,44 +245,44 @@ public class TestHoodieMergeOnReadTable extends HoodieClientTestHarness {
|
|||||||
|
|
||||||
// verify new write shows up in snapshot mode even though there is pending compaction
|
// verify new write shows up in snapshot mode even though there is pending compaction
|
||||||
snapshotROFiles = getROSnapshotFiles(partitionPath);
|
snapshotROFiles = getROSnapshotFiles(partitionPath);
|
||||||
validateFiles(partitionPath, 2, snapshotROFiles, roSnapshotInputFormat,
|
validateFiles(partitionPath, 2, snapshotROFiles, false, roSnapshotJobConf,400, commitTime1, insertsTime);
|
||||||
roSnapshotJobConf,400, commitTime1, insertsTime);
|
|
||||||
|
|
||||||
incrementalROFiles = getROIncrementalFiles(partitionPath, true);
|
incrementalROFiles = getROIncrementalFiles(partitionPath, true);
|
||||||
assertEquals(firstFilePath, incrementalROFiles[0].getPath());
|
assertEquals(firstFilePath, incrementalROFiles[0].getPath());
|
||||||
// verify 006 does not show up in RO mode because of pending compaction
|
// verify 006 does not show up in RO mode because of pending compaction
|
||||||
validateFiles(partitionPath, 1, incrementalROFiles, roInputFormat,
|
|
||||||
roJobConf, 200, commitTime1);
|
validateFiles(partitionPath, 1, incrementalROFiles, false, roJobConf, 200, commitTime1);
|
||||||
|
|
||||||
// verify that if stopAtCompaction is disabled, inserts from "insertsTime" show up
|
// verify that if stopAtCompaction is disabled, inserts from "insertsTime" show up
|
||||||
incrementalROFiles = getROIncrementalFiles(partitionPath, false);
|
incrementalROFiles = getROIncrementalFiles(partitionPath, false);
|
||||||
validateFiles(partitionPath,2, incrementalROFiles, roInputFormat,
|
validateFiles(partitionPath,2, incrementalROFiles, false, roJobConf, 400, commitTime1, insertsTime);
|
||||||
roJobConf, 400, commitTime1, insertsTime);
|
|
||||||
|
|
||||||
// verify 006 shows up in RT views
|
// verify 006 shows up in RT views
|
||||||
incrementalRTFiles = getRTIncrementalFiles(partitionPath);
|
incrementalRTFiles = getRTIncrementalFiles(partitionPath);
|
||||||
validateFiles(partitionPath, 2, incrementalRTFiles, rtInputFormat,
|
validateFiles(partitionPath, 2, incrementalRTFiles, true, rtJobConf, 400, commitTime1, updateTime, insertsTime);
|
||||||
rtJobConf, 400, commitTime1, updateTime, insertsTime);
|
|
||||||
|
|
||||||
// perform the scheduled compaction
|
// perform the scheduled compaction
|
||||||
client.compact(compactionCommitTime);
|
client.compact(compactionCommitTime);
|
||||||
|
|
||||||
// verify new write shows up in snapshot mode after compaction is complete
|
// verify new write shows up in snapshot mode after compaction is complete
|
||||||
snapshotROFiles = getROSnapshotFiles(partitionPath);
|
snapshotROFiles = getROSnapshotFiles(partitionPath);
|
||||||
validateFiles(partitionPath,2, snapshotROFiles, roSnapshotInputFormat,
|
validateFiles(partitionPath,2, snapshotROFiles, false, roSnapshotJobConf,400, commitTime1, compactionCommitTime,
|
||||||
roSnapshotJobConf,400, commitTime1, compactionCommitTime, insertsTime);
|
insertsTime);
|
||||||
|
|
||||||
incrementalROFiles = getROIncrementalFiles(partitionPath, "002", -1, true);
|
incrementalROFiles = getROIncrementalFiles(partitionPath, "002", -1, true);
|
||||||
assertTrue(incrementalROFiles.length == 2);
|
assertTrue(incrementalROFiles.length == 2);
|
||||||
// verify 006 shows up because of pending compaction
|
// verify 006 shows up because of pending compaction
|
||||||
validateFiles(partitionPath, 2, incrementalROFiles, roInputFormat,
|
validateFiles(partitionPath, 2, incrementalROFiles, false, roJobConf, 400, commitTime1, compactionCommitTime,
|
||||||
roJobConf, 400, commitTime1, compactionCommitTime, insertsTime);
|
insertsTime);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Check if record level metadata is aggregated properly at the end of write.
|
// Check if record level metadata is aggregated properly at the end of write.
|
||||||
@Test
|
@ParameterizedTest
|
||||||
public void testMetadataAggregateFromWriteStatus() throws Exception {
|
@MethodSource("argumentsProvider")
|
||||||
|
public void testMetadataAggregateFromWriteStatus(HoodieFileFormat baseFileFormat) throws Exception {
|
||||||
|
init(baseFileFormat);
|
||||||
|
|
||||||
HoodieWriteConfig cfg = getConfigBuilder(false).withWriteStatusClass(MetadataMergeWriteStatus.class).build();
|
HoodieWriteConfig cfg = getConfigBuilder(false).withWriteStatusClass(MetadataMergeWriteStatus.class).build();
|
||||||
try (HoodieWriteClient client = getHoodieWriteClient(cfg);) {
|
try (HoodieWriteClient client = getHoodieWriteClient(cfg);) {
|
||||||
|
|
||||||
@@ -305,8 +304,11 @@ public class TestHoodieMergeOnReadTable extends HoodieClientTestHarness {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@ParameterizedTest
|
||||||
public void testSimpleInsertUpdateAndDelete() throws Exception {
|
@MethodSource("argumentsProvider")
|
||||||
|
public void testSimpleInsertUpdateAndDelete(HoodieFileFormat baseFileFormat) throws Exception {
|
||||||
|
init(baseFileFormat);
|
||||||
|
|
||||||
HoodieWriteConfig cfg = getConfig(true);
|
HoodieWriteConfig cfg = getConfig(true);
|
||||||
try (HoodieWriteClient client = getHoodieWriteClient(cfg);) {
|
try (HoodieWriteClient client = getHoodieWriteClient(cfg);) {
|
||||||
|
|
||||||
@@ -332,7 +334,7 @@ public class TestHoodieMergeOnReadTable extends HoodieClientTestHarness {
|
|||||||
Option<HoodieInstant> commit = metaClient.getActiveTimeline().getCommitTimeline().firstInstant();
|
Option<HoodieInstant> commit = metaClient.getActiveTimeline().getCommitTimeline().firstInstant();
|
||||||
assertFalse(commit.isPresent());
|
assertFalse(commit.isPresent());
|
||||||
|
|
||||||
FileStatus[] allFiles = HoodieTestUtils.listAllDataFilesInPath(metaClient.getFs(), cfg.getBasePath());
|
FileStatus[] allFiles = listAllDataFilesInPath(hoodieTable, cfg.getBasePath());
|
||||||
tableView = getHoodieTableFileSystemView(metaClient, metaClient.getCommitTimeline().filterCompletedInstants(), allFiles);
|
tableView = getHoodieTableFileSystemView(metaClient, metaClient.getCommitTimeline().filterCompletedInstants(), allFiles);
|
||||||
Stream<HoodieBaseFile> dataFilesToRead = tableView.getLatestBaseFiles();
|
Stream<HoodieBaseFile> dataFilesToRead = tableView.getLatestBaseFiles();
|
||||||
assertFalse(dataFilesToRead.findAny().isPresent());
|
assertFalse(dataFilesToRead.findAny().isPresent());
|
||||||
@@ -373,23 +375,25 @@ public class TestHoodieMergeOnReadTable extends HoodieClientTestHarness {
|
|||||||
commit = metaClient.getActiveTimeline().getCommitTimeline().firstInstant();
|
commit = metaClient.getActiveTimeline().getCommitTimeline().firstInstant();
|
||||||
assertFalse(commit.isPresent());
|
assertFalse(commit.isPresent());
|
||||||
|
|
||||||
allFiles = HoodieTestUtils.listAllDataFilesInPath(dfs, cfg.getBasePath());
|
allFiles = listAllDataFilesInPath(hoodieTable, cfg.getBasePath());
|
||||||
tableView = getHoodieTableFileSystemView(metaClient, hoodieTable.getCompletedCommitsTimeline(), allFiles);
|
tableView = getHoodieTableFileSystemView(metaClient, hoodieTable.getCompletedCommitsTimeline(), allFiles);
|
||||||
dataFilesToRead = tableView.getLatestBaseFiles();
|
dataFilesToRead = tableView.getLatestBaseFiles();
|
||||||
assertTrue(dataFilesToRead.findAny().isPresent());
|
assertTrue(dataFilesToRead.findAny().isPresent());
|
||||||
|
|
||||||
List<String> dataFiles = tableView.getLatestBaseFiles().map(HoodieBaseFile::getPath).collect(Collectors.toList());
|
List<String> dataFiles = tableView.getLatestBaseFiles().map(HoodieBaseFile::getPath).collect(Collectors.toList());
|
||||||
List<GenericRecord> recordsRead = HoodieMergeOnReadTestUtils.getRecordsUsingInputFormat(dataFiles, basePath);
|
List<GenericRecord> recordsRead = HoodieMergeOnReadTestUtils.getRecordsUsingInputFormat(hadoopConf, dataFiles, basePath);
|
||||||
// Wrote 20 records and deleted 20 records, so remaining 20-20 = 0
|
// Wrote 20 records and deleted 20 records, so remaining 20-20 = 0
|
||||||
assertEquals(0, recordsRead.size(), "Must contain 0 records");
|
assertEquals(0, recordsRead.size(), "Must contain 0 records");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@ParameterizedTest
|
||||||
public void testCOWToMORConvertedTableRollback() throws Exception {
|
@MethodSource("argumentsProvider")
|
||||||
|
public void testCOWToMORConvertedTableRollback(HoodieFileFormat baseFileFormat) throws Exception {
|
||||||
|
init(baseFileFormat);
|
||||||
|
|
||||||
// Set TableType to COW
|
// Set TableType to COW
|
||||||
HoodieTestUtils.init(hadoopConf, basePath, HoodieTableType.COPY_ON_WRITE);
|
HoodieTestUtils.init(hadoopConf, basePath, HoodieTableType.COPY_ON_WRITE, baseFileFormat);
|
||||||
|
|
||||||
HoodieWriteConfig cfg = getConfig(true);
|
HoodieWriteConfig cfg = getConfig(true);
|
||||||
try (HoodieWriteClient client = getHoodieWriteClient(cfg);) {
|
try (HoodieWriteClient client = getHoodieWriteClient(cfg);) {
|
||||||
@@ -425,14 +429,14 @@ public class TestHoodieMergeOnReadTable extends HoodieClientTestHarness {
|
|||||||
assertNoWriteErrors(statuses);
|
assertNoWriteErrors(statuses);
|
||||||
|
|
||||||
// Set TableType to MOR
|
// Set TableType to MOR
|
||||||
HoodieTestUtils.init(hadoopConf, basePath, HoodieTableType.MERGE_ON_READ);
|
HoodieTestUtils.init(hadoopConf, basePath, HoodieTableType.MERGE_ON_READ, baseFileFormat);
|
||||||
|
|
||||||
// rollback a COW commit when TableType is MOR
|
// rollback a COW commit when TableType is MOR
|
||||||
client.rollback(newCommitTime);
|
client.rollback(newCommitTime);
|
||||||
|
|
||||||
metaClient = HoodieTableMetaClient.reload(metaClient);
|
metaClient = HoodieTableMetaClient.reload(metaClient);
|
||||||
HoodieTable hoodieTable = HoodieTable.create(metaClient, cfg, hadoopConf);
|
HoodieTable hoodieTable = HoodieTable.create(metaClient, cfg, hadoopConf);
|
||||||
FileStatus[] allFiles = HoodieTestUtils.listAllDataFilesInPath(metaClient.getFs(), cfg.getBasePath());
|
FileStatus[] allFiles = listAllDataFilesInPath(hoodieTable, cfg.getBasePath());
|
||||||
tableView = getHoodieTableFileSystemView(metaClient, hoodieTable.getCompletedCommitsTimeline(), allFiles);
|
tableView = getHoodieTableFileSystemView(metaClient, hoodieTable.getCompletedCommitsTimeline(), allFiles);
|
||||||
|
|
||||||
final String absentCommit = newCommitTime;
|
final String absentCommit = newCommitTime;
|
||||||
@@ -440,8 +444,10 @@ public class TestHoodieMergeOnReadTable extends HoodieClientTestHarness {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@ParameterizedTest
|
||||||
public void testRollbackWithDeltaAndCompactionCommit() throws Exception {
|
@MethodSource("argumentsProvider")
|
||||||
|
public void testRollbackWithDeltaAndCompactionCommit(HoodieFileFormat baseFileFormat) throws Exception {
|
||||||
|
init(baseFileFormat);
|
||||||
|
|
||||||
HoodieWriteConfig cfg = getConfig(false);
|
HoodieWriteConfig cfg = getConfig(false);
|
||||||
try (HoodieWriteClient client = getHoodieWriteClient(cfg);) {
|
try (HoodieWriteClient client = getHoodieWriteClient(cfg);) {
|
||||||
@@ -471,7 +477,7 @@ public class TestHoodieMergeOnReadTable extends HoodieClientTestHarness {
|
|||||||
Option<HoodieInstant> commit = metaClient.getActiveTimeline().getCommitTimeline().firstInstant();
|
Option<HoodieInstant> commit = metaClient.getActiveTimeline().getCommitTimeline().firstInstant();
|
||||||
assertFalse(commit.isPresent());
|
assertFalse(commit.isPresent());
|
||||||
|
|
||||||
FileStatus[] allFiles = HoodieTestUtils.listAllDataFilesInPath(metaClient.getFs(), cfg.getBasePath());
|
FileStatus[] allFiles = listAllDataFilesInPath(hoodieTable, cfg.getBasePath());
|
||||||
tableView =
|
tableView =
|
||||||
getHoodieTableFileSystemView(metaClient, metaClient.getCommitTimeline().filterCompletedInstants(), allFiles);
|
getHoodieTableFileSystemView(metaClient, metaClient.getCommitTimeline().filterCompletedInstants(), allFiles);
|
||||||
Stream<HoodieBaseFile> dataFilesToRead = tableView.getLatestBaseFiles();
|
Stream<HoodieBaseFile> dataFilesToRead = tableView.getLatestBaseFiles();
|
||||||
@@ -495,7 +501,8 @@ public class TestHoodieMergeOnReadTable extends HoodieClientTestHarness {
|
|||||||
copyOfRecords.addAll(dataGen.generateInserts(commitTime1, 200));
|
copyOfRecords.addAll(dataGen.generateInserts(commitTime1, 200));
|
||||||
|
|
||||||
List<String> dataFiles = tableView.getLatestBaseFiles().map(HoodieBaseFile::getPath).collect(Collectors.toList());
|
List<String> dataFiles = tableView.getLatestBaseFiles().map(HoodieBaseFile::getPath).collect(Collectors.toList());
|
||||||
List<GenericRecord> recordsRead = HoodieMergeOnReadTestUtils.getRecordsUsingInputFormat(dataFiles, basePath);
|
List<GenericRecord> recordsRead = HoodieMergeOnReadTestUtils.getRecordsUsingInputFormat(hadoopConf, dataFiles,
|
||||||
|
basePath);
|
||||||
assertEquals(recordsRead.size(), 200);
|
assertEquals(recordsRead.size(), 200);
|
||||||
|
|
||||||
statuses = secondClient.upsert(jsc.parallelize(copyOfRecords, 1), commitTime1).collect();
|
statuses = secondClient.upsert(jsc.parallelize(copyOfRecords, 1), commitTime1).collect();
|
||||||
@@ -504,12 +511,12 @@ public class TestHoodieMergeOnReadTable extends HoodieClientTestHarness {
|
|||||||
|
|
||||||
// Test failed delta commit rollback
|
// Test failed delta commit rollback
|
||||||
secondClient.rollback(commitTime1);
|
secondClient.rollback(commitTime1);
|
||||||
allFiles = HoodieTestUtils.listAllDataFilesInPath(metaClient.getFs(), cfg.getBasePath());
|
allFiles = listAllDataFilesInPath(hoodieTable, cfg.getBasePath());
|
||||||
// After rollback, there should be no parquet file with the failed commit time
|
// After rollback, there should be no base file with the failed commit time
|
||||||
assertEquals(0, Arrays.stream(allFiles)
|
assertEquals(0, Arrays.stream(allFiles)
|
||||||
.filter(file -> file.getPath().getName().contains(commitTime1)).count());
|
.filter(file -> file.getPath().getName().contains(commitTime1)).count());
|
||||||
dataFiles = tableView.getLatestBaseFiles().map(HoodieBaseFile::getPath).collect(Collectors.toList());
|
dataFiles = tableView.getLatestBaseFiles().map(HoodieBaseFile::getPath).collect(Collectors.toList());
|
||||||
recordsRead = HoodieMergeOnReadTestUtils.getRecordsUsingInputFormat(dataFiles, basePath);
|
recordsRead = HoodieMergeOnReadTestUtils.getRecordsUsingInputFormat(hadoopConf, dataFiles, basePath);
|
||||||
assertEquals(200, recordsRead.size());
|
assertEquals(200, recordsRead.size());
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -525,7 +532,8 @@ public class TestHoodieMergeOnReadTable extends HoodieClientTestHarness {
|
|||||||
copyOfRecords.addAll(dataGen.generateInserts(commitTime2, 200));
|
copyOfRecords.addAll(dataGen.generateInserts(commitTime2, 200));
|
||||||
|
|
||||||
List<String> dataFiles = tableView.getLatestBaseFiles().map(HoodieBaseFile::getPath).collect(Collectors.toList());
|
List<String> dataFiles = tableView.getLatestBaseFiles().map(HoodieBaseFile::getPath).collect(Collectors.toList());
|
||||||
List<GenericRecord> recordsRead = HoodieMergeOnReadTestUtils.getRecordsUsingInputFormat(dataFiles, basePath);
|
List<GenericRecord> recordsRead = HoodieMergeOnReadTestUtils.getRecordsUsingInputFormat(hadoopConf, dataFiles,
|
||||||
|
basePath);
|
||||||
assertEquals(200, recordsRead.size());
|
assertEquals(200, recordsRead.size());
|
||||||
|
|
||||||
writeRecords = jsc.parallelize(copyOfRecords, 1);
|
writeRecords = jsc.parallelize(copyOfRecords, 1);
|
||||||
@@ -537,7 +545,7 @@ public class TestHoodieMergeOnReadTable extends HoodieClientTestHarness {
|
|||||||
|
|
||||||
// Test successful delta commit rollback
|
// Test successful delta commit rollback
|
||||||
thirdClient.rollback(commitTime2);
|
thirdClient.rollback(commitTime2);
|
||||||
allFiles = HoodieTestUtils.listAllDataFilesInPath(metaClient.getFs(), cfg.getBasePath());
|
allFiles = listAllDataFilesInPath(hoodieTable, cfg.getBasePath());
|
||||||
// After rollback, there should be no parquet file with the failed commit time
|
// After rollback, there should be no parquet file with the failed commit time
|
||||||
assertEquals(0, Arrays.stream(allFiles)
|
assertEquals(0, Arrays.stream(allFiles)
|
||||||
.filter(file -> file.getPath().getName().contains(commitTime2)).count());
|
.filter(file -> file.getPath().getName().contains(commitTime2)).count());
|
||||||
@@ -546,7 +554,7 @@ public class TestHoodieMergeOnReadTable extends HoodieClientTestHarness {
|
|||||||
hoodieTable = HoodieTable.create(metaClient, cfg, hadoopConf);
|
hoodieTable = HoodieTable.create(metaClient, cfg, hadoopConf);
|
||||||
tableView = getHoodieTableFileSystemView(metaClient, hoodieTable.getCompletedCommitsTimeline(), allFiles);
|
tableView = getHoodieTableFileSystemView(metaClient, hoodieTable.getCompletedCommitsTimeline(), allFiles);
|
||||||
dataFiles = tableView.getLatestBaseFiles().map(HoodieBaseFile::getPath).collect(Collectors.toList());
|
dataFiles = tableView.getLatestBaseFiles().map(HoodieBaseFile::getPath).collect(Collectors.toList());
|
||||||
recordsRead = HoodieMergeOnReadTestUtils.getRecordsUsingInputFormat(dataFiles, basePath);
|
recordsRead = HoodieMergeOnReadTestUtils.getRecordsUsingInputFormat(hadoopConf, dataFiles, basePath);
|
||||||
// check that the number of records read is still correct after rollback operation
|
// check that the number of records read is still correct after rollback operation
|
||||||
assertEquals(200, recordsRead.size());
|
assertEquals(200, recordsRead.size());
|
||||||
|
|
||||||
@@ -569,7 +577,7 @@ public class TestHoodieMergeOnReadTable extends HoodieClientTestHarness {
|
|||||||
JavaRDD<WriteStatus> ws = thirdClient.compact(compactionInstantTime);
|
JavaRDD<WriteStatus> ws = thirdClient.compact(compactionInstantTime);
|
||||||
thirdClient.commitCompaction(compactionInstantTime, ws, Option.empty());
|
thirdClient.commitCompaction(compactionInstantTime, ws, Option.empty());
|
||||||
|
|
||||||
allFiles = HoodieTestUtils.listAllDataFilesInPath(metaClient.getFs(), cfg.getBasePath());
|
allFiles = listAllDataFilesInPath(hoodieTable, cfg.getBasePath());
|
||||||
metaClient = HoodieTableMetaClient.reload(metaClient);
|
metaClient = HoodieTableMetaClient.reload(metaClient);
|
||||||
tableView = getHoodieTableFileSystemView(metaClient, metaClient.getCommitsTimeline(), allFiles);
|
tableView = getHoodieTableFileSystemView(metaClient, metaClient.getCommitsTimeline(), allFiles);
|
||||||
|
|
||||||
@@ -580,7 +588,7 @@ public class TestHoodieMergeOnReadTable extends HoodieClientTestHarness {
|
|||||||
|
|
||||||
thirdClient.rollback(compactedCommitTime);
|
thirdClient.rollback(compactedCommitTime);
|
||||||
|
|
||||||
allFiles = HoodieTestUtils.listAllDataFilesInPath(metaClient.getFs(), cfg.getBasePath());
|
allFiles = listAllDataFilesInPath(hoodieTable, cfg.getBasePath());
|
||||||
metaClient = HoodieTableMetaClient.reload(metaClient);
|
metaClient = HoodieTableMetaClient.reload(metaClient);
|
||||||
tableView = getHoodieTableFileSystemView(metaClient, metaClient.getCommitsTimeline(), allFiles);
|
tableView = getHoodieTableFileSystemView(metaClient, metaClient.getCommitsTimeline(), allFiles);
|
||||||
|
|
||||||
@@ -589,8 +597,10 @@ public class TestHoodieMergeOnReadTable extends HoodieClientTestHarness {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@ParameterizedTest
|
||||||
public void testMultiRollbackWithDeltaAndCompactionCommit() throws Exception {
|
@MethodSource("argumentsProvider")
|
||||||
|
public void testMultiRollbackWithDeltaAndCompactionCommit(HoodieFileFormat baseFileFormat) throws Exception {
|
||||||
|
init(baseFileFormat);
|
||||||
|
|
||||||
HoodieWriteConfig cfg = getConfig(false);
|
HoodieWriteConfig cfg = getConfig(false);
|
||||||
try (final HoodieWriteClient client = getHoodieWriteClient(cfg);) {
|
try (final HoodieWriteClient client = getHoodieWriteClient(cfg);) {
|
||||||
@@ -618,7 +628,7 @@ public class TestHoodieMergeOnReadTable extends HoodieClientTestHarness {
|
|||||||
Option<HoodieInstant> commit = metaClient.getActiveTimeline().getCommitTimeline().firstInstant();
|
Option<HoodieInstant> commit = metaClient.getActiveTimeline().getCommitTimeline().firstInstant();
|
||||||
assertFalse(commit.isPresent());
|
assertFalse(commit.isPresent());
|
||||||
|
|
||||||
FileStatus[] allFiles = HoodieTestUtils.listAllDataFilesInPath(metaClient.getFs(), cfg.getBasePath());
|
FileStatus[] allFiles = listAllDataFilesInPath(hoodieTable, cfg.getBasePath());
|
||||||
tableView = getHoodieTableFileSystemView(metaClient, metaClient.getCommitTimeline().filterCompletedInstants(), allFiles);
|
tableView = getHoodieTableFileSystemView(metaClient, metaClient.getCommitTimeline().filterCompletedInstants(), allFiles);
|
||||||
Stream<HoodieBaseFile> dataFilesToRead = tableView.getLatestBaseFiles();
|
Stream<HoodieBaseFile> dataFilesToRead = tableView.getLatestBaseFiles();
|
||||||
assertFalse(dataFilesToRead.findAny().isPresent());
|
assertFalse(dataFilesToRead.findAny().isPresent());
|
||||||
@@ -641,7 +651,8 @@ public class TestHoodieMergeOnReadTable extends HoodieClientTestHarness {
|
|||||||
copyOfRecords.addAll(dataGen.generateInserts(newCommitTime, 200));
|
copyOfRecords.addAll(dataGen.generateInserts(newCommitTime, 200));
|
||||||
|
|
||||||
List<String> dataFiles = tableView.getLatestBaseFiles().map(hf -> hf.getPath()).collect(Collectors.toList());
|
List<String> dataFiles = tableView.getLatestBaseFiles().map(hf -> hf.getPath()).collect(Collectors.toList());
|
||||||
List<GenericRecord> recordsRead = HoodieMergeOnReadTestUtils.getRecordsUsingInputFormat(dataFiles, basePath);
|
List<GenericRecord> recordsRead = HoodieMergeOnReadTestUtils.getRecordsUsingInputFormat(hadoopConf, dataFiles,
|
||||||
|
basePath);
|
||||||
assertEquals(200, recordsRead.size());
|
assertEquals(200, recordsRead.size());
|
||||||
|
|
||||||
statuses = nClient.upsert(jsc.parallelize(copyOfRecords, 1), newCommitTime).collect();
|
statuses = nClient.upsert(jsc.parallelize(copyOfRecords, 1), newCommitTime).collect();
|
||||||
@@ -696,7 +707,7 @@ public class TestHoodieMergeOnReadTable extends HoodieClientTestHarness {
|
|||||||
JavaRDD<WriteStatus> ws = client.compact(compactionInstantTime);
|
JavaRDD<WriteStatus> ws = client.compact(compactionInstantTime);
|
||||||
client.commitCompaction(compactionInstantTime, ws, Option.empty());
|
client.commitCompaction(compactionInstantTime, ws, Option.empty());
|
||||||
|
|
||||||
allFiles = HoodieTestUtils.listAllDataFilesInPath(metaClient.getFs(), cfg.getBasePath());
|
allFiles = listAllDataFilesInPath(hoodieTable, cfg.getBasePath());
|
||||||
metaClient = HoodieTableMetaClient.reload(metaClient);
|
metaClient = HoodieTableMetaClient.reload(metaClient);
|
||||||
tableView = getHoodieTableFileSystemView(metaClient, metaClient.getCommitsTimeline(), allFiles);
|
tableView = getHoodieTableFileSystemView(metaClient, metaClient.getCommitsTimeline(), allFiles);
|
||||||
|
|
||||||
@@ -724,7 +735,7 @@ public class TestHoodieMergeOnReadTable extends HoodieClientTestHarness {
|
|||||||
client.restoreToInstant("000");
|
client.restoreToInstant("000");
|
||||||
|
|
||||||
metaClient = HoodieTableMetaClient.reload(metaClient);
|
metaClient = HoodieTableMetaClient.reload(metaClient);
|
||||||
allFiles = HoodieTestUtils.listAllDataFilesInPath(metaClient.getFs(), cfg.getBasePath());
|
allFiles = listAllDataFilesInPath(hoodieTable, cfg.getBasePath());
|
||||||
tableView = getHoodieTableFileSystemView(metaClient, metaClient.getCommitTimeline().filterCompletedInstants(), allFiles);
|
tableView = getHoodieTableFileSystemView(metaClient, metaClient.getCommitTimeline().filterCompletedInstants(), allFiles);
|
||||||
dataFilesToRead = tableView.getLatestBaseFiles();
|
dataFilesToRead = tableView.getLatestBaseFiles();
|
||||||
assertFalse(dataFilesToRead.findAny().isPresent());
|
assertFalse(dataFilesToRead.findAny().isPresent());
|
||||||
@@ -751,8 +762,11 @@ public class TestHoodieMergeOnReadTable extends HoodieClientTestHarness {
|
|||||||
.build();
|
.build();
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@ParameterizedTest
|
||||||
public void testUpsertPartitioner() throws Exception {
|
@MethodSource("argumentsProvider")
|
||||||
|
public void testUpsertPartitioner(HoodieFileFormat baseFileFormat) throws Exception {
|
||||||
|
init(baseFileFormat);
|
||||||
|
|
||||||
HoodieWriteConfig cfg = getConfig(true);
|
HoodieWriteConfig cfg = getConfig(true);
|
||||||
try (HoodieWriteClient client = getHoodieWriteClient(cfg);) {
|
try (HoodieWriteClient client = getHoodieWriteClient(cfg);) {
|
||||||
|
|
||||||
@@ -778,7 +792,7 @@ public class TestHoodieMergeOnReadTable extends HoodieClientTestHarness {
|
|||||||
Option<HoodieInstant> commit = metaClient.getActiveTimeline().getCommitTimeline().firstInstant();
|
Option<HoodieInstant> commit = metaClient.getActiveTimeline().getCommitTimeline().firstInstant();
|
||||||
assertFalse(commit.isPresent());
|
assertFalse(commit.isPresent());
|
||||||
|
|
||||||
FileStatus[] allFiles = HoodieTestUtils.listAllDataFilesInPath(metaClient.getFs(), cfg.getBasePath());
|
FileStatus[] allFiles = listAllDataFilesInPath(hoodieTable, cfg.getBasePath());
|
||||||
BaseFileOnlyView roView = getHoodieTableFileSystemView(metaClient,
|
BaseFileOnlyView roView = getHoodieTableFileSystemView(metaClient,
|
||||||
metaClient.getCommitsTimeline().filterCompletedInstants(), allFiles);
|
metaClient.getCommitsTimeline().filterCompletedInstants(), allFiles);
|
||||||
Stream<HoodieBaseFile> dataFilesToRead = roView.getLatestBaseFiles();
|
Stream<HoodieBaseFile> dataFilesToRead = roView.getLatestBaseFiles();
|
||||||
@@ -812,7 +826,7 @@ public class TestHoodieMergeOnReadTable extends HoodieClientTestHarness {
|
|||||||
commit = metaClient.getActiveTimeline().getCommitTimeline().firstInstant();
|
commit = metaClient.getActiveTimeline().getCommitTimeline().firstInstant();
|
||||||
assertFalse(commit.isPresent());
|
assertFalse(commit.isPresent());
|
||||||
|
|
||||||
allFiles = HoodieTestUtils.listAllDataFilesInPath(metaClient.getFs(), cfg.getBasePath());
|
allFiles = listAllDataFilesInPath(hoodieTable, cfg.getBasePath());
|
||||||
roView = getHoodieTableFileSystemView(metaClient,
|
roView = getHoodieTableFileSystemView(metaClient,
|
||||||
hoodieTable.getActiveTimeline().reload().getCommitsTimeline().filterCompletedInstants(), allFiles);
|
hoodieTable.getActiveTimeline().reload().getCommitsTimeline().filterCompletedInstants(), allFiles);
|
||||||
dataFilesToRead = roView.getLatestBaseFiles();
|
dataFilesToRead = roView.getLatestBaseFiles();
|
||||||
@@ -823,14 +837,18 @@ public class TestHoodieMergeOnReadTable extends HoodieClientTestHarness {
|
|||||||
assertTrue(parquetFileIdToNewSize.entrySet().stream().anyMatch(entry -> parquetFileIdToSize.get(entry.getKey()) < entry.getValue()));
|
assertTrue(parquetFileIdToNewSize.entrySet().stream().anyMatch(entry -> parquetFileIdToSize.get(entry.getKey()) < entry.getValue()));
|
||||||
|
|
||||||
List<String> dataFiles = roView.getLatestBaseFiles().map(HoodieBaseFile::getPath).collect(Collectors.toList());
|
List<String> dataFiles = roView.getLatestBaseFiles().map(HoodieBaseFile::getPath).collect(Collectors.toList());
|
||||||
List<GenericRecord> recordsRead = HoodieMergeOnReadTestUtils.getRecordsUsingInputFormat(dataFiles, basePath);
|
List<GenericRecord> recordsRead = HoodieMergeOnReadTestUtils.getRecordsUsingInputFormat(hadoopConf, dataFiles,
|
||||||
|
basePath);
|
||||||
// Wrote 20 records in 2 batches
|
// Wrote 20 records in 2 batches
|
||||||
assertEquals(40, recordsRead.size(), "Must contain 40 records");
|
assertEquals(40, recordsRead.size(), "Must contain 40 records");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@ParameterizedTest
|
||||||
public void testLogFileCountsAfterCompaction() throws Exception {
|
@MethodSource("argumentsProvider")
|
||||||
|
public void testLogFileCountsAfterCompaction(HoodieFileFormat baseFileFormat) throws Exception {
|
||||||
|
init(baseFileFormat);
|
||||||
|
|
||||||
// insert 100 records
|
// insert 100 records
|
||||||
HoodieWriteConfig config = getConfig(true);
|
HoodieWriteConfig config = getConfig(true);
|
||||||
try (HoodieWriteClient writeClient = getHoodieWriteClient(config);) {
|
try (HoodieWriteClient writeClient = getHoodieWriteClient(config);) {
|
||||||
@@ -902,8 +920,11 @@ public class TestHoodieMergeOnReadTable extends HoodieClientTestHarness {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@ParameterizedTest
|
||||||
public void testSimpleInsertsGeneratedIntoLogFiles() throws Exception {
|
@MethodSource("argumentsProvider")
|
||||||
|
public void testSimpleInsertsGeneratedIntoLogFiles(HoodieFileFormat baseFileFormat) throws Exception {
|
||||||
|
init(baseFileFormat);
|
||||||
|
|
||||||
// insert 100 records
|
// insert 100 records
|
||||||
// Setting IndexType to be InMemory to simulate Global Index nature
|
// Setting IndexType to be InMemory to simulate Global Index nature
|
||||||
HoodieWriteConfig config = getConfigBuilder(false, IndexType.INMEMORY).build();
|
HoodieWriteConfig config = getConfigBuilder(false, IndexType.INMEMORY).build();
|
||||||
@@ -939,8 +960,11 @@ public class TestHoodieMergeOnReadTable extends HoodieClientTestHarness {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@ParameterizedTest
|
||||||
public void testInsertsGeneratedIntoLogFilesRollback(@TempDir java.nio.file.Path tempFolder) throws Exception {
|
@MethodSource("argumentsProvider")
|
||||||
|
public void testInsertsGeneratedIntoLogFilesRollback(HoodieFileFormat baseFileFormat) throws Exception {
|
||||||
|
init(baseFileFormat);
|
||||||
|
|
||||||
// insert 100 records
|
// insert 100 records
|
||||||
// Setting IndexType to be InMemory to simulate Global Index nature
|
// Setting IndexType to be InMemory to simulate Global Index nature
|
||||||
HoodieWriteConfig config = getConfigBuilder(false, IndexType.INMEMORY).build();
|
HoodieWriteConfig config = getConfigBuilder(false, IndexType.INMEMORY).build();
|
||||||
@@ -1010,8 +1034,11 @@ public class TestHoodieMergeOnReadTable extends HoodieClientTestHarness {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@ParameterizedTest
|
||||||
public void testInsertsGeneratedIntoLogFilesRollbackAfterCompaction() throws Exception {
|
@MethodSource("argumentsProvider")
|
||||||
|
public void testInsertsGeneratedIntoLogFilesRollbackAfterCompaction(HoodieFileFormat baseFileFormat) throws Exception {
|
||||||
|
init(baseFileFormat);
|
||||||
|
|
||||||
// insert 100 records
|
// insert 100 records
|
||||||
// Setting IndexType to be InMemory to simulate Global Index nature
|
// Setting IndexType to be InMemory to simulate Global Index nature
|
||||||
HoodieWriteConfig config = getConfigBuilder(false, IndexType.INMEMORY).build();
|
HoodieWriteConfig config = getConfigBuilder(false, IndexType.INMEMORY).build();
|
||||||
@@ -1063,8 +1090,10 @@ public class TestHoodieMergeOnReadTable extends HoodieClientTestHarness {
|
|||||||
/**
|
/**
|
||||||
* Test to ensure rolling stats are correctly written to metadata file.
|
* Test to ensure rolling stats are correctly written to metadata file.
|
||||||
*/
|
*/
|
||||||
@Test
|
@ParameterizedTest
|
||||||
public void testRollingStatsInMetadata() throws Exception {
|
@MethodSource("argumentsProvider")
|
||||||
|
public void testRollingStatsInMetadata(HoodieFileFormat baseFileFormat) throws Exception {
|
||||||
|
init(baseFileFormat);
|
||||||
|
|
||||||
HoodieWriteConfig cfg = getConfigBuilder(false, IndexType.INMEMORY).withAutoCommit(false).build();
|
HoodieWriteConfig cfg = getConfigBuilder(false, IndexType.INMEMORY).withAutoCommit(false).build();
|
||||||
try (HoodieWriteClient client = getHoodieWriteClient(cfg);) {
|
try (HoodieWriteClient client = getHoodieWriteClient(cfg);) {
|
||||||
@@ -1163,8 +1192,11 @@ public class TestHoodieMergeOnReadTable extends HoodieClientTestHarness {
|
|||||||
/**
|
/**
|
||||||
* Test to ensure rolling stats are correctly written to the metadata file, identifies small files and corrects them.
|
* Test to ensure rolling stats are correctly written to the metadata file, identifies small files and corrects them.
|
||||||
*/
|
*/
|
||||||
@Test
|
@ParameterizedTest
|
||||||
public void testRollingStatsWithSmallFileHandling() throws Exception {
|
@MethodSource("argumentsProvider")
|
||||||
|
public void testRollingStatsWithSmallFileHandling(HoodieFileFormat baseFileFormat) throws Exception {
|
||||||
|
init(baseFileFormat);
|
||||||
|
|
||||||
HoodieWriteConfig cfg = getConfigBuilder(false, IndexType.INMEMORY).withAutoCommit(false).build();
|
HoodieWriteConfig cfg = getConfigBuilder(false, IndexType.INMEMORY).withAutoCommit(false).build();
|
||||||
try (HoodieWriteClient client = getHoodieWriteClient(cfg);) {
|
try (HoodieWriteClient client = getHoodieWriteClient(cfg);) {
|
||||||
Map<String, Long> fileIdToInsertsMap = new HashMap<>();
|
Map<String, Long> fileIdToInsertsMap = new HashMap<>();
|
||||||
@@ -1296,8 +1328,11 @@ public class TestHoodieMergeOnReadTable extends HoodieClientTestHarness {
|
|||||||
/**
|
/**
|
||||||
* Test to validate invoking table.handleUpdate() with input records from multiple partitions will fail.
|
* Test to validate invoking table.handleUpdate() with input records from multiple partitions will fail.
|
||||||
*/
|
*/
|
||||||
@Test
|
@ParameterizedTest
|
||||||
public void testHandleUpdateWithMultiplePartitions() throws Exception {
|
@MethodSource("argumentsProvider")
|
||||||
|
public void testHandleUpdateWithMultiplePartitions(HoodieFileFormat baseFileFormat) throws Exception {
|
||||||
|
init(baseFileFormat);
|
||||||
|
|
||||||
HoodieWriteConfig cfg = getConfig(true);
|
HoodieWriteConfig cfg = getConfig(true);
|
||||||
try (HoodieWriteClient client = getHoodieWriteClient(cfg);) {
|
try (HoodieWriteClient client = getHoodieWriteClient(cfg);) {
|
||||||
|
|
||||||
@@ -1323,7 +1358,7 @@ public class TestHoodieMergeOnReadTable extends HoodieClientTestHarness {
|
|||||||
Option<HoodieInstant> commit = metaClient.getActiveTimeline().getCommitTimeline().firstInstant();
|
Option<HoodieInstant> commit = metaClient.getActiveTimeline().getCommitTimeline().firstInstant();
|
||||||
assertFalse(commit.isPresent());
|
assertFalse(commit.isPresent());
|
||||||
|
|
||||||
FileStatus[] allFiles = HoodieTestUtils.listAllDataFilesInPath(metaClient.getFs(), cfg.getBasePath());
|
FileStatus[] allFiles = listAllDataFilesInPath(hoodieTable, cfg.getBasePath());
|
||||||
BaseFileOnlyView roView =
|
BaseFileOnlyView roView =
|
||||||
getHoodieTableFileSystemView(metaClient, metaClient.getCommitTimeline().filterCompletedInstants(), allFiles);
|
getHoodieTableFileSystemView(metaClient, metaClient.getCommitTimeline().filterCompletedInstants(), allFiles);
|
||||||
Stream<HoodieBaseFile> dataFilesToRead = roView.getLatestBaseFiles();
|
Stream<HoodieBaseFile> dataFilesToRead = roView.getLatestBaseFiles();
|
||||||
@@ -1401,7 +1436,7 @@ public class TestHoodieMergeOnReadTable extends HoodieClientTestHarness {
|
|||||||
assertFalse(status.hasErrors(), "Errors found in write of " + status.getFileId());
|
assertFalse(status.hasErrors(), "Errors found in write of " + status.getFileId());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private FileStatus[] insertAndGetFilePaths(List<HoodieRecord> records, HoodieWriteClient client,
|
private FileStatus[] insertAndGetFilePaths(List<HoodieRecord> records, HoodieWriteClient client,
|
||||||
HoodieWriteConfig cfg, String commitTime) throws IOException {
|
HoodieWriteConfig cfg, String commitTime) throws IOException {
|
||||||
JavaRDD<HoodieRecord> writeRecords = jsc.parallelize(records, 1);
|
JavaRDD<HoodieRecord> writeRecords = jsc.parallelize(records, 1);
|
||||||
@@ -1419,7 +1454,7 @@ public class TestHoodieMergeOnReadTable extends HoodieClientTestHarness {
|
|||||||
Option<HoodieInstant> commit = metaClient.getActiveTimeline().getCommitTimeline().lastInstant();
|
Option<HoodieInstant> commit = metaClient.getActiveTimeline().getCommitTimeline().lastInstant();
|
||||||
assertFalse(commit.isPresent());
|
assertFalse(commit.isPresent());
|
||||||
|
|
||||||
FileStatus[] allFiles = HoodieTestUtils.listAllDataFilesInPath(metaClient.getFs(), cfg.getBasePath());
|
FileStatus[] allFiles = listAllDataFilesInPath(hoodieTable, cfg.getBasePath());
|
||||||
BaseFileOnlyView roView =
|
BaseFileOnlyView roView =
|
||||||
getHoodieTableFileSystemView(metaClient, metaClient.getCommitTimeline().filterCompletedInstants(), allFiles);
|
getHoodieTableFileSystemView(metaClient, metaClient.getCommitTimeline().filterCompletedInstants(), allFiles);
|
||||||
Stream<HoodieBaseFile> dataFilesToRead = roView.getLatestBaseFiles();
|
Stream<HoodieBaseFile> dataFilesToRead = roView.getLatestBaseFiles();
|
||||||
@@ -1452,14 +1487,14 @@ public class TestHoodieMergeOnReadTable extends HoodieClientTestHarness {
|
|||||||
|
|
||||||
Option<HoodieInstant> commit = metaClient.getActiveTimeline().getCommitTimeline().firstInstant();
|
Option<HoodieInstant> commit = metaClient.getActiveTimeline().getCommitTimeline().firstInstant();
|
||||||
assertFalse(commit.isPresent());
|
assertFalse(commit.isPresent());
|
||||||
return HoodieTestUtils.listAllDataFilesInPath(metaClient.getFs(), cfg.getBasePath());
|
HoodieTable hoodieTable = HoodieTable.create(metaClient, cfg, hadoopConf);
|
||||||
|
return listAllDataFilesInPath(hoodieTable, cfg.getBasePath());
|
||||||
}
|
}
|
||||||
|
|
||||||
private FileStatus[] getROSnapshotFiles(String partitionPath)
|
private FileStatus[] getROSnapshotFiles(String partitionPath)
|
||||||
throws Exception {
|
throws Exception {
|
||||||
HoodieTestUtils.init(hadoopConf, basePath, HoodieTableType.MERGE_ON_READ);
|
|
||||||
FileInputFormat.setInputPaths(roSnapshotJobConf, basePath + "/" + partitionPath);
|
FileInputFormat.setInputPaths(roSnapshotJobConf, basePath + "/" + partitionPath);
|
||||||
return roSnapshotInputFormat.listStatus(roSnapshotJobConf);
|
return listStatus(roSnapshotJobConf, false);
|
||||||
}
|
}
|
||||||
|
|
||||||
private FileStatus[] getROIncrementalFiles(String partitionPath, boolean stopAtCompaction)
|
private FileStatus[] getROIncrementalFiles(String partitionPath, boolean stopAtCompaction)
|
||||||
@@ -1469,10 +1504,9 @@ public class TestHoodieMergeOnReadTable extends HoodieClientTestHarness {
|
|||||||
|
|
||||||
private FileStatus[] getROIncrementalFiles(String partitionPath, String startCommitTime, int numCommitsToPull, boolean stopAtCompaction)
|
private FileStatus[] getROIncrementalFiles(String partitionPath, String startCommitTime, int numCommitsToPull, boolean stopAtCompaction)
|
||||||
throws Exception {
|
throws Exception {
|
||||||
HoodieTestUtils.init(hadoopConf, basePath, HoodieTableType.MERGE_ON_READ);
|
|
||||||
setupIncremental(roJobConf, startCommitTime, numCommitsToPull, stopAtCompaction);
|
setupIncremental(roJobConf, startCommitTime, numCommitsToPull, stopAtCompaction);
|
||||||
FileInputFormat.setInputPaths(roJobConf, Paths.get(basePath, partitionPath).toString());
|
FileInputFormat.setInputPaths(roJobConf, Paths.get(basePath, partitionPath).toString());
|
||||||
return roInputFormat.listStatus(roJobConf);
|
return listStatus(roJobConf, false);
|
||||||
}
|
}
|
||||||
|
|
||||||
private FileStatus[] getRTIncrementalFiles(String partitionPath)
|
private FileStatus[] getRTIncrementalFiles(String partitionPath)
|
||||||
@@ -1482,10 +1516,9 @@ public class TestHoodieMergeOnReadTable extends HoodieClientTestHarness {
|
|||||||
|
|
||||||
private FileStatus[] getRTIncrementalFiles(String partitionPath, String startCommitTime, int numCommitsToPull)
|
private FileStatus[] getRTIncrementalFiles(String partitionPath, String startCommitTime, int numCommitsToPull)
|
||||||
throws Exception {
|
throws Exception {
|
||||||
HoodieTestUtils.init(hadoopConf, basePath, HoodieTableType.MERGE_ON_READ);
|
|
||||||
setupIncremental(rtJobConf, startCommitTime, numCommitsToPull, false);
|
setupIncremental(rtJobConf, startCommitTime, numCommitsToPull, false);
|
||||||
FileInputFormat.setInputPaths(rtJobConf, Paths.get(basePath, partitionPath).toString());
|
FileInputFormat.setInputPaths(rtJobConf, Paths.get(basePath, partitionPath).toString());
|
||||||
return rtInputFormat.listStatus(rtJobConf);
|
return listStatus(rtJobConf, true);
|
||||||
}
|
}
|
||||||
|
|
||||||
private void setupIncremental(JobConf jobConf, String startCommit, int numberOfCommitsToPull, boolean stopAtCompaction) {
|
private void setupIncremental(JobConf jobConf, String startCommit, int numberOfCommitsToPull, boolean stopAtCompaction) {
|
||||||
@@ -1507,16 +1540,37 @@ public class TestHoodieMergeOnReadTable extends HoodieClientTestHarness {
|
|||||||
}
|
}
|
||||||
|
|
||||||
private void validateFiles(String partitionPath, int expectedNumFiles,
|
private void validateFiles(String partitionPath, int expectedNumFiles,
|
||||||
FileStatus[] files, HoodieParquetInputFormat inputFormat,
|
FileStatus[] files, boolean realtime, JobConf jobConf,
|
||||||
JobConf jobConf, int expectedRecords, String... expectedCommits) {
|
int expectedRecords, String... expectedCommits) {
|
||||||
|
|
||||||
assertEquals(expectedNumFiles, files.length);
|
assertEquals(expectedNumFiles, files.length);
|
||||||
Set<String> expectedCommitsSet = Arrays.stream(expectedCommits).collect(Collectors.toSet());
|
Set<String> expectedCommitsSet = Arrays.stream(expectedCommits).collect(Collectors.toSet());
|
||||||
List<GenericRecord> records = HoodieMergeOnReadTestUtils.getRecordsUsingInputFormat(
|
List<GenericRecord> records = HoodieMergeOnReadTestUtils.getRecordsUsingInputFormat(hadoopConf,
|
||||||
Collections.singletonList(Paths.get(basePath, partitionPath).toString()), basePath, jobConf, inputFormat);
|
Collections.singletonList(Paths.get(basePath, partitionPath).toString()), basePath, jobConf, realtime);
|
||||||
assertEquals(expectedRecords, records.size());
|
assertEquals(expectedRecords, records.size());
|
||||||
Set<String> actualCommits = records.stream().map(r ->
|
Set<String> actualCommits = records.stream().map(r ->
|
||||||
r.get(HoodieRecord.COMMIT_TIME_METADATA_FIELD).toString()).collect(Collectors.toSet());
|
r.get(HoodieRecord.COMMIT_TIME_METADATA_FIELD).toString()).collect(Collectors.toSet());
|
||||||
assertEquals(expectedCommitsSet, actualCommits);
|
assertEquals(expectedCommitsSet, actualCommits);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private FileStatus[] listAllDataFilesInPath(HoodieTable table, String basePath) throws IOException {
|
||||||
|
return HoodieTestUtils.listAllDataFilesInPath(metaClient.getFs(), basePath, table.getBaseFileExtension());
|
||||||
|
}
|
||||||
|
|
||||||
|
private FileStatus[] listStatus(JobConf jobConf, boolean realtime) throws IOException {
|
||||||
|
// This is required as Hoodie InputFormats do not extend a common base class and FileInputFormat's
|
||||||
|
// listStatus() is protected.
|
||||||
|
FileInputFormat inputFormat = HoodieInputFormatUtils.getInputFormat(baseFileFormat, realtime, jobConf);
|
||||||
|
switch (baseFileFormat) {
|
||||||
|
case PARQUET:
|
||||||
|
if (realtime) {
|
||||||
|
return ((HoodieParquetRealtimeInputFormat)inputFormat).listStatus(jobConf);
|
||||||
|
} else {
|
||||||
|
return ((HoodieParquetInputFormat)inputFormat).listStatus(jobConf);
|
||||||
|
}
|
||||||
|
default:
|
||||||
|
throw new HoodieIOException("Hoodie InputFormat not implemented for base file format " + baseFileFormat);
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -29,6 +29,7 @@ import org.apache.hudi.common.bloom.BloomFilterTypeCode;
|
|||||||
import org.apache.hudi.common.fs.FSUtils;
|
import org.apache.hudi.common.fs.FSUtils;
|
||||||
import org.apache.hudi.common.model.HoodieBaseFile;
|
import org.apache.hudi.common.model.HoodieBaseFile;
|
||||||
import org.apache.hudi.common.model.HoodieCommitMetadata;
|
import org.apache.hudi.common.model.HoodieCommitMetadata;
|
||||||
|
import org.apache.hudi.common.model.HoodieFileFormat;
|
||||||
import org.apache.hudi.common.model.HoodieKey;
|
import org.apache.hudi.common.model.HoodieKey;
|
||||||
import org.apache.hudi.common.model.HoodieRecord;
|
import org.apache.hudi.common.model.HoodieRecord;
|
||||||
import org.apache.hudi.common.table.HoodieTableMetaClient;
|
import org.apache.hudi.common.table.HoodieTableMetaClient;
|
||||||
@@ -181,15 +182,20 @@ public class HoodieClientTestUtils {
|
|||||||
/**
|
/**
|
||||||
* Obtain all new data written into the Hoodie table since the given timestamp.
|
* Obtain all new data written into the Hoodie table since the given timestamp.
|
||||||
*/
|
*/
|
||||||
public static Dataset<Row> readSince(String basePath, SQLContext sqlContext, HoodieTimeline commitTimeline,
|
public static Dataset<Row> readSince(String basePath, SQLContext sqlContext,
|
||||||
String lastCommitTime) {
|
HoodieTimeline commitTimeline, String lastCommitTime) {
|
||||||
List<HoodieInstant> commitsToReturn =
|
List<HoodieInstant> commitsToReturn =
|
||||||
commitTimeline.findInstantsAfter(lastCommitTime, Integer.MAX_VALUE).getInstants().collect(Collectors.toList());
|
commitTimeline.findInstantsAfter(lastCommitTime, Integer.MAX_VALUE).getInstants().collect(Collectors.toList());
|
||||||
try {
|
try {
|
||||||
// Go over the commit metadata, and obtain the new files that need to be read.
|
// Go over the commit metadata, and obtain the new files that need to be read.
|
||||||
HashMap<String, String> fileIdToFullPath = getLatestFileIDsToFullPath(basePath, commitTimeline, commitsToReturn);
|
HashMap<String, String> fileIdToFullPath = getLatestFileIDsToFullPath(basePath, commitTimeline, commitsToReturn);
|
||||||
return sqlContext.read().parquet(fileIdToFullPath.values().toArray(new String[fileIdToFullPath.size()]))
|
String[] paths = fileIdToFullPath.values().toArray(new String[fileIdToFullPath.size()]);
|
||||||
.filter(String.format("%s >'%s'", HoodieRecord.COMMIT_TIME_METADATA_FIELD, lastCommitTime));
|
Dataset<Row> rows = null;
|
||||||
|
if (paths[0].endsWith(HoodieFileFormat.PARQUET.getFileExtension())) {
|
||||||
|
rows = sqlContext.read().parquet(paths);
|
||||||
|
}
|
||||||
|
|
||||||
|
return rows.filter(String.format("%s >'%s'", HoodieRecord.COMMIT_TIME_METADATA_FIELD, lastCommitTime));
|
||||||
} catch (IOException e) {
|
} catch (IOException e) {
|
||||||
throw new HoodieException("Error pulling data incrementally from commitTimestamp :" + lastCommitTime, e);
|
throw new HoodieException("Error pulling data incrementally from commitTimestamp :" + lastCommitTime, e);
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -19,18 +19,19 @@
|
|||||||
package org.apache.hudi.testutils;
|
package org.apache.hudi.testutils;
|
||||||
|
|
||||||
import org.apache.hudi.avro.HoodieAvroUtils;
|
import org.apache.hudi.avro.HoodieAvroUtils;
|
||||||
|
import org.apache.hudi.common.table.HoodieTableMetaClient;
|
||||||
import org.apache.hudi.common.testutils.HoodieTestUtils;
|
import org.apache.hudi.common.testutils.HoodieTestUtils;
|
||||||
import org.apache.hudi.hadoop.HoodieParquetInputFormat;
|
import org.apache.hudi.hadoop.utils.HoodieInputFormatUtils;
|
||||||
import org.apache.hudi.hadoop.realtime.HoodieParquetRealtimeInputFormat;
|
|
||||||
|
|
||||||
import org.apache.avro.Schema;
|
import org.apache.avro.Schema;
|
||||||
import org.apache.avro.generic.GenericRecord;
|
import org.apache.avro.generic.GenericRecord;
|
||||||
import org.apache.avro.generic.GenericRecordBuilder;
|
import org.apache.avro.generic.GenericRecordBuilder;
|
||||||
|
import org.apache.hadoop.conf.Configurable;
|
||||||
import org.apache.hadoop.conf.Configuration;
|
import org.apache.hadoop.conf.Configuration;
|
||||||
import org.apache.hadoop.hive.metastore.api.hive_metastoreConstants;
|
import org.apache.hadoop.hive.metastore.api.hive_metastoreConstants;
|
||||||
import org.apache.hadoop.hive.serde2.ColumnProjectionUtils;
|
import org.apache.hadoop.hive.serde2.ColumnProjectionUtils;
|
||||||
import org.apache.hadoop.io.ArrayWritable;
|
import org.apache.hadoop.io.ArrayWritable;
|
||||||
import org.apache.hadoop.io.Writable;
|
import org.apache.hadoop.io.Writable;
|
||||||
|
import org.apache.hadoop.mapred.FileInputFormat;
|
||||||
import org.apache.hadoop.mapred.InputSplit;
|
import org.apache.hadoop.mapred.InputSplit;
|
||||||
import org.apache.hadoop.mapred.JobConf;
|
import org.apache.hadoop.mapred.JobConf;
|
||||||
import org.apache.hadoop.mapred.RecordReader;
|
import org.apache.hadoop.mapred.RecordReader;
|
||||||
@@ -45,20 +46,19 @@ import java.util.stream.Collectors;
|
|||||||
* Utility methods to aid in testing MergeOnRead (workaround for HoodieReadClient for MOR).
|
* Utility methods to aid in testing MergeOnRead (workaround for HoodieReadClient for MOR).
|
||||||
*/
|
*/
|
||||||
public class HoodieMergeOnReadTestUtils {
|
public class HoodieMergeOnReadTestUtils {
|
||||||
public static List<GenericRecord> getRecordsUsingInputFormat(List<String> inputPaths, String basePath) {
|
public static List<GenericRecord> getRecordsUsingInputFormat(Configuration conf, List<String> inputPaths,
|
||||||
return getRecordsUsingInputFormat(inputPaths, basePath, new Configuration());
|
String basePath) {
|
||||||
|
return getRecordsUsingInputFormat(conf, inputPaths, basePath, new JobConf(conf), true);
|
||||||
}
|
}
|
||||||
|
|
||||||
public static List<GenericRecord> getRecordsUsingInputFormat(List<String> inputPaths, String basePath,
|
public static List<GenericRecord> getRecordsUsingInputFormat(Configuration conf, List<String> inputPaths,
|
||||||
Configuration conf) {
|
|
||||||
JobConf jobConf = new JobConf(conf);
|
|
||||||
return getRecordsUsingInputFormat(inputPaths, basePath, jobConf, new HoodieParquetRealtimeInputFormat());
|
|
||||||
}
|
|
||||||
|
|
||||||
public static List<GenericRecord> getRecordsUsingInputFormat(List<String> inputPaths,
|
|
||||||
String basePath,
|
String basePath,
|
||||||
JobConf jobConf,
|
JobConf jobConf,
|
||||||
HoodieParquetInputFormat inputFormat) {
|
boolean realtime) {
|
||||||
|
HoodieTableMetaClient metaClient = new HoodieTableMetaClient(conf, basePath);
|
||||||
|
FileInputFormat inputFormat = HoodieInputFormatUtils.getInputFormat(metaClient.getTableConfig().getBaseFileFormat(),
|
||||||
|
realtime, jobConf);
|
||||||
|
|
||||||
Schema schema = HoodieAvroUtils.addMetadataFields(
|
Schema schema = HoodieAvroUtils.addMetadataFields(
|
||||||
new Schema.Parser().parse(HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA));
|
new Schema.Parser().parse(HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA));
|
||||||
setPropsForInputFormat(inputFormat, jobConf, schema, basePath);
|
setPropsForInputFormat(inputFormat, jobConf, schema, basePath);
|
||||||
@@ -93,8 +93,8 @@ public class HoodieMergeOnReadTestUtils {
|
|||||||
}).orElse(new ArrayList<GenericRecord>());
|
}).orElse(new ArrayList<GenericRecord>());
|
||||||
}
|
}
|
||||||
|
|
||||||
private static void setPropsForInputFormat(HoodieParquetInputFormat inputFormat, JobConf jobConf,
|
private static void setPropsForInputFormat(FileInputFormat inputFormat, JobConf jobConf, Schema schema,
|
||||||
Schema schema, String basePath) {
|
String basePath) {
|
||||||
List<Schema.Field> fields = schema.getFields();
|
List<Schema.Field> fields = schema.getFields();
|
||||||
String names = fields.stream().map(f -> f.name().toString()).collect(Collectors.joining(","));
|
String names = fields.stream().map(f -> f.name().toString()).collect(Collectors.joining(","));
|
||||||
String postions = fields.stream().map(f -> String.valueOf(f.pos())).collect(Collectors.joining(","));
|
String postions = fields.stream().map(f -> String.valueOf(f.pos())).collect(Collectors.joining(","));
|
||||||
@@ -116,7 +116,10 @@ public class HoodieMergeOnReadTestUtils {
|
|||||||
conf.set(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR, postions);
|
conf.set(ColumnProjectionUtils.READ_COLUMN_IDS_CONF_STR, postions);
|
||||||
conf.set(hive_metastoreConstants.META_TABLE_PARTITION_COLUMNS, "datestr");
|
conf.set(hive_metastoreConstants.META_TABLE_PARTITION_COLUMNS, "datestr");
|
||||||
conf.set(hive_metastoreConstants.META_TABLE_COLUMN_TYPES, hiveColumnTypes);
|
conf.set(hive_metastoreConstants.META_TABLE_COLUMN_TYPES, hiveColumnTypes);
|
||||||
inputFormat.setConf(conf);
|
|
||||||
|
// Hoodie Input formats are also configurable
|
||||||
|
Configurable configurable = (Configurable)inputFormat;
|
||||||
|
configurable.setConf(conf);
|
||||||
jobConf.addResource(conf);
|
jobConf.addResource(conf);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -107,15 +107,21 @@ public class FSUtils {
|
|||||||
return String.format("%d-%d-%d", taskPartitionId, stageId, taskAttemptId);
|
return String.format("%d-%d-%d", taskPartitionId, stageId, taskAttemptId);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// TODO: this should be removed
|
||||||
public static String makeDataFileName(String instantTime, String writeToken, String fileId) {
|
public static String makeDataFileName(String instantTime, String writeToken, String fileId) {
|
||||||
return String.format("%s_%s_%s.parquet", fileId, writeToken, instantTime);
|
return String.format("%s_%s_%s%s", fileId, writeToken, instantTime, HoodieFileFormat.PARQUET.getFileExtension());
|
||||||
|
}
|
||||||
|
|
||||||
|
public static String makeDataFileName(String instantTime, String writeToken, String fileId, String fileExtension) {
|
||||||
|
return String.format("%s_%s_%s%s", fileId, writeToken, instantTime, fileExtension);
|
||||||
}
|
}
|
||||||
|
|
||||||
public static String makeMarkerFile(String instantTime, String writeToken, String fileId) {
|
public static String makeMarkerFile(String instantTime, String writeToken, String fileId) {
|
||||||
return String.format("%s_%s_%s%s", fileId, writeToken, instantTime, HoodieTableMetaClient.MARKER_EXTN);
|
return String.format("%s_%s_%s%s", fileId, writeToken, instantTime, HoodieTableMetaClient.MARKER_EXTN);
|
||||||
}
|
}
|
||||||
|
|
||||||
public static String translateMarkerToDataPath(String basePath, String markerPath, String instantTs) {
|
public static String translateMarkerToDataPath(String basePath, String markerPath, String instantTs,
|
||||||
|
String baseFileExtension) {
|
||||||
ValidationUtils.checkArgument(markerPath.endsWith(HoodieTableMetaClient.MARKER_EXTN));
|
ValidationUtils.checkArgument(markerPath.endsWith(HoodieTableMetaClient.MARKER_EXTN));
|
||||||
String markerRootPath = Path.getPathWithoutSchemeAndAuthority(
|
String markerRootPath = Path.getPathWithoutSchemeAndAuthority(
|
||||||
new Path(String.format("%s/%s/%s", basePath, HoodieTableMetaClient.TEMPFOLDER_NAME, instantTs))).toString();
|
new Path(String.format("%s/%s/%s", basePath, HoodieTableMetaClient.TEMPFOLDER_NAME, instantTs))).toString();
|
||||||
@@ -123,8 +129,7 @@ public class FSUtils {
|
|||||||
ValidationUtils.checkArgument(begin >= 0,
|
ValidationUtils.checkArgument(begin >= 0,
|
||||||
"Not in marker dir. Marker Path=" + markerPath + ", Expected Marker Root=" + markerRootPath);
|
"Not in marker dir. Marker Path=" + markerPath + ", Expected Marker Root=" + markerRootPath);
|
||||||
String rPath = markerPath.substring(begin + markerRootPath.length() + 1);
|
String rPath = markerPath.substring(begin + markerRootPath.length() + 1);
|
||||||
return String.format("%s/%s%s", basePath, rPath.replace(HoodieTableMetaClient.MARKER_EXTN, ""),
|
return String.format("%s/%s%s", basePath, rPath.replace(HoodieTableMetaClient.MARKER_EXTN, ""), baseFileExtension);
|
||||||
HoodieFileFormat.PARQUET.getFileExtension());
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public static String maskWithoutFileId(String instantTime, int taskPartitionId) {
|
public static String maskWithoutFileId(String instantTime, int taskPartitionId) {
|
||||||
@@ -195,12 +200,12 @@ public class FSUtils {
|
|||||||
}
|
}
|
||||||
|
|
||||||
public static List<String> getAllDataFilesForMarkers(FileSystem fs, String basePath, String instantTs,
|
public static List<String> getAllDataFilesForMarkers(FileSystem fs, String basePath, String instantTs,
|
||||||
String markerDir) throws IOException {
|
String markerDir, String baseFileExtension) throws IOException {
|
||||||
List<String> dataFiles = new LinkedList<>();
|
List<String> dataFiles = new LinkedList<>();
|
||||||
processFiles(fs, markerDir, (status) -> {
|
processFiles(fs, markerDir, (status) -> {
|
||||||
String pathStr = status.getPath().toString();
|
String pathStr = status.getPath().toString();
|
||||||
if (pathStr.endsWith(HoodieTableMetaClient.MARKER_EXTN)) {
|
if (pathStr.endsWith(HoodieTableMetaClient.MARKER_EXTN)) {
|
||||||
dataFiles.add(FSUtils.translateMarkerToDataPath(basePath, pathStr, instantTs));
|
dataFiles.add(FSUtils.translateMarkerToDataPath(basePath, pathStr, instantTs, baseFileExtension));
|
||||||
}
|
}
|
||||||
return true;
|
return true;
|
||||||
}, false);
|
}, false);
|
||||||
@@ -545,4 +550,13 @@ public class FSUtils {
|
|||||||
|| inputStream.getWrappedStream().getClass().getCanonicalName()
|
|| inputStream.getWrappedStream().getClass().getCanonicalName()
|
||||||
.equals("com.google.cloud.hadoop.fs.gcs.GoogleHadoopFSInputStream");
|
.equals("com.google.cloud.hadoop.fs.gcs.GoogleHadoopFSInputStream");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public static Configuration registerFileSystem(Path file, Configuration conf) {
|
||||||
|
Configuration returnConf = new Configuration(conf);
|
||||||
|
String scheme = FSUtils.getFs(file.toString(), conf).getScheme();
|
||||||
|
returnConf.set("fs." + HoodieWrapperFileSystem.getHoodieScheme(scheme) + ".impl",
|
||||||
|
HoodieWrapperFileSystem.class.getName());
|
||||||
|
return returnConf;
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -128,7 +128,8 @@ public class HoodieTableMetaClient implements Serializable {
|
|||||||
}
|
}
|
||||||
this.timelineLayoutVersion = layoutVersion.orElseGet(() -> tableConfig.getTimelineLayoutVersion().get());
|
this.timelineLayoutVersion = layoutVersion.orElseGet(() -> tableConfig.getTimelineLayoutVersion().get());
|
||||||
this.loadActiveTimelineOnLoad = loadActiveTimelineOnLoad;
|
this.loadActiveTimelineOnLoad = loadActiveTimelineOnLoad;
|
||||||
LOG.info("Finished Loading Table of type " + tableType + "(version=" + timelineLayoutVersion + ") from " + basePath);
|
LOG.info("Finished Loading Table of type " + tableType + "(version=" + timelineLayoutVersion + ", baseFileFormat="
|
||||||
|
+ this.tableConfig.getBaseFileFormat() + ") from " + basePath);
|
||||||
if (loadActiveTimelineOnLoad) {
|
if (loadActiveTimelineOnLoad) {
|
||||||
LOG.info("Loading Active commit timeline for " + basePath);
|
LOG.info("Loading Active commit timeline for " + basePath);
|
||||||
getActiveTimeline();
|
getActiveTimeline();
|
||||||
@@ -299,12 +300,22 @@ public class HoodieTableMetaClient implements Serializable {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Helper method to initialize a table, with given basePath, tableType, name, archiveFolder.
|
* Helper method to initialize a table, with given basePath, tableType, name, archiveFolder, payloadClass.
|
||||||
*/
|
*/
|
||||||
public static HoodieTableMetaClient initTableType(Configuration hadoopConf, String basePath, String tableType,
|
public static HoodieTableMetaClient initTableType(Configuration hadoopConf, String basePath, String tableType,
|
||||||
String tableName, String archiveLogFolder, String payloadClassName) throws IOException {
|
String tableName, String archiveLogFolder, String payloadClassName) throws IOException {
|
||||||
return initTableType(hadoopConf, basePath, HoodieTableType.valueOf(tableType), tableName,
|
return initTableType(hadoopConf, basePath, HoodieTableType.valueOf(tableType), tableName,
|
||||||
archiveLogFolder, payloadClassName, null);
|
archiveLogFolder, payloadClassName, null, null);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Helper method to initialize a table, with given basePath, tableType, name, archiveFolder, payloadClass and
|
||||||
|
* base file format.
|
||||||
|
*/
|
||||||
|
public static HoodieTableMetaClient initTableType(Configuration hadoopConf, String basePath, String tableType,
|
||||||
|
String tableName, String archiveLogFolder, String payloadClassName, String baseFileFormat) throws IOException {
|
||||||
|
return initTableType(hadoopConf, basePath, HoodieTableType.valueOf(tableType), tableName,
|
||||||
|
archiveLogFolder, payloadClassName, null, baseFileFormat);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@@ -312,12 +323,20 @@ public class HoodieTableMetaClient implements Serializable {
|
|||||||
*/
|
*/
|
||||||
public static HoodieTableMetaClient initTableType(Configuration hadoopConf, String basePath,
|
public static HoodieTableMetaClient initTableType(Configuration hadoopConf, String basePath,
|
||||||
HoodieTableType tableType, String tableName, String payloadClassName) throws IOException {
|
HoodieTableType tableType, String tableName, String payloadClassName) throws IOException {
|
||||||
return initTableType(hadoopConf, basePath, tableType, tableName, null, payloadClassName, null);
|
return initTableType(hadoopConf, basePath, tableType, tableName, null, payloadClassName, null, null);
|
||||||
|
}
|
||||||
|
|
||||||
|
public static HoodieTableMetaClient initTableType(Configuration hadoopConf, String basePath,
|
||||||
|
HoodieTableType tableType, String tableName,
|
||||||
|
String archiveLogFolder, String payloadClassName,
|
||||||
|
Integer timelineLayoutVersion) throws IOException {
|
||||||
|
return initTableType(hadoopConf, basePath, tableType, tableName, archiveLogFolder, payloadClassName,
|
||||||
|
timelineLayoutVersion, null);
|
||||||
}
|
}
|
||||||
|
|
||||||
public static HoodieTableMetaClient initTableType(Configuration hadoopConf, String basePath,
|
public static HoodieTableMetaClient initTableType(Configuration hadoopConf, String basePath,
|
||||||
HoodieTableType tableType, String tableName, String archiveLogFolder, String payloadClassName,
|
HoodieTableType tableType, String tableName, String archiveLogFolder, String payloadClassName,
|
||||||
Integer timelineLayoutVersion) throws IOException {
|
Integer timelineLayoutVersion, String baseFileFormat) throws IOException {
|
||||||
Properties properties = new Properties();
|
Properties properties = new Properties();
|
||||||
properties.setProperty(HoodieTableConfig.HOODIE_TABLE_NAME_PROP_NAME, tableName);
|
properties.setProperty(HoodieTableConfig.HOODIE_TABLE_NAME_PROP_NAME, tableName);
|
||||||
properties.setProperty(HoodieTableConfig.HOODIE_TABLE_TYPE_PROP_NAME, tableType.name());
|
properties.setProperty(HoodieTableConfig.HOODIE_TABLE_TYPE_PROP_NAME, tableType.name());
|
||||||
@@ -332,6 +351,10 @@ public class HoodieTableMetaClient implements Serializable {
|
|||||||
if (null != timelineLayoutVersion) {
|
if (null != timelineLayoutVersion) {
|
||||||
properties.put(HoodieTableConfig.HOODIE_TIMELINE_LAYOUT_VERSION, String.valueOf(timelineLayoutVersion));
|
properties.put(HoodieTableConfig.HOODIE_TIMELINE_LAYOUT_VERSION, String.valueOf(timelineLayoutVersion));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (null != baseFileFormat) {
|
||||||
|
properties.setProperty(HoodieTableConfig.HOODIE_BASE_FILE_FORMAT_PROP_NAME, baseFileFormat.toUpperCase());
|
||||||
|
}
|
||||||
return HoodieTableMetaClient.initTableAndGetMetaClient(hadoopConf, basePath, properties);
|
return HoodieTableMetaClient.initTableAndGetMetaClient(hadoopConf, basePath, properties);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -31,7 +31,7 @@ import org.apache.hudi.common.model.HoodieFileFormat;
|
|||||||
import org.apache.hudi.common.model.HoodieLogFile;
|
import org.apache.hudi.common.model.HoodieLogFile;
|
||||||
import org.apache.hudi.common.table.log.HoodieLogFormat;
|
import org.apache.hudi.common.table.log.HoodieLogFormat;
|
||||||
import org.apache.hudi.common.table.log.HoodieLogFormat.Reader;
|
import org.apache.hudi.common.table.log.HoodieLogFormat.Reader;
|
||||||
import org.apache.hudi.common.table.log.block.HoodieAvroDataBlock;
|
import org.apache.hudi.common.table.log.block.HoodieDataBlock;
|
||||||
import org.apache.hudi.common.table.log.block.HoodieLogBlock;
|
import org.apache.hudi.common.table.log.block.HoodieLogBlock;
|
||||||
import org.apache.hudi.common.table.timeline.HoodieActiveTimeline;
|
import org.apache.hudi.common.table.timeline.HoodieActiveTimeline;
|
||||||
import org.apache.hudi.common.table.timeline.HoodieInstant;
|
import org.apache.hudi.common.table.timeline.HoodieInstant;
|
||||||
@@ -356,20 +356,7 @@ public class TableSchemaResolver {
|
|||||||
* @return
|
* @return
|
||||||
*/
|
*/
|
||||||
public MessageType readSchemaFromLogFile(Path path) throws IOException {
|
public MessageType readSchemaFromLogFile(Path path) throws IOException {
|
||||||
FileSystem fs = metaClient.getRawFs();
|
return readSchemaFromLogFile(metaClient.getRawFs(), path);
|
||||||
Reader reader = HoodieLogFormat.newReader(fs, new HoodieLogFile(path), null);
|
|
||||||
HoodieAvroDataBlock lastBlock = null;
|
|
||||||
while (reader.hasNext()) {
|
|
||||||
HoodieLogBlock block = reader.next();
|
|
||||||
if (block instanceof HoodieAvroDataBlock) {
|
|
||||||
lastBlock = (HoodieAvroDataBlock) block;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
reader.close();
|
|
||||||
if (lastBlock != null) {
|
|
||||||
return new AvroSchemaConverter().convert(lastBlock.getSchema());
|
|
||||||
}
|
|
||||||
return null;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@@ -394,11 +381,11 @@ public class TableSchemaResolver {
|
|||||||
*/
|
*/
|
||||||
public static MessageType readSchemaFromLogFile(FileSystem fs, Path path) throws IOException {
|
public static MessageType readSchemaFromLogFile(FileSystem fs, Path path) throws IOException {
|
||||||
Reader reader = HoodieLogFormat.newReader(fs, new HoodieLogFile(path), null);
|
Reader reader = HoodieLogFormat.newReader(fs, new HoodieLogFile(path), null);
|
||||||
HoodieAvroDataBlock lastBlock = null;
|
HoodieDataBlock lastBlock = null;
|
||||||
while (reader.hasNext()) {
|
while (reader.hasNext()) {
|
||||||
HoodieLogBlock block = reader.next();
|
HoodieLogBlock block = reader.next();
|
||||||
if (block instanceof HoodieAvroDataBlock) {
|
if (block instanceof HoodieDataBlock) {
|
||||||
lastBlock = (HoodieAvroDataBlock) block;
|
lastBlock = (HoodieDataBlock) block;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
reader.close();
|
reader.close();
|
||||||
|
|||||||
@@ -25,6 +25,7 @@ import org.apache.hudi.common.model.HoodieRecordPayload;
|
|||||||
import org.apache.hudi.common.table.HoodieTableMetaClient;
|
import org.apache.hudi.common.table.HoodieTableMetaClient;
|
||||||
import org.apache.hudi.common.table.log.block.HoodieAvroDataBlock;
|
import org.apache.hudi.common.table.log.block.HoodieAvroDataBlock;
|
||||||
import org.apache.hudi.common.table.log.block.HoodieCommandBlock;
|
import org.apache.hudi.common.table.log.block.HoodieCommandBlock;
|
||||||
|
import org.apache.hudi.common.table.log.block.HoodieDataBlock;
|
||||||
import org.apache.hudi.common.table.log.block.HoodieDeleteBlock;
|
import org.apache.hudi.common.table.log.block.HoodieDeleteBlock;
|
||||||
import org.apache.hudi.common.table.log.block.HoodieLogBlock;
|
import org.apache.hudi.common.table.log.block.HoodieLogBlock;
|
||||||
import org.apache.hudi.common.table.timeline.HoodieTimeline;
|
import org.apache.hudi.common.table.timeline.HoodieTimeline;
|
||||||
@@ -267,7 +268,7 @@ public abstract class AbstractHoodieLogRecordScanner {
|
|||||||
* Iterate over the GenericRecord in the block, read the hoodie key and partition path and call subclass processors to
|
* Iterate over the GenericRecord in the block, read the hoodie key and partition path and call subclass processors to
|
||||||
* handle it.
|
* handle it.
|
||||||
*/
|
*/
|
||||||
private void processAvroDataBlock(HoodieAvroDataBlock dataBlock) throws Exception {
|
private void processDataBlock(HoodieDataBlock dataBlock) throws Exception {
|
||||||
// TODO (NA) - Implement getRecordItr() in HoodieAvroDataBlock and use that here
|
// TODO (NA) - Implement getRecordItr() in HoodieAvroDataBlock and use that here
|
||||||
List<IndexedRecord> recs = dataBlock.getRecords();
|
List<IndexedRecord> recs = dataBlock.getRecords();
|
||||||
totalLogRecords.addAndGet(recs.size());
|
totalLogRecords.addAndGet(recs.size());
|
||||||
@@ -302,7 +303,7 @@ public abstract class AbstractHoodieLogRecordScanner {
|
|||||||
HoodieLogBlock lastBlock = lastBlocks.pollLast();
|
HoodieLogBlock lastBlock = lastBlocks.pollLast();
|
||||||
switch (lastBlock.getBlockType()) {
|
switch (lastBlock.getBlockType()) {
|
||||||
case AVRO_DATA_BLOCK:
|
case AVRO_DATA_BLOCK:
|
||||||
processAvroDataBlock((HoodieAvroDataBlock) lastBlock);
|
processDataBlock((HoodieAvroDataBlock) lastBlock);
|
||||||
break;
|
break;
|
||||||
case DELETE_BLOCK:
|
case DELETE_BLOCK:
|
||||||
Arrays.stream(((HoodieDeleteBlock) lastBlock).getKeysToDelete()).forEach(this::processNextDeletedKey);
|
Arrays.stream(((HoodieDeleteBlock) lastBlock).getKeysToDelete()).forEach(this::processNextDeletedKey);
|
||||||
|
|||||||
@@ -193,7 +193,7 @@ public class HoodieLogFileReader implements HoodieLogFormat.Reader {
|
|||||||
if (nextBlockVersion.getVersion() == HoodieLogFormatVersion.DEFAULT_VERSION) {
|
if (nextBlockVersion.getVersion() == HoodieLogFormatVersion.DEFAULT_VERSION) {
|
||||||
return HoodieAvroDataBlock.getBlock(content, readerSchema);
|
return HoodieAvroDataBlock.getBlock(content, readerSchema);
|
||||||
} else {
|
} else {
|
||||||
return HoodieAvroDataBlock.getBlock(logFile, inputStream, Option.ofNullable(content), readBlockLazily,
|
return new HoodieAvroDataBlock(logFile, inputStream, Option.ofNullable(content), readBlockLazily,
|
||||||
contentPosition, contentLength, blockEndPos, readerSchema, header, footer);
|
contentPosition, contentLength, blockEndPos, readerSchema, header, footer);
|
||||||
}
|
}
|
||||||
case DELETE_BLOCK:
|
case DELETE_BLOCK:
|
||||||
|
|||||||
@@ -22,7 +22,7 @@ import org.apache.hudi.common.fs.FSUtils;
|
|||||||
import org.apache.hudi.common.model.HoodieLogFile;
|
import org.apache.hudi.common.model.HoodieLogFile;
|
||||||
import org.apache.hudi.common.table.HoodieTableMetaClient;
|
import org.apache.hudi.common.table.HoodieTableMetaClient;
|
||||||
import org.apache.hudi.common.table.log.HoodieLogFormat.Reader;
|
import org.apache.hudi.common.table.log.HoodieLogFormat.Reader;
|
||||||
import org.apache.hudi.common.table.log.block.HoodieAvroDataBlock;
|
import org.apache.hudi.common.table.log.block.HoodieDataBlock;
|
||||||
import org.apache.hudi.common.table.log.block.HoodieLogBlock;
|
import org.apache.hudi.common.table.log.block.HoodieLogBlock;
|
||||||
import org.apache.hudi.common.table.log.block.HoodieLogBlock.HeaderMetadataType;
|
import org.apache.hudi.common.table.log.block.HoodieLogBlock.HeaderMetadataType;
|
||||||
import org.apache.hudi.common.table.timeline.HoodieActiveTimeline;
|
import org.apache.hudi.common.table.timeline.HoodieActiveTimeline;
|
||||||
@@ -49,8 +49,8 @@ public class LogReaderUtils {
|
|||||||
HoodieTimeline completedTimeline = activeTimeline.getCommitsTimeline().filterCompletedInstants();
|
HoodieTimeline completedTimeline = activeTimeline.getCommitsTimeline().filterCompletedInstants();
|
||||||
while (reader.hasPrev()) {
|
while (reader.hasPrev()) {
|
||||||
HoodieLogBlock block = reader.prev();
|
HoodieLogBlock block = reader.prev();
|
||||||
if (block instanceof HoodieAvroDataBlock) {
|
if (block instanceof HoodieDataBlock) {
|
||||||
HoodieAvroDataBlock lastBlock = (HoodieAvroDataBlock) block;
|
HoodieDataBlock lastBlock = (HoodieDataBlock) block;
|
||||||
if (completedTimeline
|
if (completedTimeline
|
||||||
.containsOrBeforeTimelineStarts(lastBlock.getLogBlockHeader().get(HeaderMetadataType.INSTANT_TIME))) {
|
.containsOrBeforeTimelineStarts(lastBlock.getLogBlockHeader().get(HeaderMetadataType.INSTANT_TIME))) {
|
||||||
writerSchema = new Schema.Parser().parse(lastBlock.getLogBlockHeader().get(HeaderMetadataType.SCHEMA));
|
writerSchema = new Schema.Parser().parse(lastBlock.getLogBlockHeader().get(HeaderMetadataType.SCHEMA));
|
||||||
|
|||||||
@@ -36,8 +36,6 @@ import org.apache.avro.io.Encoder;
|
|||||||
import org.apache.avro.io.EncoderFactory;
|
import org.apache.avro.io.EncoderFactory;
|
||||||
import org.apache.hadoop.fs.FSDataInputStream;
|
import org.apache.hadoop.fs.FSDataInputStream;
|
||||||
|
|
||||||
import javax.annotation.Nonnull;
|
|
||||||
|
|
||||||
import java.io.ByteArrayInputStream;
|
import java.io.ByteArrayInputStream;
|
||||||
import java.io.ByteArrayOutputStream;
|
import java.io.ByteArrayOutputStream;
|
||||||
import java.io.DataInputStream;
|
import java.io.DataInputStream;
|
||||||
@@ -49,56 +47,42 @@ import java.util.Iterator;
|
|||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
|
|
||||||
/**
|
import javax.annotation.Nonnull;
|
||||||
* DataBlock contains a list of records serialized using Avro. The Datablock contains 1. Data Block version 2. Total
|
|
||||||
* number of records in the block 3. Size of a record 4. Actual avro serialized content of the record
|
/**
|
||||||
*/
|
* HoodieAvroDataBlock contains a list of records serialized using Avro. It is used with the Parquet base file format.
|
||||||
public class HoodieAvroDataBlock extends HoodieLogBlock {
|
*/
|
||||||
|
public class HoodieAvroDataBlock extends HoodieDataBlock {
|
||||||
|
|
||||||
private List<IndexedRecord> records;
|
|
||||||
private Schema schema;
|
|
||||||
private ThreadLocal<BinaryEncoder> encoderCache = new ThreadLocal<>();
|
private ThreadLocal<BinaryEncoder> encoderCache = new ThreadLocal<>();
|
||||||
private ThreadLocal<BinaryDecoder> decoderCache = new ThreadLocal<>();
|
private ThreadLocal<BinaryDecoder> decoderCache = new ThreadLocal<>();
|
||||||
|
|
||||||
public HoodieAvroDataBlock(@Nonnull List<IndexedRecord> records, @Nonnull Map<HeaderMetadataType, String> header,
|
public HoodieAvroDataBlock(@Nonnull Map<HeaderMetadataType, String> logBlockHeader,
|
||||||
@Nonnull Map<HeaderMetadataType, String> footer) {
|
@Nonnull Map<HeaderMetadataType, String> logBlockFooter,
|
||||||
super(header, footer, Option.empty(), Option.empty(), null, false);
|
@Nonnull Option<HoodieLogBlockContentLocation> blockContentLocation, @Nonnull Option<byte[]> content,
|
||||||
this.records = records;
|
FSDataInputStream inputStream, boolean readBlockLazily) {
|
||||||
this.schema = new Schema.Parser().parse(super.getLogBlockHeader().get(HeaderMetadataType.SCHEMA));
|
super(logBlockHeader, logBlockFooter, blockContentLocation, content, inputStream, readBlockLazily);
|
||||||
|
}
|
||||||
|
|
||||||
|
public HoodieAvroDataBlock(HoodieLogFile logFile, FSDataInputStream inputStream, Option<byte[]> content,
|
||||||
|
boolean readBlockLazily, long position, long blockSize, long blockEndpos, Schema readerSchema,
|
||||||
|
Map<HeaderMetadataType, String> header, Map<HeaderMetadataType, String> footer) {
|
||||||
|
super(content, inputStream, readBlockLazily,
|
||||||
|
Option.of(new HoodieLogBlockContentLocation(logFile, position, blockSize, blockEndpos)), readerSchema, header,
|
||||||
|
footer);
|
||||||
}
|
}
|
||||||
|
|
||||||
public HoodieAvroDataBlock(@Nonnull List<IndexedRecord> records, @Nonnull Map<HeaderMetadataType, String> header) {
|
public HoodieAvroDataBlock(@Nonnull List<IndexedRecord> records, @Nonnull Map<HeaderMetadataType, String> header) {
|
||||||
this(records, header, new HashMap<>());
|
super(records, header, new HashMap<>());
|
||||||
}
|
|
||||||
|
|
||||||
private HoodieAvroDataBlock(Option<byte[]> content, @Nonnull FSDataInputStream inputStream, boolean readBlockLazily,
|
|
||||||
Option<HoodieLogBlockContentLocation> blockContentLocation, Schema readerSchema,
|
|
||||||
@Nonnull Map<HeaderMetadataType, String> headers, @Nonnull Map<HeaderMetadataType, String> footer) {
|
|
||||||
super(headers, footer, blockContentLocation, content, inputStream, readBlockLazily);
|
|
||||||
this.schema = readerSchema;
|
|
||||||
}
|
|
||||||
|
|
||||||
public static HoodieLogBlock getBlock(HoodieLogFile logFile, FSDataInputStream inputStream, Option<byte[]> content,
|
|
||||||
boolean readBlockLazily, long position, long blockSize, long blockEndpos, Schema readerSchema,
|
|
||||||
Map<HeaderMetadataType, String> header, Map<HeaderMetadataType, String> footer) {
|
|
||||||
|
|
||||||
return new HoodieAvroDataBlock(content, inputStream, readBlockLazily,
|
|
||||||
Option.of(new HoodieLogBlockContentLocation(logFile, position, blockSize, blockEndpos)), readerSchema, header,
|
|
||||||
footer);
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public byte[] getContentBytes() throws IOException {
|
public HoodieLogBlockType getBlockType() {
|
||||||
|
return HoodieLogBlockType.AVRO_DATA_BLOCK;
|
||||||
// In case this method is called before realizing records from content
|
}
|
||||||
if (getContent().isPresent()) {
|
|
||||||
return getContent().get();
|
|
||||||
} else if (readBlockLazily && !getContent().isPresent() && records == null) {
|
|
||||||
// read block lazily
|
|
||||||
createRecordsFromContentBytes();
|
|
||||||
}
|
|
||||||
|
|
||||||
|
@Override
|
||||||
|
protected byte[] serializeRecords() throws IOException {
|
||||||
Schema schema = new Schema.Parser().parse(super.getLogBlockHeader().get(HeaderMetadataType.SCHEMA));
|
Schema schema = new Schema.Parser().parse(super.getLogBlockHeader().get(HeaderMetadataType.SCHEMA));
|
||||||
GenericDatumWriter<IndexedRecord> writer = new GenericDatumWriter<>(schema);
|
GenericDatumWriter<IndexedRecord> writer = new GenericDatumWriter<>(schema);
|
||||||
ByteArrayOutputStream baos = new ByteArrayOutputStream();
|
ByteArrayOutputStream baos = new ByteArrayOutputStream();
|
||||||
@@ -137,40 +121,10 @@ public class HoodieAvroDataBlock extends HoodieLogBlock {
|
|||||||
return baos.toByteArray();
|
return baos.toByteArray();
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
|
||||||
public HoodieLogBlockType getBlockType() {
|
|
||||||
return HoodieLogBlockType.AVRO_DATA_BLOCK;
|
|
||||||
}
|
|
||||||
|
|
||||||
public List<IndexedRecord> getRecords() {
|
|
||||||
if (records == null) {
|
|
||||||
try {
|
|
||||||
// in case records are absent, read content lazily and then convert to IndexedRecords
|
|
||||||
createRecordsFromContentBytes();
|
|
||||||
} catch (IOException io) {
|
|
||||||
throw new HoodieIOException("Unable to convert content bytes to records", io);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return records;
|
|
||||||
}
|
|
||||||
|
|
||||||
public Schema getSchema() {
|
|
||||||
// if getSchema was invoked before converting byte [] to records
|
|
||||||
if (records == null) {
|
|
||||||
getRecords();
|
|
||||||
}
|
|
||||||
return schema;
|
|
||||||
}
|
|
||||||
|
|
||||||
// TODO (na) - Break down content into smaller chunks of byte [] to be GC as they are used
|
// TODO (na) - Break down content into smaller chunks of byte [] to be GC as they are used
|
||||||
// TODO (na) - Implement a recordItr instead of recordList
|
// TODO (na) - Implement a recordItr instead of recordList
|
||||||
private void createRecordsFromContentBytes() throws IOException {
|
@Override
|
||||||
|
protected void deserializeRecords() throws IOException {
|
||||||
if (readBlockLazily && !getContent().isPresent()) {
|
|
||||||
// read log block contents from disk
|
|
||||||
inflate();
|
|
||||||
}
|
|
||||||
|
|
||||||
SizeAwareDataInputStream dis =
|
SizeAwareDataInputStream dis =
|
||||||
new SizeAwareDataInputStream(new DataInputStream(new ByteArrayInputStream(getContent().get())));
|
new SizeAwareDataInputStream(new DataInputStream(new ByteArrayInputStream(getContent().get())));
|
||||||
|
|
||||||
@@ -212,6 +166,9 @@ public class HoodieAvroDataBlock extends HoodieLogBlock {
|
|||||||
|
|
||||||
//----------------------------------------------------------------------------------------
|
//----------------------------------------------------------------------------------------
|
||||||
// DEPRECATED METHODS
|
// DEPRECATED METHODS
|
||||||
|
//
|
||||||
|
// These methods were only supported by HoodieAvroDataBlock and have been deprecated. Hence,
|
||||||
|
// these are only implemented here even though they duplicate the code from HoodieAvroDataBlock.
|
||||||
//----------------------------------------------------------------------------------------
|
//----------------------------------------------------------------------------------------
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@@ -230,7 +187,7 @@ public class HoodieAvroDataBlock extends HoodieLogBlock {
|
|||||||
* HoodieLogFormat V1.
|
* HoodieLogFormat V1.
|
||||||
*/
|
*/
|
||||||
@Deprecated
|
@Deprecated
|
||||||
public static HoodieLogBlock getBlock(byte[] content, Schema readerSchema) throws IOException {
|
public static HoodieAvroDataBlock getBlock(byte[] content, Schema readerSchema) throws IOException {
|
||||||
|
|
||||||
SizeAwareDataInputStream dis = new SizeAwareDataInputStream(new DataInputStream(new ByteArrayInputStream(content)));
|
SizeAwareDataInputStream dis = new SizeAwareDataInputStream(new DataInputStream(new ByteArrayInputStream(content)));
|
||||||
|
|
||||||
@@ -302,5 +259,4 @@ public class HoodieAvroDataBlock extends HoodieLogBlock {
|
|||||||
output.close();
|
output.close();
|
||||||
return baos.toByteArray();
|
return baos.toByteArray();
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -0,0 +1,132 @@
|
|||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one
|
||||||
|
* or more contributor license agreements. See the NOTICE file
|
||||||
|
* distributed with this work for additional information
|
||||||
|
* regarding copyright ownership. The ASF licenses this file
|
||||||
|
* to you under the Apache License, Version 2.0 (the
|
||||||
|
* "License"); you may not use this file except in compliance
|
||||||
|
* with the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.apache.hudi.common.table.log.block;
|
||||||
|
|
||||||
|
import org.apache.hudi.common.util.Option;
|
||||||
|
import org.apache.hudi.exception.HoodieException;
|
||||||
|
import org.apache.hudi.exception.HoodieIOException;
|
||||||
|
|
||||||
|
import org.apache.avro.Schema;
|
||||||
|
import org.apache.avro.generic.IndexedRecord;
|
||||||
|
import org.apache.hadoop.fs.FSDataInputStream;
|
||||||
|
|
||||||
|
import javax.annotation.Nonnull;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.util.HashMap;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Map;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* DataBlock contains a list of records serialized using formats compatible with the base file format.
|
||||||
|
* For each base file format there is a corresponding DataBlock format.
|
||||||
|
*
|
||||||
|
* The Datablock contains:
|
||||||
|
* 1. Data Block version
|
||||||
|
* 2. Total number of records in the block
|
||||||
|
* 3. Actual serialized content of the records
|
||||||
|
*/
|
||||||
|
public abstract class HoodieDataBlock extends HoodieLogBlock {
|
||||||
|
|
||||||
|
protected List<IndexedRecord> records;
|
||||||
|
protected Schema schema;
|
||||||
|
|
||||||
|
public HoodieDataBlock(@Nonnull Map<HeaderMetadataType, String> logBlockHeader,
|
||||||
|
@Nonnull Map<HeaderMetadataType, String> logBlockFooter,
|
||||||
|
@Nonnull Option<HoodieLogBlockContentLocation> blockContentLocation, @Nonnull Option<byte[]> content,
|
||||||
|
FSDataInputStream inputStream, boolean readBlockLazily) {
|
||||||
|
super(logBlockHeader, logBlockFooter, blockContentLocation, content, inputStream, readBlockLazily);
|
||||||
|
}
|
||||||
|
|
||||||
|
public HoodieDataBlock(@Nonnull List<IndexedRecord> records, @Nonnull Map<HeaderMetadataType, String> header,
|
||||||
|
@Nonnull Map<HeaderMetadataType, String> footer) {
|
||||||
|
super(header, footer, Option.empty(), Option.empty(), null, false);
|
||||||
|
this.records = records;
|
||||||
|
this.schema = new Schema.Parser().parse(super.getLogBlockHeader().get(HeaderMetadataType.SCHEMA));
|
||||||
|
}
|
||||||
|
|
||||||
|
public HoodieDataBlock(@Nonnull List<IndexedRecord> records, @Nonnull Map<HeaderMetadataType, String> header) {
|
||||||
|
this(records, header, new HashMap<>());
|
||||||
|
}
|
||||||
|
|
||||||
|
protected HoodieDataBlock(Option<byte[]> content, @Nonnull FSDataInputStream inputStream, boolean readBlockLazily,
|
||||||
|
Option<HoodieLogBlockContentLocation> blockContentLocation, Schema readerSchema,
|
||||||
|
@Nonnull Map<HeaderMetadataType, String> headers, @Nonnull Map<HeaderMetadataType, String> footer) {
|
||||||
|
super(headers, footer, blockContentLocation, content, inputStream, readBlockLazily);
|
||||||
|
this.schema = readerSchema;
|
||||||
|
}
|
||||||
|
|
||||||
|
public static HoodieLogBlock getBlock(HoodieLogBlockType logDataBlockFormat, List<IndexedRecord> recordList,
|
||||||
|
Map<HeaderMetadataType, String> header) {
|
||||||
|
switch (logDataBlockFormat) {
|
||||||
|
case AVRO_DATA_BLOCK:
|
||||||
|
return new HoodieAvroDataBlock(recordList, header);
|
||||||
|
default:
|
||||||
|
throw new HoodieException("Data block format " + logDataBlockFormat + " not implemented");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public byte[] getContentBytes() throws IOException {
|
||||||
|
// In case this method is called before realizing records from content
|
||||||
|
if (getContent().isPresent()) {
|
||||||
|
return getContent().get();
|
||||||
|
} else if (readBlockLazily && !getContent().isPresent() && records == null) {
|
||||||
|
// read block lazily
|
||||||
|
createRecordsFromContentBytes();
|
||||||
|
}
|
||||||
|
|
||||||
|
return serializeRecords();
|
||||||
|
}
|
||||||
|
|
||||||
|
public abstract HoodieLogBlockType getBlockType();
|
||||||
|
|
||||||
|
public List<IndexedRecord> getRecords() {
|
||||||
|
if (records == null) {
|
||||||
|
try {
|
||||||
|
// in case records are absent, read content lazily and then convert to IndexedRecords
|
||||||
|
createRecordsFromContentBytes();
|
||||||
|
} catch (IOException io) {
|
||||||
|
throw new HoodieIOException("Unable to convert content bytes to records", io);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return records;
|
||||||
|
}
|
||||||
|
|
||||||
|
public Schema getSchema() {
|
||||||
|
// if getSchema was invoked before converting byte [] to records
|
||||||
|
if (records == null) {
|
||||||
|
getRecords();
|
||||||
|
}
|
||||||
|
return schema;
|
||||||
|
}
|
||||||
|
|
||||||
|
private void createRecordsFromContentBytes() throws IOException {
|
||||||
|
if (readBlockLazily && !getContent().isPresent()) {
|
||||||
|
// read log block contents from disk
|
||||||
|
inflate();
|
||||||
|
}
|
||||||
|
|
||||||
|
deserializeRecords();
|
||||||
|
}
|
||||||
|
|
||||||
|
protected abstract byte[] serializeRecords() throws IOException;
|
||||||
|
|
||||||
|
protected abstract void deserializeRecords() throws IOException;
|
||||||
|
}
|
||||||
@@ -84,9 +84,7 @@ public abstract class HoodieLogBlock {
|
|||||||
throw new HoodieException("No implementation was provided");
|
throw new HoodieException("No implementation was provided");
|
||||||
}
|
}
|
||||||
|
|
||||||
public HoodieLogBlockType getBlockType() {
|
public abstract HoodieLogBlockType getBlockType();
|
||||||
throw new HoodieException("No implementation was provided");
|
|
||||||
}
|
|
||||||
|
|
||||||
public long getLogBlockLength() {
|
public long getLogBlockLength() {
|
||||||
throw new HoodieException("No implementation was provided");
|
throw new HoodieException("No implementation was provided");
|
||||||
|
|||||||
@@ -121,7 +121,8 @@ public abstract class AbstractTableFileSystemView implements SyncableFileSystemV
|
|||||||
}
|
}
|
||||||
});
|
});
|
||||||
long storePartitionsTs = timer.endTimer();
|
long storePartitionsTs = timer.endTimer();
|
||||||
LOG.info("addFilesToView: NumFiles=" + statuses.length + ", FileGroupsCreationTime=" + fgBuildTimeTakenMs
|
LOG.info("addFilesToView: NumFiles=" + statuses.length + ", NumFileGroups=" + fileGroups.size()
|
||||||
|
+ ", FileGroupsCreationTime=" + fgBuildTimeTakenMs
|
||||||
+ ", StoreTimeTaken=" + storePartitionsTs);
|
+ ", StoreTimeTaken=" + storePartitionsTs);
|
||||||
return fileGroups;
|
return fileGroups;
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -16,7 +16,7 @@
|
|||||||
* limitations under the License.
|
* limitations under the License.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
package org.apache.hudi.client.utils;
|
package org.apache.hudi.common.util;
|
||||||
|
|
||||||
import org.apache.hudi.common.util.queue.BoundedInMemoryQueue;
|
import org.apache.hudi.common.util.queue.BoundedInMemoryQueue;
|
||||||
import org.apache.hudi.exception.HoodieIOException;
|
import org.apache.hudi.exception.HoodieIOException;
|
||||||
@@ -0,0 +1,44 @@
|
|||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one
|
||||||
|
* or more contributor license agreements. See the NOTICE file
|
||||||
|
* distributed with this work for additional information
|
||||||
|
* regarding copyright ownership. The ASF licenses this file
|
||||||
|
* to you under the Apache License, Version 2.0 (the
|
||||||
|
* "License"); you may not use this file except in compliance
|
||||||
|
* with the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.apache.hudi.io.storage;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.util.Iterator;
|
||||||
|
import java.util.Set;
|
||||||
|
|
||||||
|
import org.apache.avro.Schema;
|
||||||
|
import org.apache.avro.generic.IndexedRecord;
|
||||||
|
import org.apache.hudi.common.bloom.BloomFilter;
|
||||||
|
|
||||||
|
public interface HoodieFileReader<R extends IndexedRecord> {
|
||||||
|
|
||||||
|
public String[] readMinMaxRecordKeys();
|
||||||
|
|
||||||
|
public BloomFilter readBloomFilter();
|
||||||
|
|
||||||
|
public Set<String> filterRowKeys(Set<String> candidateRowKeys);
|
||||||
|
|
||||||
|
public Iterator<R> getRecordIterator(Schema schema) throws IOException;
|
||||||
|
|
||||||
|
Schema getSchema();
|
||||||
|
|
||||||
|
void close();
|
||||||
|
|
||||||
|
long getTotalRecords();
|
||||||
|
}
|
||||||
@@ -0,0 +1,47 @@
|
|||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one
|
||||||
|
* or more contributor license agreements. See the NOTICE file
|
||||||
|
* distributed with this work for additional information
|
||||||
|
* regarding copyright ownership. The ASF licenses this file
|
||||||
|
* to you under the Apache License, Version 2.0 (the
|
||||||
|
* "License"); you may not use this file except in compliance
|
||||||
|
* with the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.apache.hudi.io.storage;
|
||||||
|
|
||||||
|
import org.apache.hudi.common.fs.FSUtils;
|
||||||
|
import org.apache.hudi.common.model.HoodieRecordPayload;
|
||||||
|
|
||||||
|
import org.apache.avro.generic.IndexedRecord;
|
||||||
|
import org.apache.hadoop.conf.Configuration;
|
||||||
|
import org.apache.hadoop.fs.Path;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
|
||||||
|
import static org.apache.hudi.common.model.HoodieFileFormat.PARQUET;
|
||||||
|
|
||||||
|
public class HoodieFileReaderFactory {
|
||||||
|
|
||||||
|
public static <T extends HoodieRecordPayload, R extends IndexedRecord> HoodieFileReader<R> getFileReader(
|
||||||
|
Configuration conf, Path path) throws IOException {
|
||||||
|
final String extension = FSUtils.getFileExtension(path.toString());
|
||||||
|
if (PARQUET.getFileExtension().equals(extension)) {
|
||||||
|
return newParquetFileReader(conf, path);
|
||||||
|
}
|
||||||
|
throw new UnsupportedOperationException(extension + " format not supported yet.");
|
||||||
|
}
|
||||||
|
|
||||||
|
private static <T extends HoodieRecordPayload, R extends IndexedRecord> HoodieFileReader<R> newParquetFileReader(
|
||||||
|
Configuration conf, Path path) throws IOException {
|
||||||
|
return new HoodieParquetReader<>(conf, path);
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -0,0 +1,80 @@
|
|||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one
|
||||||
|
* or more contributor license agreements. See the NOTICE file
|
||||||
|
* distributed with this work for additional information
|
||||||
|
* regarding copyright ownership. The ASF licenses this file
|
||||||
|
* to you under the Apache License, Version 2.0 (the
|
||||||
|
* "License"); you may not use this file except in compliance
|
||||||
|
* with the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.apache.hudi.io.storage;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.util.Iterator;
|
||||||
|
import java.util.Set;
|
||||||
|
|
||||||
|
import org.apache.avro.Schema;
|
||||||
|
import org.apache.avro.generic.IndexedRecord;
|
||||||
|
import org.apache.hadoop.conf.Configuration;
|
||||||
|
import org.apache.hadoop.fs.Path;
|
||||||
|
import org.apache.hudi.common.bloom.BloomFilter;
|
||||||
|
import org.apache.hudi.common.util.ParquetReaderIterator;
|
||||||
|
import org.apache.hudi.common.util.ParquetUtils;
|
||||||
|
import org.apache.parquet.avro.AvroParquetReader;
|
||||||
|
import org.apache.parquet.avro.AvroReadSupport;
|
||||||
|
import org.apache.parquet.hadoop.ParquetReader;
|
||||||
|
|
||||||
|
public class HoodieParquetReader<R extends IndexedRecord> implements HoodieFileReader {
|
||||||
|
private Path path;
|
||||||
|
private Configuration conf;
|
||||||
|
|
||||||
|
public HoodieParquetReader(Configuration configuration, Path path) {
|
||||||
|
this.conf = configuration;
|
||||||
|
this.path = path;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String[] readMinMaxRecordKeys() {
|
||||||
|
return ParquetUtils.readMinMaxRecordKeys(conf, path);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public BloomFilter readBloomFilter() {
|
||||||
|
return ParquetUtils.readBloomFilterFromParquetMetadata(conf, path);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Set<String> filterRowKeys(Set candidateRowKeys) {
|
||||||
|
return ParquetUtils.filterParquetRowKeys(conf, path, candidateRowKeys);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Iterator<R> getRecordIterator(Schema schema) throws IOException {
|
||||||
|
AvroReadSupport.setAvroReadSchema(conf, schema);
|
||||||
|
ParquetReader<IndexedRecord> reader = AvroParquetReader.<IndexedRecord>builder(path).withConf(conf).build();
|
||||||
|
return new ParquetReaderIterator(reader);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Schema getSchema() {
|
||||||
|
return ParquetUtils.readAvroSchema(conf, path);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void close() {
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public long getTotalRecords() {
|
||||||
|
// TODO Auto-generated method stub
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -0,0 +1,30 @@
|
|||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one
|
||||||
|
* or more contributor license agreements. See the NOTICE file
|
||||||
|
* distributed with this work for additional information
|
||||||
|
* regarding copyright ownership. The ASF licenses this file
|
||||||
|
* to you under the Apache License, Version 2.0 (the
|
||||||
|
* "License"); you may not use this file except in compliance
|
||||||
|
* with the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.apache.hudi.common.functional;
|
||||||
|
|
||||||
|
import org.apache.hudi.common.table.log.block.HoodieLogBlock.HoodieLogBlockType;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Tests Avro log format {@link HoodieAvroDataBlock}.
|
||||||
|
*/
|
||||||
|
public class TestHoodieAvroLogFormat extends TestHoodieLogFormat {
|
||||||
|
public TestHoodieAvroLogFormat() {
|
||||||
|
super(HoodieLogBlockType.AVRO_DATA_BLOCK);
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -32,6 +32,7 @@ import org.apache.hudi.common.table.log.HoodieLogFormat.Writer;
|
|||||||
import org.apache.hudi.common.table.log.HoodieMergedLogRecordScanner;
|
import org.apache.hudi.common.table.log.HoodieMergedLogRecordScanner;
|
||||||
import org.apache.hudi.common.table.log.block.HoodieAvroDataBlock;
|
import org.apache.hudi.common.table.log.block.HoodieAvroDataBlock;
|
||||||
import org.apache.hudi.common.table.log.block.HoodieCommandBlock;
|
import org.apache.hudi.common.table.log.block.HoodieCommandBlock;
|
||||||
|
import org.apache.hudi.common.table.log.block.HoodieDataBlock;
|
||||||
import org.apache.hudi.common.table.log.block.HoodieDeleteBlock;
|
import org.apache.hudi.common.table.log.block.HoodieDeleteBlock;
|
||||||
import org.apache.hudi.common.table.log.block.HoodieLogBlock;
|
import org.apache.hudi.common.table.log.block.HoodieLogBlock;
|
||||||
import org.apache.hudi.common.table.log.block.HoodieLogBlock.HeaderMetadataType;
|
import org.apache.hudi.common.table.log.block.HoodieLogBlock.HeaderMetadataType;
|
||||||
@@ -81,12 +82,20 @@ import static org.junit.jupiter.api.Assertions.assertTrue;
|
|||||||
* Tests hoodie log format {@link HoodieLogFormat}.
|
* Tests hoodie log format {@link HoodieLogFormat}.
|
||||||
*/
|
*/
|
||||||
@SuppressWarnings("Duplicates")
|
@SuppressWarnings("Duplicates")
|
||||||
public class TestHoodieLogFormat extends HoodieCommonTestHarness {
|
public abstract class TestHoodieLogFormat extends HoodieCommonTestHarness {
|
||||||
|
|
||||||
private static String BASE_OUTPUT_PATH = "/tmp/";
|
private static String BASE_OUTPUT_PATH = "/tmp/";
|
||||||
private FileSystem fs;
|
private FileSystem fs;
|
||||||
private Path partitionPath;
|
private Path partitionPath;
|
||||||
private int bufferSize = 4096;
|
private int bufferSize = 4096;
|
||||||
|
private HoodieLogBlockType dataBlockType;
|
||||||
|
|
||||||
|
public TestHoodieLogFormat(HoodieLogBlockType dataBlockType) {
|
||||||
|
this.dataBlockType = dataBlockType;
|
||||||
|
}
|
||||||
|
|
||||||
|
private TestHoodieLogFormat() {
|
||||||
|
}
|
||||||
|
|
||||||
@BeforeAll
|
@BeforeAll
|
||||||
public static void setUpClass() throws IOException, InterruptedException {
|
public static void setUpClass() throws IOException, InterruptedException {
|
||||||
@@ -133,7 +142,7 @@ public class TestHoodieLogFormat extends HoodieCommonTestHarness {
|
|||||||
Map<HeaderMetadataType, String> header = new HashMap<>();
|
Map<HeaderMetadataType, String> header = new HashMap<>();
|
||||||
header.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, "100");
|
header.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, "100");
|
||||||
header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, getSimpleSchema().toString());
|
header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, getSimpleSchema().toString());
|
||||||
HoodieAvroDataBlock dataBlock = new HoodieAvroDataBlock(records, header);
|
HoodieDataBlock dataBlock = getDataBlock(records, header);
|
||||||
writer = writer.appendBlock(dataBlock);
|
writer = writer.appendBlock(dataBlock);
|
||||||
long size = writer.getCurrentSize();
|
long size = writer.getCurrentSize();
|
||||||
assertTrue(size > 0, "We just wrote a block - size should be > 0");
|
assertTrue(size > 0, "We just wrote a block - size should be > 0");
|
||||||
@@ -151,7 +160,7 @@ public class TestHoodieLogFormat extends HoodieCommonTestHarness {
|
|||||||
Map<HoodieLogBlock.HeaderMetadataType, String> header = new HashMap<>();
|
Map<HoodieLogBlock.HeaderMetadataType, String> header = new HashMap<>();
|
||||||
header.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, "100");
|
header.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, "100");
|
||||||
header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, getSimpleSchema().toString());
|
header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, getSimpleSchema().toString());
|
||||||
HoodieAvroDataBlock dataBlock = new HoodieAvroDataBlock(records, header);
|
HoodieDataBlock dataBlock = getDataBlock(records, header);
|
||||||
// Write out a block
|
// Write out a block
|
||||||
writer = writer.appendBlock(dataBlock);
|
writer = writer.appendBlock(dataBlock);
|
||||||
// Get the size of the block
|
// Get the size of the block
|
||||||
@@ -164,7 +173,7 @@ public class TestHoodieLogFormat extends HoodieCommonTestHarness {
|
|||||||
.withFileId("test-fileid1").overBaseCommit("100").withFs(fs).withSizeThreshold(size - 1).build();
|
.withFileId("test-fileid1").overBaseCommit("100").withFs(fs).withSizeThreshold(size - 1).build();
|
||||||
records = SchemaTestUtil.generateTestRecords(0, 100);
|
records = SchemaTestUtil.generateTestRecords(0, 100);
|
||||||
header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, getSimpleSchema().toString());
|
header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, getSimpleSchema().toString());
|
||||||
dataBlock = new HoodieAvroDataBlock(records, header);
|
dataBlock = getDataBlock(records, header);
|
||||||
writer = writer.appendBlock(dataBlock);
|
writer = writer.appendBlock(dataBlock);
|
||||||
assertEquals(0, writer.getCurrentSize(), "This should be a new log file and hence size should be 0");
|
assertEquals(0, writer.getCurrentSize(), "This should be a new log file and hence size should be 0");
|
||||||
assertEquals(2, writer.getLogFile().getLogVersion(), "Version should be rolled to 2");
|
assertEquals(2, writer.getLogFile().getLogVersion(), "Version should be rolled to 2");
|
||||||
@@ -217,7 +226,7 @@ public class TestHoodieLogFormat extends HoodieCommonTestHarness {
|
|||||||
Map<HoodieLogBlock.HeaderMetadataType, String> header = new HashMap<>();
|
Map<HoodieLogBlock.HeaderMetadataType, String> header = new HashMap<>();
|
||||||
header.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, "100");
|
header.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, "100");
|
||||||
header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, getSimpleSchema().toString());
|
header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, getSimpleSchema().toString());
|
||||||
HoodieAvroDataBlock dataBlock = new HoodieAvroDataBlock(records, header);
|
HoodieDataBlock dataBlock = getDataBlock(records, header);
|
||||||
writer = writer.appendBlock(dataBlock);
|
writer = writer.appendBlock(dataBlock);
|
||||||
writer2 = writer2.appendBlock(dataBlock);
|
writer2 = writer2.appendBlock(dataBlock);
|
||||||
writer.close();
|
writer.close();
|
||||||
@@ -235,7 +244,7 @@ public class TestHoodieLogFormat extends HoodieCommonTestHarness {
|
|||||||
Map<HoodieLogBlock.HeaderMetadataType, String> header = new HashMap<>();
|
Map<HoodieLogBlock.HeaderMetadataType, String> header = new HashMap<>();
|
||||||
header.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, "100");
|
header.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, "100");
|
||||||
header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, getSimpleSchema().toString());
|
header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, getSimpleSchema().toString());
|
||||||
HoodieAvroDataBlock dataBlock = new HoodieAvroDataBlock(records, header);
|
HoodieDataBlock dataBlock = getDataBlock(records, header);
|
||||||
writer = writer.appendBlock(dataBlock);
|
writer = writer.appendBlock(dataBlock);
|
||||||
long size1 = writer.getCurrentSize();
|
long size1 = writer.getCurrentSize();
|
||||||
writer.close();
|
writer.close();
|
||||||
@@ -245,7 +254,7 @@ public class TestHoodieLogFormat extends HoodieCommonTestHarness {
|
|||||||
.withFileId("test-fileid1").overBaseCommit("100").withFs(fs).build();
|
.withFileId("test-fileid1").overBaseCommit("100").withFs(fs).build();
|
||||||
records = SchemaTestUtil.generateTestRecords(0, 100);
|
records = SchemaTestUtil.generateTestRecords(0, 100);
|
||||||
header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, getSimpleSchema().toString());
|
header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, getSimpleSchema().toString());
|
||||||
dataBlock = new HoodieAvroDataBlock(records, header);
|
dataBlock = getDataBlock(records, header);
|
||||||
writer = writer.appendBlock(dataBlock);
|
writer = writer.appendBlock(dataBlock);
|
||||||
long size2 = writer.getCurrentSize();
|
long size2 = writer.getCurrentSize();
|
||||||
assertTrue(size2 > size1, "We just wrote a new block - size2 should be > size1");
|
assertTrue(size2 > size1, "We just wrote a new block - size2 should be > size1");
|
||||||
@@ -259,7 +268,7 @@ public class TestHoodieLogFormat extends HoodieCommonTestHarness {
|
|||||||
.withFileId("test-fileid1").overBaseCommit("100").withFs(fs).build();
|
.withFileId("test-fileid1").overBaseCommit("100").withFs(fs).build();
|
||||||
records = SchemaTestUtil.generateTestRecords(0, 100);
|
records = SchemaTestUtil.generateTestRecords(0, 100);
|
||||||
header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, getSimpleSchema().toString());
|
header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, getSimpleSchema().toString());
|
||||||
dataBlock = new HoodieAvroDataBlock(records, header);
|
dataBlock = getDataBlock(records, header);
|
||||||
writer = writer.appendBlock(dataBlock);
|
writer = writer.appendBlock(dataBlock);
|
||||||
long size3 = writer.getCurrentSize();
|
long size3 = writer.getCurrentSize();
|
||||||
assertTrue(size3 > size2, "We just wrote a new block - size3 should be > size2");
|
assertTrue(size3 > size2, "We just wrote a new block - size3 should be > size2");
|
||||||
@@ -289,7 +298,7 @@ public class TestHoodieLogFormat extends HoodieCommonTestHarness {
|
|||||||
* dataBlock = new HoodieAvroDataBlock(records, header); writer = writer.appendBlock(dataBlock); long size1 =
|
* dataBlock = new HoodieAvroDataBlock(records, header); writer = writer.appendBlock(dataBlock); long size1 =
|
||||||
* writer.getCurrentSize(); // do not close this writer - this simulates a data note appending to a log dying
|
* writer.getCurrentSize(); // do not close this writer - this simulates a data note appending to a log dying
|
||||||
* without closing the file // writer.close();
|
* without closing the file // writer.close();
|
||||||
*
|
*
|
||||||
* writer = HoodieLogFormat.newWriterBuilder().onParentPath(partitionPath)
|
* writer = HoodieLogFormat.newWriterBuilder().onParentPath(partitionPath)
|
||||||
* .withFileExtension(HoodieLogFile.DELTA_EXTENSION).withFileId("test-fileid1").overBaseCommit("100")
|
* .withFileExtension(HoodieLogFile.DELTA_EXTENSION).withFileId("test-fileid1").overBaseCommit("100")
|
||||||
* .withFs(fs).build(); records = SchemaTestUtil.generateTestRecords(0, 100);
|
* .withFs(fs).build(); records = SchemaTestUtil.generateTestRecords(0, 100);
|
||||||
@@ -313,7 +322,7 @@ public class TestHoodieLogFormat extends HoodieCommonTestHarness {
|
|||||||
Map<HoodieLogBlock.HeaderMetadataType, String> header = new HashMap<>();
|
Map<HoodieLogBlock.HeaderMetadataType, String> header = new HashMap<>();
|
||||||
header.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, "100");
|
header.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, "100");
|
||||||
header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, getSimpleSchema().toString());
|
header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, getSimpleSchema().toString());
|
||||||
HoodieAvroDataBlock dataBlock = new HoodieAvroDataBlock(records, header);
|
HoodieDataBlock dataBlock = getDataBlock(records, header);
|
||||||
|
|
||||||
for (int i = 0; i < 2; i++) {
|
for (int i = 0; i < 2; i++) {
|
||||||
HoodieLogFormat.newWriterBuilder().onParentPath(testPath)
|
HoodieLogFormat.newWriterBuilder().onParentPath(testPath)
|
||||||
@@ -338,15 +347,15 @@ public class TestHoodieLogFormat extends HoodieCommonTestHarness {
|
|||||||
Map<HoodieLogBlock.HeaderMetadataType, String> header = new HashMap<>();
|
Map<HoodieLogBlock.HeaderMetadataType, String> header = new HashMap<>();
|
||||||
header.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, "100");
|
header.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, "100");
|
||||||
header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, getSimpleSchema().toString());
|
header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, getSimpleSchema().toString());
|
||||||
HoodieAvroDataBlock dataBlock = new HoodieAvroDataBlock(records, header);
|
HoodieDataBlock dataBlock = getDataBlock(records, header);
|
||||||
writer = writer.appendBlock(dataBlock);
|
writer = writer.appendBlock(dataBlock);
|
||||||
writer.close();
|
writer.close();
|
||||||
|
|
||||||
Reader reader = HoodieLogFormat.newReader(fs, writer.getLogFile(), SchemaTestUtil.getSimpleSchema());
|
Reader reader = HoodieLogFormat.newReader(fs, writer.getLogFile(), SchemaTestUtil.getSimpleSchema());
|
||||||
assertTrue(reader.hasNext(), "We wrote a block, we should be able to read it");
|
assertTrue(reader.hasNext(), "We wrote a block, we should be able to read it");
|
||||||
HoodieLogBlock nextBlock = reader.next();
|
HoodieLogBlock nextBlock = reader.next();
|
||||||
assertEquals(HoodieLogBlockType.AVRO_DATA_BLOCK, nextBlock.getBlockType(), "The next block should be a data block");
|
assertEquals(dataBlockType, nextBlock.getBlockType(), "The next block should be a data block");
|
||||||
HoodieAvroDataBlock dataBlockRead = (HoodieAvroDataBlock) nextBlock;
|
HoodieDataBlock dataBlockRead = (HoodieDataBlock) nextBlock;
|
||||||
assertEquals(copyOfRecords.size(), dataBlockRead.getRecords().size(),
|
assertEquals(copyOfRecords.size(), dataBlockRead.getRecords().size(),
|
||||||
"Read records size should be equal to the written records size");
|
"Read records size should be equal to the written records size");
|
||||||
assertEquals(copyOfRecords, dataBlockRead.getRecords(),
|
assertEquals(copyOfRecords, dataBlockRead.getRecords(),
|
||||||
@@ -366,7 +375,7 @@ public class TestHoodieLogFormat extends HoodieCommonTestHarness {
|
|||||||
Map<HoodieLogBlock.HeaderMetadataType, String> header = new HashMap<>();
|
Map<HoodieLogBlock.HeaderMetadataType, String> header = new HashMap<>();
|
||||||
header.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, "100");
|
header.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, "100");
|
||||||
header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, getSimpleSchema().toString());
|
header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, getSimpleSchema().toString());
|
||||||
HoodieAvroDataBlock dataBlock = new HoodieAvroDataBlock(records1, header);
|
HoodieDataBlock dataBlock = getDataBlock(records1, header);
|
||||||
writer = writer.appendBlock(dataBlock);
|
writer = writer.appendBlock(dataBlock);
|
||||||
writer.close();
|
writer.close();
|
||||||
|
|
||||||
@@ -377,7 +386,7 @@ public class TestHoodieLogFormat extends HoodieCommonTestHarness {
|
|||||||
List<IndexedRecord> copyOfRecords2 = records2.stream()
|
List<IndexedRecord> copyOfRecords2 = records2.stream()
|
||||||
.map(record -> HoodieAvroUtils.rewriteRecord((GenericRecord) record, schema)).collect(Collectors.toList());
|
.map(record -> HoodieAvroUtils.rewriteRecord((GenericRecord) record, schema)).collect(Collectors.toList());
|
||||||
header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, getSimpleSchema().toString());
|
header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, getSimpleSchema().toString());
|
||||||
dataBlock = new HoodieAvroDataBlock(records2, header);
|
dataBlock = getDataBlock(records2, header);
|
||||||
writer = writer.appendBlock(dataBlock);
|
writer = writer.appendBlock(dataBlock);
|
||||||
writer.close();
|
writer.close();
|
||||||
|
|
||||||
@@ -389,14 +398,14 @@ public class TestHoodieLogFormat extends HoodieCommonTestHarness {
|
|||||||
List<IndexedRecord> copyOfRecords3 = records3.stream()
|
List<IndexedRecord> copyOfRecords3 = records3.stream()
|
||||||
.map(record -> HoodieAvroUtils.rewriteRecord((GenericRecord) record, schema)).collect(Collectors.toList());
|
.map(record -> HoodieAvroUtils.rewriteRecord((GenericRecord) record, schema)).collect(Collectors.toList());
|
||||||
header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, getSimpleSchema().toString());
|
header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, getSimpleSchema().toString());
|
||||||
dataBlock = new HoodieAvroDataBlock(records3, header);
|
dataBlock = getDataBlock(records3, header);
|
||||||
writer = writer.appendBlock(dataBlock);
|
writer = writer.appendBlock(dataBlock);
|
||||||
writer.close();
|
writer.close();
|
||||||
|
|
||||||
Reader reader = HoodieLogFormat.newReader(fs, writer.getLogFile(), SchemaTestUtil.getSimpleSchema());
|
Reader reader = HoodieLogFormat.newReader(fs, writer.getLogFile(), SchemaTestUtil.getSimpleSchema());
|
||||||
assertTrue(reader.hasNext(), "First block should be available");
|
assertTrue(reader.hasNext(), "First block should be available");
|
||||||
HoodieLogBlock nextBlock = reader.next();
|
HoodieLogBlock nextBlock = reader.next();
|
||||||
HoodieAvroDataBlock dataBlockRead = (HoodieAvroDataBlock) nextBlock;
|
HoodieDataBlock dataBlockRead = (HoodieDataBlock) nextBlock;
|
||||||
assertEquals(copyOfRecords1.size(), dataBlockRead.getRecords().size(),
|
assertEquals(copyOfRecords1.size(), dataBlockRead.getRecords().size(),
|
||||||
"Read records size should be equal to the written records size");
|
"Read records size should be equal to the written records size");
|
||||||
assertEquals(copyOfRecords1, dataBlockRead.getRecords(),
|
assertEquals(copyOfRecords1, dataBlockRead.getRecords(),
|
||||||
@@ -405,7 +414,7 @@ public class TestHoodieLogFormat extends HoodieCommonTestHarness {
|
|||||||
|
|
||||||
reader.hasNext();
|
reader.hasNext();
|
||||||
nextBlock = reader.next();
|
nextBlock = reader.next();
|
||||||
dataBlockRead = (HoodieAvroDataBlock) nextBlock;
|
dataBlockRead = (HoodieDataBlock) nextBlock;
|
||||||
assertEquals(copyOfRecords2.size(), dataBlockRead.getRecords().size(),
|
assertEquals(copyOfRecords2.size(), dataBlockRead.getRecords().size(),
|
||||||
"Read records size should be equal to the written records size");
|
"Read records size should be equal to the written records size");
|
||||||
assertEquals(copyOfRecords2, dataBlockRead.getRecords(),
|
assertEquals(copyOfRecords2, dataBlockRead.getRecords(),
|
||||||
@@ -413,7 +422,7 @@ public class TestHoodieLogFormat extends HoodieCommonTestHarness {
|
|||||||
|
|
||||||
reader.hasNext();
|
reader.hasNext();
|
||||||
nextBlock = reader.next();
|
nextBlock = reader.next();
|
||||||
dataBlockRead = (HoodieAvroDataBlock) nextBlock;
|
dataBlockRead = (HoodieDataBlock) nextBlock;
|
||||||
assertEquals(copyOfRecords3.size(), dataBlockRead.getRecords().size(),
|
assertEquals(copyOfRecords3.size(), dataBlockRead.getRecords().size(),
|
||||||
"Read records size should be equal to the written records size");
|
"Read records size should be equal to the written records size");
|
||||||
assertEquals(copyOfRecords3, dataBlockRead.getRecords(),
|
assertEquals(copyOfRecords3, dataBlockRead.getRecords(),
|
||||||
@@ -443,7 +452,7 @@ public class TestHoodieLogFormat extends HoodieCommonTestHarness {
|
|||||||
.map(record -> HoodieAvroUtils.rewriteRecord((GenericRecord) record, schema)).collect(Collectors.toList());
|
.map(record -> HoodieAvroUtils.rewriteRecord((GenericRecord) record, schema)).collect(Collectors.toList());
|
||||||
allRecords.add(copyOfRecords1);
|
allRecords.add(copyOfRecords1);
|
||||||
header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, schema.toString());
|
header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, schema.toString());
|
||||||
HoodieAvroDataBlock dataBlock = new HoodieAvroDataBlock(records1, header);
|
HoodieDataBlock dataBlock = getDataBlock(records1, header);
|
||||||
writer = writer.appendBlock(dataBlock);
|
writer = writer.appendBlock(dataBlock);
|
||||||
}
|
}
|
||||||
writer.close();
|
writer.close();
|
||||||
@@ -472,7 +481,7 @@ public class TestHoodieLogFormat extends HoodieCommonTestHarness {
|
|||||||
Map<HoodieLogBlock.HeaderMetadataType, String> header = new HashMap<>();
|
Map<HoodieLogBlock.HeaderMetadataType, String> header = new HashMap<>();
|
||||||
header.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, "100");
|
header.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, "100");
|
||||||
header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, getSimpleSchema().toString());
|
header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, getSimpleSchema().toString());
|
||||||
HoodieAvroDataBlock dataBlock = new HoodieAvroDataBlock(records, header);
|
HoodieDataBlock dataBlock = getDataBlock(records, header);
|
||||||
writer = writer.appendBlock(dataBlock);
|
writer = writer.appendBlock(dataBlock);
|
||||||
writer.close();
|
writer.close();
|
||||||
|
|
||||||
@@ -498,7 +507,7 @@ public class TestHoodieLogFormat extends HoodieCommonTestHarness {
|
|||||||
.withFileId("test-fileid1").overBaseCommit("100").withFs(fs).build();
|
.withFileId("test-fileid1").overBaseCommit("100").withFs(fs).build();
|
||||||
records = SchemaTestUtil.generateTestRecords(0, 10);
|
records = SchemaTestUtil.generateTestRecords(0, 10);
|
||||||
header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, getSimpleSchema().toString());
|
header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, getSimpleSchema().toString());
|
||||||
dataBlock = new HoodieAvroDataBlock(records, header);
|
dataBlock = getDataBlock(records, header);
|
||||||
writer = writer.appendBlock(dataBlock);
|
writer = writer.appendBlock(dataBlock);
|
||||||
writer.close();
|
writer.close();
|
||||||
|
|
||||||
@@ -536,7 +545,7 @@ public class TestHoodieLogFormat extends HoodieCommonTestHarness {
|
|||||||
.withFileId("test-fileid1").overBaseCommit("100").withFs(fs).build();
|
.withFileId("test-fileid1").overBaseCommit("100").withFs(fs).build();
|
||||||
records = SchemaTestUtil.generateTestRecords(0, 100);
|
records = SchemaTestUtil.generateTestRecords(0, 100);
|
||||||
header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, getSimpleSchema().toString());
|
header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, getSimpleSchema().toString());
|
||||||
dataBlock = new HoodieAvroDataBlock(records, header);
|
dataBlock = getDataBlock(records, header);
|
||||||
writer = writer.appendBlock(dataBlock);
|
writer = writer.appendBlock(dataBlock);
|
||||||
writer.close();
|
writer.close();
|
||||||
|
|
||||||
@@ -574,7 +583,7 @@ public class TestHoodieLogFormat extends HoodieCommonTestHarness {
|
|||||||
Map<HoodieLogBlock.HeaderMetadataType, String> header = new HashMap<>();
|
Map<HoodieLogBlock.HeaderMetadataType, String> header = new HashMap<>();
|
||||||
header.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, "100");
|
header.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, "100");
|
||||||
header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, schema.toString());
|
header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, schema.toString());
|
||||||
HoodieAvroDataBlock dataBlock = new HoodieAvroDataBlock(records1, header);
|
HoodieDataBlock dataBlock = getDataBlock(records1, header);
|
||||||
writer = writer.appendBlock(dataBlock);
|
writer = writer.appendBlock(dataBlock);
|
||||||
|
|
||||||
// Write 2
|
// Write 2
|
||||||
@@ -582,7 +591,7 @@ public class TestHoodieLogFormat extends HoodieCommonTestHarness {
|
|||||||
List<IndexedRecord> copyOfRecords2 = records2.stream()
|
List<IndexedRecord> copyOfRecords2 = records2.stream()
|
||||||
.map(record -> HoodieAvroUtils.rewriteRecord((GenericRecord) record, schema)).collect(Collectors.toList());
|
.map(record -> HoodieAvroUtils.rewriteRecord((GenericRecord) record, schema)).collect(Collectors.toList());
|
||||||
header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, schema.toString());
|
header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, schema.toString());
|
||||||
dataBlock = new HoodieAvroDataBlock(records2, header);
|
dataBlock = getDataBlock(records2, header);
|
||||||
writer = writer.appendBlock(dataBlock);
|
writer = writer.appendBlock(dataBlock);
|
||||||
writer.close();
|
writer.close();
|
||||||
|
|
||||||
@@ -621,14 +630,14 @@ public class TestHoodieLogFormat extends HoodieCommonTestHarness {
|
|||||||
header.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, "100");
|
header.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, "100");
|
||||||
|
|
||||||
header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, schema.toString());
|
header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, schema.toString());
|
||||||
HoodieAvroDataBlock dataBlock = new HoodieAvroDataBlock(records1, header);
|
HoodieDataBlock dataBlock = getDataBlock(records1, header);
|
||||||
writer = writer.appendBlock(dataBlock);
|
writer = writer.appendBlock(dataBlock);
|
||||||
|
|
||||||
// Write 2
|
// Write 2
|
||||||
header.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, "101");
|
header.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, "101");
|
||||||
List<IndexedRecord> records2 = SchemaTestUtil.generateHoodieTestRecords(0, 100);
|
List<IndexedRecord> records2 = SchemaTestUtil.generateHoodieTestRecords(0, 100);
|
||||||
header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, schema.toString());
|
header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, schema.toString());
|
||||||
dataBlock = new HoodieAvroDataBlock(records2, header);
|
dataBlock = getDataBlock(records2, header);
|
||||||
writer = writer.appendBlock(dataBlock);
|
writer = writer.appendBlock(dataBlock);
|
||||||
|
|
||||||
// Rollback the last write
|
// Rollback the last write
|
||||||
@@ -644,7 +653,7 @@ public class TestHoodieLogFormat extends HoodieCommonTestHarness {
|
|||||||
List<IndexedRecord> copyOfRecords3 = records3.stream()
|
List<IndexedRecord> copyOfRecords3 = records3.stream()
|
||||||
.map(record -> HoodieAvroUtils.rewriteRecord((GenericRecord) record, schema)).collect(Collectors.toList());
|
.map(record -> HoodieAvroUtils.rewriteRecord((GenericRecord) record, schema)).collect(Collectors.toList());
|
||||||
header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, schema.toString());
|
header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, schema.toString());
|
||||||
dataBlock = new HoodieAvroDataBlock(records3, header);
|
dataBlock = getDataBlock(records3, header);
|
||||||
writer = writer.appendBlock(dataBlock);
|
writer = writer.appendBlock(dataBlock);
|
||||||
writer.close();
|
writer.close();
|
||||||
|
|
||||||
@@ -681,7 +690,7 @@ public class TestHoodieLogFormat extends HoodieCommonTestHarness {
|
|||||||
Map<HoodieLogBlock.HeaderMetadataType, String> header = new HashMap<>();
|
Map<HoodieLogBlock.HeaderMetadataType, String> header = new HashMap<>();
|
||||||
header.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, "100");
|
header.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, "100");
|
||||||
header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, schema.toString());
|
header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, schema.toString());
|
||||||
HoodieAvroDataBlock dataBlock = new HoodieAvroDataBlock(records1, header);
|
HoodieDataBlock dataBlock = getDataBlock(records1, header);
|
||||||
writer = writer.appendBlock(dataBlock);
|
writer = writer.appendBlock(dataBlock);
|
||||||
writer.close();
|
writer.close();
|
||||||
|
|
||||||
@@ -723,7 +732,7 @@ public class TestHoodieLogFormat extends HoodieCommonTestHarness {
|
|||||||
.map(record -> HoodieAvroUtils.rewriteRecord((GenericRecord) record, schema)).collect(Collectors.toList());
|
.map(record -> HoodieAvroUtils.rewriteRecord((GenericRecord) record, schema)).collect(Collectors.toList());
|
||||||
|
|
||||||
header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, schema.toString());
|
header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, schema.toString());
|
||||||
dataBlock = new HoodieAvroDataBlock(records3, header);
|
dataBlock = getDataBlock(records3, header);
|
||||||
writer = writer.appendBlock(dataBlock);
|
writer = writer.appendBlock(dataBlock);
|
||||||
writer.close();
|
writer.close();
|
||||||
|
|
||||||
@@ -761,7 +770,7 @@ public class TestHoodieLogFormat extends HoodieCommonTestHarness {
|
|||||||
Map<HoodieLogBlock.HeaderMetadataType, String> header = new HashMap<>();
|
Map<HoodieLogBlock.HeaderMetadataType, String> header = new HashMap<>();
|
||||||
header.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, "100");
|
header.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, "100");
|
||||||
header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, schema.toString());
|
header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, schema.toString());
|
||||||
HoodieAvroDataBlock dataBlock = new HoodieAvroDataBlock(records1, header);
|
HoodieDataBlock dataBlock = getDataBlock(records1, header);
|
||||||
writer = writer.appendBlock(dataBlock);
|
writer = writer.appendBlock(dataBlock);
|
||||||
|
|
||||||
// Write 2
|
// Write 2
|
||||||
@@ -769,7 +778,7 @@ public class TestHoodieLogFormat extends HoodieCommonTestHarness {
|
|||||||
List<IndexedRecord> records2 = SchemaTestUtil.generateHoodieTestRecords(0, 100);
|
List<IndexedRecord> records2 = SchemaTestUtil.generateHoodieTestRecords(0, 100);
|
||||||
List<IndexedRecord> copyOfRecords2 = records2.stream()
|
List<IndexedRecord> copyOfRecords2 = records2.stream()
|
||||||
.map(record -> HoodieAvroUtils.rewriteRecord((GenericRecord) record, schema)).collect(Collectors.toList());
|
.map(record -> HoodieAvroUtils.rewriteRecord((GenericRecord) record, schema)).collect(Collectors.toList());
|
||||||
dataBlock = new HoodieAvroDataBlock(records2, header);
|
dataBlock = getDataBlock(records2, header);
|
||||||
writer = writer.appendBlock(dataBlock);
|
writer = writer.appendBlock(dataBlock);
|
||||||
|
|
||||||
copyOfRecords1.addAll(copyOfRecords2);
|
copyOfRecords1.addAll(copyOfRecords2);
|
||||||
@@ -849,13 +858,13 @@ public class TestHoodieLogFormat extends HoodieCommonTestHarness {
|
|||||||
header.put(HoodieLogBlock.HeaderMetadataType.TARGET_INSTANT_TIME, "100");
|
header.put(HoodieLogBlock.HeaderMetadataType.TARGET_INSTANT_TIME, "100");
|
||||||
|
|
||||||
header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, schema.toString());
|
header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, schema.toString());
|
||||||
HoodieAvroDataBlock dataBlock = new HoodieAvroDataBlock(records1, header);
|
HoodieDataBlock dataBlock = getDataBlock(records1, header);
|
||||||
writer = writer.appendBlock(dataBlock);
|
writer = writer.appendBlock(dataBlock);
|
||||||
|
|
||||||
// Write 2
|
// Write 2
|
||||||
List<IndexedRecord> records2 = SchemaTestUtil.generateHoodieTestRecords(0, 100);
|
List<IndexedRecord> records2 = SchemaTestUtil.generateHoodieTestRecords(0, 100);
|
||||||
header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, schema.toString());
|
header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, schema.toString());
|
||||||
dataBlock = new HoodieAvroDataBlock(records2, header);
|
dataBlock = getDataBlock(records2, header);
|
||||||
writer = writer.appendBlock(dataBlock);
|
writer = writer.appendBlock(dataBlock);
|
||||||
|
|
||||||
// Delete 50 keys
|
// Delete 50 keys
|
||||||
@@ -916,7 +925,7 @@ public class TestHoodieLogFormat extends HoodieCommonTestHarness {
|
|||||||
header.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, "100");
|
header.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, "100");
|
||||||
header.put(HoodieLogBlock.HeaderMetadataType.TARGET_INSTANT_TIME, "100");
|
header.put(HoodieLogBlock.HeaderMetadataType.TARGET_INSTANT_TIME, "100");
|
||||||
header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, schema.toString());
|
header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, schema.toString());
|
||||||
HoodieAvroDataBlock dataBlock = new HoodieAvroDataBlock(records1, header);
|
HoodieDataBlock dataBlock = getDataBlock(records1, header);
|
||||||
writer = writer.appendBlock(dataBlock);
|
writer = writer.appendBlock(dataBlock);
|
||||||
|
|
||||||
// Delete 50 keys
|
// Delete 50 keys
|
||||||
@@ -958,7 +967,7 @@ public class TestHoodieLogFormat extends HoodieCommonTestHarness {
|
|||||||
Map<HoodieLogBlock.HeaderMetadataType, String> header = new HashMap<>();
|
Map<HoodieLogBlock.HeaderMetadataType, String> header = new HashMap<>();
|
||||||
header.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, "100");
|
header.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, "100");
|
||||||
header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, schema.toString());
|
header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, schema.toString());
|
||||||
HoodieAvroDataBlock dataBlock = new HoodieAvroDataBlock(records1, header);
|
HoodieDataBlock dataBlock = getDataBlock(records1, header);
|
||||||
writer = writer.appendBlock(dataBlock);
|
writer = writer.appendBlock(dataBlock);
|
||||||
|
|
||||||
// Write invalid rollback for a failed write (possible for in-flight commits)
|
// Write invalid rollback for a failed write (possible for in-flight commits)
|
||||||
@@ -1000,7 +1009,7 @@ public class TestHoodieLogFormat extends HoodieCommonTestHarness {
|
|||||||
header.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, "100");
|
header.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, "100");
|
||||||
header.put(HoodieLogBlock.HeaderMetadataType.TARGET_INSTANT_TIME, "100");
|
header.put(HoodieLogBlock.HeaderMetadataType.TARGET_INSTANT_TIME, "100");
|
||||||
header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, schema.toString());
|
header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, schema.toString());
|
||||||
HoodieAvroDataBlock dataBlock = new HoodieAvroDataBlock(records1, header);
|
HoodieDataBlock dataBlock = getDataBlock(records1, header);
|
||||||
writer = writer.appendBlock(dataBlock);
|
writer = writer.appendBlock(dataBlock);
|
||||||
writer = writer.appendBlock(dataBlock);
|
writer = writer.appendBlock(dataBlock);
|
||||||
writer = writer.appendBlock(dataBlock);
|
writer = writer.appendBlock(dataBlock);
|
||||||
@@ -1047,7 +1056,7 @@ public class TestHoodieLogFormat extends HoodieCommonTestHarness {
|
|||||||
Map<HoodieLogBlock.HeaderMetadataType, String> header = new HashMap<>();
|
Map<HoodieLogBlock.HeaderMetadataType, String> header = new HashMap<>();
|
||||||
header.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, "100");
|
header.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, "100");
|
||||||
header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, schema.toString());
|
header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, schema.toString());
|
||||||
HoodieAvroDataBlock dataBlock = new HoodieAvroDataBlock(records1, header);
|
HoodieDataBlock dataBlock = getDataBlock(records1, header);
|
||||||
writer = writer.appendBlock(dataBlock);
|
writer = writer.appendBlock(dataBlock);
|
||||||
writer = writer.appendBlock(dataBlock);
|
writer = writer.appendBlock(dataBlock);
|
||||||
writer = writer.appendBlock(dataBlock);
|
writer = writer.appendBlock(dataBlock);
|
||||||
@@ -1149,7 +1158,7 @@ public class TestHoodieLogFormat extends HoodieCommonTestHarness {
|
|||||||
Map<HoodieLogBlock.HeaderMetadataType, String> header = new HashMap<>();
|
Map<HoodieLogBlock.HeaderMetadataType, String> header = new HashMap<>();
|
||||||
header.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, "100");
|
header.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, "100");
|
||||||
header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, schema.toString());
|
header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, schema.toString());
|
||||||
HoodieAvroDataBlock dataBlock = new HoodieAvroDataBlock(records.subList(0, numRecordsInLog1), header);
|
HoodieDataBlock dataBlock = getDataBlock(records.subList(0, numRecordsInLog1), header);
|
||||||
writer = writer.appendBlock(dataBlock);
|
writer = writer.appendBlock(dataBlock);
|
||||||
// Get the size of the block
|
// Get the size of the block
|
||||||
long size = writer.getCurrentSize();
|
long size = writer.getCurrentSize();
|
||||||
@@ -1163,7 +1172,7 @@ public class TestHoodieLogFormat extends HoodieCommonTestHarness {
|
|||||||
Map<HoodieLogBlock.HeaderMetadataType, String> header2 = new HashMap<>();
|
Map<HoodieLogBlock.HeaderMetadataType, String> header2 = new HashMap<>();
|
||||||
header2.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, "100");
|
header2.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, "100");
|
||||||
header2.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, schema.toString());
|
header2.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, schema.toString());
|
||||||
HoodieAvroDataBlock dataBlock2 = new HoodieAvroDataBlock(records2.subList(0, numRecordsInLog2), header2);
|
HoodieDataBlock dataBlock2 = getDataBlock(records2.subList(0, numRecordsInLog2), header2);
|
||||||
writer2 = writer2.appendBlock(dataBlock2);
|
writer2 = writer2.appendBlock(dataBlock2);
|
||||||
// Get the size of the block
|
// Get the size of the block
|
||||||
writer2.close();
|
writer2.close();
|
||||||
@@ -1227,7 +1236,7 @@ public class TestHoodieLogFormat extends HoodieCommonTestHarness {
|
|||||||
Map<HoodieLogBlock.HeaderMetadataType, String> header = new HashMap<>();
|
Map<HoodieLogBlock.HeaderMetadataType, String> header = new HashMap<>();
|
||||||
header.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, "100");
|
header.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, "100");
|
||||||
header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, schema.toString());
|
header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, schema.toString());
|
||||||
HoodieAvroDataBlock dataBlock = new HoodieAvroDataBlock(records1, header);
|
HoodieDataBlock dataBlock = getDataBlock(records1, header);
|
||||||
writer = writer.appendBlock(dataBlock);
|
writer = writer.appendBlock(dataBlock);
|
||||||
writer.close();
|
writer.close();
|
||||||
|
|
||||||
@@ -1237,7 +1246,7 @@ public class TestHoodieLogFormat extends HoodieCommonTestHarness {
|
|||||||
List<IndexedRecord> records2 = SchemaTestUtil.generateTestRecords(0, 100);
|
List<IndexedRecord> records2 = SchemaTestUtil.generateTestRecords(0, 100);
|
||||||
List<IndexedRecord> copyOfRecords2 = records2.stream()
|
List<IndexedRecord> copyOfRecords2 = records2.stream()
|
||||||
.map(record -> HoodieAvroUtils.rewriteRecord((GenericRecord) record, schema)).collect(Collectors.toList());
|
.map(record -> HoodieAvroUtils.rewriteRecord((GenericRecord) record, schema)).collect(Collectors.toList());
|
||||||
dataBlock = new HoodieAvroDataBlock(records2, header);
|
dataBlock = getDataBlock(records2, header);
|
||||||
writer = writer.appendBlock(dataBlock);
|
writer = writer.appendBlock(dataBlock);
|
||||||
writer.close();
|
writer.close();
|
||||||
|
|
||||||
@@ -1248,7 +1257,7 @@ public class TestHoodieLogFormat extends HoodieCommonTestHarness {
|
|||||||
List<IndexedRecord> records3 = SchemaTestUtil.generateTestRecords(0, 100);
|
List<IndexedRecord> records3 = SchemaTestUtil.generateTestRecords(0, 100);
|
||||||
List<IndexedRecord> copyOfRecords3 = records3.stream()
|
List<IndexedRecord> copyOfRecords3 = records3.stream()
|
||||||
.map(record -> HoodieAvroUtils.rewriteRecord((GenericRecord) record, schema)).collect(Collectors.toList());
|
.map(record -> HoodieAvroUtils.rewriteRecord((GenericRecord) record, schema)).collect(Collectors.toList());
|
||||||
dataBlock = new HoodieAvroDataBlock(records3, header);
|
dataBlock = getDataBlock(records3, header);
|
||||||
writer = writer.appendBlock(dataBlock);
|
writer = writer.appendBlock(dataBlock);
|
||||||
writer.close();
|
writer.close();
|
||||||
|
|
||||||
@@ -1257,7 +1266,7 @@ public class TestHoodieLogFormat extends HoodieCommonTestHarness {
|
|||||||
|
|
||||||
assertTrue(reader.hasPrev(), "Last block should be available");
|
assertTrue(reader.hasPrev(), "Last block should be available");
|
||||||
HoodieLogBlock prevBlock = reader.prev();
|
HoodieLogBlock prevBlock = reader.prev();
|
||||||
HoodieAvroDataBlock dataBlockRead = (HoodieAvroDataBlock) prevBlock;
|
HoodieDataBlock dataBlockRead = (HoodieDataBlock) prevBlock;
|
||||||
|
|
||||||
assertEquals(copyOfRecords3.size(), dataBlockRead.getRecords().size(),
|
assertEquals(copyOfRecords3.size(), dataBlockRead.getRecords().size(),
|
||||||
"Third records size should be equal to the written records size");
|
"Third records size should be equal to the written records size");
|
||||||
@@ -1266,7 +1275,7 @@ public class TestHoodieLogFormat extends HoodieCommonTestHarness {
|
|||||||
|
|
||||||
assertTrue(reader.hasPrev(), "Second block should be available");
|
assertTrue(reader.hasPrev(), "Second block should be available");
|
||||||
prevBlock = reader.prev();
|
prevBlock = reader.prev();
|
||||||
dataBlockRead = (HoodieAvroDataBlock) prevBlock;
|
dataBlockRead = (HoodieDataBlock) prevBlock;
|
||||||
assertEquals(copyOfRecords2.size(), dataBlockRead.getRecords().size(),
|
assertEquals(copyOfRecords2.size(), dataBlockRead.getRecords().size(),
|
||||||
"Read records size should be equal to the written records size");
|
"Read records size should be equal to the written records size");
|
||||||
assertEquals(copyOfRecords2, dataBlockRead.getRecords(),
|
assertEquals(copyOfRecords2, dataBlockRead.getRecords(),
|
||||||
@@ -1274,7 +1283,7 @@ public class TestHoodieLogFormat extends HoodieCommonTestHarness {
|
|||||||
|
|
||||||
assertTrue(reader.hasPrev(), "First block should be available");
|
assertTrue(reader.hasPrev(), "First block should be available");
|
||||||
prevBlock = reader.prev();
|
prevBlock = reader.prev();
|
||||||
dataBlockRead = (HoodieAvroDataBlock) prevBlock;
|
dataBlockRead = (HoodieDataBlock) prevBlock;
|
||||||
assertEquals(copyOfRecords1.size(), dataBlockRead.getRecords().size(),
|
assertEquals(copyOfRecords1.size(), dataBlockRead.getRecords().size(),
|
||||||
"Read records size should be equal to the written records size");
|
"Read records size should be equal to the written records size");
|
||||||
assertEquals(copyOfRecords1, dataBlockRead.getRecords(),
|
assertEquals(copyOfRecords1, dataBlockRead.getRecords(),
|
||||||
@@ -1296,7 +1305,7 @@ public class TestHoodieLogFormat extends HoodieCommonTestHarness {
|
|||||||
Map<HoodieLogBlock.HeaderMetadataType, String> header = new HashMap<>();
|
Map<HoodieLogBlock.HeaderMetadataType, String> header = new HashMap<>();
|
||||||
header.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, "100");
|
header.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, "100");
|
||||||
header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, schema.toString());
|
header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, schema.toString());
|
||||||
HoodieAvroDataBlock dataBlock = new HoodieAvroDataBlock(records, header);
|
HoodieDataBlock dataBlock = getDataBlock(records, header);
|
||||||
writer = writer.appendBlock(dataBlock);
|
writer = writer.appendBlock(dataBlock);
|
||||||
writer.close();
|
writer.close();
|
||||||
|
|
||||||
@@ -1322,7 +1331,7 @@ public class TestHoodieLogFormat extends HoodieCommonTestHarness {
|
|||||||
HoodieLogFormat.newWriterBuilder().onParentPath(partitionPath).withFileExtension(HoodieLogFile.DELTA_EXTENSION)
|
HoodieLogFormat.newWriterBuilder().onParentPath(partitionPath).withFileExtension(HoodieLogFile.DELTA_EXTENSION)
|
||||||
.withFileId("test-fileid1").overBaseCommit("100").withFs(fs).build();
|
.withFileId("test-fileid1").overBaseCommit("100").withFs(fs).build();
|
||||||
records = SchemaTestUtil.generateTestRecords(0, 100);
|
records = SchemaTestUtil.generateTestRecords(0, 100);
|
||||||
dataBlock = new HoodieAvroDataBlock(records, header);
|
dataBlock = getDataBlock(records, header);
|
||||||
writer = writer.appendBlock(dataBlock);
|
writer = writer.appendBlock(dataBlock);
|
||||||
writer.close();
|
writer.close();
|
||||||
|
|
||||||
@@ -1332,7 +1341,7 @@ public class TestHoodieLogFormat extends HoodieCommonTestHarness {
|
|||||||
|
|
||||||
assertTrue(reader.hasPrev(), "Last block should be available");
|
assertTrue(reader.hasPrev(), "Last block should be available");
|
||||||
HoodieLogBlock block = reader.prev();
|
HoodieLogBlock block = reader.prev();
|
||||||
assertTrue(block instanceof HoodieAvroDataBlock, "Last block should be datablock");
|
assertTrue(block instanceof HoodieDataBlock, "Last block should be datablock");
|
||||||
|
|
||||||
assertTrue(reader.hasPrev(), "Last block should be available");
|
assertTrue(reader.hasPrev(), "Last block should be available");
|
||||||
assertThrows(CorruptedLogFileException.class, () -> {
|
assertThrows(CorruptedLogFileException.class, () -> {
|
||||||
@@ -1355,7 +1364,7 @@ public class TestHoodieLogFormat extends HoodieCommonTestHarness {
|
|||||||
Map<HoodieLogBlock.HeaderMetadataType, String> header = new HashMap<>();
|
Map<HoodieLogBlock.HeaderMetadataType, String> header = new HashMap<>();
|
||||||
header.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, "100");
|
header.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, "100");
|
||||||
header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, schema.toString());
|
header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, schema.toString());
|
||||||
HoodieAvroDataBlock dataBlock = new HoodieAvroDataBlock(records1, header);
|
HoodieDataBlock dataBlock = getDataBlock(records1, header);
|
||||||
writer = writer.appendBlock(dataBlock);
|
writer = writer.appendBlock(dataBlock);
|
||||||
writer.close();
|
writer.close();
|
||||||
|
|
||||||
@@ -1363,7 +1372,7 @@ public class TestHoodieLogFormat extends HoodieCommonTestHarness {
|
|||||||
HoodieLogFormat.newWriterBuilder().onParentPath(partitionPath).withFileExtension(HoodieLogFile.DELTA_EXTENSION)
|
HoodieLogFormat.newWriterBuilder().onParentPath(partitionPath).withFileExtension(HoodieLogFile.DELTA_EXTENSION)
|
||||||
.withFileId("test-fileid1").overBaseCommit("100").withFs(fs).build();
|
.withFileId("test-fileid1").overBaseCommit("100").withFs(fs).build();
|
||||||
List<IndexedRecord> records2 = SchemaTestUtil.generateTestRecords(0, 100);
|
List<IndexedRecord> records2 = SchemaTestUtil.generateTestRecords(0, 100);
|
||||||
dataBlock = new HoodieAvroDataBlock(records2, header);
|
dataBlock = getDataBlock(records2, header);
|
||||||
writer = writer.appendBlock(dataBlock);
|
writer = writer.appendBlock(dataBlock);
|
||||||
writer.close();
|
writer.close();
|
||||||
|
|
||||||
@@ -1372,7 +1381,7 @@ public class TestHoodieLogFormat extends HoodieCommonTestHarness {
|
|||||||
HoodieLogFormat.newWriterBuilder().onParentPath(partitionPath).withFileExtension(HoodieLogFile.DELTA_EXTENSION)
|
HoodieLogFormat.newWriterBuilder().onParentPath(partitionPath).withFileExtension(HoodieLogFile.DELTA_EXTENSION)
|
||||||
.withFileId("test-fileid1").overBaseCommit("100").withFs(fs).build();
|
.withFileId("test-fileid1").overBaseCommit("100").withFs(fs).build();
|
||||||
List<IndexedRecord> records3 = SchemaTestUtil.generateTestRecords(0, 100);
|
List<IndexedRecord> records3 = SchemaTestUtil.generateTestRecords(0, 100);
|
||||||
dataBlock = new HoodieAvroDataBlock(records3, header);
|
dataBlock = getDataBlock(records3, header);
|
||||||
writer = writer.appendBlock(dataBlock);
|
writer = writer.appendBlock(dataBlock);
|
||||||
writer.close();
|
writer.close();
|
||||||
|
|
||||||
@@ -1388,7 +1397,7 @@ public class TestHoodieLogFormat extends HoodieCommonTestHarness {
|
|||||||
// After moving twice, this last reader.prev() should read the First block written
|
// After moving twice, this last reader.prev() should read the First block written
|
||||||
assertTrue(reader.hasPrev(), "First block should be available");
|
assertTrue(reader.hasPrev(), "First block should be available");
|
||||||
HoodieLogBlock prevBlock = reader.prev();
|
HoodieLogBlock prevBlock = reader.prev();
|
||||||
HoodieAvroDataBlock dataBlockRead = (HoodieAvroDataBlock) prevBlock;
|
HoodieDataBlock dataBlockRead = (HoodieDataBlock) prevBlock;
|
||||||
assertEquals(copyOfRecords1.size(), dataBlockRead.getRecords().size(),
|
assertEquals(copyOfRecords1.size(), dataBlockRead.getRecords().size(),
|
||||||
"Read records size should be equal to the written records size");
|
"Read records size should be equal to the written records size");
|
||||||
assertEquals(copyOfRecords1, dataBlockRead.getRecords(),
|
assertEquals(copyOfRecords1, dataBlockRead.getRecords(),
|
||||||
@@ -1429,4 +1438,13 @@ public class TestHoodieLogFormat extends HoodieCommonTestHarness {
|
|||||||
assertEquals(recordsCopy.get(i), readRecords.get(i));
|
assertEquals(recordsCopy.get(i), readRecords.get(i));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private HoodieDataBlock getDataBlock(List<IndexedRecord> records, Map<HeaderMetadataType, String> header) {
|
||||||
|
switch (dataBlockType) {
|
||||||
|
case AVRO_DATA_BLOCK:
|
||||||
|
return new HoodieAvroDataBlock(records, header);
|
||||||
|
default:
|
||||||
|
throw new RuntimeException("Unknown data block type " + dataBlockType);
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -28,6 +28,7 @@ import org.apache.hudi.common.fs.FSUtils;
|
|||||||
import org.apache.hudi.common.model.FileSlice;
|
import org.apache.hudi.common.model.FileSlice;
|
||||||
import org.apache.hudi.common.model.HoodieAvroPayload;
|
import org.apache.hudi.common.model.HoodieAvroPayload;
|
||||||
import org.apache.hudi.common.model.HoodieCleaningPolicy;
|
import org.apache.hudi.common.model.HoodieCleaningPolicy;
|
||||||
|
import org.apache.hudi.common.model.HoodieFileFormat;
|
||||||
import org.apache.hudi.common.model.HoodieLogFile;
|
import org.apache.hudi.common.model.HoodieLogFile;
|
||||||
import org.apache.hudi.common.model.HoodieRecord;
|
import org.apache.hudi.common.model.HoodieRecord;
|
||||||
import org.apache.hudi.common.model.HoodieRecordLocation;
|
import org.apache.hudi.common.model.HoodieRecordLocation;
|
||||||
@@ -120,21 +121,41 @@ public class HoodieTestUtils {
|
|||||||
return init(getDefaultHadoopConf(), basePath, tableType);
|
return init(getDefaultHadoopConf(), basePath, tableType);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public static HoodieTableMetaClient init(String basePath, HoodieFileFormat baseFileFormat) throws IOException {
|
||||||
|
return init(getDefaultHadoopConf(), basePath, HoodieTableType.COPY_ON_WRITE, baseFileFormat);
|
||||||
|
}
|
||||||
|
|
||||||
public static HoodieTableMetaClient init(Configuration hadoopConf, String basePath) throws IOException {
|
public static HoodieTableMetaClient init(Configuration hadoopConf, String basePath) throws IOException {
|
||||||
return init(hadoopConf, basePath, HoodieTableType.COPY_ON_WRITE);
|
return init(hadoopConf, basePath, HoodieTableType.COPY_ON_WRITE);
|
||||||
}
|
}
|
||||||
|
|
||||||
public static HoodieTableMetaClient init(Configuration hadoopConf, String basePath, HoodieTableType tableType)
|
public static HoodieTableMetaClient init(Configuration hadoopConf, String basePath, HoodieTableType tableType)
|
||||||
throws IOException {
|
throws IOException {
|
||||||
return init(hadoopConf, basePath, tableType, RAW_TRIPS_TEST_NAME);
|
return init(hadoopConf, basePath, tableType, new Properties());
|
||||||
}
|
}
|
||||||
|
|
||||||
public static HoodieTableMetaClient init(Configuration hadoopConf, String basePath, HoodieTableType tableType,
|
public static HoodieTableMetaClient init(Configuration hadoopConf, String basePath, HoodieTableType tableType,
|
||||||
String tableName) throws IOException {
|
String tableName)
|
||||||
|
throws IOException {
|
||||||
Properties properties = new Properties();
|
Properties properties = new Properties();
|
||||||
properties.setProperty(HoodieTableConfig.HOODIE_TABLE_NAME_PROP_NAME, tableName);
|
properties.setProperty(HoodieTableConfig.HOODIE_TABLE_NAME_PROP_NAME, tableName);
|
||||||
properties.setProperty(HoodieTableConfig.HOODIE_TABLE_TYPE_PROP_NAME, tableType.name());
|
return init(hadoopConf, basePath, tableType, properties);
|
||||||
properties.setProperty(HoodieTableConfig.HOODIE_PAYLOAD_CLASS_PROP_NAME, HoodieAvroPayload.class.getName());
|
}
|
||||||
|
|
||||||
|
public static HoodieTableMetaClient init(Configuration hadoopConf, String basePath, HoodieTableType tableType,
|
||||||
|
HoodieFileFormat baseFileFormat)
|
||||||
|
throws IOException {
|
||||||
|
Properties properties = new Properties();
|
||||||
|
properties.setProperty(HoodieTableConfig.HOODIE_BASE_FILE_FORMAT_PROP_NAME, baseFileFormat.toString());
|
||||||
|
return init(hadoopConf, basePath, tableType, properties);
|
||||||
|
}
|
||||||
|
|
||||||
|
public static HoodieTableMetaClient init(Configuration hadoopConf, String basePath, HoodieTableType tableType,
|
||||||
|
Properties properties)
|
||||||
|
throws IOException {
|
||||||
|
properties.putIfAbsent(HoodieTableConfig.HOODIE_TABLE_NAME_PROP_NAME, RAW_TRIPS_TEST_NAME);
|
||||||
|
properties.putIfAbsent(HoodieTableConfig.HOODIE_TABLE_TYPE_PROP_NAME, tableType.name());
|
||||||
|
properties.putIfAbsent(HoodieTableConfig.HOODIE_PAYLOAD_CLASS_PROP_NAME, HoodieAvroPayload.class.getName());
|
||||||
return HoodieTableMetaClient.initTableAndGetMetaClient(hadoopConf, basePath, properties);
|
return HoodieTableMetaClient.initTableAndGetMetaClient(hadoopConf, basePath, properties);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -442,12 +463,31 @@ public class HoodieTestUtils {
|
|||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// TODO: should be removed
|
||||||
public static FileStatus[] listAllDataFilesInPath(FileSystem fs, String basePath) throws IOException {
|
public static FileStatus[] listAllDataFilesInPath(FileSystem fs, String basePath) throws IOException {
|
||||||
|
return listAllDataFilesInPath(fs, basePath, ".parquet");
|
||||||
|
}
|
||||||
|
|
||||||
|
public static FileStatus[] listAllDataFilesInPath(FileSystem fs, String basePath, String datafileExtension)
|
||||||
|
throws IOException {
|
||||||
RemoteIterator<LocatedFileStatus> itr = fs.listFiles(new Path(basePath), true);
|
RemoteIterator<LocatedFileStatus> itr = fs.listFiles(new Path(basePath), true);
|
||||||
List<FileStatus> returns = new ArrayList<>();
|
List<FileStatus> returns = new ArrayList<>();
|
||||||
while (itr.hasNext()) {
|
while (itr.hasNext()) {
|
||||||
LocatedFileStatus status = itr.next();
|
LocatedFileStatus status = itr.next();
|
||||||
if (status.getPath().getName().contains(".parquet")) {
|
if (status.getPath().getName().contains(datafileExtension)) {
|
||||||
|
returns.add(status);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return returns.toArray(new FileStatus[returns.size()]);
|
||||||
|
}
|
||||||
|
|
||||||
|
public static FileStatus[] listAllLogFilesInPath(FileSystem fs, String basePath, String logfileExtension)
|
||||||
|
throws IOException {
|
||||||
|
RemoteIterator<LocatedFileStatus> itr = fs.listFiles(new Path(basePath), true);
|
||||||
|
List<FileStatus> returns = new ArrayList<>();
|
||||||
|
while (itr.hasNext()) {
|
||||||
|
LocatedFileStatus status = itr.next();
|
||||||
|
if (status.getPath().getName().contains(logfileExtension)) {
|
||||||
returns.add(status);
|
returns.add(status);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -16,7 +16,7 @@
|
|||||||
* limitations under the License.
|
* limitations under the License.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
package org.apache.hudi.client.utils;
|
package org.apache.hudi.common.util;
|
||||||
|
|
||||||
import org.apache.hudi.exception.HoodieIOException;
|
import org.apache.hudi.exception.HoodieIOException;
|
||||||
|
|
||||||
@@ -0,0 +1,54 @@
|
|||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one
|
||||||
|
* or more contributor license agreements. See the NOTICE file
|
||||||
|
* distributed with this work for additional information
|
||||||
|
* regarding copyright ownership. The ASF licenses this file
|
||||||
|
* to you under the Apache License, Version 2.0 (the
|
||||||
|
* "License"); you may not use this file except in compliance
|
||||||
|
* with the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.apache.hudi.io.storage;
|
||||||
|
|
||||||
|
import org.apache.avro.generic.IndexedRecord;
|
||||||
|
import org.apache.hadoop.conf.Configuration;
|
||||||
|
import org.apache.hadoop.fs.Path;
|
||||||
|
import org.junit.jupiter.api.Test;
|
||||||
|
import org.junit.jupiter.api.io.TempDir;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
|
||||||
|
import static org.junit.jupiter.api.Assertions.assertThrows;
|
||||||
|
import static org.junit.jupiter.api.Assertions.assertTrue;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Tests for {@link HoodieFileReaderFactory}.
|
||||||
|
*/
|
||||||
|
public class TestHoodieFileReaderFactory {
|
||||||
|
@TempDir
|
||||||
|
public java.nio.file.Path tempDir;
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testGetFileReader() throws IOException {
|
||||||
|
// parquet file format.
|
||||||
|
final Configuration hadoopConf = new Configuration();
|
||||||
|
final Path parquetPath = new Path("/partition/path/f1_1-0-1_000.parquet");
|
||||||
|
HoodieFileReader<IndexedRecord> parquetReader = HoodieFileReaderFactory.getFileReader(hadoopConf, parquetPath);
|
||||||
|
assertTrue(parquetReader instanceof HoodieParquetReader);
|
||||||
|
|
||||||
|
// other file format exception.
|
||||||
|
final Path logPath = new Path("/partition/path/f.b51192a8-574b-4a85-b246-bcfec03ac8bf_100.log.2_1-0-1");
|
||||||
|
final Throwable thrown = assertThrows(UnsupportedOperationException.class, () -> {
|
||||||
|
HoodieFileReader<IndexedRecord> logWriter = HoodieFileReaderFactory.getFileReader(hadoopConf, logPath);
|
||||||
|
}, "should fail since log storage reader is not supported yet.");
|
||||||
|
assertTrue(thrown.getMessage().contains("format not supported yet."));
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -18,6 +18,7 @@
|
|||||||
|
|
||||||
package org.apache.hudi.hadoop;
|
package org.apache.hudi.hadoop;
|
||||||
|
|
||||||
|
import org.apache.hudi.common.model.HoodieFileFormat;
|
||||||
import org.apache.hudi.common.table.HoodieTableMetaClient;
|
import org.apache.hudi.common.table.HoodieTableMetaClient;
|
||||||
import org.apache.hudi.common.table.timeline.HoodieDefaultTimeline;
|
import org.apache.hudi.common.table.timeline.HoodieDefaultTimeline;
|
||||||
import org.apache.hudi.common.table.timeline.HoodieInstant;
|
import org.apache.hudi.common.table.timeline.HoodieInstant;
|
||||||
@@ -101,7 +102,8 @@ public class HoodieParquetInputFormat extends MapredParquetInputFormat implement
|
|||||||
setInputPaths(job, snapshotPaths.toArray(new Path[snapshotPaths.size()]));
|
setInputPaths(job, snapshotPaths.toArray(new Path[snapshotPaths.size()]));
|
||||||
FileStatus[] fileStatuses = super.listStatus(job);
|
FileStatus[] fileStatuses = super.listStatus(job);
|
||||||
Map<HoodieTableMetaClient, List<FileStatus>> groupedFileStatus =
|
Map<HoodieTableMetaClient, List<FileStatus>> groupedFileStatus =
|
||||||
HoodieInputFormatUtils.groupFileStatusForSnapshotPaths(fileStatuses, tableMetaClientMap.values());
|
HoodieInputFormatUtils.groupFileStatusForSnapshotPaths(fileStatuses,
|
||||||
|
HoodieFileFormat.PARQUET.getFileExtension(), tableMetaClientMap.values());
|
||||||
LOG.info("Found a total of " + groupedFileStatus.size() + " groups");
|
LOG.info("Found a total of " + groupedFileStatus.size() + " groups");
|
||||||
for (Map.Entry<HoodieTableMetaClient, List<FileStatus>> entry : groupedFileStatus.entrySet()) {
|
for (Map.Entry<HoodieTableMetaClient, List<FileStatus>> entry : groupedFileStatus.entrySet()) {
|
||||||
List<FileStatus> result = HoodieInputFormatUtils.filterFileStatusForSnapshotMode(job, entry.getKey(), entry.getValue());
|
List<FileStatus> result = HoodieInputFormatUtils.filterFileStatusForSnapshotMode(job, entry.getKey(), entry.getValue());
|
||||||
|
|||||||
@@ -24,7 +24,7 @@ import org.apache.hudi.exception.HoodieException;
|
|||||||
import org.apache.hudi.hadoop.HoodieParquetInputFormat;
|
import org.apache.hudi.hadoop.HoodieParquetInputFormat;
|
||||||
import org.apache.hudi.hadoop.realtime.HoodieCombineRealtimeRecordReader;
|
import org.apache.hudi.hadoop.realtime.HoodieCombineRealtimeRecordReader;
|
||||||
import org.apache.hudi.hadoop.realtime.HoodieParquetRealtimeInputFormat;
|
import org.apache.hudi.hadoop.realtime.HoodieParquetRealtimeInputFormat;
|
||||||
|
import org.apache.hudi.hadoop.utils.HoodieInputFormatUtils;
|
||||||
import org.apache.hadoop.conf.Configuration;
|
import org.apache.hadoop.conf.Configuration;
|
||||||
import org.apache.hadoop.fs.FileStatus;
|
import org.apache.hadoop.fs.FileStatus;
|
||||||
import org.apache.hadoop.fs.FileSystem;
|
import org.apache.hadoop.fs.FileSystem;
|
||||||
@@ -952,7 +952,11 @@ public class HoodieCombineHiveInputFormat<K extends WritableComparable, V extend
|
|||||||
ValidationUtils.checkArgument(split instanceof HoodieCombineRealtimeFileSplit, "Only "
|
ValidationUtils.checkArgument(split instanceof HoodieCombineRealtimeFileSplit, "Only "
|
||||||
+ HoodieCombineRealtimeFileSplit.class.getName() + " allowed, found " + split.getClass().getName());
|
+ HoodieCombineRealtimeFileSplit.class.getName() + " allowed, found " + split.getClass().getName());
|
||||||
for (InputSplit inputSplit : ((HoodieCombineRealtimeFileSplit) split).getRealtimeFileSplits()) {
|
for (InputSplit inputSplit : ((HoodieCombineRealtimeFileSplit) split).getRealtimeFileSplits()) {
|
||||||
recordReaders.add(new HoodieParquetRealtimeInputFormat().getRecordReader(inputSplit, job, reporter));
|
if (split.getPaths().length == 0) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
FileInputFormat inputFormat = HoodieInputFormatUtils.getInputFormat(split.getPath(0).toString(), true, job);
|
||||||
|
recordReaders.add(inputFormat.getRecordReader(inputSplit, job, reporter));
|
||||||
}
|
}
|
||||||
return new HoodieCombineRealtimeRecordReader(job, split, recordReaders);
|
return new HoodieCombineRealtimeRecordReader(job, split, recordReaders);
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -32,8 +32,6 @@ import org.apache.hadoop.hive.serde2.ColumnProjectionUtils;
|
|||||||
import org.apache.hadoop.mapred.JobConf;
|
import org.apache.hadoop.mapred.JobConf;
|
||||||
import org.apache.log4j.LogManager;
|
import org.apache.log4j.LogManager;
|
||||||
import org.apache.log4j.Logger;
|
import org.apache.log4j.Logger;
|
||||||
import org.apache.parquet.avro.AvroSchemaConverter;
|
|
||||||
import org.apache.parquet.schema.MessageType;
|
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
@@ -50,7 +48,6 @@ public abstract class AbstractRealtimeRecordReader {
|
|||||||
|
|
||||||
protected final HoodieRealtimeFileSplit split;
|
protected final HoodieRealtimeFileSplit split;
|
||||||
protected final JobConf jobConf;
|
protected final JobConf jobConf;
|
||||||
private final MessageType baseFileSchema;
|
|
||||||
protected final boolean usesCustomPayload;
|
protected final boolean usesCustomPayload;
|
||||||
// Schema handles
|
// Schema handles
|
||||||
private Schema readerSchema;
|
private Schema readerSchema;
|
||||||
@@ -66,7 +63,6 @@ public abstract class AbstractRealtimeRecordReader {
|
|||||||
try {
|
try {
|
||||||
this.usesCustomPayload = usesCustomPayload();
|
this.usesCustomPayload = usesCustomPayload();
|
||||||
LOG.info("usesCustomPayload ==> " + this.usesCustomPayload);
|
LOG.info("usesCustomPayload ==> " + this.usesCustomPayload);
|
||||||
baseFileSchema = HoodieRealtimeRecordReaderUtils.readSchema(jobConf, split.getPath());
|
|
||||||
init();
|
init();
|
||||||
} catch (IOException e) {
|
} catch (IOException e) {
|
||||||
throw new HoodieIOException("Could not create HoodieRealtimeRecordReader on path " + this.split.getPath(), e);
|
throw new HoodieIOException("Could not create HoodieRealtimeRecordReader on path " + this.split.getPath(), e);
|
||||||
@@ -88,7 +84,7 @@ public abstract class AbstractRealtimeRecordReader {
|
|||||||
Schema schemaFromLogFile =
|
Schema schemaFromLogFile =
|
||||||
LogReaderUtils.readLatestSchemaFromLogFiles(split.getBasePath(), split.getDeltaLogPaths(), jobConf);
|
LogReaderUtils.readLatestSchemaFromLogFiles(split.getBasePath(), split.getDeltaLogPaths(), jobConf);
|
||||||
if (schemaFromLogFile == null) {
|
if (schemaFromLogFile == null) {
|
||||||
writerSchema = new AvroSchemaConverter().convert(baseFileSchema);
|
writerSchema = HoodieRealtimeRecordReaderUtils.readSchema(jobConf, split.getPath());
|
||||||
LOG.debug("Writer Schema From Parquet => " + writerSchema.getFields());
|
LOG.debug("Writer Schema From Parquet => " + writerSchema.getFields());
|
||||||
} else {
|
} else {
|
||||||
writerSchema = schemaFromLogFile;
|
writerSchema = schemaFromLogFile;
|
||||||
|
|||||||
@@ -18,8 +18,10 @@
|
|||||||
|
|
||||||
package org.apache.hudi.hadoop.utils;
|
package org.apache.hudi.hadoop.utils;
|
||||||
|
|
||||||
|
import org.apache.hudi.common.fs.FSUtils;
|
||||||
import org.apache.hudi.common.model.HoodieBaseFile;
|
import org.apache.hudi.common.model.HoodieBaseFile;
|
||||||
import org.apache.hudi.common.model.HoodieCommitMetadata;
|
import org.apache.hudi.common.model.HoodieCommitMetadata;
|
||||||
|
import org.apache.hudi.common.model.HoodieFileFormat;
|
||||||
import org.apache.hudi.common.model.HoodiePartitionMetadata;
|
import org.apache.hudi.common.model.HoodiePartitionMetadata;
|
||||||
import org.apache.hudi.common.table.HoodieTableMetaClient;
|
import org.apache.hudi.common.table.HoodieTableMetaClient;
|
||||||
import org.apache.hudi.common.table.timeline.HoodieDefaultTimeline;
|
import org.apache.hudi.common.table.timeline.HoodieDefaultTimeline;
|
||||||
@@ -30,11 +32,15 @@ import org.apache.hudi.common.table.view.TableFileSystemView;
|
|||||||
import org.apache.hudi.common.util.Option;
|
import org.apache.hudi.common.util.Option;
|
||||||
import org.apache.hudi.common.util.StringUtils;
|
import org.apache.hudi.common.util.StringUtils;
|
||||||
import org.apache.hudi.exception.HoodieIOException;
|
import org.apache.hudi.exception.HoodieIOException;
|
||||||
|
import org.apache.hudi.hadoop.HoodieParquetInputFormat;
|
||||||
|
import org.apache.hudi.hadoop.realtime.HoodieParquetRealtimeInputFormat;
|
||||||
import org.apache.hadoop.conf.Configuration;
|
import org.apache.hadoop.conf.Configuration;
|
||||||
import org.apache.hadoop.fs.FileStatus;
|
import org.apache.hadoop.fs.FileStatus;
|
||||||
import org.apache.hadoop.fs.FileSystem;
|
import org.apache.hadoop.fs.FileSystem;
|
||||||
import org.apache.hadoop.fs.Path;
|
import org.apache.hadoop.fs.Path;
|
||||||
|
import org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat;
|
||||||
|
import org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe;
|
||||||
|
import org.apache.hadoop.mapred.FileInputFormat;
|
||||||
import org.apache.hadoop.mapred.JobConf;
|
import org.apache.hadoop.mapred.JobConf;
|
||||||
import org.apache.hadoop.mapreduce.Job;
|
import org.apache.hadoop.mapreduce.Job;
|
||||||
import org.apache.log4j.LogManager;
|
import org.apache.log4j.LogManager;
|
||||||
@@ -61,6 +67,54 @@ public class HoodieInputFormatUtils {
|
|||||||
|
|
||||||
private static final Logger LOG = LogManager.getLogger(HoodieInputFormatUtils.class);
|
private static final Logger LOG = LogManager.getLogger(HoodieInputFormatUtils.class);
|
||||||
|
|
||||||
|
public static FileInputFormat getInputFormat(HoodieFileFormat baseFileFormat, boolean realtime, Configuration conf) {
|
||||||
|
switch (baseFileFormat) {
|
||||||
|
case PARQUET:
|
||||||
|
if (realtime) {
|
||||||
|
HoodieParquetRealtimeInputFormat inputFormat = new HoodieParquetRealtimeInputFormat();
|
||||||
|
inputFormat.setConf(conf);
|
||||||
|
return inputFormat;
|
||||||
|
} else {
|
||||||
|
HoodieParquetInputFormat inputFormat = new HoodieParquetInputFormat();
|
||||||
|
inputFormat.setConf(conf);
|
||||||
|
return inputFormat;
|
||||||
|
}
|
||||||
|
default:
|
||||||
|
throw new HoodieIOException("Hoodie InputFormat not implemented for base file format " + baseFileFormat);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public static String getInputFormatClassName(HoodieFileFormat baseFileFormat, boolean realtime, Configuration conf) {
|
||||||
|
FileInputFormat inputFormat = getInputFormat(baseFileFormat, realtime, conf);
|
||||||
|
return inputFormat.getClass().getName();
|
||||||
|
}
|
||||||
|
|
||||||
|
public static String getOutputFormatClassName(HoodieFileFormat baseFileFormat) {
|
||||||
|
switch (baseFileFormat) {
|
||||||
|
case PARQUET:
|
||||||
|
return MapredParquetOutputFormat.class.getName();
|
||||||
|
default:
|
||||||
|
throw new HoodieIOException("No OutputFormat for base file format " + baseFileFormat);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public static String getSerDeClassName(HoodieFileFormat baseFileFormat) {
|
||||||
|
switch (baseFileFormat) {
|
||||||
|
case PARQUET:
|
||||||
|
return ParquetHiveSerDe.class.getName();
|
||||||
|
default:
|
||||||
|
throw new HoodieIOException("No SerDe for base file format " + baseFileFormat);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public static FileInputFormat getInputFormat(String path, boolean realtime, Configuration conf) {
|
||||||
|
final String extension = FSUtils.getFileExtension(path.toString());
|
||||||
|
if (extension.equals(HoodieFileFormat.PARQUET.getFileExtension())) {
|
||||||
|
return getInputFormat(HoodieFileFormat.PARQUET, realtime, conf);
|
||||||
|
}
|
||||||
|
throw new HoodieIOException("Hoodie InputFormat not implemented for base file of type " + extension);
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Filter any specific instants that we do not want to process.
|
* Filter any specific instants that we do not want to process.
|
||||||
* example timeline:
|
* example timeline:
|
||||||
@@ -255,19 +309,20 @@ public class HoodieInputFormatUtils {
|
|||||||
* Takes in a list of filesStatus and a list of table metadatas. Groups the files status list
|
* Takes in a list of filesStatus and a list of table metadatas. Groups the files status list
|
||||||
* based on given table metadata.
|
* based on given table metadata.
|
||||||
* @param fileStatuses
|
* @param fileStatuses
|
||||||
|
* @param fileExtension
|
||||||
* @param metaClientList
|
* @param metaClientList
|
||||||
* @return
|
* @return
|
||||||
* @throws IOException
|
* @throws IOException
|
||||||
*/
|
*/
|
||||||
public static Map<HoodieTableMetaClient, List<FileStatus>> groupFileStatusForSnapshotPaths(
|
public static Map<HoodieTableMetaClient, List<FileStatus>> groupFileStatusForSnapshotPaths(
|
||||||
FileStatus[] fileStatuses, Collection<HoodieTableMetaClient> metaClientList) {
|
FileStatus[] fileStatuses, String fileExtension, Collection<HoodieTableMetaClient> metaClientList) {
|
||||||
// This assumes the paths for different tables are grouped together
|
// This assumes the paths for different tables are grouped together
|
||||||
Map<HoodieTableMetaClient, List<FileStatus>> grouped = new HashMap<>();
|
Map<HoodieTableMetaClient, List<FileStatus>> grouped = new HashMap<>();
|
||||||
HoodieTableMetaClient metadata = null;
|
HoodieTableMetaClient metadata = null;
|
||||||
for (FileStatus status : fileStatuses) {
|
for (FileStatus status : fileStatuses) {
|
||||||
Path inputPath = status.getPath();
|
Path inputPath = status.getPath();
|
||||||
if (!inputPath.getName().endsWith(".parquet")) {
|
if (!inputPath.getName().endsWith(fileExtension)) {
|
||||||
//FIXME(vc): skip non parquet files for now. This wont be needed once log file name start
|
//FIXME(vc): skip non data files for now. This wont be needed once log file name start
|
||||||
// with "."
|
// with "."
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -22,7 +22,8 @@ import org.apache.hudi.avro.HoodieAvroUtils;
|
|||||||
import org.apache.hudi.common.util.collection.Pair;
|
import org.apache.hudi.common.util.collection.Pair;
|
||||||
import org.apache.hudi.exception.HoodieException;
|
import org.apache.hudi.exception.HoodieException;
|
||||||
import org.apache.hudi.exception.HoodieIOException;
|
import org.apache.hudi.exception.HoodieIOException;
|
||||||
|
import org.apache.hudi.io.storage.HoodieFileReader;
|
||||||
|
import org.apache.hudi.io.storage.HoodieFileReaderFactory;
|
||||||
import org.apache.avro.LogicalTypes;
|
import org.apache.avro.LogicalTypes;
|
||||||
import org.apache.avro.Schema;
|
import org.apache.avro.Schema;
|
||||||
import org.apache.avro.generic.GenericArray;
|
import org.apache.avro.generic.GenericArray;
|
||||||
@@ -40,8 +41,6 @@ import org.apache.hadoop.io.IntWritable;
|
|||||||
import org.apache.hadoop.io.LongWritable;
|
import org.apache.hadoop.io.LongWritable;
|
||||||
import org.apache.hadoop.io.Text;
|
import org.apache.hadoop.io.Text;
|
||||||
import org.apache.hadoop.io.Writable;
|
import org.apache.hadoop.io.Writable;
|
||||||
import org.apache.parquet.hadoop.ParquetFileReader;
|
|
||||||
import org.apache.parquet.schema.MessageType;
|
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.nio.ByteBuffer;
|
import java.nio.ByteBuffer;
|
||||||
@@ -57,14 +56,14 @@ import java.util.stream.Collectors;
|
|||||||
public class HoodieRealtimeRecordReaderUtils {
|
public class HoodieRealtimeRecordReaderUtils {
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Reads the schema from the parquet file. This is different from ParquetUtils as it uses the twitter parquet to
|
* Reads the schema from the base file.
|
||||||
* support hive 1.1.0
|
|
||||||
*/
|
*/
|
||||||
public static MessageType readSchema(Configuration conf, Path parquetFilePath) {
|
public static Schema readSchema(Configuration conf, Path filePath) {
|
||||||
try {
|
try {
|
||||||
return ParquetFileReader.readFooter(conf, parquetFilePath).getFileMetaData().getSchema();
|
HoodieFileReader storageReader = HoodieFileReaderFactory.getFileReader(conf, filePath);
|
||||||
|
return storageReader.getSchema();
|
||||||
} catch (IOException e) {
|
} catch (IOException e) {
|
||||||
throw new HoodieIOException("Failed to read footer for parquet " + parquetFilePath, e);
|
throw new HoodieIOException("Failed to read schema from " + filePath, e);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -21,6 +21,7 @@ package org.apache.hudi.hadoop;
|
|||||||
import org.apache.hudi.avro.model.HoodieCompactionPlan;
|
import org.apache.hudi.avro.model.HoodieCompactionPlan;
|
||||||
import org.apache.hudi.common.fs.FSUtils;
|
import org.apache.hudi.common.fs.FSUtils;
|
||||||
import org.apache.hudi.common.model.HoodieCommitMetadata;
|
import org.apache.hudi.common.model.HoodieCommitMetadata;
|
||||||
|
import org.apache.hudi.common.model.HoodieFileFormat;
|
||||||
import org.apache.hudi.common.model.HoodieWriteStat;
|
import org.apache.hudi.common.model.HoodieWriteStat;
|
||||||
import org.apache.hudi.common.table.HoodieTableMetaClient;
|
import org.apache.hudi.common.table.HoodieTableMetaClient;
|
||||||
import org.apache.hudi.common.table.timeline.HoodieActiveTimeline;
|
import org.apache.hudi.common.table.timeline.HoodieActiveTimeline;
|
||||||
@@ -59,6 +60,8 @@ public class TestHoodieParquetInputFormat {
|
|||||||
|
|
||||||
private HoodieParquetInputFormat inputFormat;
|
private HoodieParquetInputFormat inputFormat;
|
||||||
private JobConf jobConf;
|
private JobConf jobConf;
|
||||||
|
private final HoodieFileFormat baseFileFormat = HoodieFileFormat.PARQUET;
|
||||||
|
private final String baseFileExtension = baseFileFormat.getFileExtension();
|
||||||
|
|
||||||
public static void ensureFilesInCommit(String msg, FileStatus[] files, String commit, int expected) {
|
public static void ensureFilesInCommit(String msg, FileStatus[] files, String commit, int expected) {
|
||||||
int count = 0;
|
int count = 0;
|
||||||
@@ -145,7 +148,7 @@ public class TestHoodieParquetInputFormat {
|
|||||||
@Test
|
@Test
|
||||||
public void testInputFormatLoad() throws IOException {
|
public void testInputFormatLoad() throws IOException {
|
||||||
// initial commit
|
// initial commit
|
||||||
File partitionDir = InputFormatTestUtil.prepareTable(basePath, 10, "100");
|
File partitionDir = InputFormatTestUtil.prepareTable(basePath, baseFileFormat, 10, "100");
|
||||||
InputFormatTestUtil.commit(basePath, "100");
|
InputFormatTestUtil.commit(basePath, "100");
|
||||||
|
|
||||||
// Add the paths
|
// Add the paths
|
||||||
@@ -161,7 +164,7 @@ public class TestHoodieParquetInputFormat {
|
|||||||
@Test
|
@Test
|
||||||
public void testInputFormatUpdates() throws IOException {
|
public void testInputFormatUpdates() throws IOException {
|
||||||
// initial commit
|
// initial commit
|
||||||
File partitionDir = InputFormatTestUtil.prepareTable(basePath, 10, "100");
|
File partitionDir = InputFormatTestUtil.prepareTable(basePath, baseFileFormat, 10, "100");
|
||||||
InputFormatTestUtil.commit(basePath, "100");
|
InputFormatTestUtil.commit(basePath, "100");
|
||||||
|
|
||||||
// Add the paths
|
// Add the paths
|
||||||
@@ -171,7 +174,7 @@ public class TestHoodieParquetInputFormat {
|
|||||||
assertEquals(10, files.length);
|
assertEquals(10, files.length);
|
||||||
|
|
||||||
// update files
|
// update files
|
||||||
InputFormatTestUtil.simulateUpdates(partitionDir, "100", 5, "200", true);
|
InputFormatTestUtil.simulateUpdates(partitionDir, baseFileExtension, "100", 5, "200", true);
|
||||||
// Before the commit
|
// Before the commit
|
||||||
files = inputFormat.listStatus(jobConf);
|
files = inputFormat.listStatus(jobConf);
|
||||||
assertEquals(10, files.length);
|
assertEquals(10, files.length);
|
||||||
@@ -188,7 +191,7 @@ public class TestHoodieParquetInputFormat {
|
|||||||
@Test
|
@Test
|
||||||
public void testInputFormatWithCompaction() throws IOException {
|
public void testInputFormatWithCompaction() throws IOException {
|
||||||
// initial commit
|
// initial commit
|
||||||
File partitionDir = InputFormatTestUtil.prepareTable(basePath, 10, "100");
|
File partitionDir = InputFormatTestUtil.prepareTable(basePath, baseFileFormat, 10, "100");
|
||||||
InputFormatTestUtil.commit(basePath, "100");
|
InputFormatTestUtil.commit(basePath, "100");
|
||||||
|
|
||||||
// Add the paths
|
// Add the paths
|
||||||
@@ -204,7 +207,7 @@ public class TestHoodieParquetInputFormat {
|
|||||||
createCompactionFile(basePath, "125");
|
createCompactionFile(basePath, "125");
|
||||||
|
|
||||||
// add inserts after compaction timestamp
|
// add inserts after compaction timestamp
|
||||||
InputFormatTestUtil.simulateInserts(partitionDir, "fileId2", 5, "200");
|
InputFormatTestUtil.simulateInserts(partitionDir, baseFileExtension, "fileId2", 5, "200");
|
||||||
InputFormatTestUtil.commit(basePath, "200");
|
InputFormatTestUtil.commit(basePath, "200");
|
||||||
|
|
||||||
// verify snapshot reads show all new inserts even though there is pending compaction
|
// verify snapshot reads show all new inserts even though there is pending compaction
|
||||||
@@ -221,7 +224,7 @@ public class TestHoodieParquetInputFormat {
|
|||||||
@Test
|
@Test
|
||||||
public void testIncrementalSimple() throws IOException {
|
public void testIncrementalSimple() throws IOException {
|
||||||
// initial commit
|
// initial commit
|
||||||
File partitionDir = InputFormatTestUtil.prepareTable(basePath, 10, "100");
|
File partitionDir = InputFormatTestUtil.prepareTable(basePath, baseFileFormat, 10, "100");
|
||||||
createCommitFile(basePath, "100", "2016/05/01");
|
createCommitFile(basePath, "100", "2016/05/01");
|
||||||
|
|
||||||
// Add the paths
|
// Add the paths
|
||||||
@@ -266,25 +269,25 @@ public class TestHoodieParquetInputFormat {
|
|||||||
@Test
|
@Test
|
||||||
public void testIncrementalWithMultipleCommits() throws IOException {
|
public void testIncrementalWithMultipleCommits() throws IOException {
|
||||||
// initial commit
|
// initial commit
|
||||||
File partitionDir = InputFormatTestUtil.prepareTable(basePath, 10, "100");
|
File partitionDir = InputFormatTestUtil.prepareTable(basePath, baseFileFormat, 10, "100");
|
||||||
createCommitFile(basePath, "100", "2016/05/01");
|
createCommitFile(basePath, "100", "2016/05/01");
|
||||||
|
|
||||||
// Add the paths
|
// Add the paths
|
||||||
FileInputFormat.setInputPaths(jobConf, partitionDir.getPath());
|
FileInputFormat.setInputPaths(jobConf, partitionDir.getPath());
|
||||||
// update files
|
// update files
|
||||||
InputFormatTestUtil.simulateUpdates(partitionDir, "100", 5, "200", false);
|
InputFormatTestUtil.simulateUpdates(partitionDir, baseFileExtension, "100", 5, "200", false);
|
||||||
createCommitFile(basePath, "200", "2016/05/01");
|
createCommitFile(basePath, "200", "2016/05/01");
|
||||||
|
|
||||||
InputFormatTestUtil.simulateUpdates(partitionDir, "100", 4, "300", false);
|
InputFormatTestUtil.simulateUpdates(partitionDir, baseFileExtension, "100", 4, "300", false);
|
||||||
createCommitFile(basePath, "300", "2016/05/01");
|
createCommitFile(basePath, "300", "2016/05/01");
|
||||||
|
|
||||||
InputFormatTestUtil.simulateUpdates(partitionDir, "100", 3, "400", false);
|
InputFormatTestUtil.simulateUpdates(partitionDir, baseFileExtension, "100", 3, "400", false);
|
||||||
createCommitFile(basePath, "400", "2016/05/01");
|
createCommitFile(basePath, "400", "2016/05/01");
|
||||||
|
|
||||||
InputFormatTestUtil.simulateUpdates(partitionDir, "100", 2, "500", false);
|
InputFormatTestUtil.simulateUpdates(partitionDir, baseFileExtension, "100", 2, "500", false);
|
||||||
createCommitFile(basePath, "500", "2016/05/01");
|
createCommitFile(basePath, "500", "2016/05/01");
|
||||||
|
|
||||||
InputFormatTestUtil.simulateUpdates(partitionDir, "100", 1, "600", false);
|
InputFormatTestUtil.simulateUpdates(partitionDir, baseFileExtension, "100", 1, "600", false);
|
||||||
createCommitFile(basePath, "600", "2016/05/01");
|
createCommitFile(basePath, "600", "2016/05/01");
|
||||||
|
|
||||||
InputFormatTestUtil.setupIncremental(jobConf, "100", 1);
|
InputFormatTestUtil.setupIncremental(jobConf, "100", 1);
|
||||||
@@ -364,14 +367,14 @@ public class TestHoodieParquetInputFormat {
|
|||||||
@Test
|
@Test
|
||||||
public void testIncrementalWithPendingCompaction() throws IOException {
|
public void testIncrementalWithPendingCompaction() throws IOException {
|
||||||
// initial commit
|
// initial commit
|
||||||
File partitionDir = InputFormatTestUtil.prepareTable(basePath, 10, "100");
|
File partitionDir = InputFormatTestUtil.prepareTable(basePath, baseFileFormat, 10, "100");
|
||||||
createCommitFile(basePath, "100", "2016/05/01");
|
createCommitFile(basePath, "100", "2016/05/01");
|
||||||
|
|
||||||
// simulate compaction requested at 300
|
// simulate compaction requested at 300
|
||||||
File compactionFile = createCompactionFile(basePath, "300");
|
File compactionFile = createCompactionFile(basePath, "300");
|
||||||
|
|
||||||
// write inserts into new bucket
|
// write inserts into new bucket
|
||||||
InputFormatTestUtil.simulateInserts(partitionDir, "fileId2", 10, "400");
|
InputFormatTestUtil.simulateInserts(partitionDir, baseFileExtension, "fileId2", 10, "400");
|
||||||
createCommitFile(basePath, "400", "2016/05/01");
|
createCommitFile(basePath, "400", "2016/05/01");
|
||||||
|
|
||||||
// Add the paths
|
// Add the paths
|
||||||
|
|||||||
@@ -20,8 +20,10 @@ package org.apache.hudi.hadoop.testutils;
|
|||||||
|
|
||||||
import org.apache.hudi.avro.HoodieAvroUtils;
|
import org.apache.hudi.avro.HoodieAvroUtils;
|
||||||
import org.apache.hudi.common.fs.FSUtils;
|
import org.apache.hudi.common.fs.FSUtils;
|
||||||
|
import org.apache.hudi.common.model.HoodieFileFormat;
|
||||||
import org.apache.hudi.common.model.HoodieLogFile;
|
import org.apache.hudi.common.model.HoodieLogFile;
|
||||||
import org.apache.hudi.common.model.HoodieRecord;
|
import org.apache.hudi.common.model.HoodieRecord;
|
||||||
|
import org.apache.hudi.common.model.HoodieTableType;
|
||||||
import org.apache.hudi.common.table.log.HoodieLogFormat;
|
import org.apache.hudi.common.table.log.HoodieLogFormat;
|
||||||
import org.apache.hudi.common.table.log.block.HoodieAvroDataBlock;
|
import org.apache.hudi.common.table.log.block.HoodieAvroDataBlock;
|
||||||
import org.apache.hudi.common.table.log.block.HoodieCommandBlock;
|
import org.apache.hudi.common.table.log.block.HoodieCommandBlock;
|
||||||
@@ -59,25 +61,29 @@ public class InputFormatTestUtil {
|
|||||||
|
|
||||||
private static String TEST_WRITE_TOKEN = "1-0-1";
|
private static String TEST_WRITE_TOKEN = "1-0-1";
|
||||||
|
|
||||||
public static File prepareTable(java.nio.file.Path basePath, int numberOfFiles, String commitNumber)
|
public static File prepareTable(java.nio.file.Path basePath, HoodieFileFormat baseFileFormat, int numberOfFiles,
|
||||||
|
String commitNumber)
|
||||||
throws IOException {
|
throws IOException {
|
||||||
HoodieTestUtils.init(HoodieTestUtils.getDefaultHadoopConf(), basePath.toString());
|
HoodieTestUtils.init(HoodieTestUtils.getDefaultHadoopConf(), basePath.toString(), HoodieTableType.COPY_ON_WRITE,
|
||||||
|
baseFileFormat);
|
||||||
java.nio.file.Path partitionPath = basePath.resolve(Paths.get("2016", "05", "01"));
|
java.nio.file.Path partitionPath = basePath.resolve(Paths.get("2016", "05", "01"));
|
||||||
Files.createDirectories(partitionPath);
|
Files.createDirectories(partitionPath);
|
||||||
return simulateInserts(partitionPath.toFile(), "fileId1", numberOfFiles, commitNumber);
|
return simulateInserts(partitionPath.toFile(), baseFileFormat.getFileExtension(), "fileId1", numberOfFiles,
|
||||||
|
commitNumber);
|
||||||
}
|
}
|
||||||
|
|
||||||
public static File simulateInserts(File partitionPath, String fileId, int numberOfFiles, String commitNumber)
|
public static File simulateInserts(File partitionPath, String baseFileExtension, String fileId, int numberOfFiles,
|
||||||
|
String commitNumber)
|
||||||
throws IOException {
|
throws IOException {
|
||||||
for (int i = 0; i < numberOfFiles; i++) {
|
for (int i = 0; i < numberOfFiles; i++) {
|
||||||
Files.createFile(partitionPath.toPath()
|
Files.createFile(partitionPath.toPath()
|
||||||
.resolve(FSUtils.makeDataFileName(commitNumber, TEST_WRITE_TOKEN, fileId + i)));
|
.resolve(FSUtils.makeDataFileName(commitNumber, TEST_WRITE_TOKEN, fileId + i, baseFileExtension)));
|
||||||
}
|
}
|
||||||
return partitionPath;
|
return partitionPath;
|
||||||
}
|
}
|
||||||
|
|
||||||
public static void simulateUpdates(File directory, final String originalCommit, int numberOfFilesUpdated,
|
public static void simulateUpdates(File directory, String baseFileExtension, final String originalCommit,
|
||||||
String newCommit, boolean randomize) throws IOException {
|
int numberOfFilesUpdated, String newCommit, boolean randomize) throws IOException {
|
||||||
List<File> dataFiles = Arrays.asList(Objects.requireNonNull(directory.listFiles((dir, name) -> {
|
List<File> dataFiles = Arrays.asList(Objects.requireNonNull(directory.listFiles((dir, name) -> {
|
||||||
String commitTs = FSUtils.getCommitTime(name);
|
String commitTs = FSUtils.getCommitTime(name);
|
||||||
return originalCommit.equals(commitTs);
|
return originalCommit.equals(commitTs);
|
||||||
@@ -88,7 +94,8 @@ public class InputFormatTestUtil {
|
|||||||
List<File> toUpdateList = dataFiles.subList(0, Math.min(numberOfFilesUpdated, dataFiles.size()));
|
List<File> toUpdateList = dataFiles.subList(0, Math.min(numberOfFilesUpdated, dataFiles.size()));
|
||||||
for (File file : toUpdateList) {
|
for (File file : toUpdateList) {
|
||||||
String fileId = FSUtils.getFileId(file.getName());
|
String fileId = FSUtils.getFileId(file.getName());
|
||||||
Files.createFile(directory.toPath().resolve(FSUtils.makeDataFileName(newCommit, TEST_WRITE_TOKEN, fileId)));
|
Files.createFile(directory.toPath().resolve(FSUtils.makeDataFileName(newCommit, TEST_WRITE_TOKEN, fileId,
|
||||||
|
baseFileExtension)));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -35,6 +35,9 @@ public class HiveSyncConfig implements Serializable {
|
|||||||
@Parameter(names = {"--table"}, description = "name of the target table in Hive", required = true)
|
@Parameter(names = {"--table"}, description = "name of the target table in Hive", required = true)
|
||||||
public String tableName;
|
public String tableName;
|
||||||
|
|
||||||
|
@Parameter(names = {"--base-file-format"}, description = "Format of the base files (PARQUET (or) HFILE)")
|
||||||
|
public String baseFileFormat = "PARQUET";
|
||||||
|
|
||||||
@Parameter(names = {"--user"}, description = "Hive username", required = true)
|
@Parameter(names = {"--user"}, description = "Hive username", required = true)
|
||||||
public String hiveUser;
|
public String hiveUser;
|
||||||
|
|
||||||
|
|||||||
@@ -19,10 +19,10 @@
|
|||||||
package org.apache.hudi.hive;
|
package org.apache.hudi.hive;
|
||||||
|
|
||||||
import org.apache.hudi.common.fs.FSUtils;
|
import org.apache.hudi.common.fs.FSUtils;
|
||||||
|
import org.apache.hudi.common.model.HoodieFileFormat;
|
||||||
import org.apache.hudi.common.util.Option;
|
import org.apache.hudi.common.util.Option;
|
||||||
import org.apache.hudi.exception.InvalidTableException;
|
import org.apache.hudi.exception.InvalidTableException;
|
||||||
import org.apache.hudi.hadoop.HoodieParquetInputFormat;
|
import org.apache.hudi.hadoop.utils.HoodieInputFormatUtils;
|
||||||
import org.apache.hudi.hadoop.realtime.HoodieParquetRealtimeInputFormat;
|
|
||||||
import org.apache.hudi.hive.HoodieHiveClient.PartitionEvent;
|
import org.apache.hudi.hive.HoodieHiveClient.PartitionEvent;
|
||||||
import org.apache.hudi.hive.HoodieHiveClient.PartitionEvent.PartitionEventType;
|
import org.apache.hudi.hive.HoodieHiveClient.PartitionEvent.PartitionEventType;
|
||||||
import org.apache.hudi.hive.util.HiveSchemaUtil;
|
import org.apache.hudi.hive.util.HiveSchemaUtil;
|
||||||
@@ -32,8 +32,6 @@ import org.apache.hadoop.conf.Configuration;
|
|||||||
import org.apache.hadoop.fs.FileSystem;
|
import org.apache.hadoop.fs.FileSystem;
|
||||||
import org.apache.hadoop.hive.conf.HiveConf;
|
import org.apache.hadoop.hive.conf.HiveConf;
|
||||||
import org.apache.hadoop.hive.metastore.api.Partition;
|
import org.apache.hadoop.hive.metastore.api.Partition;
|
||||||
import org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat;
|
|
||||||
import org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe;
|
|
||||||
import org.apache.log4j.LogManager;
|
import org.apache.log4j.LogManager;
|
||||||
import org.apache.log4j.Logger;
|
import org.apache.log4j.Logger;
|
||||||
import org.apache.parquet.schema.MessageType;
|
import org.apache.parquet.schema.MessageType;
|
||||||
@@ -146,21 +144,24 @@ public class HiveSyncTool {
|
|||||||
// Check and sync schema
|
// Check and sync schema
|
||||||
if (!tableExists) {
|
if (!tableExists) {
|
||||||
LOG.info("Hive table " + tableName + " is not found. Creating it");
|
LOG.info("Hive table " + tableName + " is not found. Creating it");
|
||||||
if (!useRealTimeInputFormat) {
|
HoodieFileFormat baseFileFormat = HoodieFileFormat.valueOf(cfg.baseFileFormat.toUpperCase());
|
||||||
String inputFormatClassName = cfg.usePreApacheInputFormat ? com.uber.hoodie.hadoop.HoodieInputFormat.class.getName()
|
String inputFormatClassName = HoodieInputFormatUtils.getInputFormatClassName(baseFileFormat, useRealTimeInputFormat,
|
||||||
: HoodieParquetInputFormat.class.getName();
|
new Configuration());
|
||||||
hoodieHiveClient.createTable(tableName, schema, inputFormatClassName, MapredParquetOutputFormat.class.getName(),
|
|
||||||
ParquetHiveSerDe.class.getName());
|
if (baseFileFormat.equals(HoodieFileFormat.PARQUET) && cfg.usePreApacheInputFormat) {
|
||||||
} else {
|
// Parquet input format had an InputFormat class visible under the old naming scheme.
|
||||||
// Custom serde will not work with ALTER TABLE REPLACE COLUMNS
|
inputFormatClassName = useRealTimeInputFormat
|
||||||
// https://github.com/apache/hive/blob/release-1.1.0/ql/src/java/org/apache/hadoop/hive
|
? com.uber.hoodie.hadoop.realtime.HoodieRealtimeInputFormat.class.getName()
|
||||||
// /ql/exec/DDLTask.java#L3488
|
: com.uber.hoodie.hadoop.HoodieInputFormat.class.getName();
|
||||||
String inputFormatClassName =
|
|
||||||
cfg.usePreApacheInputFormat ? com.uber.hoodie.hadoop.realtime.HoodieRealtimeInputFormat.class.getName()
|
|
||||||
: HoodieParquetRealtimeInputFormat.class.getName();
|
|
||||||
hoodieHiveClient.createTable(tableName, schema, inputFormatClassName, MapredParquetOutputFormat.class.getName(),
|
|
||||||
ParquetHiveSerDe.class.getName());
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
String outputFormatClassName = HoodieInputFormatUtils.getOutputFormatClassName(baseFileFormat);
|
||||||
|
String serDeFormatClassName = HoodieInputFormatUtils.getSerDeClassName(baseFileFormat);
|
||||||
|
|
||||||
|
// Custom serde will not work with ALTER TABLE REPLACE COLUMNS
|
||||||
|
// https://github.com/apache/hive/blob/release-1.1.0/ql/src/java/org/apache/hadoop/hive
|
||||||
|
// /ql/exec/DDLTask.java#L3488
|
||||||
|
hoodieHiveClient.createTable(tableName, schema, inputFormatClassName, outputFormatClassName, serDeFormatClassName);
|
||||||
} else {
|
} else {
|
||||||
// Check if the table schema has evolved
|
// Check if the table schema has evolved
|
||||||
Map<String, String> tableSchema = hoodieHiveClient.getTableSchema(tableName);
|
Map<String, String> tableSchema = hoodieHiveClient.getTableSchema(tableName);
|
||||||
|
|||||||
@@ -18,6 +18,7 @@
|
|||||||
|
|
||||||
package org.apache.hudi.integ;
|
package org.apache.hudi.integ;
|
||||||
|
|
||||||
|
import org.apache.hudi.common.model.HoodieFileFormat;
|
||||||
import org.apache.hudi.common.util.CollectionUtils;
|
import org.apache.hudi.common.util.CollectionUtils;
|
||||||
import org.apache.hudi.common.util.collection.Pair;
|
import org.apache.hudi.common.util.collection.Pair;
|
||||||
|
|
||||||
@@ -67,6 +68,8 @@ public class ITTestHoodieDemo extends ITTestBase {
|
|||||||
private static final String HIVE_INCREMENTAL_MOR_RO_COMMANDS = HOODIE_WS_ROOT + "/docker/demo/hive-incremental-mor-ro.commands";
|
private static final String HIVE_INCREMENTAL_MOR_RO_COMMANDS = HOODIE_WS_ROOT + "/docker/demo/hive-incremental-mor-ro.commands";
|
||||||
private static final String HIVE_INCREMENTAL_MOR_RT_COMMANDS = HOODIE_WS_ROOT + "/docker/demo/hive-incremental-mor-rt.commands";
|
private static final String HIVE_INCREMENTAL_MOR_RT_COMMANDS = HOODIE_WS_ROOT + "/docker/demo/hive-incremental-mor-rt.commands";
|
||||||
|
|
||||||
|
private static HoodieFileFormat baseFileFormat;
|
||||||
|
|
||||||
private static String HIVE_SYNC_CMD_FMT =
|
private static String HIVE_SYNC_CMD_FMT =
|
||||||
" --enable-hive-sync --hoodie-conf hoodie.datasource.hive_sync.jdbcurl=jdbc:hive2://hiveserver:10000 "
|
" --enable-hive-sync --hoodie-conf hoodie.datasource.hive_sync.jdbcurl=jdbc:hive2://hiveserver:10000 "
|
||||||
+ " --hoodie-conf hoodie.datasource.hive_sync.username=hive "
|
+ " --hoodie-conf hoodie.datasource.hive_sync.username=hive "
|
||||||
@@ -76,7 +79,9 @@ public class ITTestHoodieDemo extends ITTestBase {
|
|||||||
+ " --hoodie-conf hoodie.datasource.hive_sync.table=%s";
|
+ " --hoodie-conf hoodie.datasource.hive_sync.table=%s";
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testDemo() throws Exception {
|
public void testParquetDemo() throws Exception {
|
||||||
|
baseFileFormat = HoodieFileFormat.PARQUET;
|
||||||
|
|
||||||
setupDemo();
|
setupDemo();
|
||||||
|
|
||||||
// batch 1
|
// batch 1
|
||||||
@@ -122,6 +127,7 @@ public class ITTestHoodieDemo extends ITTestBase {
|
|||||||
List<String> cmds = CollectionUtils.createImmutableList(
|
List<String> cmds = CollectionUtils.createImmutableList(
|
||||||
"spark-submit --class org.apache.hudi.utilities.deltastreamer.HoodieDeltaStreamer " + HUDI_UTILITIES_BUNDLE
|
"spark-submit --class org.apache.hudi.utilities.deltastreamer.HoodieDeltaStreamer " + HUDI_UTILITIES_BUNDLE
|
||||||
+ " --table-type COPY_ON_WRITE "
|
+ " --table-type COPY_ON_WRITE "
|
||||||
|
+ " --base-file-format " + baseFileFormat.toString()
|
||||||
+ " --source-class org.apache.hudi.utilities.sources.JsonDFSSource --source-ordering-field ts "
|
+ " --source-class org.apache.hudi.utilities.sources.JsonDFSSource --source-ordering-field ts "
|
||||||
+ " --target-base-path " + COW_BASE_PATH + " --target-table " + COW_TABLE_NAME
|
+ " --target-base-path " + COW_BASE_PATH + " --target-table " + COW_TABLE_NAME
|
||||||
+ " --props /var/demo/config/dfs-source.properties"
|
+ " --props /var/demo/config/dfs-source.properties"
|
||||||
@@ -130,12 +136,14 @@ public class ITTestHoodieDemo extends ITTestBase {
|
|||||||
+ " --database default"
|
+ " --database default"
|
||||||
+ " --table " + COW_TABLE_NAME
|
+ " --table " + COW_TABLE_NAME
|
||||||
+ " --base-path " + COW_BASE_PATH
|
+ " --base-path " + COW_BASE_PATH
|
||||||
|
+ " --base-file-format " + baseFileFormat.toString()
|
||||||
+ " --user hive"
|
+ " --user hive"
|
||||||
+ " --pass hive"
|
+ " --pass hive"
|
||||||
+ " --jdbc-url jdbc:hive2://hiveserver:10000"
|
+ " --jdbc-url jdbc:hive2://hiveserver:10000"
|
||||||
+ " --partitioned-by dt",
|
+ " --partitioned-by dt",
|
||||||
("spark-submit --class org.apache.hudi.utilities.deltastreamer.HoodieDeltaStreamer " + HUDI_UTILITIES_BUNDLE
|
("spark-submit --class org.apache.hudi.utilities.deltastreamer.HoodieDeltaStreamer " + HUDI_UTILITIES_BUNDLE
|
||||||
+ " --table-type MERGE_ON_READ "
|
+ " --table-type MERGE_ON_READ "
|
||||||
|
+ " --base-file-format " + baseFileFormat.toString()
|
||||||
+ " --source-class org.apache.hudi.utilities.sources.JsonDFSSource --source-ordering-field ts "
|
+ " --source-class org.apache.hudi.utilities.sources.JsonDFSSource --source-ordering-field ts "
|
||||||
+ " --target-base-path " + MOR_BASE_PATH + " --target-table " + MOR_TABLE_NAME
|
+ " --target-base-path " + MOR_BASE_PATH + " --target-table " + MOR_TABLE_NAME
|
||||||
+ " --props /var/demo/config/dfs-source.properties"
|
+ " --props /var/demo/config/dfs-source.properties"
|
||||||
|
|||||||
@@ -270,7 +270,7 @@ public class DataSourceUtils {
|
|||||||
return dropDuplicates(jssc, incomingHoodieRecords, writeConfig);
|
return dropDuplicates(jssc, incomingHoodieRecords, writeConfig);
|
||||||
}
|
}
|
||||||
|
|
||||||
public static HiveSyncConfig buildHiveSyncConfig(TypedProperties props, String basePath) {
|
public static HiveSyncConfig buildHiveSyncConfig(TypedProperties props, String basePath, String baseFileFormat) {
|
||||||
checkRequiredProperties(props, Collections.singletonList(DataSourceWriteOptions.HIVE_TABLE_OPT_KEY()));
|
checkRequiredProperties(props, Collections.singletonList(DataSourceWriteOptions.HIVE_TABLE_OPT_KEY()));
|
||||||
HiveSyncConfig hiveSyncConfig = new HiveSyncConfig();
|
HiveSyncConfig hiveSyncConfig = new HiveSyncConfig();
|
||||||
hiveSyncConfig.basePath = basePath;
|
hiveSyncConfig.basePath = basePath;
|
||||||
@@ -280,6 +280,7 @@ public class DataSourceUtils {
|
|||||||
hiveSyncConfig.databaseName = props.getString(DataSourceWriteOptions.HIVE_DATABASE_OPT_KEY(),
|
hiveSyncConfig.databaseName = props.getString(DataSourceWriteOptions.HIVE_DATABASE_OPT_KEY(),
|
||||||
DataSourceWriteOptions.DEFAULT_HIVE_DATABASE_OPT_VAL());
|
DataSourceWriteOptions.DEFAULT_HIVE_DATABASE_OPT_VAL());
|
||||||
hiveSyncConfig.tableName = props.getString(DataSourceWriteOptions.HIVE_TABLE_OPT_KEY());
|
hiveSyncConfig.tableName = props.getString(DataSourceWriteOptions.HIVE_TABLE_OPT_KEY());
|
||||||
|
hiveSyncConfig.baseFileFormat = baseFileFormat;
|
||||||
hiveSyncConfig.hiveUser =
|
hiveSyncConfig.hiveUser =
|
||||||
props.getString(DataSourceWriteOptions.HIVE_USER_OPT_KEY(), DataSourceWriteOptions.DEFAULT_HIVE_USER_OPT_VAL());
|
props.getString(DataSourceWriteOptions.HIVE_USER_OPT_KEY(), DataSourceWriteOptions.DEFAULT_HIVE_USER_OPT_VAL());
|
||||||
hiveSyncConfig.hivePass =
|
hiveSyncConfig.hivePass =
|
||||||
|
|||||||
@@ -257,6 +257,7 @@ object DataSourceWriteOptions {
|
|||||||
val HIVE_SYNC_ENABLED_OPT_KEY = "hoodie.datasource.hive_sync.enable"
|
val HIVE_SYNC_ENABLED_OPT_KEY = "hoodie.datasource.hive_sync.enable"
|
||||||
val HIVE_DATABASE_OPT_KEY = "hoodie.datasource.hive_sync.database"
|
val HIVE_DATABASE_OPT_KEY = "hoodie.datasource.hive_sync.database"
|
||||||
val HIVE_TABLE_OPT_KEY = "hoodie.datasource.hive_sync.table"
|
val HIVE_TABLE_OPT_KEY = "hoodie.datasource.hive_sync.table"
|
||||||
|
val HIVE_BASE_FILE_FORMAT_OPT_KEY = "hoodie.datasource.hive_sync.base_file_format"
|
||||||
val HIVE_USER_OPT_KEY = "hoodie.datasource.hive_sync.username"
|
val HIVE_USER_OPT_KEY = "hoodie.datasource.hive_sync.username"
|
||||||
val HIVE_PASS_OPT_KEY = "hoodie.datasource.hive_sync.password"
|
val HIVE_PASS_OPT_KEY = "hoodie.datasource.hive_sync.password"
|
||||||
val HIVE_URL_OPT_KEY = "hoodie.datasource.hive_sync.jdbcurl"
|
val HIVE_URL_OPT_KEY = "hoodie.datasource.hive_sync.jdbcurl"
|
||||||
@@ -270,6 +271,7 @@ object DataSourceWriteOptions {
|
|||||||
val DEFAULT_HIVE_SYNC_ENABLED_OPT_VAL = "false"
|
val DEFAULT_HIVE_SYNC_ENABLED_OPT_VAL = "false"
|
||||||
val DEFAULT_HIVE_DATABASE_OPT_VAL = "default"
|
val DEFAULT_HIVE_DATABASE_OPT_VAL = "default"
|
||||||
val DEFAULT_HIVE_TABLE_OPT_VAL = "unknown"
|
val DEFAULT_HIVE_TABLE_OPT_VAL = "unknown"
|
||||||
|
val DEFAULT_HIVE_BASE_FILE_FORMAT_OPT_VAL = "PARQUET"
|
||||||
val DEFAULT_HIVE_USER_OPT_VAL = "hive"
|
val DEFAULT_HIVE_USER_OPT_VAL = "hive"
|
||||||
val DEFAULT_HIVE_PASS_OPT_VAL = "hive"
|
val DEFAULT_HIVE_PASS_OPT_VAL = "hive"
|
||||||
val DEFAULT_HIVE_URL_OPT_VAL = "jdbc:hive2://localhost:10000"
|
val DEFAULT_HIVE_URL_OPT_VAL = "jdbc:hive2://localhost:10000"
|
||||||
|
|||||||
@@ -212,6 +212,7 @@ private[hudi] object HoodieSparkSqlWriter {
|
|||||||
HIVE_SYNC_ENABLED_OPT_KEY -> DEFAULT_HIVE_SYNC_ENABLED_OPT_VAL,
|
HIVE_SYNC_ENABLED_OPT_KEY -> DEFAULT_HIVE_SYNC_ENABLED_OPT_VAL,
|
||||||
HIVE_DATABASE_OPT_KEY -> DEFAULT_HIVE_DATABASE_OPT_VAL,
|
HIVE_DATABASE_OPT_KEY -> DEFAULT_HIVE_DATABASE_OPT_VAL,
|
||||||
HIVE_TABLE_OPT_KEY -> DEFAULT_HIVE_TABLE_OPT_VAL,
|
HIVE_TABLE_OPT_KEY -> DEFAULT_HIVE_TABLE_OPT_VAL,
|
||||||
|
HIVE_BASE_FILE_FORMAT_OPT_KEY -> DEFAULT_HIVE_BASE_FILE_FORMAT_OPT_VAL,
|
||||||
HIVE_USER_OPT_KEY -> DEFAULT_HIVE_USER_OPT_VAL,
|
HIVE_USER_OPT_KEY -> DEFAULT_HIVE_USER_OPT_VAL,
|
||||||
HIVE_PASS_OPT_KEY -> DEFAULT_HIVE_PASS_OPT_VAL,
|
HIVE_PASS_OPT_KEY -> DEFAULT_HIVE_PASS_OPT_VAL,
|
||||||
HIVE_URL_OPT_KEY -> DEFAULT_HIVE_URL_OPT_VAL,
|
HIVE_URL_OPT_KEY -> DEFAULT_HIVE_URL_OPT_VAL,
|
||||||
@@ -239,6 +240,7 @@ private[hudi] object HoodieSparkSqlWriter {
|
|||||||
private def buildSyncConfig(basePath: Path, parameters: Map[String, String]): HiveSyncConfig = {
|
private def buildSyncConfig(basePath: Path, parameters: Map[String, String]): HiveSyncConfig = {
|
||||||
val hiveSyncConfig: HiveSyncConfig = new HiveSyncConfig()
|
val hiveSyncConfig: HiveSyncConfig = new HiveSyncConfig()
|
||||||
hiveSyncConfig.basePath = basePath.toString
|
hiveSyncConfig.basePath = basePath.toString
|
||||||
|
hiveSyncConfig.baseFileFormat = parameters(HIVE_BASE_FILE_FORMAT_OPT_KEY);
|
||||||
hiveSyncConfig.usePreApacheInputFormat =
|
hiveSyncConfig.usePreApacheInputFormat =
|
||||||
parameters.get(HIVE_USE_PRE_APACHE_INPUT_FORMAT_OPT_KEY).exists(r => r.toBoolean)
|
parameters.get(HIVE_USE_PRE_APACHE_INPUT_FORMAT_OPT_KEY).exists(r => r.toBoolean)
|
||||||
hiveSyncConfig.databaseName = parameters(HIVE_DATABASE_OPT_KEY)
|
hiveSyncConfig.databaseName = parameters(HIVE_DATABASE_OPT_KEY)
|
||||||
|
|||||||
@@ -205,7 +205,7 @@ public class DeltaSync implements Serializable {
|
|||||||
} else {
|
} else {
|
||||||
this.commitTimelineOpt = Option.empty();
|
this.commitTimelineOpt = Option.empty();
|
||||||
HoodieTableMetaClient.initTableType(new Configuration(jssc.hadoopConfiguration()), cfg.targetBasePath,
|
HoodieTableMetaClient.initTableType(new Configuration(jssc.hadoopConfiguration()), cfg.targetBasePath,
|
||||||
cfg.tableType, cfg.targetTableName, "archived", cfg.payloadClassName);
|
cfg.tableType, cfg.targetTableName, "archived", cfg.payloadClassName, cfg.baseFileFormat);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -274,7 +274,7 @@ public class DeltaSync implements Serializable {
|
|||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
HoodieTableMetaClient.initTableType(new Configuration(jssc.hadoopConfiguration()), cfg.targetBasePath,
|
HoodieTableMetaClient.initTableType(new Configuration(jssc.hadoopConfiguration()), cfg.targetBasePath,
|
||||||
cfg.tableType, cfg.targetTableName, "archived", cfg.payloadClassName);
|
cfg.tableType, cfg.targetTableName, "archived", cfg.payloadClassName, cfg.baseFileFormat);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!resumeCheckpointStr.isPresent() && cfg.checkpoint != null) {
|
if (!resumeCheckpointStr.isPresent() && cfg.checkpoint != null) {
|
||||||
@@ -474,7 +474,7 @@ public class DeltaSync implements Serializable {
|
|||||||
*/
|
*/
|
||||||
private void syncHive() {
|
private void syncHive() {
|
||||||
if (cfg.enableHiveSync) {
|
if (cfg.enableHiveSync) {
|
||||||
HiveSyncConfig hiveSyncConfig = DataSourceUtils.buildHiveSyncConfig(props, cfg.targetBasePath);
|
HiveSyncConfig hiveSyncConfig = DataSourceUtils.buildHiveSyncConfig(props, cfg.targetBasePath, cfg.baseFileFormat);
|
||||||
LOG.info("Syncing target hoodie table with hive table(" + hiveSyncConfig.tableName + "). Hive metastore URL :"
|
LOG.info("Syncing target hoodie table with hive table(" + hiveSyncConfig.tableName + "). Hive metastore URL :"
|
||||||
+ hiveSyncConfig.jdbcUrl + ", basePath :" + cfg.targetBasePath);
|
+ hiveSyncConfig.jdbcUrl + ", basePath :" + cfg.targetBasePath);
|
||||||
new HiveSyncTool(hiveSyncConfig, new HiveConf(conf, HiveConf.class), fs).syncHoodieTable();
|
new HiveSyncTool(hiveSyncConfig, new HiveConf(conf, HiveConf.class), fs).syncHoodieTable();
|
||||||
|
|||||||
@@ -177,6 +177,9 @@ public class HoodieDeltaStreamer implements Serializable {
|
|||||||
@Parameter(names = {"--table-type"}, description = "Type of table. COPY_ON_WRITE (or) MERGE_ON_READ", required = true)
|
@Parameter(names = {"--table-type"}, description = "Type of table. COPY_ON_WRITE (or) MERGE_ON_READ", required = true)
|
||||||
public String tableType;
|
public String tableType;
|
||||||
|
|
||||||
|
@Parameter(names = {"--base-file-format"}, description = "File format for the base files. PARQUET (or) HFILE", required = false)
|
||||||
|
public String baseFileFormat;
|
||||||
|
|
||||||
@Parameter(names = {"--props"}, description = "path to properties file on localfs or dfs, with configurations for "
|
@Parameter(names = {"--props"}, description = "path to properties file on localfs or dfs, with configurations for "
|
||||||
+ "hoodie client, schema provider, key generator and data source. For hoodie client props, sane defaults are "
|
+ "hoodie client, schema provider, key generator and data source. For hoodie client props, sane defaults are "
|
||||||
+ "used, but recommend use to provide basic things like metrics endpoints, hive configs etc. For sources, refer"
|
+ "used, but recommend use to provide basic things like metrics endpoints, hive configs etc. For sources, refer"
|
||||||
@@ -379,8 +382,20 @@ public class HoodieDeltaStreamer implements Serializable {
|
|||||||
// This will guarantee there is no surprise with table type
|
// This will guarantee there is no surprise with table type
|
||||||
ValidationUtils.checkArgument(tableType.equals(HoodieTableType.valueOf(cfg.tableType)),
|
ValidationUtils.checkArgument(tableType.equals(HoodieTableType.valueOf(cfg.tableType)),
|
||||||
"Hoodie table is of type " + tableType + " but passed in CLI argument is " + cfg.tableType);
|
"Hoodie table is of type " + tableType + " but passed in CLI argument is " + cfg.tableType);
|
||||||
|
|
||||||
|
// Load base file format
|
||||||
|
// This will guarantee there is no surprise with base file type
|
||||||
|
String baseFileFormat = meta.getTableConfig().getBaseFileFormat().toString();
|
||||||
|
ValidationUtils.checkArgument(baseFileFormat.equals(cfg.baseFileFormat) || cfg.baseFileFormat == null,
|
||||||
|
"Hoodie table's base file format is of type " + baseFileFormat + " but passed in CLI argument is "
|
||||||
|
+ cfg.baseFileFormat);
|
||||||
|
cfg.baseFileFormat = meta.getTableConfig().getBaseFileFormat().toString();
|
||||||
|
this.cfg.baseFileFormat = cfg.baseFileFormat;
|
||||||
} else {
|
} else {
|
||||||
tableType = HoodieTableType.valueOf(cfg.tableType);
|
tableType = HoodieTableType.valueOf(cfg.tableType);
|
||||||
|
if (cfg.baseFileFormat == null) {
|
||||||
|
cfg.baseFileFormat = "PARQUET"; // default for backward compatibility
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
ValidationUtils.checkArgument(!cfg.filterDupes || cfg.operation != Operation.UPSERT,
|
ValidationUtils.checkArgument(!cfg.filterDupes || cfg.operation != Operation.UPSERT,
|
||||||
|
|||||||
Reference in New Issue
Block a user