[HUDI-431] Adding support for Parquet in MOR LogBlocks (#4333)
- Adding support for Parquet in MOR tables Log blocks Co-authored-by: Sivabalan Narayanan <n.siva.b@gmail.com>
This commit is contained in:
@@ -108,7 +108,7 @@ public class TestHoodieLogFileCommand extends CLIFunctionalTestHarness {
|
||||
Map<HoodieLogBlock.HeaderMetadataType, String> header = new HashMap<>();
|
||||
header.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, INSTANT_TIME);
|
||||
header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, getSimpleSchema().toString());
|
||||
dataBlock = new HoodieAvroDataBlock(records, header);
|
||||
dataBlock = new HoodieAvroDataBlock(records, header, HoodieRecord.RECORD_KEY_METADATA_FIELD);
|
||||
writer.appendBlock(dataBlock);
|
||||
}
|
||||
}
|
||||
@@ -188,7 +188,7 @@ public class TestHoodieLogFileCommand extends CLIFunctionalTestHarness {
|
||||
Map<HoodieLogBlock.HeaderMetadataType, String> header = new HashMap<>();
|
||||
header.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, INSTANT_TIME);
|
||||
header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, schema.toString());
|
||||
HoodieAvroDataBlock dataBlock = new HoodieAvroDataBlock(records1, header);
|
||||
HoodieAvroDataBlock dataBlock = new HoodieAvroDataBlock(records1, header, HoodieRecord.RECORD_KEY_METADATA_FIELD);
|
||||
writer.appendBlock(dataBlock);
|
||||
} finally {
|
||||
if (writer != null) {
|
||||
|
||||
@@ -83,14 +83,17 @@ public class HoodieStorageConfig extends HoodieConfig {
|
||||
.withDocumentation("Lower values increase the size of metadata tracked within HFile, but can offer potentially "
|
||||
+ "faster lookup times.");
|
||||
|
||||
// used to size log files
|
||||
public static final ConfigProperty<String> LOGFILE_DATA_BLOCK_FORMAT = ConfigProperty
|
||||
.key("hoodie.logfile.data.block.format")
|
||||
.noDefaultValue()
|
||||
.withDocumentation("Format of the data block within delta logs. Following formats are currently supported \"avro\", \"hfile\", \"parquet\"");
|
||||
|
||||
public static final ConfigProperty<String> LOGFILE_MAX_SIZE = ConfigProperty
|
||||
.key("hoodie.logfile.max.size")
|
||||
.defaultValue(String.valueOf(1024 * 1024 * 1024)) // 1 GB
|
||||
.withDocumentation("LogFile max size. This is the maximum size allowed for a log file "
|
||||
+ "before it is rolled over to the next version.");
|
||||
|
||||
// used to size data blocks in log file
|
||||
public static final ConfigProperty<String> LOGFILE_DATA_BLOCK_MAX_SIZE = ConfigProperty
|
||||
.key("hoodie.logfile.data.block.max.size")
|
||||
.defaultValue(String.valueOf(256 * 1024 * 1024))
|
||||
|
||||
@@ -37,9 +37,11 @@ import org.apache.hudi.common.model.HoodieFileFormat;
|
||||
import org.apache.hudi.common.model.OverwriteWithLatestAvroPayload;
|
||||
import org.apache.hudi.common.model.WriteConcurrencyMode;
|
||||
import org.apache.hudi.common.table.HoodieTableConfig;
|
||||
import org.apache.hudi.common.table.log.block.HoodieLogBlock;
|
||||
import org.apache.hudi.common.table.marker.MarkerType;
|
||||
import org.apache.hudi.common.table.timeline.versioning.TimelineLayoutVersion;
|
||||
import org.apache.hudi.common.table.view.FileSystemViewStorageConfig;
|
||||
import org.apache.hudi.common.util.Option;
|
||||
import org.apache.hudi.common.util.ReflectionUtils;
|
||||
import org.apache.hudi.common.util.ValidationUtils;
|
||||
import org.apache.hudi.config.metrics.HoodieMetricsConfig;
|
||||
@@ -1506,6 +1508,11 @@ public class HoodieWriteConfig extends HoodieConfig {
|
||||
return getString(HoodieStorageConfig.PARQUET_OUTPUT_TIMESTAMP_TYPE);
|
||||
}
|
||||
|
||||
public Option<HoodieLogBlock.HoodieLogBlockType> getLogDataBlockFormat() {
|
||||
return Option.ofNullable(getString(HoodieStorageConfig.LOGFILE_DATA_BLOCK_FORMAT))
|
||||
.map(HoodieLogBlock.HoodieLogBlockType::fromId);
|
||||
}
|
||||
|
||||
public long getLogFileMaxSize() {
|
||||
return getLong(HoodieStorageConfig.LOGFILE_MAX_SIZE);
|
||||
}
|
||||
|
||||
@@ -38,10 +38,12 @@ import org.apache.hudi.common.model.IOType;
|
||||
import org.apache.hudi.common.table.log.AppendResult;
|
||||
import org.apache.hudi.common.table.log.HoodieLogFormat;
|
||||
import org.apache.hudi.common.table.log.HoodieLogFormat.Writer;
|
||||
import org.apache.hudi.common.table.log.block.HoodieDataBlock;
|
||||
import org.apache.hudi.common.table.log.block.HoodieAvroDataBlock;
|
||||
import org.apache.hudi.common.table.log.block.HoodieDeleteBlock;
|
||||
import org.apache.hudi.common.table.log.block.HoodieHFileDataBlock;
|
||||
import org.apache.hudi.common.table.log.block.HoodieLogBlock;
|
||||
import org.apache.hudi.common.table.log.block.HoodieLogBlock.HeaderMetadataType;
|
||||
import org.apache.hudi.common.table.log.block.HoodieParquetDataBlock;
|
||||
import org.apache.hudi.common.table.view.TableFileSystemView.SliceView;
|
||||
import org.apache.hudi.common.util.DefaultSizeEstimator;
|
||||
import org.apache.hudi.common.util.Option;
|
||||
@@ -49,6 +51,7 @@ import org.apache.hudi.common.util.ReflectionUtils;
|
||||
import org.apache.hudi.common.util.SizeEstimator;
|
||||
import org.apache.hudi.config.HoodieWriteConfig;
|
||||
import org.apache.hudi.exception.HoodieAppendException;
|
||||
import org.apache.hudi.exception.HoodieException;
|
||||
import org.apache.hudi.exception.HoodieUpsertException;
|
||||
import org.apache.hudi.table.HoodieTable;
|
||||
|
||||
@@ -360,13 +363,13 @@ public class HoodieAppendHandle<T extends HoodieRecordPayload, I, K, O> extends
|
||||
header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, writeSchemaWithMetaFields.toString());
|
||||
List<HoodieLogBlock> blocks = new ArrayList<>(2);
|
||||
if (recordList.size() > 0) {
|
||||
if (config.populateMetaFields()) {
|
||||
blocks.add(HoodieDataBlock.getBlock(hoodieTable.getLogDataBlockFormat(), recordList, header));
|
||||
} else {
|
||||
final String keyField = hoodieTable.getMetaClient().getTableConfig().getRecordKeyFieldProp();
|
||||
blocks.add(HoodieDataBlock.getBlock(hoodieTable.getLogDataBlockFormat(), recordList, header, keyField));
|
||||
}
|
||||
String keyField = config.populateMetaFields()
|
||||
? HoodieRecord.RECORD_KEY_METADATA_FIELD
|
||||
: hoodieTable.getMetaClient().getTableConfig().getRecordKeyFieldProp();
|
||||
|
||||
blocks.add(getBlock(config, pickLogDataBlockFormat(), recordList, header, keyField));
|
||||
}
|
||||
|
||||
if (keysToDelete.size() > 0) {
|
||||
blocks.add(new HoodieDeleteBlock(keysToDelete.toArray(new HoodieKey[keysToDelete.size()]), header));
|
||||
}
|
||||
@@ -497,4 +500,40 @@ public class HoodieAppendHandle<T extends HoodieRecordPayload, I, K, O> extends
|
||||
numberOfRecords = 0;
|
||||
}
|
||||
}
|
||||
|
||||
private HoodieLogBlock.HoodieLogBlockType pickLogDataBlockFormat() {
|
||||
Option<HoodieLogBlock.HoodieLogBlockType> logBlockTypeOpt = config.getLogDataBlockFormat();
|
||||
if (logBlockTypeOpt.isPresent()) {
|
||||
return logBlockTypeOpt.get();
|
||||
}
|
||||
|
||||
// Fallback to deduce data-block type based on the base file format
|
||||
switch (hoodieTable.getBaseFileFormat()) {
|
||||
case PARQUET:
|
||||
case ORC:
|
||||
return HoodieLogBlock.HoodieLogBlockType.AVRO_DATA_BLOCK;
|
||||
case HFILE:
|
||||
return HoodieLogBlock.HoodieLogBlockType.HFILE_DATA_BLOCK;
|
||||
default:
|
||||
throw new HoodieException("Base file format " + hoodieTable.getBaseFileFormat()
|
||||
+ " does not have associated log block type");
|
||||
}
|
||||
}
|
||||
|
||||
private static HoodieLogBlock getBlock(HoodieWriteConfig writeConfig,
|
||||
HoodieLogBlock.HoodieLogBlockType logDataBlockFormat,
|
||||
List<IndexedRecord> recordList,
|
||||
Map<HeaderMetadataType, String> header,
|
||||
String keyField) {
|
||||
switch (logDataBlockFormat) {
|
||||
case AVRO_DATA_BLOCK:
|
||||
return new HoodieAvroDataBlock(recordList, header, keyField);
|
||||
case HFILE_DATA_BLOCK:
|
||||
return new HoodieHFileDataBlock(recordList, header, writeConfig.getHFileCompressionAlgorithm());
|
||||
case PARQUET_DATA_BLOCK:
|
||||
return new HoodieParquetDataBlock(recordList, header, keyField, writeConfig.getParquetCompressionCodec());
|
||||
default:
|
||||
throw new HoodieException("Data block format " + logDataBlockFormat + " not implemented");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -51,13 +51,23 @@ public class HoodieParquetWriter<T extends HoodieRecordPayload, R extends Indexe
|
||||
private final TaskContextSupplier taskContextSupplier;
|
||||
private final boolean populateMetaFields;
|
||||
|
||||
public HoodieParquetWriter(String instantTime, Path file, HoodieAvroParquetConfig parquetConfig,
|
||||
Schema schema, TaskContextSupplier taskContextSupplier, boolean populateMetaFields) throws IOException {
|
||||
public HoodieParquetWriter(String instantTime,
|
||||
Path file,
|
||||
HoodieAvroParquetConfig parquetConfig,
|
||||
Schema schema,
|
||||
TaskContextSupplier taskContextSupplier,
|
||||
boolean populateMetaFields) throws IOException {
|
||||
super(HoodieWrapperFileSystem.convertToHoodiePath(file, parquetConfig.getHadoopConf()),
|
||||
ParquetFileWriter.Mode.CREATE, parquetConfig.getWriteSupport(), parquetConfig.getCompressionCodecName(),
|
||||
parquetConfig.getBlockSize(), parquetConfig.getPageSize(), parquetConfig.getPageSize(),
|
||||
parquetConfig.dictionaryEnabled(), DEFAULT_IS_VALIDATING_ENABLED,
|
||||
DEFAULT_WRITER_VERSION, FSUtils.registerFileSystem(file, parquetConfig.getHadoopConf()));
|
||||
ParquetFileWriter.Mode.CREATE,
|
||||
parquetConfig.getWriteSupport(),
|
||||
parquetConfig.getCompressionCodecName(),
|
||||
parquetConfig.getBlockSize(),
|
||||
parquetConfig.getPageSize(),
|
||||
parquetConfig.getPageSize(),
|
||||
parquetConfig.dictionaryEnabled(),
|
||||
DEFAULT_IS_VALIDATING_ENABLED,
|
||||
DEFAULT_WRITER_VERSION,
|
||||
FSUtils.registerFileSystem(file, parquetConfig.getHadoopConf()));
|
||||
this.file = HoodieWrapperFileSystem.convertToHoodiePath(file, parquetConfig.getHadoopConf());
|
||||
this.fs =
|
||||
(HoodieWrapperFileSystem) this.file.getFileSystem(FSUtils.registerFileSystem(file, parquetConfig.getHadoopConf()));
|
||||
|
||||
@@ -18,7 +18,11 @@
|
||||
|
||||
package org.apache.hudi.table;
|
||||
|
||||
import org.apache.avro.Schema;
|
||||
import org.apache.avro.specific.SpecificRecordBase;
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.apache.hudi.avro.HoodieAvroUtils;
|
||||
import org.apache.hudi.avro.model.HoodieCleanMetadata;
|
||||
import org.apache.hudi.avro.model.HoodieCleanerPlan;
|
||||
@@ -44,7 +48,6 @@ import org.apache.hudi.common.model.HoodieRecordPayload;
|
||||
import org.apache.hudi.common.model.HoodieWriteStat;
|
||||
import org.apache.hudi.common.table.HoodieTableMetaClient;
|
||||
import org.apache.hudi.common.table.TableSchemaResolver;
|
||||
import org.apache.hudi.common.table.log.block.HoodieLogBlock.HoodieLogBlockType;
|
||||
import org.apache.hudi.common.table.timeline.HoodieActiveTimeline;
|
||||
import org.apache.hudi.common.table.timeline.HoodieInstant;
|
||||
import org.apache.hudi.common.table.timeline.HoodieTimeline;
|
||||
@@ -69,11 +72,6 @@ import org.apache.hudi.table.action.HoodieWriteMetadata;
|
||||
import org.apache.hudi.table.action.bootstrap.HoodieBootstrapWriteMetadata;
|
||||
import org.apache.hudi.table.marker.WriteMarkers;
|
||||
import org.apache.hudi.table.marker.WriteMarkersFactory;
|
||||
|
||||
import org.apache.avro.Schema;
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.apache.hudi.table.storage.HoodieLayoutFactory;
|
||||
import org.apache.hudi.table.storage.HoodieStorageLayout;
|
||||
import org.apache.log4j.LogManager;
|
||||
@@ -719,19 +717,6 @@ public abstract class HoodieTable<T extends HoodieRecordPayload, I, K, O> implem
|
||||
return metaClient.getTableConfig().getLogFileFormat();
|
||||
}
|
||||
|
||||
public HoodieLogBlockType getLogDataBlockFormat() {
|
||||
switch (getBaseFileFormat()) {
|
||||
case PARQUET:
|
||||
case ORC:
|
||||
return HoodieLogBlockType.AVRO_DATA_BLOCK;
|
||||
case HFILE:
|
||||
return HoodieLogBlockType.HFILE_DATA_BLOCK;
|
||||
default:
|
||||
throw new HoodieException("Base file format " + getBaseFileFormat()
|
||||
+ " does not have associated log block format");
|
||||
}
|
||||
}
|
||||
|
||||
public String getBaseFileExtension() {
|
||||
return getBaseFileFormat().getFileExtension();
|
||||
}
|
||||
|
||||
@@ -182,7 +182,7 @@ public class HoodieWriteableTestTable extends HoodieMetadataTestTable {
|
||||
LOG.warn("Failed to convert record " + r.toString(), e);
|
||||
return null;
|
||||
}
|
||||
}).collect(Collectors.toList()), header));
|
||||
}).collect(Collectors.toList()), header, HoodieRecord.RECORD_KEY_METADATA_FIELD));
|
||||
return Pair.of(partitionPath, logWriter.getLogFile());
|
||||
}
|
||||
}
|
||||
|
||||
@@ -19,6 +19,7 @@
|
||||
|
||||
package org.apache.hudi.testutils;
|
||||
|
||||
import org.apache.avro.generic.IndexedRecord;
|
||||
import org.apache.hudi.avro.HoodieAvroUtils;
|
||||
import org.apache.hudi.common.bloom.BloomFilter;
|
||||
import org.apache.hudi.common.bloom.BloomFilterFactory;
|
||||
@@ -39,6 +40,7 @@ import org.apache.hadoop.fs.Path;
|
||||
import org.apache.log4j.LogManager;
|
||||
import org.apache.log4j.Logger;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.HashMap;
|
||||
@@ -132,12 +134,12 @@ public class HoodieFlinkWriteableTestTable extends HoodieWriteableTestTable {
|
||||
try {
|
||||
GenericRecord val = (GenericRecord) r.getData().getInsertValue(schema).get();
|
||||
HoodieAvroUtils.addHoodieKeyToRecord(val, r.getRecordKey(), r.getPartitionPath(), "");
|
||||
return (org.apache.avro.generic.IndexedRecord) val;
|
||||
} catch (java.io.IOException e) {
|
||||
return (IndexedRecord) val;
|
||||
} catch (IOException e) {
|
||||
LOG.warn("Failed to convert record " + r.toString(), e);
|
||||
return null;
|
||||
}
|
||||
}).collect(Collectors.toList()), header));
|
||||
}).collect(Collectors.toList()), header, HoodieRecord.RECORD_KEY_METADATA_FIELD));
|
||||
return Pair.of(partitionPath, logWriter.getLogFile());
|
||||
}
|
||||
}
|
||||
|
||||
@@ -18,17 +18,6 @@
|
||||
|
||||
package org.apache.hudi.avro;
|
||||
|
||||
import org.apache.hudi.common.config.SerializableSchema;
|
||||
import org.apache.hudi.common.model.HoodieOperation;
|
||||
import org.apache.hudi.common.model.HoodieRecord;
|
||||
import org.apache.hudi.common.model.HoodieRecordPayload;
|
||||
import org.apache.hudi.common.util.Option;
|
||||
import org.apache.hudi.common.util.StringUtils;
|
||||
import org.apache.hudi.common.util.collection.Pair;
|
||||
import org.apache.hudi.exception.HoodieException;
|
||||
import org.apache.hudi.exception.HoodieIOException;
|
||||
import org.apache.hudi.exception.SchemaCompatibilityException;
|
||||
|
||||
import org.apache.avro.Conversions.DecimalConversion;
|
||||
import org.apache.avro.JsonProperties;
|
||||
import org.apache.avro.LogicalTypes;
|
||||
@@ -50,15 +39,22 @@ import org.apache.avro.io.EncoderFactory;
|
||||
import org.apache.avro.io.JsonDecoder;
|
||||
import org.apache.avro.io.JsonEncoder;
|
||||
import org.apache.avro.specific.SpecificRecordBase;
|
||||
import org.apache.hudi.common.config.SerializableSchema;
|
||||
import org.apache.hudi.common.model.HoodieOperation;
|
||||
import org.apache.hudi.common.model.HoodieRecord;
|
||||
import org.apache.hudi.common.model.HoodieRecordPayload;
|
||||
import org.apache.hudi.common.util.Option;
|
||||
import org.apache.hudi.common.util.StringUtils;
|
||||
import org.apache.hudi.common.util.collection.Pair;
|
||||
import org.apache.hudi.exception.HoodieException;
|
||||
import org.apache.hudi.exception.HoodieIOException;
|
||||
import org.apache.hudi.exception.SchemaCompatibilityException;
|
||||
|
||||
import java.io.ByteArrayInputStream;
|
||||
import java.io.ByteArrayOutputStream;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.io.OutputStream;
|
||||
import java.math.BigDecimal;
|
||||
import java.nio.ByteBuffer;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.sql.Timestamp;
|
||||
import java.time.LocalDate;
|
||||
import java.util.ArrayList;
|
||||
@@ -67,8 +63,6 @@ import java.util.Collections;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.stream.Collectors;
|
||||
import java.util.zip.DeflaterOutputStream;
|
||||
import java.util.zip.InflaterInputStream;
|
||||
|
||||
/**
|
||||
* Helper class to do common stuff across Avro.
|
||||
@@ -343,7 +337,7 @@ public class HoodieAvroUtils {
|
||||
}
|
||||
|
||||
/**
|
||||
* Given a avro record with a given schema, rewrites it into the new schema while setting fields only from the new
|
||||
* Given an Avro record with a given schema, rewrites it into the new schema while setting fields only from the new
|
||||
* schema.
|
||||
* NOTE: Here, the assumption is that you cannot go from an evolved schema (schema with (N) fields)
|
||||
* to an older schema (schema with (N-1) fields). All fields present in the older record schema MUST be present in the
|
||||
@@ -377,6 +371,16 @@ public class HoodieAvroUtils {
|
||||
return newRecord;
|
||||
}
|
||||
|
||||
/**
|
||||
* Converts list of {@link GenericRecord} provided into the {@link GenericRecord} adhering to the
|
||||
* provided {@code newSchema}.
|
||||
*
|
||||
* To better understand conversion rules please check {@link #rewriteRecord(GenericRecord, Schema)}
|
||||
*/
|
||||
public static List<GenericRecord> rewriteRecords(List<GenericRecord> records, Schema newSchema) {
|
||||
return records.stream().map(r -> rewriteRecord(r, newSchema)).collect(Collectors.toList());
|
||||
}
|
||||
|
||||
private static void copyOldValueOrSetDefault(GenericRecord oldRecord, GenericRecord newRecord, Schema.Field f) {
|
||||
// cache the result of oldRecord.get() to save CPU expensive hash lookup
|
||||
Schema oldSchema = oldRecord.getSchema();
|
||||
@@ -392,33 +396,6 @@ public class HoodieAvroUtils {
|
||||
}
|
||||
}
|
||||
|
||||
public static byte[] compress(String text) {
|
||||
ByteArrayOutputStream baos = new ByteArrayOutputStream();
|
||||
try {
|
||||
OutputStream out = new DeflaterOutputStream(baos);
|
||||
out.write(text.getBytes(StandardCharsets.UTF_8));
|
||||
out.close();
|
||||
} catch (IOException e) {
|
||||
throw new HoodieIOException("IOException while compressing text " + text, e);
|
||||
}
|
||||
return baos.toByteArray();
|
||||
}
|
||||
|
||||
public static String decompress(byte[] bytes) {
|
||||
InputStream in = new InflaterInputStream(new ByteArrayInputStream(bytes));
|
||||
ByteArrayOutputStream baos = new ByteArrayOutputStream();
|
||||
try {
|
||||
byte[] buffer = new byte[8192];
|
||||
int len;
|
||||
while ((len = in.read(buffer)) > 0) {
|
||||
baos.write(buffer, 0, len);
|
||||
}
|
||||
return new String(baos.toByteArray(), StandardCharsets.UTF_8);
|
||||
} catch (IOException e) {
|
||||
throw new HoodieIOException("IOException while decompressing text", e);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Generate a reader schema off the provided writeSchema, to just project out the provided columns.
|
||||
*/
|
||||
|
||||
@@ -18,11 +18,10 @@
|
||||
|
||||
package org.apache.hudi.avro;
|
||||
|
||||
import org.apache.avro.Schema;
|
||||
import org.apache.hudi.common.bloom.BloomFilter;
|
||||
import org.apache.hudi.common.bloom.HoodieDynamicBoundedBloomFilter;
|
||||
import org.apache.hudi.common.util.Option;
|
||||
|
||||
import org.apache.avro.Schema;
|
||||
import org.apache.parquet.avro.AvroWriteSupport;
|
||||
import org.apache.parquet.hadoop.api.WriteSupport;
|
||||
import org.apache.parquet.schema.MessageType;
|
||||
|
||||
@@ -135,6 +135,17 @@ public class FSUtils {
|
||||
return providedPath;
|
||||
}
|
||||
|
||||
/**
|
||||
* Makes path qualified w/ {@link FileSystem}'s URI
|
||||
*
|
||||
* @param fs instance of {@link FileSystem} path belongs to
|
||||
* @param path path to be qualified
|
||||
* @return qualified path, prefixed w/ the URI of the target FS object provided
|
||||
*/
|
||||
public static Path makeQualified(FileSystem fs, Path path) {
|
||||
return path.makeQualified(fs.getUri(), fs.getWorkingDirectory());
|
||||
}
|
||||
|
||||
/**
|
||||
* A write token uniquely identifies an attempt at one of the IOHandle operations (Merge/Create/Append).
|
||||
*/
|
||||
|
||||
@@ -136,7 +136,7 @@ public class HoodieWrapperFileSystem extends FileSystem {
|
||||
}
|
||||
}
|
||||
|
||||
private static Path convertPathWithScheme(Path oldPath, String newScheme) {
|
||||
public static Path convertPathWithScheme(Path oldPath, String newScheme) {
|
||||
URI oldURI = oldPath.toUri();
|
||||
URI newURI;
|
||||
try {
|
||||
|
||||
@@ -19,10 +19,11 @@
|
||||
package org.apache.hudi.common.fs.inline;
|
||||
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.apache.hudi.common.util.ValidationUtils;
|
||||
|
||||
import java.io.File;
|
||||
|
||||
import static org.apache.hudi.common.util.ValidationUtils.checkArgument;
|
||||
|
||||
/**
|
||||
* Utils to parse InLineFileSystem paths.
|
||||
* Inline FS format:
|
||||
@@ -61,10 +62,10 @@ public class InLineFSUtils {
|
||||
|
||||
/**
|
||||
* InlineFS Path format:
|
||||
* "inlinefs://path/to/outer/file/outer_file_schema/?start_offset=start_offset>&length=<length>"
|
||||
* "inlinefs://path/to/outer/file/outer_file_scheme/?start_offset=start_offset>&length=<length>"
|
||||
* <p>
|
||||
* Outer File Path format:
|
||||
* "outer_file_schema://path/to/outer/file"
|
||||
* "outer_file_scheme://path/to/outer/file"
|
||||
* <p>
|
||||
* Example
|
||||
* Input: "inlinefs://file1/s3a/?start_offset=20&length=40".
|
||||
@@ -74,40 +75,48 @@ public class InLineFSUtils {
|
||||
* @return Outer file Path from the InLineFS Path
|
||||
*/
|
||||
public static Path getOuterFilePathFromInlinePath(Path inlineFSPath) {
|
||||
final String scheme = inlineFSPath.getParent().getName();
|
||||
assertInlineFSPath(inlineFSPath);
|
||||
|
||||
final String outerFileScheme = inlineFSPath.getParent().getName();
|
||||
final Path basePath = inlineFSPath.getParent().getParent();
|
||||
ValidationUtils.checkArgument(basePath.toString().contains(SCHEME_SEPARATOR),
|
||||
"Invalid InLineFSPath: " + inlineFSPath);
|
||||
checkArgument(basePath.toString().contains(SCHEME_SEPARATOR),
|
||||
"Invalid InLineFS path: " + inlineFSPath);
|
||||
|
||||
final String pathExceptScheme = basePath.toString().substring(basePath.toString().indexOf(SCHEME_SEPARATOR) + 1);
|
||||
final String fullPath = scheme + SCHEME_SEPARATOR
|
||||
+ (scheme.equals(LOCAL_FILESYSTEM_SCHEME) ? PATH_SEPARATOR : "")
|
||||
final String fullPath = outerFileScheme + SCHEME_SEPARATOR
|
||||
+ (outerFileScheme.equals(LOCAL_FILESYSTEM_SCHEME) ? PATH_SEPARATOR : "")
|
||||
+ pathExceptScheme;
|
||||
return new Path(fullPath);
|
||||
}
|
||||
|
||||
/**
|
||||
* Eg input : "inlinefs://file1/s3a/?start_offset=20&length=40".
|
||||
* output: 20
|
||||
* Returns start offset w/in the base for the block identified by the given InlineFS path
|
||||
*
|
||||
* @param inlinePath
|
||||
* @return
|
||||
* input: "inlinefs://file1/s3a/?start_offset=20&length=40".
|
||||
* output: 20
|
||||
*/
|
||||
public static int startOffset(Path inlinePath) {
|
||||
String[] slices = inlinePath.toString().split("[?&=]");
|
||||
public static int startOffset(Path inlineFSPath) {
|
||||
assertInlineFSPath(inlineFSPath);
|
||||
|
||||
String[] slices = inlineFSPath.toString().split("[?&=]");
|
||||
return Integer.parseInt(slices[slices.length - 3]);
|
||||
}
|
||||
|
||||
/**
|
||||
* Eg input : "inlinefs:/file1/s3a/?start_offset=20&length=40".
|
||||
* Output: 40
|
||||
* Returns length of the block (embedded w/in the base file) identified by the given InlineFS path
|
||||
*
|
||||
* @param inlinePath
|
||||
* @return
|
||||
* input: "inlinefs:/file1/s3a/?start_offset=20&length=40".
|
||||
* output: 40
|
||||
*/
|
||||
public static int length(Path inlinePath) {
|
||||
assertInlineFSPath(inlinePath);
|
||||
|
||||
String[] slices = inlinePath.toString().split("[?&=]");
|
||||
return Integer.parseInt(slices[slices.length - 1]);
|
||||
}
|
||||
|
||||
private static void assertInlineFSPath(Path inlinePath) {
|
||||
String scheme = inlinePath.toUri().getScheme();
|
||||
checkArgument(InLineFileSystem.SCHEME.equals(scheme));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -57,6 +57,7 @@ public class InLineFileSystem extends FileSystem {
|
||||
return URI.create(getScheme());
|
||||
}
|
||||
|
||||
@Override
|
||||
public String getScheme() {
|
||||
return SCHEME;
|
||||
}
|
||||
@@ -129,5 +130,4 @@ public class InLineFileSystem extends FileSystem {
|
||||
public boolean mkdirs(Path path, FsPermission fsPermission) throws IOException {
|
||||
throw new UnsupportedOperationException("Can't set working directory");
|
||||
}
|
||||
|
||||
}
|
||||
@@ -18,11 +18,10 @@
|
||||
|
||||
package org.apache.hudi.common.model;
|
||||
|
||||
import org.apache.hudi.common.fs.FSUtils;
|
||||
|
||||
import org.apache.hadoop.fs.FileStatus;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.apache.hudi.common.fs.FSUtils;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Serializable;
|
||||
@@ -60,7 +59,7 @@ public class HoodieLogFile implements Serializable {
|
||||
public HoodieLogFile(Path logPath) {
|
||||
this.fileStatus = null;
|
||||
this.pathStr = logPath.toString();
|
||||
this.fileLen = 0;
|
||||
this.fileLen = -1;
|
||||
}
|
||||
|
||||
public HoodieLogFile(Path logPath, Long fileLen) {
|
||||
|
||||
@@ -18,6 +18,11 @@
|
||||
|
||||
package org.apache.hudi.common.table;
|
||||
|
||||
import org.apache.avro.Schema;
|
||||
import org.apache.hadoop.fs.FSDataInputStream;
|
||||
import org.apache.hadoop.fs.FSDataOutputStream;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.apache.hudi.common.bootstrap.index.HFileBootstrapIndex;
|
||||
import org.apache.hudi.common.bootstrap.index.NoOpBootstrapIndex;
|
||||
import org.apache.hudi.common.config.ConfigClassProperty;
|
||||
@@ -36,12 +41,6 @@ import org.apache.hudi.common.util.Option;
|
||||
import org.apache.hudi.common.util.ValidationUtils;
|
||||
import org.apache.hudi.exception.HoodieIOException;
|
||||
import org.apache.hudi.keygen.constant.KeyGeneratorOptions;
|
||||
|
||||
import org.apache.avro.Schema;
|
||||
import org.apache.hadoop.fs.FSDataInputStream;
|
||||
import org.apache.hadoop.fs.FSDataOutputStream;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.apache.log4j.LogManager;
|
||||
import org.apache.log4j.Logger;
|
||||
|
||||
|
||||
@@ -30,6 +30,7 @@ import org.apache.hudi.common.table.log.block.HoodieDataBlock;
|
||||
import org.apache.hudi.common.table.log.block.HoodieDeleteBlock;
|
||||
import org.apache.hudi.common.table.log.block.HoodieHFileDataBlock;
|
||||
import org.apache.hudi.common.table.log.block.HoodieLogBlock;
|
||||
import org.apache.hudi.common.table.log.block.HoodieParquetDataBlock;
|
||||
import org.apache.hudi.common.table.timeline.HoodieTimeline;
|
||||
import org.apache.hudi.common.util.Option;
|
||||
import org.apache.hudi.common.util.SpillableMapUtils;
|
||||
@@ -230,6 +231,7 @@ public abstract class AbstractHoodieLogRecordReader {
|
||||
switch (logBlock.getBlockType()) {
|
||||
case HFILE_DATA_BLOCK:
|
||||
case AVRO_DATA_BLOCK:
|
||||
case PARQUET_DATA_BLOCK:
|
||||
LOG.info("Reading a data block from file " + logFile.getPath() + " at instant "
|
||||
+ logBlock.getLogBlockHeader().get(INSTANT_TIME));
|
||||
if (isNewInstantBlock(logBlock) && !readBlocksLazily) {
|
||||
@@ -426,6 +428,9 @@ public abstract class AbstractHoodieLogRecordReader {
|
||||
case HFILE_DATA_BLOCK:
|
||||
processDataBlock((HoodieHFileDataBlock) lastBlock, keys);
|
||||
break;
|
||||
case PARQUET_DATA_BLOCK:
|
||||
processDataBlock((HoodieParquetDataBlock) lastBlock, keys);
|
||||
break;
|
||||
case DELETE_BLOCK:
|
||||
Arrays.stream(((HoodieDeleteBlock) lastBlock).getKeysToDelete()).forEach(this::processNextDeletedKey);
|
||||
break;
|
||||
|
||||
@@ -18,6 +18,13 @@
|
||||
|
||||
package org.apache.hudi.common.table.log;
|
||||
|
||||
import org.apache.avro.Schema;
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.fs.BufferedFSInputStream;
|
||||
import org.apache.hadoop.fs.FSDataInputStream;
|
||||
import org.apache.hadoop.fs.FSInputStream;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.hadoop.hbase.util.Bytes;
|
||||
import org.apache.hudi.common.fs.FSUtils;
|
||||
import org.apache.hudi.common.fs.SchemeAwareFSDataInputStream;
|
||||
import org.apache.hudi.common.fs.TimedFSDataInputStream;
|
||||
@@ -31,21 +38,15 @@ import org.apache.hudi.common.table.log.block.HoodieHFileDataBlock;
|
||||
import org.apache.hudi.common.table.log.block.HoodieLogBlock;
|
||||
import org.apache.hudi.common.table.log.block.HoodieLogBlock.HeaderMetadataType;
|
||||
import org.apache.hudi.common.table.log.block.HoodieLogBlock.HoodieLogBlockType;
|
||||
import org.apache.hudi.common.table.log.block.HoodieParquetDataBlock;
|
||||
import org.apache.hudi.common.util.Option;
|
||||
import org.apache.hudi.common.util.ValidationUtils;
|
||||
import org.apache.hudi.exception.CorruptedLogFileException;
|
||||
import org.apache.hudi.exception.HoodieIOException;
|
||||
import org.apache.hudi.exception.HoodieNotSupportedException;
|
||||
|
||||
import org.apache.avro.Schema;
|
||||
import org.apache.hadoop.fs.BufferedFSInputStream;
|
||||
import org.apache.hadoop.fs.FSDataInputStream;
|
||||
import org.apache.hadoop.fs.FSInputStream;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.hadoop.hbase.util.Bytes;
|
||||
import org.apache.log4j.LogManager;
|
||||
import org.apache.log4j.Logger;
|
||||
|
||||
import javax.annotation.Nullable;
|
||||
import java.io.EOFException;
|
||||
import java.io.IOException;
|
||||
import java.util.Arrays;
|
||||
@@ -53,6 +54,9 @@ import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
import java.util.Objects;
|
||||
|
||||
import static org.apache.hudi.common.util.ValidationUtils.checkArgument;
|
||||
import static org.apache.hudi.common.util.ValidationUtils.checkState;
|
||||
|
||||
/**
|
||||
* Scans a log file and provides block level iterator on the log file Loads the entire block contents in memory Can emit
|
||||
* either a DataBlock, CommandBlock, DeleteBlock or CorruptBlock (if one is found).
|
||||
@@ -63,6 +67,7 @@ public class HoodieLogFileReader implements HoodieLogFormat.Reader {
|
||||
private static final int BLOCK_SCAN_READ_BUFFER_SIZE = 1024 * 1024; // 1 MB
|
||||
private static final Logger LOG = LogManager.getLogger(HoodieLogFileReader.class);
|
||||
|
||||
private final Configuration hadoopConf;
|
||||
private final FSDataInputStream inputStream;
|
||||
private final HoodieLogFile logFile;
|
||||
private final byte[] magicBuffer = new byte[6];
|
||||
@@ -72,7 +77,7 @@ public class HoodieLogFileReader implements HoodieLogFormat.Reader {
|
||||
private long reverseLogFilePosition;
|
||||
private long lastReverseLogFilePosition;
|
||||
private boolean reverseReader;
|
||||
private boolean enableInlineReading;
|
||||
private boolean enableRecordLookups;
|
||||
private boolean closed = false;
|
||||
private transient Thread shutdownThread = null;
|
||||
|
||||
@@ -88,76 +93,26 @@ public class HoodieLogFileReader implements HoodieLogFormat.Reader {
|
||||
}
|
||||
|
||||
public HoodieLogFileReader(FileSystem fs, HoodieLogFile logFile, Schema readerSchema, int bufferSize,
|
||||
boolean readBlockLazily, boolean reverseReader, boolean enableInlineReading,
|
||||
boolean readBlockLazily, boolean reverseReader, boolean enableRecordLookups,
|
||||
String keyField) throws IOException {
|
||||
FSDataInputStream fsDataInputStream = fs.open(logFile.getPath(), bufferSize);
|
||||
this.logFile = logFile;
|
||||
this.inputStream = getFSDataInputStream(fsDataInputStream, fs, bufferSize);
|
||||
this.hadoopConf = fs.getConf();
|
||||
// NOTE: We repackage {@code HoodieLogFile} here to make sure that the provided path
|
||||
// is prefixed with an appropriate scheme given that we're not propagating the FS
|
||||
// further
|
||||
this.logFile = new HoodieLogFile(FSUtils.makeQualified(fs, logFile.getPath()), logFile.getFileSize());
|
||||
this.inputStream = getFSDataInputStream(fs, this.logFile, bufferSize);
|
||||
this.readerSchema = readerSchema;
|
||||
this.readBlockLazily = readBlockLazily;
|
||||
this.reverseReader = reverseReader;
|
||||
this.enableInlineReading = enableInlineReading;
|
||||
this.enableRecordLookups = enableRecordLookups;
|
||||
this.keyField = keyField;
|
||||
if (this.reverseReader) {
|
||||
this.reverseLogFilePosition = this.lastReverseLogFilePosition = logFile.getFileSize();
|
||||
this.reverseLogFilePosition = this.lastReverseLogFilePosition = this.logFile.getFileSize();
|
||||
}
|
||||
|
||||
addShutDownHook();
|
||||
}
|
||||
|
||||
public HoodieLogFileReader(FileSystem fs, HoodieLogFile logFile, Schema readerSchema) throws IOException {
|
||||
this(fs, logFile, readerSchema, DEFAULT_BUFFER_SIZE, false, false);
|
||||
}
|
||||
|
||||
/**
|
||||
* Fetch the right {@link FSDataInputStream} to be used by wrapping with required input streams.
|
||||
* @param fsDataInputStream original instance of {@link FSDataInputStream}.
|
||||
* @param fs instance of {@link FileSystem} in use.
|
||||
* @param bufferSize buffer size to be used.
|
||||
* @return the right {@link FSDataInputStream} as required.
|
||||
*/
|
||||
private FSDataInputStream getFSDataInputStream(FSDataInputStream fsDataInputStream, FileSystem fs, int bufferSize) {
|
||||
if (FSUtils.isGCSFileSystem(fs)) {
|
||||
// in GCS FS, we might need to interceptor seek offsets as we might get EOF exception
|
||||
return new SchemeAwareFSDataInputStream(getFSDataInputStreamForGCS(fsDataInputStream, bufferSize), true);
|
||||
}
|
||||
|
||||
if (fsDataInputStream.getWrappedStream() instanceof FSInputStream) {
|
||||
return new TimedFSDataInputStream(logFile.getPath(), new FSDataInputStream(
|
||||
new BufferedFSInputStream((FSInputStream) fsDataInputStream.getWrappedStream(), bufferSize)));
|
||||
}
|
||||
|
||||
// fsDataInputStream.getWrappedStream() maybe a BufferedFSInputStream
|
||||
// need to wrap in another BufferedFSInputStream the make bufferSize work?
|
||||
return fsDataInputStream;
|
||||
}
|
||||
|
||||
/**
|
||||
* GCS FileSystem needs some special handling for seek and hence this method assists to fetch the right {@link FSDataInputStream} to be
|
||||
* used by wrapping with required input streams.
|
||||
* @param fsDataInputStream original instance of {@link FSDataInputStream}.
|
||||
* @param bufferSize buffer size to be used.
|
||||
* @return the right {@link FSDataInputStream} as required.
|
||||
*/
|
||||
private FSDataInputStream getFSDataInputStreamForGCS(FSDataInputStream fsDataInputStream, int bufferSize) {
|
||||
// incase of GCS FS, there are two flows.
|
||||
// a. fsDataInputStream.getWrappedStream() instanceof FSInputStream
|
||||
// b. fsDataInputStream.getWrappedStream() not an instanceof FSInputStream, but an instance of FSDataInputStream.
|
||||
// (a) is handled in the first if block and (b) is handled in the second if block. If not, we fallback to original fsDataInputStream
|
||||
if (fsDataInputStream.getWrappedStream() instanceof FSInputStream) {
|
||||
return new TimedFSDataInputStream(logFile.getPath(), new FSDataInputStream(
|
||||
new BufferedFSInputStream((FSInputStream) fsDataInputStream.getWrappedStream(), bufferSize)));
|
||||
}
|
||||
|
||||
if (fsDataInputStream.getWrappedStream() instanceof FSDataInputStream
|
||||
&& ((FSDataInputStream) fsDataInputStream.getWrappedStream()).getWrappedStream() instanceof FSInputStream) {
|
||||
FSInputStream inputStream = (FSInputStream)((FSDataInputStream) fsDataInputStream.getWrappedStream()).getWrappedStream();
|
||||
return new TimedFSDataInputStream(logFile.getPath(),
|
||||
new FSDataInputStream(new BufferedFSInputStream(inputStream, bufferSize)));
|
||||
}
|
||||
|
||||
return fsDataInputStream;
|
||||
}
|
||||
|
||||
@Override
|
||||
public HoodieLogFile getLogFile() {
|
||||
return logFile;
|
||||
@@ -181,15 +136,10 @@ public class HoodieLogFileReader implements HoodieLogFormat.Reader {
|
||||
// TODO : convert content and block length to long by using ByteBuffer, raw byte [] allows
|
||||
// for max of Integer size
|
||||
private HoodieLogBlock readBlock() throws IOException {
|
||||
|
||||
int blocksize;
|
||||
int type;
|
||||
HoodieLogBlockType blockType = null;
|
||||
Map<HeaderMetadataType, String> header = null;
|
||||
|
||||
int blockSize;
|
||||
try {
|
||||
// 1 Read the total size of the block
|
||||
blocksize = (int) inputStream.readLong();
|
||||
blockSize = (int) inputStream.readLong();
|
||||
} catch (EOFException | CorruptedLogFileException e) {
|
||||
// An exception reading any of the above indicates a corrupt block
|
||||
// Create a corrupt block by finding the next MAGIC marker or EOF
|
||||
@@ -197,9 +147,9 @@ public class HoodieLogFileReader implements HoodieLogFormat.Reader {
|
||||
}
|
||||
|
||||
// We may have had a crash which could have written this block partially
|
||||
// Skip blocksize in the stream and we should either find a sync marker (start of the next
|
||||
// Skip blockSize in the stream and we should either find a sync marker (start of the next
|
||||
// block) or EOF. If we did not find either of it, then this block is a corrupted block.
|
||||
boolean isCorrupted = isBlockCorrupt(blocksize);
|
||||
boolean isCorrupted = isBlockCorrupted(blockSize);
|
||||
if (isCorrupted) {
|
||||
return createCorruptBlock();
|
||||
}
|
||||
@@ -208,71 +158,85 @@ public class HoodieLogFileReader implements HoodieLogFormat.Reader {
|
||||
HoodieLogFormat.LogFormatVersion nextBlockVersion = readVersion();
|
||||
|
||||
// 3. Read the block type for a log block
|
||||
if (nextBlockVersion.getVersion() != HoodieLogFormatVersion.DEFAULT_VERSION) {
|
||||
type = inputStream.readInt();
|
||||
|
||||
ValidationUtils.checkArgument(type < HoodieLogBlockType.values().length, "Invalid block byte type found " + type);
|
||||
blockType = HoodieLogBlockType.values()[type];
|
||||
}
|
||||
HoodieLogBlockType blockType = tryReadBlockType(nextBlockVersion);
|
||||
|
||||
// 4. Read the header for a log block, if present
|
||||
if (nextBlockVersion.hasHeader()) {
|
||||
header = HoodieLogBlock.getLogMetadata(inputStream);
|
||||
}
|
||||
|
||||
int contentLength = blocksize;
|
||||
Map<HeaderMetadataType, String> header =
|
||||
nextBlockVersion.hasHeader() ? HoodieLogBlock.getLogMetadata(inputStream) : null;
|
||||
|
||||
// 5. Read the content length for the content
|
||||
if (nextBlockVersion.getVersion() != HoodieLogFormatVersion.DEFAULT_VERSION) {
|
||||
contentLength = (int) inputStream.readLong();
|
||||
}
|
||||
// Fallback to full-block size if no content-length
|
||||
// TODO replace w/ hasContentLength
|
||||
int contentLength =
|
||||
nextBlockVersion.getVersion() != HoodieLogFormatVersion.DEFAULT_VERSION ? (int) inputStream.readLong() : blockSize;
|
||||
|
||||
// 6. Read the content or skip content based on IO vs Memory trade-off by client
|
||||
// TODO - have a max block size and reuse this buffer in the ByteBuffer
|
||||
// (hard to guess max block size for now)
|
||||
long contentPosition = inputStream.getPos();
|
||||
byte[] content = HoodieLogBlock.readOrSkipContent(inputStream, contentLength, readBlockLazily);
|
||||
boolean shouldReadLazily = readBlockLazily && nextBlockVersion.getVersion() != HoodieLogFormatVersion.DEFAULT_VERSION;
|
||||
Option<byte[]> content = HoodieLogBlock.tryReadContent(inputStream, contentLength, shouldReadLazily);
|
||||
|
||||
// 7. Read footer if any
|
||||
Map<HeaderMetadataType, String> footer = null;
|
||||
if (nextBlockVersion.hasFooter()) {
|
||||
footer = HoodieLogBlock.getLogMetadata(inputStream);
|
||||
}
|
||||
Map<HeaderMetadataType, String> footer =
|
||||
nextBlockVersion.hasFooter() ? HoodieLogBlock.getLogMetadata(inputStream) : null;
|
||||
|
||||
// 8. Read log block length, if present. This acts as a reverse pointer when traversing a
|
||||
// log file in reverse
|
||||
@SuppressWarnings("unused")
|
||||
long logBlockLength = 0;
|
||||
if (nextBlockVersion.hasLogBlockLength()) {
|
||||
logBlockLength = inputStream.readLong();
|
||||
inputStream.readLong();
|
||||
}
|
||||
|
||||
// 9. Read the log block end position in the log file
|
||||
long blockEndPos = inputStream.getPos();
|
||||
|
||||
HoodieLogBlock.HoodieLogBlockContentLocation logBlockContentLoc =
|
||||
new HoodieLogBlock.HoodieLogBlockContentLocation(hadoopConf, logFile, contentPosition, contentLength, blockEndPos);
|
||||
|
||||
switch (Objects.requireNonNull(blockType)) {
|
||||
// based on type read the block
|
||||
case AVRO_DATA_BLOCK:
|
||||
if (nextBlockVersion.getVersion() == HoodieLogFormatVersion.DEFAULT_VERSION) {
|
||||
return HoodieAvroDataBlock.getBlock(content, readerSchema);
|
||||
return HoodieAvroDataBlock.getBlock(content.get(), readerSchema);
|
||||
} else {
|
||||
return new HoodieAvroDataBlock(logFile, inputStream, Option.ofNullable(content), readBlockLazily,
|
||||
contentPosition, contentLength, blockEndPos, readerSchema, header, footer, keyField);
|
||||
return new HoodieAvroDataBlock(inputStream, content, readBlockLazily, logBlockContentLoc,
|
||||
Option.ofNullable(readerSchema), header, footer, keyField);
|
||||
}
|
||||
|
||||
case HFILE_DATA_BLOCK:
|
||||
return new HoodieHFileDataBlock(logFile, inputStream, Option.ofNullable(content), readBlockLazily,
|
||||
contentPosition, contentLength, blockEndPos, readerSchema,
|
||||
header, footer, enableInlineReading, keyField);
|
||||
checkState(nextBlockVersion.getVersion() != HoodieLogFormatVersion.DEFAULT_VERSION,
|
||||
String.format("HFile block could not be of version (%d)", HoodieLogFormatVersion.DEFAULT_VERSION));
|
||||
|
||||
return new HoodieHFileDataBlock(inputStream, content, readBlockLazily, logBlockContentLoc,
|
||||
Option.ofNullable(readerSchema), header, footer, enableRecordLookups);
|
||||
|
||||
case PARQUET_DATA_BLOCK:
|
||||
checkState(nextBlockVersion.getVersion() != HoodieLogFormatVersion.DEFAULT_VERSION,
|
||||
String.format("Parquet block could not be of version (%d)", HoodieLogFormatVersion.DEFAULT_VERSION));
|
||||
|
||||
return new HoodieParquetDataBlock(inputStream, content, readBlockLazily, logBlockContentLoc,
|
||||
Option.ofNullable(readerSchema), header, footer, keyField);
|
||||
|
||||
case DELETE_BLOCK:
|
||||
return HoodieDeleteBlock.getBlock(logFile, inputStream, Option.ofNullable(content), readBlockLazily,
|
||||
contentPosition, contentLength, blockEndPos, header, footer);
|
||||
return new HoodieDeleteBlock(content, inputStream, readBlockLazily, Option.of(logBlockContentLoc), header, footer);
|
||||
|
||||
case COMMAND_BLOCK:
|
||||
return HoodieCommandBlock.getBlock(logFile, inputStream, Option.ofNullable(content), readBlockLazily,
|
||||
contentPosition, contentLength, blockEndPos, header, footer);
|
||||
return new HoodieCommandBlock(content, inputStream, readBlockLazily, Option.of(logBlockContentLoc), header, footer);
|
||||
|
||||
default:
|
||||
throw new HoodieNotSupportedException("Unsupported Block " + blockType);
|
||||
}
|
||||
}
|
||||
|
||||
@Nullable
|
||||
private HoodieLogBlockType tryReadBlockType(HoodieLogFormat.LogFormatVersion blockVersion) throws IOException {
|
||||
if (blockVersion.getVersion() == HoodieLogFormatVersion.DEFAULT_VERSION) {
|
||||
return null;
|
||||
}
|
||||
|
||||
int type = inputStream.readInt();
|
||||
checkArgument(type < HoodieLogBlockType.values().length, "Invalid block byte type found " + type);
|
||||
return HoodieLogBlockType.values()[type];
|
||||
}
|
||||
|
||||
private HoodieLogBlock createCorruptBlock() throws IOException {
|
||||
LOG.info("Log " + logFile + " has a corrupted block at " + inputStream.getPos());
|
||||
long currentPos = inputStream.getPos();
|
||||
@@ -282,12 +246,13 @@ public class HoodieLogFileReader implements HoodieLogFormat.Reader {
|
||||
LOG.info("Next available block in " + logFile + " starts at " + nextBlockOffset);
|
||||
int corruptedBlockSize = (int) (nextBlockOffset - currentPos);
|
||||
long contentPosition = inputStream.getPos();
|
||||
byte[] corruptedBytes = HoodieLogBlock.readOrSkipContent(inputStream, corruptedBlockSize, readBlockLazily);
|
||||
return HoodieCorruptBlock.getBlock(logFile, inputStream, Option.ofNullable(corruptedBytes), readBlockLazily,
|
||||
contentPosition, corruptedBlockSize, nextBlockOffset, new HashMap<>(), new HashMap<>());
|
||||
Option<byte[]> corruptedBytes = HoodieLogBlock.tryReadContent(inputStream, corruptedBlockSize, readBlockLazily);
|
||||
HoodieLogBlock.HoodieLogBlockContentLocation logBlockContentLoc =
|
||||
new HoodieLogBlock.HoodieLogBlockContentLocation(hadoopConf, logFile, contentPosition, corruptedBlockSize, nextBlockOffset);
|
||||
return new HoodieCorruptBlock(corruptedBytes, inputStream, readBlockLazily, Option.of(logBlockContentLoc), new HashMap<>(), new HashMap<>());
|
||||
}
|
||||
|
||||
private boolean isBlockCorrupt(int blocksize) throws IOException {
|
||||
private boolean isBlockCorrupted(int blocksize) throws IOException {
|
||||
long currentPos = inputStream.getPos();
|
||||
try {
|
||||
inputStream.seek(currentPos + blocksize);
|
||||
@@ -481,4 +446,59 @@ public class HoodieLogFileReader implements HoodieLogFormat.Reader {
|
||||
public void remove() {
|
||||
throw new UnsupportedOperationException("Remove not supported for HoodieLogFileReader");
|
||||
}
|
||||
|
||||
/**
|
||||
* Fetch the right {@link FSDataInputStream} to be used by wrapping with required input streams.
|
||||
* @param fs instance of {@link FileSystem} in use.
|
||||
* @param bufferSize buffer size to be used.
|
||||
* @return the right {@link FSDataInputStream} as required.
|
||||
*/
|
||||
private static FSDataInputStream getFSDataInputStream(FileSystem fs,
|
||||
HoodieLogFile logFile,
|
||||
int bufferSize) throws IOException {
|
||||
FSDataInputStream fsDataInputStream = fs.open(logFile.getPath(), bufferSize);
|
||||
|
||||
if (FSUtils.isGCSFileSystem(fs)) {
|
||||
// in GCS FS, we might need to interceptor seek offsets as we might get EOF exception
|
||||
return new SchemeAwareFSDataInputStream(getFSDataInputStreamForGCS(fsDataInputStream, logFile, bufferSize), true);
|
||||
}
|
||||
|
||||
if (fsDataInputStream.getWrappedStream() instanceof FSInputStream) {
|
||||
return new TimedFSDataInputStream(logFile.getPath(), new FSDataInputStream(
|
||||
new BufferedFSInputStream((FSInputStream) fsDataInputStream.getWrappedStream(), bufferSize)));
|
||||
}
|
||||
|
||||
// fsDataInputStream.getWrappedStream() maybe a BufferedFSInputStream
|
||||
// need to wrap in another BufferedFSInputStream the make bufferSize work?
|
||||
return fsDataInputStream;
|
||||
}
|
||||
|
||||
/**
|
||||
* GCS FileSystem needs some special handling for seek and hence this method assists to fetch the right {@link FSDataInputStream} to be
|
||||
* used by wrapping with required input streams.
|
||||
* @param fsDataInputStream original instance of {@link FSDataInputStream}.
|
||||
* @param bufferSize buffer size to be used.
|
||||
* @return the right {@link FSDataInputStream} as required.
|
||||
*/
|
||||
private static FSDataInputStream getFSDataInputStreamForGCS(FSDataInputStream fsDataInputStream,
|
||||
HoodieLogFile logFile,
|
||||
int bufferSize) {
|
||||
// incase of GCS FS, there are two flows.
|
||||
// a. fsDataInputStream.getWrappedStream() instanceof FSInputStream
|
||||
// b. fsDataInputStream.getWrappedStream() not an instanceof FSInputStream, but an instance of FSDataInputStream.
|
||||
// (a) is handled in the first if block and (b) is handled in the second if block. If not, we fallback to original fsDataInputStream
|
||||
if (fsDataInputStream.getWrappedStream() instanceof FSInputStream) {
|
||||
return new TimedFSDataInputStream(logFile.getPath(), new FSDataInputStream(
|
||||
new BufferedFSInputStream((FSInputStream) fsDataInputStream.getWrappedStream(), bufferSize)));
|
||||
}
|
||||
|
||||
if (fsDataInputStream.getWrappedStream() instanceof FSDataInputStream
|
||||
&& ((FSDataInputStream) fsDataInputStream.getWrappedStream()).getWrappedStream() instanceof FSInputStream) {
|
||||
FSInputStream inputStream = (FSInputStream)((FSDataInputStream) fsDataInputStream.getWrappedStream()).getWrappedStream();
|
||||
return new TimedFSDataInputStream(logFile.getPath(),
|
||||
new FSDataInputStream(new BufferedFSInputStream(inputStream, bufferSize)));
|
||||
}
|
||||
|
||||
return fsDataInputStream;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -60,13 +60,6 @@ public class HoodieLogFormatWriter implements HoodieLogFormat.Writer {
|
||||
|
||||
private static final String APPEND_UNAVAILABLE_EXCEPTION_MESSAGE = "not sufficiently replicated yet";
|
||||
|
||||
/**
|
||||
* @param fs
|
||||
* @param logFile
|
||||
* @param bufferSize
|
||||
* @param replication
|
||||
* @param sizeThreshold
|
||||
*/
|
||||
HoodieLogFormatWriter(FileSystem fs, HoodieLogFile logFile, Integer bufferSize, Short replication, Long sizeThreshold, String rolloverLogWriteToken) {
|
||||
this.fs = fs;
|
||||
this.logFile = logFile;
|
||||
|
||||
@@ -18,13 +18,6 @@
|
||||
|
||||
package org.apache.hudi.common.table.log.block;
|
||||
|
||||
import org.apache.hudi.avro.HoodieAvroUtils;
|
||||
import org.apache.hudi.common.fs.SizeAwareDataInputStream;
|
||||
import org.apache.hudi.common.model.HoodieLogFile;
|
||||
import org.apache.hudi.common.model.HoodieRecord;
|
||||
import org.apache.hudi.common.util.Option;
|
||||
import org.apache.hudi.exception.HoodieIOException;
|
||||
|
||||
import org.apache.avro.Schema;
|
||||
import org.apache.avro.generic.GenericDatumReader;
|
||||
import org.apache.avro.generic.GenericDatumWriter;
|
||||
@@ -36,59 +29,64 @@ import org.apache.avro.io.DecoderFactory;
|
||||
import org.apache.avro.io.Encoder;
|
||||
import org.apache.avro.io.EncoderFactory;
|
||||
import org.apache.hadoop.fs.FSDataInputStream;
|
||||
import org.apache.hudi.common.fs.SizeAwareDataInputStream;
|
||||
import org.apache.hudi.common.model.HoodieRecord;
|
||||
import org.apache.hudi.common.util.Option;
|
||||
import org.apache.hudi.exception.HoodieIOException;
|
||||
|
||||
import javax.annotation.Nonnull;
|
||||
import java.io.ByteArrayInputStream;
|
||||
import java.io.ByteArrayOutputStream;
|
||||
import java.io.DataInputStream;
|
||||
import java.io.DataOutputStream;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.io.OutputStream;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
import java.util.HashMap;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.zip.DeflaterOutputStream;
|
||||
import java.util.zip.InflaterInputStream;
|
||||
|
||||
import javax.annotation.Nonnull;
|
||||
import static org.apache.hudi.common.util.ValidationUtils.checkState;
|
||||
|
||||
/**
|
||||
* HoodieAvroDataBlock contains a list of records serialized using Avro. It is used with the Parquet base file format.
|
||||
*/
|
||||
public class HoodieAvroDataBlock extends HoodieDataBlock {
|
||||
|
||||
private ThreadLocal<BinaryEncoder> encoderCache = new ThreadLocal<>();
|
||||
private ThreadLocal<BinaryDecoder> decoderCache = new ThreadLocal<>();
|
||||
private final ThreadLocal<BinaryEncoder> encoderCache = new ThreadLocal<>();
|
||||
private final ThreadLocal<BinaryDecoder> decoderCache = new ThreadLocal<>();
|
||||
|
||||
public HoodieAvroDataBlock(@Nonnull Map<HeaderMetadataType, String> logBlockHeader,
|
||||
@Nonnull Map<HeaderMetadataType, String> logBlockFooter,
|
||||
@Nonnull Option<HoodieLogBlockContentLocation> blockContentLocation, @Nonnull Option<byte[]> content,
|
||||
FSDataInputStream inputStream, boolean readBlockLazily) {
|
||||
super(logBlockHeader, logBlockFooter, blockContentLocation, content, inputStream, readBlockLazily);
|
||||
public HoodieAvroDataBlock(FSDataInputStream inputStream,
|
||||
Option<byte[]> content,
|
||||
boolean readBlockLazily,
|
||||
HoodieLogBlockContentLocation logBlockContentLocation,
|
||||
Option<Schema> readerSchema,
|
||||
Map<HeaderMetadataType, String> header,
|
||||
Map<HeaderMetadataType, String> footer,
|
||||
String keyField) {
|
||||
super(content, inputStream, readBlockLazily, Option.of(logBlockContentLocation), readerSchema, header, footer, keyField, false);
|
||||
}
|
||||
|
||||
public HoodieAvroDataBlock(HoodieLogFile logFile, FSDataInputStream inputStream, Option<byte[]> content,
|
||||
boolean readBlockLazily, long position, long blockSize, long blockEndpos, Schema readerSchema,
|
||||
Map<HeaderMetadataType, String> header, Map<HeaderMetadataType, String> footer, String keyField) {
|
||||
super(content, inputStream, readBlockLazily,
|
||||
Option.of(new HoodieLogBlockContentLocation(logFile, position, blockSize, blockEndpos)), readerSchema, header,
|
||||
footer, keyField);
|
||||
}
|
||||
|
||||
public HoodieAvroDataBlock(@Nonnull List<IndexedRecord> records, @Nonnull Map<HeaderMetadataType,
|
||||
String> header, String keyField) {
|
||||
public HoodieAvroDataBlock(@Nonnull List<IndexedRecord> records,
|
||||
@Nonnull Map<HeaderMetadataType, String> header,
|
||||
@Nonnull String keyField
|
||||
) {
|
||||
super(records, header, new HashMap<>(), keyField);
|
||||
}
|
||||
|
||||
public HoodieAvroDataBlock(@Nonnull List<IndexedRecord> records, @Nonnull Map<HeaderMetadataType, String> header) {
|
||||
super(records, header, new HashMap<>(), HoodieRecord.RECORD_KEY_METADATA_FIELD);
|
||||
}
|
||||
|
||||
@Override
|
||||
public HoodieLogBlockType getBlockType() {
|
||||
return HoodieLogBlockType.AVRO_DATA_BLOCK;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected byte[] serializeRecords() throws IOException {
|
||||
protected byte[] serializeRecords(List<IndexedRecord> records) throws IOException {
|
||||
Schema schema = new Schema.Parser().parse(super.getLogBlockHeader().get(HeaderMetadataType.SCHEMA));
|
||||
GenericDatumWriter<IndexedRecord> writer = new GenericDatumWriter<>(schema);
|
||||
ByteArrayOutputStream baos = new ByteArrayOutputStream();
|
||||
@@ -118,7 +116,6 @@ public class HoodieAvroDataBlock extends HoodieDataBlock {
|
||||
output.writeInt(size);
|
||||
// Write the content
|
||||
output.write(temp.toByteArray());
|
||||
itr.remove();
|
||||
} catch (IOException e) {
|
||||
throw new HoodieIOException("IOException converting HoodieAvroDataBlock to bytes", e);
|
||||
}
|
||||
@@ -130,9 +127,11 @@ public class HoodieAvroDataBlock extends HoodieDataBlock {
|
||||
// TODO (na) - Break down content into smaller chunks of byte [] to be GC as they are used
|
||||
// TODO (na) - Implement a recordItr instead of recordList
|
||||
@Override
|
||||
protected void deserializeRecords() throws IOException {
|
||||
protected List<IndexedRecord> deserializeRecords(byte[] content) throws IOException {
|
||||
checkState(readerSchema != null, "Reader's schema has to be non-null");
|
||||
|
||||
SizeAwareDataInputStream dis =
|
||||
new SizeAwareDataInputStream(new DataInputStream(new ByteArrayInputStream(getContent().get())));
|
||||
new SizeAwareDataInputStream(new DataInputStream(new ByteArrayInputStream(content)));
|
||||
|
||||
// 1. Read version for this data block
|
||||
int version = dis.readInt();
|
||||
@@ -141,12 +140,8 @@ public class HoodieAvroDataBlock extends HoodieDataBlock {
|
||||
// Get schema from the header
|
||||
Schema writerSchema = new Schema.Parser().parse(super.getLogBlockHeader().get(HeaderMetadataType.SCHEMA));
|
||||
|
||||
// If readerSchema was not present, use writerSchema
|
||||
if (schema == null) {
|
||||
schema = writerSchema;
|
||||
}
|
||||
GenericDatumReader<IndexedRecord> reader = new GenericDatumReader<>(writerSchema, readerSchema);
|
||||
|
||||
GenericDatumReader<IndexedRecord> reader = new GenericDatumReader<>(writerSchema, schema);
|
||||
// 2. Get the total records
|
||||
int totalRecords = 0;
|
||||
if (logBlockVersion.hasRecordCount()) {
|
||||
@@ -157,17 +152,17 @@ public class HoodieAvroDataBlock extends HoodieDataBlock {
|
||||
// 3. Read the content
|
||||
for (int i = 0; i < totalRecords; i++) {
|
||||
int recordLength = dis.readInt();
|
||||
BinaryDecoder decoder = DecoderFactory.get().binaryDecoder(getContent().get(), dis.getNumberOfBytesRead(),
|
||||
BinaryDecoder decoder = DecoderFactory.get().binaryDecoder(content, dis.getNumberOfBytesRead(),
|
||||
recordLength, decoderCache.get());
|
||||
decoderCache.set(decoder);
|
||||
IndexedRecord record = reader.read(null, decoder);
|
||||
records.add(record);
|
||||
dis.skipBytes(recordLength);
|
||||
}
|
||||
|
||||
dis.close();
|
||||
this.records = records;
|
||||
// Free up content to be GC'd, deflate
|
||||
deflate();
|
||||
|
||||
return records;
|
||||
}
|
||||
|
||||
//----------------------------------------------------------------------------------------
|
||||
@@ -183,9 +178,7 @@ public class HoodieAvroDataBlock extends HoodieDataBlock {
|
||||
*/
|
||||
@Deprecated
|
||||
public HoodieAvroDataBlock(List<IndexedRecord> records, Schema schema) {
|
||||
super(new HashMap<>(), new HashMap<>(), Option.empty(), Option.empty(), null, false);
|
||||
this.records = records;
|
||||
this.schema = schema;
|
||||
super(records, Collections.singletonMap(HeaderMetadataType.SCHEMA, schema.toString()), new HashMap<>(), HoodieRecord.RECORD_KEY_METADATA_FIELD);
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -201,7 +194,7 @@ public class HoodieAvroDataBlock extends HoodieDataBlock {
|
||||
int schemaLength = dis.readInt();
|
||||
byte[] compressedSchema = new byte[schemaLength];
|
||||
dis.readFully(compressedSchema, 0, schemaLength);
|
||||
Schema writerSchema = new Schema.Parser().parse(HoodieAvroUtils.decompress(compressedSchema));
|
||||
Schema writerSchema = new Schema.Parser().parse(decompress(compressedSchema));
|
||||
|
||||
if (readerSchema == null) {
|
||||
readerSchema = writerSchema;
|
||||
@@ -224,6 +217,33 @@ public class HoodieAvroDataBlock extends HoodieDataBlock {
|
||||
return new HoodieAvroDataBlock(records, readerSchema);
|
||||
}
|
||||
|
||||
private static byte[] compress(String text) {
|
||||
ByteArrayOutputStream baos = new ByteArrayOutputStream();
|
||||
try {
|
||||
OutputStream out = new DeflaterOutputStream(baos);
|
||||
out.write(text.getBytes(StandardCharsets.UTF_8));
|
||||
out.close();
|
||||
} catch (IOException e) {
|
||||
throw new HoodieIOException("IOException while compressing text " + text, e);
|
||||
}
|
||||
return baos.toByteArray();
|
||||
}
|
||||
|
||||
private static String decompress(byte[] bytes) {
|
||||
InputStream in = new InflaterInputStream(new ByteArrayInputStream(bytes));
|
||||
ByteArrayOutputStream baos = new ByteArrayOutputStream();
|
||||
try {
|
||||
byte[] buffer = new byte[8192];
|
||||
int len;
|
||||
while ((len = in.read(buffer)) > 0) {
|
||||
baos.write(buffer, 0, len);
|
||||
}
|
||||
return new String(baos.toByteArray(), StandardCharsets.UTF_8);
|
||||
} catch (IOException e) {
|
||||
throw new HoodieIOException("IOException while decompressing text", e);
|
||||
}
|
||||
}
|
||||
|
||||
@Deprecated
|
||||
public byte[] getBytes(Schema schema) throws IOException {
|
||||
|
||||
@@ -232,10 +252,12 @@ public class HoodieAvroDataBlock extends HoodieDataBlock {
|
||||
DataOutputStream output = new DataOutputStream(baos);
|
||||
|
||||
// 2. Compress and Write schema out
|
||||
byte[] schemaContent = HoodieAvroUtils.compress(schema.toString());
|
||||
byte[] schemaContent = compress(schema.toString());
|
||||
output.writeInt(schemaContent.length);
|
||||
output.write(schemaContent);
|
||||
|
||||
List<IndexedRecord> records = getRecords();
|
||||
|
||||
// 3. Write total number of records
|
||||
output.writeInt(records.size());
|
||||
|
||||
|
||||
@@ -18,7 +18,6 @@
|
||||
|
||||
package org.apache.hudi.common.table.log.block;
|
||||
|
||||
import org.apache.hudi.common.model.HoodieLogFile;
|
||||
import org.apache.hudi.common.util.Option;
|
||||
|
||||
import org.apache.hadoop.fs.FSDataInputStream;
|
||||
@@ -44,7 +43,7 @@ public class HoodieCommandBlock extends HoodieLogBlock {
|
||||
this(Option.empty(), null, false, Option.empty(), header, new HashMap<>());
|
||||
}
|
||||
|
||||
private HoodieCommandBlock(Option<byte[]> content, FSDataInputStream inputStream, boolean readBlockLazily,
|
||||
public HoodieCommandBlock(Option<byte[]> content, FSDataInputStream inputStream, boolean readBlockLazily,
|
||||
Option<HoodieLogBlockContentLocation> blockContentLocation, Map<HeaderMetadataType, String> header,
|
||||
Map<HeaderMetadataType, String> footer) {
|
||||
super(header, footer, blockContentLocation, content, inputStream, readBlockLazily);
|
||||
@@ -65,12 +64,4 @@ public class HoodieCommandBlock extends HoodieLogBlock {
|
||||
public byte[] getContentBytes() {
|
||||
return new byte[0];
|
||||
}
|
||||
|
||||
public static HoodieLogBlock getBlock(HoodieLogFile logFile, FSDataInputStream inputStream, Option<byte[]> content,
|
||||
boolean readBlockLazily, long position, long blockSize, long blockEndPos, Map<HeaderMetadataType, String> header,
|
||||
Map<HeaderMetadataType, String> footer) {
|
||||
|
||||
return new HoodieCommandBlock(content, inputStream, readBlockLazily,
|
||||
Option.of(new HoodieLogBlockContentLocation(logFile, position, blockSize, blockEndPos)), header, footer);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -18,7 +18,6 @@
|
||||
|
||||
package org.apache.hudi.common.table.log.block;
|
||||
|
||||
import org.apache.hudi.common.model.HoodieLogFile;
|
||||
import org.apache.hudi.common.util.Option;
|
||||
|
||||
import org.apache.hadoop.fs.FSDataInputStream;
|
||||
@@ -32,7 +31,7 @@ import java.util.Map;
|
||||
*/
|
||||
public class HoodieCorruptBlock extends HoodieLogBlock {
|
||||
|
||||
private HoodieCorruptBlock(Option<byte[]> corruptedBytes, FSDataInputStream inputStream, boolean readBlockLazily,
|
||||
public HoodieCorruptBlock(Option<byte[]> corruptedBytes, FSDataInputStream inputStream, boolean readBlockLazily,
|
||||
Option<HoodieLogBlockContentLocation> blockContentLocation, Map<HeaderMetadataType, String> header,
|
||||
Map<HeaderMetadataType, String> footer) {
|
||||
super(header, footer, blockContentLocation, corruptedBytes, inputStream, readBlockLazily);
|
||||
@@ -40,7 +39,6 @@ public class HoodieCorruptBlock extends HoodieLogBlock {
|
||||
|
||||
@Override
|
||||
public byte[] getContentBytes() throws IOException {
|
||||
|
||||
if (!getContent().isPresent() && readBlockLazily) {
|
||||
// read content from disk
|
||||
inflate();
|
||||
@@ -53,11 +51,4 @@ public class HoodieCorruptBlock extends HoodieLogBlock {
|
||||
return HoodieLogBlockType.CORRUPT_BLOCK;
|
||||
}
|
||||
|
||||
public static HoodieLogBlock getBlock(HoodieLogFile logFile, FSDataInputStream inputStream,
|
||||
Option<byte[]> corruptedBytes, boolean readBlockLazily, long position, long blockSize, long blockEndPos,
|
||||
Map<HeaderMetadataType, String> header, Map<HeaderMetadataType, String> footer) {
|
||||
|
||||
return new HoodieCorruptBlock(corruptedBytes, inputStream, readBlockLazily,
|
||||
Option.of(new HoodieLogBlockContentLocation(logFile, position, blockSize, blockEndPos)), header, footer);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -18,25 +18,24 @@
|
||||
|
||||
package org.apache.hudi.common.table.log.block;
|
||||
|
||||
import org.apache.hudi.common.model.HoodieRecord;
|
||||
import org.apache.hudi.common.util.Option;
|
||||
import org.apache.hudi.exception.HoodieException;
|
||||
import org.apache.hudi.exception.HoodieIOException;
|
||||
|
||||
import org.apache.avro.Schema;
|
||||
import org.apache.avro.generic.IndexedRecord;
|
||||
import org.apache.hadoop.fs.FSDataInputStream;
|
||||
|
||||
import javax.annotation.Nonnull;
|
||||
import org.apache.hudi.common.util.Option;
|
||||
import org.apache.hudi.exception.HoodieIOException;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import static org.apache.hudi.common.util.ValidationUtils.checkState;
|
||||
|
||||
/**
|
||||
* DataBlock contains a list of records serialized using formats compatible with the base file format.
|
||||
* For each base file format there is a corresponding DataBlock format.
|
||||
*
|
||||
* <p>
|
||||
* The Datablock contains:
|
||||
* 1. Data Block version
|
||||
* 2. Total number of records in the block
|
||||
@@ -44,125 +43,151 @@ import java.util.Map;
|
||||
*/
|
||||
public abstract class HoodieDataBlock extends HoodieLogBlock {
|
||||
|
||||
protected List<IndexedRecord> records;
|
||||
protected Schema schema;
|
||||
protected String keyField;
|
||||
// TODO rebase records/content to leverage Either to warrant
|
||||
// that they are mutex (used by read/write flows respectively)
|
||||
private Option<List<IndexedRecord>> records;
|
||||
|
||||
public HoodieDataBlock(@Nonnull Map<HeaderMetadataType, String> logBlockHeader,
|
||||
@Nonnull Map<HeaderMetadataType, String> logBlockFooter,
|
||||
@Nonnull Option<HoodieLogBlockContentLocation> blockContentLocation, @Nonnull Option<byte[]> content,
|
||||
FSDataInputStream inputStream, boolean readBlockLazily) {
|
||||
super(logBlockHeader, logBlockFooter, blockContentLocation, content, inputStream, readBlockLazily);
|
||||
this.keyField = HoodieRecord.RECORD_KEY_METADATA_FIELD;
|
||||
}
|
||||
/**
|
||||
* Key field's name w/in the record's schema
|
||||
*/
|
||||
private final String keyFieldName;
|
||||
|
||||
public HoodieDataBlock(@Nonnull List<IndexedRecord> records, @Nonnull Map<HeaderMetadataType, String> header,
|
||||
@Nonnull Map<HeaderMetadataType, String> footer, String keyField) {
|
||||
this(header, footer, Option.empty(), Option.empty(), null, false);
|
||||
this.records = records;
|
||||
this.schema = new Schema.Parser().parse(super.getLogBlockHeader().get(HeaderMetadataType.SCHEMA));
|
||||
this.keyField = keyField;
|
||||
}
|
||||
private final boolean enablePointLookups;
|
||||
|
||||
protected HoodieDataBlock(Option<byte[]> content, @Nonnull FSDataInputStream inputStream, boolean readBlockLazily,
|
||||
Option<HoodieLogBlockContentLocation> blockContentLocation, Schema readerSchema,
|
||||
@Nonnull Map<HeaderMetadataType, String> headers, @Nonnull Map<HeaderMetadataType,
|
||||
String> footer, String keyField) {
|
||||
this(headers, footer, blockContentLocation, content, inputStream, readBlockLazily);
|
||||
this.schema = readerSchema;
|
||||
this.keyField = keyField;
|
||||
protected final Schema readerSchema;
|
||||
|
||||
/**
|
||||
* NOTE: This ctor is used on the write-path (ie when records ought to be written into the log)
|
||||
*/
|
||||
public HoodieDataBlock(List<IndexedRecord> records,
|
||||
Map<HeaderMetadataType, String> header,
|
||||
Map<HeaderMetadataType, String> footer,
|
||||
String keyFieldName) {
|
||||
super(header, footer, Option.empty(), Option.empty(), null, false);
|
||||
this.records = Option.of(records);
|
||||
this.keyFieldName = keyFieldName;
|
||||
// If no reader-schema has been provided assume writer-schema as one
|
||||
this.readerSchema = getWriterSchema(super.getLogBlockHeader());
|
||||
this.enablePointLookups = false;
|
||||
}
|
||||
|
||||
/**
|
||||
* Util method to get a data block for the requested type.
|
||||
*
|
||||
* @param logDataBlockFormat - Data block type
|
||||
* @param recordList - List of records that goes in the data block
|
||||
* @param header - data block header
|
||||
* @return Data block of the requested type.
|
||||
* NOTE: This ctor is used on the write-path (ie when records ought to be written into the log)
|
||||
*/
|
||||
public static HoodieLogBlock getBlock(HoodieLogBlockType logDataBlockFormat, List<IndexedRecord> recordList,
|
||||
Map<HeaderMetadataType, String> header) {
|
||||
return getBlock(logDataBlockFormat, recordList, header, HoodieRecord.RECORD_KEY_METADATA_FIELD);
|
||||
}
|
||||
|
||||
/**
|
||||
* Util method to get a data block for the requested type.
|
||||
*
|
||||
* @param logDataBlockFormat - Data block type
|
||||
* @param recordList - List of records that goes in the data block
|
||||
* @param header - data block header
|
||||
* @param keyField - FieldId to get the key from the records
|
||||
* @return Data block of the requested type.
|
||||
*/
|
||||
public static HoodieLogBlock getBlock(HoodieLogBlockType logDataBlockFormat, List<IndexedRecord> recordList,
|
||||
Map<HeaderMetadataType, String> header, String keyField) {
|
||||
switch (logDataBlockFormat) {
|
||||
case AVRO_DATA_BLOCK:
|
||||
return new HoodieAvroDataBlock(recordList, header, keyField);
|
||||
case HFILE_DATA_BLOCK:
|
||||
return new HoodieHFileDataBlock(recordList, header, keyField);
|
||||
default:
|
||||
throw new HoodieException("Data block format " + logDataBlockFormat + " not implemented");
|
||||
}
|
||||
protected HoodieDataBlock(Option<byte[]> content,
|
||||
FSDataInputStream inputStream,
|
||||
boolean readBlockLazily,
|
||||
Option<HoodieLogBlockContentLocation> blockContentLocation,
|
||||
Option<Schema> readerSchema,
|
||||
Map<HeaderMetadataType, String> headers,
|
||||
Map<HeaderMetadataType, String> footer,
|
||||
String keyFieldName,
|
||||
boolean enablePointLookups) {
|
||||
super(headers, footer, blockContentLocation, content, inputStream, readBlockLazily);
|
||||
this.records = Option.empty();
|
||||
this.keyFieldName = keyFieldName;
|
||||
// If no reader-schema has been provided assume writer-schema as one
|
||||
this.readerSchema = readerSchema.orElseGet(() -> getWriterSchema(super.getLogBlockHeader()));
|
||||
this.enablePointLookups = enablePointLookups;
|
||||
}
|
||||
|
||||
@Override
|
||||
public byte[] getContentBytes() throws IOException {
|
||||
// In case this method is called before realizing records from content
|
||||
if (getContent().isPresent()) {
|
||||
return getContent().get();
|
||||
} else if (readBlockLazily && !getContent().isPresent() && records == null) {
|
||||
// read block lazily
|
||||
createRecordsFromContentBytes();
|
||||
Option<byte[]> content = getContent();
|
||||
|
||||
checkState(content.isPresent() || records.isPresent(), "Block is in invalid state");
|
||||
|
||||
if (content.isPresent()) {
|
||||
return content.get();
|
||||
}
|
||||
|
||||
return serializeRecords();
|
||||
return serializeRecords(records.get());
|
||||
}
|
||||
|
||||
public abstract HoodieLogBlockType getBlockType();
|
||||
protected static Schema getWriterSchema(Map<HeaderMetadataType, String> logBlockHeader) {
|
||||
return new Schema.Parser().parse(logBlockHeader.get(HeaderMetadataType.SCHEMA));
|
||||
}
|
||||
|
||||
public List<IndexedRecord> getRecords() {
|
||||
if (records == null) {
|
||||
/**
|
||||
* Returns all the records contained w/in this block
|
||||
*/
|
||||
public final List<IndexedRecord> getRecords() {
|
||||
if (!records.isPresent()) {
|
||||
try {
|
||||
// in case records are absent, read content lazily and then convert to IndexedRecords
|
||||
createRecordsFromContentBytes();
|
||||
records = Option.of(readRecordsFromBlockPayload());
|
||||
} catch (IOException io) {
|
||||
throw new HoodieIOException("Unable to convert content bytes to records", io);
|
||||
}
|
||||
}
|
||||
return records;
|
||||
return records.get();
|
||||
}
|
||||
|
||||
public Schema getSchema() {
|
||||
return readerSchema;
|
||||
}
|
||||
|
||||
/**
|
||||
* Batch get of keys of interest. Implementation can choose to either do full scan and return matched entries or
|
||||
* do a seek based parsing and return matched entries.
|
||||
*
|
||||
* @param keys keys of interest.
|
||||
* @return List of IndexedRecords for the keys of interest.
|
||||
* @throws IOException
|
||||
* @throws IOException in case of failures encountered when reading/parsing records
|
||||
*/
|
||||
public List<IndexedRecord> getRecords(List<String> keys) throws IOException {
|
||||
throw new UnsupportedOperationException("On demand batch get based on interested keys not supported");
|
||||
public final List<IndexedRecord> getRecords(List<String> keys) throws IOException {
|
||||
boolean fullScan = keys.isEmpty();
|
||||
if (enablePointLookups && !fullScan) {
|
||||
return lookupRecords(keys);
|
||||
}
|
||||
|
||||
public Schema getSchema() {
|
||||
// if getSchema was invoked before converting byte [] to records
|
||||
if (records == null) {
|
||||
getRecords();
|
||||
}
|
||||
return schema;
|
||||
// Otherwise, we fetch all the records and filter out all the records, but the
|
||||
// ones requested
|
||||
List<IndexedRecord> allRecords = getRecords();
|
||||
if (fullScan) {
|
||||
return allRecords;
|
||||
}
|
||||
|
||||
protected void createRecordsFromContentBytes() throws IOException {
|
||||
HashSet<String> keySet = new HashSet<>(keys);
|
||||
return allRecords.stream()
|
||||
.filter(record -> keySet.contains(getRecordKey(record).orElse(null)))
|
||||
.collect(Collectors.toList());
|
||||
}
|
||||
|
||||
protected List<IndexedRecord> readRecordsFromBlockPayload() throws IOException {
|
||||
if (readBlockLazily && !getContent().isPresent()) {
|
||||
// read log block contents from disk
|
||||
inflate();
|
||||
}
|
||||
|
||||
deserializeRecords();
|
||||
try {
|
||||
return deserializeRecords(getContent().get());
|
||||
} finally {
|
||||
// Free up content to be GC'd by deflating the block
|
||||
deflate();
|
||||
}
|
||||
}
|
||||
|
||||
protected abstract byte[] serializeRecords() throws IOException;
|
||||
|
||||
protected abstract void deserializeRecords() throws IOException;
|
||||
protected List<IndexedRecord> lookupRecords(List<String> keys) throws IOException {
|
||||
throw new UnsupportedOperationException(
|
||||
String.format("Point lookups are not supported by this Data block type (%s)", getBlockType())
|
||||
);
|
||||
}
|
||||
|
||||
protected abstract byte[] serializeRecords(List<IndexedRecord> records) throws IOException;
|
||||
|
||||
protected abstract List<IndexedRecord> deserializeRecords(byte[] content) throws IOException;
|
||||
|
||||
public abstract HoodieLogBlockType getBlockType();
|
||||
|
||||
protected Option<Schema.Field> getKeyField(Schema schema) {
|
||||
return Option.ofNullable(schema.getField(keyFieldName));
|
||||
}
|
||||
|
||||
protected Option<String> getRecordKey(IndexedRecord record) {
|
||||
return getKeyField(record.getSchema())
|
||||
.map(keyField -> record.get(keyField.pos()))
|
||||
.map(Object::toString);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -20,7 +20,6 @@ package org.apache.hudi.common.table.log.block;
|
||||
|
||||
import org.apache.hudi.common.fs.SizeAwareDataInputStream;
|
||||
import org.apache.hudi.common.model.HoodieKey;
|
||||
import org.apache.hudi.common.model.HoodieLogFile;
|
||||
import org.apache.hudi.common.util.Option;
|
||||
import org.apache.hudi.common.util.SerializationUtils;
|
||||
import org.apache.hudi.exception.HoodieIOException;
|
||||
@@ -47,7 +46,7 @@ public class HoodieDeleteBlock extends HoodieLogBlock {
|
||||
this.keysToDelete = keysToDelete;
|
||||
}
|
||||
|
||||
private HoodieDeleteBlock(Option<byte[]> content, FSDataInputStream inputStream, boolean readBlockLazily,
|
||||
public HoodieDeleteBlock(Option<byte[]> content, FSDataInputStream inputStream, boolean readBlockLazily,
|
||||
Option<HoodieLogBlockContentLocation> blockContentLocation, Map<HeaderMetadataType, String> header,
|
||||
Map<HeaderMetadataType, String> footer) {
|
||||
super(header, footer, blockContentLocation, content, inputStream, readBlockLazily);
|
||||
@@ -55,11 +54,12 @@ public class HoodieDeleteBlock extends HoodieLogBlock {
|
||||
|
||||
@Override
|
||||
public byte[] getContentBytes() throws IOException {
|
||||
Option<byte[]> content = getContent();
|
||||
|
||||
// In case this method is called before realizing keys from content
|
||||
if (getContent().isPresent()) {
|
||||
return getContent().get();
|
||||
} else if (readBlockLazily && !getContent().isPresent() && keysToDelete == null) {
|
||||
if (content.isPresent()) {
|
||||
return content.get();
|
||||
} else if (readBlockLazily && keysToDelete == null) {
|
||||
// read block lazily
|
||||
getKeysToDelete();
|
||||
}
|
||||
@@ -100,11 +100,4 @@ public class HoodieDeleteBlock extends HoodieLogBlock {
|
||||
return HoodieLogBlockType.DELETE_BLOCK;
|
||||
}
|
||||
|
||||
public static HoodieLogBlock getBlock(HoodieLogFile logFile, FSDataInputStream inputStream, Option<byte[]> content,
|
||||
boolean readBlockLazily, long position, long blockSize, long blockEndPos, Map<HeaderMetadataType, String> header,
|
||||
Map<HeaderMetadataType, String> footer) throws IOException {
|
||||
|
||||
return new HoodieDeleteBlock(content, inputStream, readBlockLazily,
|
||||
Option.of(new HoodieLogBlockContentLocation(logFile, position, blockSize, blockEndPos)), header, footer);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -18,19 +18,7 @@
|
||||
|
||||
package org.apache.hudi.common.table.log.block;
|
||||
|
||||
import org.apache.hudi.avro.HoodieAvroUtils;
|
||||
import org.apache.hudi.common.fs.inline.InLineFSUtils;
|
||||
import org.apache.hudi.common.fs.inline.InLineFileSystem;
|
||||
import org.apache.hudi.common.model.HoodieLogFile;
|
||||
import org.apache.hudi.common.util.Option;
|
||||
import org.apache.hudi.common.util.StringUtils;
|
||||
import org.apache.hudi.common.util.ValidationUtils;
|
||||
import org.apache.hudi.exception.HoodieIOException;
|
||||
import org.apache.hudi.io.storage.HoodieHBaseKVComparator;
|
||||
import org.apache.hudi.io.storage.HoodieHFileReader;
|
||||
|
||||
import org.apache.avro.Schema;
|
||||
import org.apache.avro.Schema.Field;
|
||||
import org.apache.avro.generic.IndexedRecord;
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.fs.FSDataInputStream;
|
||||
@@ -43,11 +31,18 @@ import org.apache.hadoop.hbase.io.hfile.HFile;
|
||||
import org.apache.hadoop.hbase.io.hfile.HFileContext;
|
||||
import org.apache.hadoop.hbase.io.hfile.HFileContextBuilder;
|
||||
import org.apache.hadoop.hbase.util.Pair;
|
||||
import org.apache.hudi.avro.HoodieAvroUtils;
|
||||
import org.apache.hudi.common.fs.inline.InLineFSUtils;
|
||||
import org.apache.hudi.common.fs.inline.InLineFileSystem;
|
||||
import org.apache.hudi.common.util.Option;
|
||||
import org.apache.hudi.common.util.StringUtils;
|
||||
import org.apache.hudi.common.util.ValidationUtils;
|
||||
import org.apache.hudi.exception.HoodieIOException;
|
||||
import org.apache.hudi.io.storage.HoodieHBaseKVComparator;
|
||||
import org.apache.hudi.io.storage.HoodieHFileReader;
|
||||
import org.apache.log4j.LogManager;
|
||||
import org.apache.log4j.Logger;
|
||||
|
||||
import javax.annotation.Nonnull;
|
||||
|
||||
import java.io.ByteArrayOutputStream;
|
||||
import java.io.IOException;
|
||||
import java.util.Collections;
|
||||
@@ -58,29 +53,36 @@ import java.util.Map;
|
||||
import java.util.TreeMap;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import static org.apache.hudi.common.util.ValidationUtils.checkState;
|
||||
|
||||
/**
|
||||
* HoodieHFileDataBlock contains a list of records stored inside an HFile format. It is used with the HFile
|
||||
* base file format.
|
||||
*/
|
||||
public class HoodieHFileDataBlock extends HoodieDataBlock {
|
||||
private static final Logger LOG = LogManager.getLogger(HoodieHFileDataBlock.class);
|
||||
private static Compression.Algorithm compressionAlgorithm = Compression.Algorithm.GZ;
|
||||
private static int blockSize = 1 * 1024 * 1024;
|
||||
private boolean enableInlineReading = false;
|
||||
|
||||
public HoodieHFileDataBlock(HoodieLogFile logFile, FSDataInputStream inputStream, Option<byte[]> content,
|
||||
boolean readBlockLazily, long position, long blockSize, long blockEndpos,
|
||||
Schema readerSchema, Map<HeaderMetadataType, String> header,
|
||||
Map<HeaderMetadataType, String> footer, boolean enableInlineReading, String keyField) {
|
||||
super(content, inputStream, readBlockLazily,
|
||||
Option.of(new HoodieLogBlockContentLocation(logFile, position, blockSize, blockEndpos)),
|
||||
readerSchema, header, footer, keyField);
|
||||
this.enableInlineReading = enableInlineReading;
|
||||
private static final int DEFAULT_BLOCK_SIZE = 1024 * 1024;
|
||||
|
||||
private final Option<Compression.Algorithm> compressionAlgorithm;
|
||||
|
||||
public HoodieHFileDataBlock(FSDataInputStream inputStream,
|
||||
Option<byte[]> content,
|
||||
boolean readBlockLazily,
|
||||
HoodieLogBlockContentLocation logBlockContentLocation,
|
||||
Option<Schema> readerSchema,
|
||||
Map<HeaderMetadataType, String> header,
|
||||
Map<HeaderMetadataType, String> footer,
|
||||
boolean enablePointLookups) {
|
||||
super(content, inputStream, readBlockLazily, Option.of(logBlockContentLocation), readerSchema, header, footer, HoodieHFileReader.KEY_FIELD_NAME, enablePointLookups);
|
||||
this.compressionAlgorithm = Option.empty();
|
||||
}
|
||||
|
||||
public HoodieHFileDataBlock(@Nonnull List<IndexedRecord> records, @Nonnull Map<HeaderMetadataType, String> header,
|
||||
String keyField) {
|
||||
super(records, header, new HashMap<>(), keyField);
|
||||
public HoodieHFileDataBlock(List<IndexedRecord> records,
|
||||
Map<HeaderMetadataType, String> header,
|
||||
Compression.Algorithm compressionAlgorithm) {
|
||||
super(records, header, new HashMap<>(), HoodieHFileReader.KEY_FIELD_NAME);
|
||||
this.compressionAlgorithm = Option.of(compressionAlgorithm);
|
||||
}
|
||||
|
||||
@Override
|
||||
@@ -89,43 +91,45 @@ public class HoodieHFileDataBlock extends HoodieDataBlock {
|
||||
}
|
||||
|
||||
@Override
|
||||
protected byte[] serializeRecords() throws IOException {
|
||||
HFileContext context = new HFileContextBuilder().withBlockSize(blockSize).withCompression(compressionAlgorithm)
|
||||
protected byte[] serializeRecords(List<IndexedRecord> records) throws IOException {
|
||||
HFileContext context = new HFileContextBuilder()
|
||||
.withBlockSize(DEFAULT_BLOCK_SIZE)
|
||||
.withCompression(compressionAlgorithm.get())
|
||||
.build();
|
||||
|
||||
Configuration conf = new Configuration();
|
||||
CacheConfig cacheConfig = new CacheConfig(conf);
|
||||
ByteArrayOutputStream baos = new ByteArrayOutputStream();
|
||||
FSDataOutputStream ostream = new FSDataOutputStream(baos, null);
|
||||
|
||||
HFile.Writer writer = HFile.getWriterFactory(conf, cacheConfig)
|
||||
.withOutputStream(ostream).withFileContext(context).withComparator(new HoodieHBaseKVComparator()).create();
|
||||
// Use simple incrementing counter as a key
|
||||
boolean useIntegerKey = !getRecordKey(records.get(0)).isPresent();
|
||||
// This is set here to avoid re-computing this in the loop
|
||||
int keyWidth = useIntegerKey ? (int) Math.ceil(Math.log(records.size())) + 1 : -1;
|
||||
|
||||
// Serialize records into bytes
|
||||
Map<String, byte[]> sortedRecordsMap = new TreeMap<>();
|
||||
Iterator<IndexedRecord> itr = records.iterator();
|
||||
boolean useIntegerKey = false;
|
||||
int key = 0;
|
||||
int keySize = 0;
|
||||
final Field keyFieldSchema = records.get(0).getSchema().getField(HoodieHFileReader.KEY_FIELD_NAME);
|
||||
if (keyFieldSchema == null) {
|
||||
// Missing key metadata field so we should use an integer sequence key
|
||||
useIntegerKey = true;
|
||||
keySize = (int) Math.ceil(Math.log(records.size())) + 1;
|
||||
}
|
||||
|
||||
int id = 0;
|
||||
while (itr.hasNext()) {
|
||||
IndexedRecord record = itr.next();
|
||||
String recordKey;
|
||||
if (useIntegerKey) {
|
||||
recordKey = String.format("%" + keySize + "s", key++);
|
||||
recordKey = String.format("%" + keyWidth + "s", id++);
|
||||
} else {
|
||||
recordKey = record.get(keyFieldSchema.pos()).toString();
|
||||
recordKey = getRecordKey(record).get();
|
||||
}
|
||||
final byte[] recordBytes = serializeRecord(record, Option.ofNullable(keyFieldSchema));
|
||||
|
||||
final byte[] recordBytes = serializeRecord(record);
|
||||
ValidationUtils.checkState(!sortedRecordsMap.containsKey(recordKey),
|
||||
"Writing multiple records with same key not supported for " + this.getClass().getName());
|
||||
sortedRecordsMap.put(recordKey, recordBytes);
|
||||
}
|
||||
|
||||
HFile.Writer writer = HFile.getWriterFactory(conf, cacheConfig)
|
||||
.withOutputStream(ostream).withFileContext(context).withComparator(new HoodieHBaseKVComparator()).create();
|
||||
|
||||
// Write the records
|
||||
sortedRecordsMap.forEach((recordKey, recordBytes) -> {
|
||||
try {
|
||||
@@ -144,79 +148,52 @@ public class HoodieHFileDataBlock extends HoodieDataBlock {
|
||||
}
|
||||
|
||||
@Override
|
||||
protected void createRecordsFromContentBytes() throws IOException {
|
||||
if (enableInlineReading) {
|
||||
getRecords(Collections.emptyList());
|
||||
} else {
|
||||
super.createRecordsFromContentBytes();
|
||||
}
|
||||
protected List<IndexedRecord> deserializeRecords(byte[] content) throws IOException {
|
||||
checkState(readerSchema != null, "Reader's schema has to be non-null");
|
||||
|
||||
// Get schema from the header
|
||||
Schema writerSchema = new Schema.Parser().parse(super.getLogBlockHeader().get(HeaderMetadataType.SCHEMA));
|
||||
|
||||
// Read the content
|
||||
HoodieHFileReader<IndexedRecord> reader = new HoodieHFileReader<>(content);
|
||||
List<Pair<String, IndexedRecord>> records = reader.readAllRecords(writerSchema, readerSchema);
|
||||
|
||||
return records.stream().map(Pair::getSecond).collect(Collectors.toList());
|
||||
}
|
||||
|
||||
// TODO abstract this w/in HoodieDataBlock
|
||||
@Override
|
||||
public List<IndexedRecord> getRecords(List<String> keys) throws IOException {
|
||||
readWithInlineFS(keys);
|
||||
return records;
|
||||
protected List<IndexedRecord> lookupRecords(List<String> keys) throws IOException {
|
||||
HoodieLogBlockContentLocation blockContentLoc = getBlockContentLocation().get();
|
||||
|
||||
// NOTE: It's important to extend Hadoop configuration here to make sure configuration
|
||||
// is appropriately carried over
|
||||
Configuration inlineConf = new Configuration(blockContentLoc.getHadoopConf());
|
||||
inlineConf.set("fs." + InLineFileSystem.SCHEME + ".impl", InLineFileSystem.class.getName());
|
||||
|
||||
Path inlinePath = InLineFSUtils.getInlineFilePath(
|
||||
blockContentLoc.getLogFile().getPath(),
|
||||
blockContentLoc.getLogFile().getPath().getFileSystem(inlineConf).getScheme(),
|
||||
blockContentLoc.getContentPositionInLogFile(),
|
||||
blockContentLoc.getBlockSize());
|
||||
|
||||
// HFile read will be efficient if keys are sorted, since on storage, records are sorted by key. This will avoid unnecessary seeks.
|
||||
Collections.sort(keys);
|
||||
|
||||
try (HoodieHFileReader<IndexedRecord> reader =
|
||||
new HoodieHFileReader<>(inlineConf, inlinePath, new CacheConfig(inlineConf), inlinePath.getFileSystem(inlineConf))) {
|
||||
// Get writer's schema from the header
|
||||
List<Pair<String, IndexedRecord>> logRecords = reader.readRecords(keys, readerSchema);
|
||||
return logRecords.stream().map(Pair::getSecond).collect(Collectors.toList());
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Serialize the record to byte buffer.
|
||||
*
|
||||
* @param record - Record to serialize
|
||||
* @param keyField - Key field in the schema
|
||||
* @return Serialized byte buffer for the record
|
||||
*/
|
||||
private byte[] serializeRecord(final IndexedRecord record, final Option<Field> keyField) {
|
||||
private byte[] serializeRecord(IndexedRecord record) {
|
||||
Option<Schema.Field> keyField = getKeyField(record.getSchema());
|
||||
// Reset key value w/in the record to avoid duplicating the key w/in payload
|
||||
if (keyField.isPresent()) {
|
||||
record.put(keyField.get().pos(), StringUtils.EMPTY_STRING);
|
||||
}
|
||||
return HoodieAvroUtils.indexedRecordToBytes(record);
|
||||
}
|
||||
|
||||
private void readWithInlineFS(List<String> keys) throws IOException {
|
||||
boolean enableFullScan = keys.isEmpty();
|
||||
// Get schema from the header
|
||||
Schema writerSchema = new Schema.Parser().parse(super.getLogBlockHeader().get(HeaderMetadataType.SCHEMA));
|
||||
// If readerSchema was not present, use writerSchema
|
||||
if (schema == null) {
|
||||
schema = writerSchema;
|
||||
}
|
||||
Configuration conf = new Configuration();
|
||||
CacheConfig cacheConf = new CacheConfig(conf);
|
||||
Configuration inlineConf = new Configuration();
|
||||
inlineConf.set("fs." + InLineFileSystem.SCHEME + ".impl", InLineFileSystem.class.getName());
|
||||
|
||||
Path inlinePath = InLineFSUtils.getInlineFilePath(
|
||||
getBlockContentLocation().get().getLogFile().getPath(),
|
||||
getBlockContentLocation().get().getLogFile().getPath().getFileSystem(conf).getScheme(),
|
||||
getBlockContentLocation().get().getContentPositionInLogFile(),
|
||||
getBlockContentLocation().get().getBlockSize());
|
||||
if (!enableFullScan) {
|
||||
// HFile read will be efficient if keys are sorted, since on storage, records are sorted by key. This will avoid unnecessary seeks.
|
||||
Collections.sort(keys);
|
||||
}
|
||||
HoodieHFileReader reader = new HoodieHFileReader(inlineConf, inlinePath, cacheConf, inlinePath.getFileSystem(inlineConf));
|
||||
List<org.apache.hadoop.hbase.util.Pair<String, IndexedRecord>> logRecords = enableFullScan ? reader.readAllRecords(writerSchema, schema) :
|
||||
reader.readRecords(keys, schema);
|
||||
reader.close();
|
||||
this.records = logRecords.stream().map(t -> t.getSecond()).collect(Collectors.toList());
|
||||
}
|
||||
|
||||
@Override
|
||||
protected void deserializeRecords() throws IOException {
|
||||
// Get schema from the header
|
||||
Schema writerSchema = new Schema.Parser().parse(super.getLogBlockHeader().get(HeaderMetadataType.SCHEMA));
|
||||
|
||||
// If readerSchema was not present, use writerSchema
|
||||
if (schema == null) {
|
||||
schema = writerSchema;
|
||||
}
|
||||
|
||||
// Read the content
|
||||
HoodieHFileReader reader = new HoodieHFileReader<>(getContent().get());
|
||||
List<Pair<String, IndexedRecord>> records = reader.readAllRecords(writerSchema, schema);
|
||||
this.records = records.stream().map(t -> t.getSecond()).collect(Collectors.toList());
|
||||
|
||||
// Free up content to be GC'd, deflate
|
||||
deflate();
|
||||
}
|
||||
}
|
||||
|
||||
@@ -18,15 +18,18 @@
|
||||
|
||||
package org.apache.hudi.common.table.log.block;
|
||||
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hudi.common.model.HoodieLogFile;
|
||||
import org.apache.hudi.common.table.log.HoodieMergedLogRecordScanner;
|
||||
import org.apache.hudi.common.util.Option;
|
||||
import org.apache.hudi.common.util.TypeUtils;
|
||||
import org.apache.hudi.exception.HoodieException;
|
||||
import org.apache.hudi.exception.HoodieIOException;
|
||||
|
||||
import org.apache.hadoop.fs.FSDataInputStream;
|
||||
|
||||
import javax.annotation.Nonnull;
|
||||
import javax.annotation.Nullable;
|
||||
|
||||
import java.io.ByteArrayOutputStream;
|
||||
import java.io.DataInputStream;
|
||||
@@ -36,6 +39,8 @@ import java.io.IOException;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
|
||||
import static org.apache.hudi.common.util.ValidationUtils.checkState;
|
||||
|
||||
/**
|
||||
* Abstract class defining a block in HoodieLogFile.
|
||||
*/
|
||||
@@ -58,14 +63,17 @@ public abstract class HoodieLogBlock {
|
||||
// TODO : change this to just InputStream so this works for any FileSystem
|
||||
// create handlers to return specific type of inputstream based on FS
|
||||
// input stream corresponding to the log file where this logBlock belongs
|
||||
protected FSDataInputStream inputStream;
|
||||
private final FSDataInputStream inputStream;
|
||||
// Toggle flag, whether to read blocks lazily (I/O intensive) or not (Memory intensive)
|
||||
protected boolean readBlockLazily;
|
||||
|
||||
public HoodieLogBlock(@Nonnull Map<HeaderMetadataType, String> logBlockHeader,
|
||||
public HoodieLogBlock(
|
||||
@Nonnull Map<HeaderMetadataType, String> logBlockHeader,
|
||||
@Nonnull Map<HeaderMetadataType, String> logBlockFooter,
|
||||
@Nonnull Option<HoodieLogBlockContentLocation> blockContentLocation, @Nonnull Option<byte[]> content,
|
||||
FSDataInputStream inputStream, boolean readBlockLazily) {
|
||||
@Nonnull Option<HoodieLogBlockContentLocation> blockContentLocation,
|
||||
@Nonnull Option<byte[]> content,
|
||||
@Nullable FSDataInputStream inputStream,
|
||||
boolean readBlockLazily) {
|
||||
this.logBlockHeader = logBlockHeader;
|
||||
this.logBlockFooter = logBlockFooter;
|
||||
this.blockContentLocation = blockContentLocation;
|
||||
@@ -109,7 +117,25 @@ public abstract class HoodieLogBlock {
|
||||
* Type of the log block WARNING: This enum is serialized as the ordinal. Only add new enums at the end.
|
||||
*/
|
||||
public enum HoodieLogBlockType {
|
||||
COMMAND_BLOCK, DELETE_BLOCK, CORRUPT_BLOCK, AVRO_DATA_BLOCK, HFILE_DATA_BLOCK
|
||||
COMMAND_BLOCK(":command"),
|
||||
DELETE_BLOCK(":delete"),
|
||||
CORRUPT_BLOCK(":corrupted"),
|
||||
AVRO_DATA_BLOCK("avro"),
|
||||
HFILE_DATA_BLOCK("hfile"),
|
||||
PARQUET_DATA_BLOCK("parquet");
|
||||
|
||||
private static final Map<String, HoodieLogBlockType> ID_TO_ENUM_MAP =
|
||||
TypeUtils.getValueToEnumMap(HoodieLogBlockType.class, e -> e.id);
|
||||
|
||||
private final String id;
|
||||
|
||||
HoodieLogBlockType(String id) {
|
||||
this.id = id;
|
||||
}
|
||||
|
||||
public static HoodieLogBlockType fromId(String id) {
|
||||
return ID_TO_ENUM_MAP.get(id);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -132,7 +158,8 @@ public abstract class HoodieLogBlock {
|
||||
* intensive CompactedScanner, the location helps to lazily read contents from the log file
|
||||
*/
|
||||
public static final class HoodieLogBlockContentLocation {
|
||||
|
||||
// Hadoop Config required to access the file
|
||||
private final Configuration hadoopConf;
|
||||
// The logFile that contains this block
|
||||
private final HoodieLogFile logFile;
|
||||
// The filePosition in the logFile for the contents of this block
|
||||
@@ -142,14 +169,22 @@ public abstract class HoodieLogBlock {
|
||||
// The final position where the complete block ends
|
||||
private final long blockEndPos;
|
||||
|
||||
HoodieLogBlockContentLocation(HoodieLogFile logFile, long contentPositionInLogFile, long blockSize,
|
||||
public HoodieLogBlockContentLocation(Configuration hadoopConf,
|
||||
HoodieLogFile logFile,
|
||||
long contentPositionInLogFile,
|
||||
long blockSize,
|
||||
long blockEndPos) {
|
||||
this.hadoopConf = hadoopConf;
|
||||
this.logFile = logFile;
|
||||
this.contentPositionInLogFile = contentPositionInLogFile;
|
||||
this.blockSize = blockSize;
|
||||
this.blockEndPos = blockEndPos;
|
||||
}
|
||||
|
||||
public Configuration getHadoopConf() {
|
||||
return hadoopConf;
|
||||
}
|
||||
|
||||
public HoodieLogFile getLogFile() {
|
||||
return logFile;
|
||||
}
|
||||
@@ -210,24 +245,27 @@ public abstract class HoodieLogBlock {
|
||||
* Read or Skip block content of a log block in the log file. Depends on lazy reading enabled in
|
||||
* {@link HoodieMergedLogRecordScanner}
|
||||
*/
|
||||
public static byte[] readOrSkipContent(FSDataInputStream inputStream, Integer contentLength, boolean readBlockLazily)
|
||||
public static Option<byte[]> tryReadContent(FSDataInputStream inputStream, Integer contentLength, boolean readLazily)
|
||||
throws IOException {
|
||||
byte[] content = null;
|
||||
if (!readBlockLazily) {
|
||||
// Read the contents in memory
|
||||
content = new byte[contentLength];
|
||||
inputStream.readFully(content, 0, contentLength);
|
||||
} else {
|
||||
if (readLazily) {
|
||||
// Seek to the end of the content block
|
||||
inputStream.seek(inputStream.getPos() + contentLength);
|
||||
return Option.empty();
|
||||
}
|
||||
return content;
|
||||
|
||||
// TODO re-use buffer if stream is backed by buffer
|
||||
// Read the contents in memory
|
||||
byte[] content = new byte[contentLength];
|
||||
inputStream.readFully(content, 0, contentLength);
|
||||
return Option.of(content);
|
||||
}
|
||||
|
||||
/**
|
||||
* When lazyReading of blocks is turned on, inflate the content of a log block from disk.
|
||||
*/
|
||||
protected void inflate() throws HoodieIOException {
|
||||
checkState(!content.isPresent(), "Block has already been inflated");
|
||||
checkState(inputStream != null, "Block should have input-stream provided");
|
||||
|
||||
try {
|
||||
content = Option.of(new byte[(int) this.getBlockContentLocation().get().getBlockSize()]);
|
||||
|
||||
@@ -0,0 +1,171 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.common.table.log.block;
|
||||
|
||||
import org.apache.avro.Schema;
|
||||
import org.apache.avro.generic.IndexedRecord;
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.fs.FSDataInputStream;
|
||||
import org.apache.hadoop.fs.FSDataOutputStream;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.apache.hudi.avro.HoodieAvroWriteSupport;
|
||||
import org.apache.hudi.common.fs.inline.InLineFSUtils;
|
||||
import org.apache.hudi.common.fs.inline.InLineFileSystem;
|
||||
import org.apache.hudi.common.util.Option;
|
||||
import org.apache.hudi.common.util.ParquetReaderIterator;
|
||||
import org.apache.hudi.io.storage.HoodieAvroParquetConfig;
|
||||
import org.apache.hudi.io.storage.HoodieParquetStreamWriter;
|
||||
import org.apache.parquet.avro.AvroParquetReader;
|
||||
import org.apache.parquet.avro.AvroReadSupport;
|
||||
import org.apache.parquet.avro.AvroSchemaConverter;
|
||||
import org.apache.parquet.hadoop.ParquetReader;
|
||||
import org.apache.parquet.hadoop.ParquetWriter;
|
||||
import org.apache.parquet.hadoop.metadata.CompressionCodecName;
|
||||
import org.apache.parquet.hadoop.util.HadoopInputFile;
|
||||
import org.apache.parquet.io.InputFile;
|
||||
|
||||
import javax.annotation.Nonnull;
|
||||
import java.io.ByteArrayOutputStream;
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashMap;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
/**
|
||||
* HoodieParquetDataBlock contains a list of records serialized using Parquet.
|
||||
*/
|
||||
public class HoodieParquetDataBlock extends HoodieDataBlock {
|
||||
|
||||
private final Option<CompressionCodecName> compressionCodecName;
|
||||
|
||||
public HoodieParquetDataBlock(FSDataInputStream inputStream,
|
||||
Option<byte[]> content,
|
||||
boolean readBlockLazily,
|
||||
HoodieLogBlockContentLocation logBlockContentLocation,
|
||||
Option<Schema> readerSchema,
|
||||
Map<HeaderMetadataType, String> header,
|
||||
Map<HeaderMetadataType, String> footer,
|
||||
String keyField) {
|
||||
super(content, inputStream, readBlockLazily, Option.of(logBlockContentLocation), readerSchema, header, footer, keyField, false);
|
||||
|
||||
this.compressionCodecName = Option.empty();
|
||||
}
|
||||
|
||||
public HoodieParquetDataBlock(
|
||||
@Nonnull List<IndexedRecord> records,
|
||||
@Nonnull Map<HeaderMetadataType, String> header,
|
||||
@Nonnull String keyField,
|
||||
@Nonnull CompressionCodecName compressionCodecName
|
||||
) {
|
||||
super(records, header, new HashMap<>(), keyField);
|
||||
|
||||
this.compressionCodecName = Option.of(compressionCodecName);
|
||||
}
|
||||
|
||||
@Override
|
||||
public HoodieLogBlockType getBlockType() {
|
||||
return HoodieLogBlockType.PARQUET_DATA_BLOCK;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected byte[] serializeRecords(List<IndexedRecord> records) throws IOException {
|
||||
if (records.size() == 0) {
|
||||
return new byte[0];
|
||||
}
|
||||
|
||||
Schema writerSchema = new Schema.Parser().parse(super.getLogBlockHeader().get(HeaderMetadataType.SCHEMA));
|
||||
|
||||
HoodieAvroWriteSupport writeSupport = new HoodieAvroWriteSupport(
|
||||
new AvroSchemaConverter().convert(writerSchema), writerSchema, Option.empty());
|
||||
|
||||
HoodieAvroParquetConfig avroParquetConfig =
|
||||
new HoodieAvroParquetConfig(
|
||||
writeSupport,
|
||||
compressionCodecName.get(),
|
||||
ParquetWriter.DEFAULT_BLOCK_SIZE,
|
||||
ParquetWriter.DEFAULT_PAGE_SIZE,
|
||||
1024 * 1024 * 1024,
|
||||
new Configuration(),
|
||||
Double.parseDouble(String.valueOf(0.1)));//HoodieStorageConfig.PARQUET_COMPRESSION_RATIO.defaultValue()));
|
||||
|
||||
ByteArrayOutputStream baos = new ByteArrayOutputStream();
|
||||
|
||||
try (FSDataOutputStream outputStream = new FSDataOutputStream(baos)) {
|
||||
try (HoodieParquetStreamWriter<IndexedRecord> parquetWriter = new HoodieParquetStreamWriter<>(outputStream, avroParquetConfig)) {
|
||||
for (IndexedRecord record : records) {
|
||||
String recordKey = getRecordKey(record).orElse(null);
|
||||
parquetWriter.writeAvro(recordKey, record);
|
||||
}
|
||||
outputStream.flush();
|
||||
}
|
||||
}
|
||||
|
||||
return baos.toByteArray();
|
||||
}
|
||||
|
||||
public static Iterator<IndexedRecord> getProjectedParquetRecordsIterator(Configuration conf,
|
||||
Schema readerSchema,
|
||||
InputFile inputFile) throws IOException {
|
||||
AvroReadSupport.setAvroReadSchema(conf, readerSchema);
|
||||
AvroReadSupport.setRequestedProjection(conf, readerSchema);
|
||||
|
||||
ParquetReader<IndexedRecord> reader =
|
||||
AvroParquetReader.<IndexedRecord>builder(inputFile).withConf(conf).build();
|
||||
return new ParquetReaderIterator<>(reader);
|
||||
}
|
||||
|
||||
/**
|
||||
* NOTE: We're overriding the whole reading sequence to make sure we properly respect
|
||||
* the requested Reader's schema and only fetch the columns that have been explicitly
|
||||
* requested by the caller (providing projected Reader's schema)
|
||||
*/
|
||||
@Override
|
||||
protected List<IndexedRecord> readRecordsFromBlockPayload() throws IOException {
|
||||
HoodieLogBlockContentLocation blockContentLoc = getBlockContentLocation().get();
|
||||
|
||||
// NOTE: It's important to extend Hadoop configuration here to make sure configuration
|
||||
// is appropriately carried over
|
||||
Configuration inlineConf = new Configuration(blockContentLoc.getHadoopConf());
|
||||
inlineConf.set("fs." + InLineFileSystem.SCHEME + ".impl", InLineFileSystem.class.getName());
|
||||
|
||||
Path inlineLogFilePath = InLineFSUtils.getInlineFilePath(
|
||||
blockContentLoc.getLogFile().getPath(),
|
||||
blockContentLoc.getLogFile().getPath().getFileSystem(inlineConf).getScheme(),
|
||||
blockContentLoc.getContentPositionInLogFile(),
|
||||
blockContentLoc.getBlockSize());
|
||||
|
||||
ArrayList<IndexedRecord> records = new ArrayList<>();
|
||||
|
||||
getProjectedParquetRecordsIterator(
|
||||
inlineConf,
|
||||
readerSchema,
|
||||
HadoopInputFile.fromPath(inlineLogFilePath, inlineConf)
|
||||
)
|
||||
.forEachRemaining(records::add);
|
||||
|
||||
return records;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected List<IndexedRecord> deserializeRecords(byte[] content) throws IOException {
|
||||
throw new UnsupportedOperationException("Should not be invoked");
|
||||
}
|
||||
}
|
||||
@@ -108,14 +108,31 @@ public final class Option<T> implements Serializable {
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns this {@link Option} if not empty, otherwise evaluates the provided supplier
|
||||
* and returns the alternative
|
||||
*/
|
||||
public Option<T> or(Supplier<? extends Option<T>> other) {
|
||||
return val != null ? this : other.get();
|
||||
}
|
||||
|
||||
/**
|
||||
* Identical to {@code Optional.orElse}
|
||||
*/
|
||||
public T orElse(T other) {
|
||||
return val != null ? val : other;
|
||||
}
|
||||
|
||||
/**
|
||||
* Identical to {@code Optional.orElseGet}
|
||||
*/
|
||||
public T orElseGet(Supplier<? extends T> other) {
|
||||
return val != null ? val : other.get();
|
||||
}
|
||||
|
||||
/**
|
||||
* Identical to {@code Optional.orElseThrow}
|
||||
*/
|
||||
public <X extends Throwable> T orElseThrow(Supplier<? extends X> exceptionSupplier) throws X {
|
||||
if (val != null) {
|
||||
return val;
|
||||
|
||||
@@ -0,0 +1,138 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.common.util.io;
|
||||
|
||||
import javax.annotation.Nonnull;
|
||||
import java.io.InputStream;
|
||||
import java.nio.ByteBuffer;
|
||||
|
||||
/**
|
||||
* Instance of {@link InputStream} backed by {@link ByteBuffer}, implementing following
|
||||
* functionality (on top of what's required by {@link InputStream})
|
||||
*
|
||||
* <ol>
|
||||
* <li>Seeking: enables random access by allowing to seek to an arbitrary position w/in the stream</li>
|
||||
* <li>(Thread-safe) Copying: enables to copy from the underlying buffer not modifying the state of the stream</li>
|
||||
* </ol>
|
||||
*
|
||||
* NOTE: Generally methods of this class are NOT thread-safe, unless specified otherwise
|
||||
*/
|
||||
public class ByteBufferBackedInputStream extends InputStream {
|
||||
|
||||
private final ByteBuffer buffer;
|
||||
private final int bufferOffset;
|
||||
|
||||
public ByteBufferBackedInputStream(ByteBuffer buf) {
|
||||
this.buffer = buf.duplicate();
|
||||
// We're marking current buffer position, so that we will be able
|
||||
// to reset it later on appropriately (to support seek operations)
|
||||
this.buffer.mark();
|
||||
this.bufferOffset = buffer.position();
|
||||
}
|
||||
|
||||
public ByteBufferBackedInputStream(byte[] array) {
|
||||
this(array, 0, array.length);
|
||||
}
|
||||
|
||||
public ByteBufferBackedInputStream(byte[] array, int offset, int length) {
|
||||
this(ByteBuffer.wrap(array, offset, length));
|
||||
}
|
||||
|
||||
@Override
|
||||
public int read() {
|
||||
if (!buffer.hasRemaining()) {
|
||||
throw new IllegalArgumentException("Reading past backed buffer boundary");
|
||||
}
|
||||
return buffer.get() & 0xFF;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int read(@Nonnull byte[] bytes, int offset, int length) {
|
||||
if (!buffer.hasRemaining()) {
|
||||
throw new IllegalArgumentException("Reading past backed buffer boundary");
|
||||
}
|
||||
// Determine total number of bytes available to read
|
||||
int available = Math.min(length, buffer.remaining());
|
||||
// Copy bytes into the target buffer
|
||||
buffer.get(bytes, offset, available);
|
||||
return available;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns current position of the stream
|
||||
*/
|
||||
public int getPosition() {
|
||||
return buffer.position() - bufferOffset;
|
||||
}
|
||||
|
||||
/**
|
||||
* Seeks to a position w/in the stream
|
||||
*
|
||||
* NOTE: Position is relative to the start of the stream (ie its absolute w/in this stream),
|
||||
* with following invariant being assumed:
|
||||
* <p>0 <= pos <= length (of the stream)</p>
|
||||
*
|
||||
* This method is NOT thread-safe
|
||||
*
|
||||
* @param pos target position to seek to w/in the holding buffer
|
||||
*/
|
||||
public void seek(long pos) {
|
||||
buffer.reset(); // to mark
|
||||
int offset = buffer.position();
|
||||
// NOTE: That the new pos is still relative to buffer's offset
|
||||
int newPos = offset + (int) pos;
|
||||
if (newPos > buffer.limit() || newPos < offset) {
|
||||
throw new IllegalArgumentException(
|
||||
String.format("Can't seek past the backing buffer (limit %d, offset %d, new %d)", buffer.limit(), offset, newPos)
|
||||
);
|
||||
}
|
||||
|
||||
buffer.position(newPos);
|
||||
}
|
||||
|
||||
/**
|
||||
* Copies at most {@code length} bytes starting from position {@code pos} into the target
|
||||
* buffer with provided {@code offset}. Returns number of bytes copied from the backing buffer
|
||||
*
|
||||
* NOTE: This does not change the current position of the stream and is thread-safe
|
||||
*
|
||||
* @param pos absolute position w/in stream to read from
|
||||
* @param targetBuffer target buffer to copy into
|
||||
* @param offset target buffer offset to copy at
|
||||
* @param length length of the sequence to copy
|
||||
* @return number of bytes copied
|
||||
*/
|
||||
public int copyFrom(long pos, byte[] targetBuffer, int offset, int length) {
|
||||
int bufferPos = bufferOffset + (int) pos;
|
||||
if (bufferPos > buffer.limit()) {
|
||||
throw new IllegalArgumentException(
|
||||
String.format("Can't read past the backing buffer boundary (offset %d, length %d)", pos, buffer.limit() - bufferOffset)
|
||||
);
|
||||
} else if (length > targetBuffer.length) {
|
||||
throw new IllegalArgumentException(
|
||||
String.format("Target buffer is too small (length %d, buffer size %d)", length, targetBuffer.length)
|
||||
);
|
||||
}
|
||||
// Determine total number of bytes available to read
|
||||
int available = Math.min(length, buffer.limit() - bufferPos);
|
||||
// Get current buffer position in the backing array
|
||||
System.arraycopy(buffer.array(), bufferPos, targetBuffer, offset, available);
|
||||
return available;
|
||||
}
|
||||
}
|
||||
@@ -27,7 +27,7 @@ import org.apache.avro.generic.IndexedRecord;
|
||||
import org.apache.hudi.common.bloom.BloomFilter;
|
||||
import org.apache.hudi.common.util.Option;
|
||||
|
||||
public interface HoodieFileReader<R extends IndexedRecord> {
|
||||
public interface HoodieFileReader<R extends IndexedRecord> extends AutoCloseable {
|
||||
|
||||
public String[] readMinMaxRecordKeys();
|
||||
|
||||
|
||||
@@ -18,7 +18,6 @@
|
||||
|
||||
package org.apache.hudi.io.storage;
|
||||
|
||||
import java.io.ByteArrayInputStream;
|
||||
import java.io.IOException;
|
||||
import java.nio.ByteBuffer;
|
||||
import java.util.ArrayList;
|
||||
@@ -51,6 +50,7 @@ import org.apache.hudi.common.bloom.BloomFilterFactory;
|
||||
import org.apache.hudi.common.fs.FSUtils;
|
||||
import org.apache.hudi.common.util.Option;
|
||||
import org.apache.hudi.common.util.ValidationUtils;
|
||||
import org.apache.hudi.common.util.io.ByteBufferBackedInputStream;
|
||||
import org.apache.hudi.exception.HoodieException;
|
||||
import org.apache.hudi.exception.HoodieIOException;
|
||||
|
||||
@@ -77,11 +77,11 @@ public class HoodieHFileReader<R extends IndexedRecord> implements HoodieFileRea
|
||||
this.reader = HFile.createReader(FSUtils.getFs(path.toString(), configuration), path, cacheConfig, conf);
|
||||
}
|
||||
|
||||
public HoodieHFileReader(Configuration configuration, Path path, CacheConfig cacheConfig, FileSystem inlineFs) throws IOException {
|
||||
public HoodieHFileReader(Configuration configuration, Path path, CacheConfig cacheConfig, FileSystem fs) throws IOException {
|
||||
this.conf = configuration;
|
||||
this.path = path;
|
||||
this.fsDataInputStream = inlineFs.open(path);
|
||||
this.reader = HFile.createReader(inlineFs, path, cacheConfig, configuration);
|
||||
this.fsDataInputStream = fs.open(path);
|
||||
this.reader = HFile.createReader(fs, path, cacheConfig, configuration);
|
||||
}
|
||||
|
||||
public HoodieHFileReader(byte[] content) throws IOException {
|
||||
@@ -332,28 +332,14 @@ public class HoodieHFileReader<R extends IndexedRecord> implements HoodieFileRea
|
||||
}
|
||||
}
|
||||
|
||||
static class SeekableByteArrayInputStream extends ByteArrayInputStream implements Seekable, PositionedReadable {
|
||||
static class SeekableByteArrayInputStream extends ByteBufferBackedInputStream implements Seekable, PositionedReadable {
|
||||
public SeekableByteArrayInputStream(byte[] buf) {
|
||||
super(buf);
|
||||
}
|
||||
|
||||
@Override
|
||||
public long getPos() throws IOException {
|
||||
return pos;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void seek(long pos) throws IOException {
|
||||
if (mark != 0) {
|
||||
throw new IllegalStateException();
|
||||
}
|
||||
|
||||
reset();
|
||||
long skipped = skip(pos);
|
||||
|
||||
if (skipped != pos) {
|
||||
throw new IOException();
|
||||
}
|
||||
return getPosition();
|
||||
}
|
||||
|
||||
@Override
|
||||
@@ -363,19 +349,7 @@ public class HoodieHFileReader<R extends IndexedRecord> implements HoodieFileRea
|
||||
|
||||
@Override
|
||||
public int read(long position, byte[] buffer, int offset, int length) throws IOException {
|
||||
|
||||
if (position >= buf.length) {
|
||||
throw new IllegalArgumentException();
|
||||
}
|
||||
if (position + length > buf.length) {
|
||||
throw new IllegalArgumentException();
|
||||
}
|
||||
if (length > buffer.length) {
|
||||
throw new IllegalArgumentException();
|
||||
}
|
||||
|
||||
System.arraycopy(buf, (int) position, buffer, offset, length);
|
||||
return length;
|
||||
return copyFrom(position, buffer, offset, length);
|
||||
}
|
||||
|
||||
@Override
|
||||
|
||||
@@ -34,9 +34,9 @@ import org.apache.parquet.avro.AvroParquetReader;
|
||||
import org.apache.parquet.avro.AvroReadSupport;
|
||||
import org.apache.parquet.hadoop.ParquetReader;
|
||||
|
||||
public class HoodieParquetReader<R extends IndexedRecord> implements HoodieFileReader {
|
||||
private Path path;
|
||||
private Configuration conf;
|
||||
public class HoodieParquetReader<R extends IndexedRecord> implements HoodieFileReader<R> {
|
||||
private final Path path;
|
||||
private final Configuration conf;
|
||||
private final BaseFileUtils parquetUtils;
|
||||
|
||||
public HoodieParquetReader(Configuration configuration, Path path) {
|
||||
@@ -45,6 +45,7 @@ public class HoodieParquetReader<R extends IndexedRecord> implements HoodieFileR
|
||||
this.parquetUtils = BaseFileUtils.getInstance(HoodieFileFormat.PARQUET);
|
||||
}
|
||||
|
||||
@Override
|
||||
public String[] readMinMaxRecordKeys() {
|
||||
return parquetUtils.readMinMaxRecordKeys(conf, path);
|
||||
}
|
||||
@@ -55,15 +56,15 @@ public class HoodieParquetReader<R extends IndexedRecord> implements HoodieFileR
|
||||
}
|
||||
|
||||
@Override
|
||||
public Set<String> filterRowKeys(Set candidateRowKeys) {
|
||||
public Set<String> filterRowKeys(Set<String> candidateRowKeys) {
|
||||
return parquetUtils.filterRowKeys(conf, path, candidateRowKeys);
|
||||
}
|
||||
|
||||
@Override
|
||||
public Iterator<R> getRecordIterator(Schema schema) throws IOException {
|
||||
AvroReadSupport.setAvroReadSchema(conf, schema);
|
||||
ParquetReader<IndexedRecord> reader = AvroParquetReader.<IndexedRecord>builder(path).withConf(conf).build();
|
||||
return new ParquetReaderIterator(reader);
|
||||
ParquetReader<R> reader = AvroParquetReader.<R>builder(path).withConf(conf).build();
|
||||
return new ParquetReaderIterator<>(reader);
|
||||
}
|
||||
|
||||
@Override
|
||||
|
||||
@@ -0,0 +1,88 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.io.storage;
|
||||
|
||||
import org.apache.avro.generic.IndexedRecord;
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.fs.FSDataOutputStream;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.apache.hudi.avro.HoodieAvroWriteSupport;
|
||||
import org.apache.hudi.parquet.io.OutputStreamBackedOutputFile;
|
||||
import org.apache.parquet.hadoop.ParquetFileWriter;
|
||||
import org.apache.parquet.hadoop.ParquetWriter;
|
||||
import org.apache.parquet.hadoop.api.WriteSupport;
|
||||
import org.apache.parquet.io.OutputFile;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
// TODO(HUDI-3035) unify w/ HoodieParquetWriter
|
||||
public class HoodieParquetStreamWriter<R extends IndexedRecord> implements AutoCloseable {
|
||||
|
||||
private final ParquetWriter<R> writer;
|
||||
private final HoodieAvroWriteSupport writeSupport;
|
||||
|
||||
public HoodieParquetStreamWriter(FSDataOutputStream outputStream,
|
||||
HoodieAvroParquetConfig parquetConfig) throws IOException {
|
||||
this.writeSupport = parquetConfig.getWriteSupport();
|
||||
this.writer = new Builder<R>(new OutputStreamBackedOutputFile(outputStream), writeSupport)
|
||||
.withWriteMode(ParquetFileWriter.Mode.CREATE)
|
||||
.withCompressionCodec(parquetConfig.getCompressionCodecName())
|
||||
.withRowGroupSize(parquetConfig.getBlockSize())
|
||||
.withPageSize(parquetConfig.getPageSize())
|
||||
.withDictionaryPageSize(parquetConfig.getPageSize())
|
||||
.withDictionaryEncoding(parquetConfig.dictionaryEnabled())
|
||||
.withWriterVersion(ParquetWriter.DEFAULT_WRITER_VERSION)
|
||||
.withConf(parquetConfig.getHadoopConf())
|
||||
.build();
|
||||
}
|
||||
|
||||
public void writeAvro(String key, R object) throws IOException {
|
||||
writer.write(object);
|
||||
writeSupport.add(key);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void close() throws IOException {
|
||||
writer.close();
|
||||
}
|
||||
|
||||
private static class Builder<T> extends ParquetWriter.Builder<T, Builder<T>> {
|
||||
private final WriteSupport<T> writeSupport;
|
||||
|
||||
private Builder(Path file, WriteSupport<T> writeSupport) {
|
||||
super(file);
|
||||
this.writeSupport = writeSupport;
|
||||
}
|
||||
|
||||
private Builder(OutputFile file, WriteSupport<T> writeSupport) {
|
||||
super(file);
|
||||
this.writeSupport = writeSupport;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected Builder<T> self() {
|
||||
return this;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected WriteSupport<T> getWriteSupport(Configuration conf) {
|
||||
return writeSupport;
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,63 @@
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.parquet.io;
|
||||
|
||||
import org.apache.hudi.common.util.io.ByteBufferBackedInputStream;
|
||||
import org.apache.parquet.io.DelegatingSeekableInputStream;
|
||||
import org.apache.parquet.io.InputFile;
|
||||
import org.apache.parquet.io.SeekableInputStream;
|
||||
|
||||
/**
|
||||
* Implementation of {@link InputFile} backed by {@code byte[]} buffer
|
||||
*/
|
||||
public class ByteBufferBackedInputFile implements InputFile {
|
||||
private final byte[] buffer;
|
||||
private final int offset;
|
||||
private final int length;
|
||||
|
||||
public ByteBufferBackedInputFile(byte[] buffer, int offset, int length) {
|
||||
this.buffer = buffer;
|
||||
this.offset = offset;
|
||||
this.length = length;
|
||||
}
|
||||
|
||||
public ByteBufferBackedInputFile(byte[] buffer) {
|
||||
this(buffer, 0, buffer.length);
|
||||
}
|
||||
|
||||
@Override
|
||||
public long getLength() {
|
||||
return length;
|
||||
}
|
||||
|
||||
@Override
|
||||
public SeekableInputStream newStream() {
|
||||
return new DelegatingSeekableInputStream(new ByteBufferBackedInputStream(buffer, offset, length)) {
|
||||
@Override
|
||||
public long getPos() {
|
||||
return ((ByteBufferBackedInputStream) getStream()).getPosition();
|
||||
}
|
||||
|
||||
@Override
|
||||
public void seek(long newPos) {
|
||||
((ByteBufferBackedInputStream) getStream()).seek(newPos);
|
||||
}
|
||||
};
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,94 @@
|
||||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.parquet.io;
|
||||
|
||||
import org.apache.hadoop.fs.FSDataOutputStream;
|
||||
import org.apache.parquet.io.OutputFile;
|
||||
import org.apache.parquet.io.PositionOutputStream;
|
||||
|
||||
import javax.annotation.Nonnull;
|
||||
import java.io.IOException;
|
||||
|
||||
/**
|
||||
* Implementation of the {@link OutputFile} backed by {@link java.io.OutputStream}
|
||||
*/
|
||||
public class OutputStreamBackedOutputFile implements OutputFile {
|
||||
|
||||
private static final long DEFAULT_BLOCK_SIZE = 1024L * 1024L;
|
||||
|
||||
private final FSDataOutputStream outputStream;
|
||||
|
||||
public OutputStreamBackedOutputFile(FSDataOutputStream outputStream) {
|
||||
this.outputStream = outputStream;
|
||||
}
|
||||
|
||||
@Override
|
||||
public PositionOutputStream create(long blockSizeHint) {
|
||||
return new PositionOutputStreamAdapter(outputStream);
|
||||
}
|
||||
|
||||
@Override
|
||||
public PositionOutputStream createOrOverwrite(long blockSizeHint) {
|
||||
return create(blockSizeHint);
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean supportsBlockSize() {
|
||||
return false;
|
||||
}
|
||||
|
||||
@Override
|
||||
public long defaultBlockSize() {
|
||||
return DEFAULT_BLOCK_SIZE;
|
||||
}
|
||||
|
||||
private static class PositionOutputStreamAdapter extends PositionOutputStream {
|
||||
private final FSDataOutputStream delegate;
|
||||
|
||||
PositionOutputStreamAdapter(FSDataOutputStream delegate) {
|
||||
this.delegate = delegate;
|
||||
}
|
||||
|
||||
@Override
|
||||
public long getPos() throws IOException {
|
||||
return delegate.getPos();
|
||||
}
|
||||
|
||||
@Override
|
||||
public void write(int b) throws IOException {
|
||||
delegate.write(b);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void write(@Nonnull byte[] buffer, int off, int len) throws IOException {
|
||||
delegate.write(buffer, off, len);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void flush() throws IOException {
|
||||
delegate.flush();
|
||||
}
|
||||
|
||||
@Override
|
||||
public void close() {
|
||||
// We're deliberately not closing the delegate stream here to allow caller
|
||||
// to explicitly manage its lifecycle
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -21,10 +21,12 @@ package org.apache.hudi.common.functional;
|
||||
import org.apache.avro.Schema;
|
||||
import org.apache.avro.generic.GenericRecord;
|
||||
import org.apache.avro.generic.IndexedRecord;
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.fs.FSDataOutputStream;
|
||||
import org.apache.hadoop.fs.FileStatus;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.apache.hadoop.hbase.io.compress.Compression;
|
||||
import org.apache.hudi.avro.HoodieAvroUtils;
|
||||
import org.apache.hudi.common.fs.FSUtils;
|
||||
import org.apache.hudi.common.model.HoodieArchivedLogFile;
|
||||
@@ -46,7 +48,9 @@ import org.apache.hudi.common.table.log.block.HoodieHFileDataBlock;
|
||||
import org.apache.hudi.common.table.log.block.HoodieLogBlock;
|
||||
import org.apache.hudi.common.table.log.block.HoodieLogBlock.HeaderMetadataType;
|
||||
import org.apache.hudi.common.table.log.block.HoodieLogBlock.HoodieLogBlockType;
|
||||
import org.apache.hudi.common.table.log.block.HoodieParquetDataBlock;
|
||||
import org.apache.hudi.common.testutils.FileCreateUtils;
|
||||
import org.apache.hudi.common.testutils.HadoopMapRedUtils;
|
||||
import org.apache.hudi.common.testutils.HoodieCommonTestHarness;
|
||||
import org.apache.hudi.common.testutils.HoodieTestUtils;
|
||||
import org.apache.hudi.common.testutils.SchemaTestUtil;
|
||||
@@ -54,8 +58,9 @@ import org.apache.hudi.common.testutils.minicluster.MiniClusterUtil;
|
||||
import org.apache.hudi.common.util.Option;
|
||||
import org.apache.hudi.common.util.collection.ExternalSpillableMap;
|
||||
import org.apache.hudi.exception.CorruptedLogFileException;
|
||||
|
||||
import org.apache.hudi.exception.HoodieIOException;
|
||||
import org.apache.parquet.hadoop.metadata.CompressionCodecName;
|
||||
import org.apache.parquet.hadoop.util.counters.BenchmarkCounter;
|
||||
import org.junit.jupiter.api.AfterAll;
|
||||
import org.junit.jupiter.api.AfterEach;
|
||||
import org.junit.jupiter.api.BeforeAll;
|
||||
@@ -96,11 +101,12 @@ import static org.junit.jupiter.params.provider.Arguments.arguments;
|
||||
@SuppressWarnings("Duplicates")
|
||||
public class TestHoodieLogFormat extends HoodieCommonTestHarness {
|
||||
|
||||
private static final HoodieLogBlockType DEFAULT_DATA_BLOCK_TYPE = HoodieLogBlockType.AVRO_DATA_BLOCK;
|
||||
|
||||
private static String BASE_OUTPUT_PATH = "/tmp/";
|
||||
private FileSystem fs;
|
||||
private Path partitionPath;
|
||||
private int bufferSize = 4096;
|
||||
private HoodieLogBlockType dataBlockType = HoodieLogBlockType.AVRO_DATA_BLOCK;
|
||||
|
||||
@BeforeAll
|
||||
public static void setUpClass() throws IOException, InterruptedException {
|
||||
@@ -139,7 +145,7 @@ public class TestHoodieLogFormat extends HoodieCommonTestHarness {
|
||||
}
|
||||
|
||||
@ParameterizedTest
|
||||
@EnumSource(names = { "AVRO_DATA_BLOCK", "HFILE_DATA_BLOCK" })
|
||||
@EnumSource(names = {"AVRO_DATA_BLOCK", "HFILE_DATA_BLOCK", "PARQUET_DATA_BLOCK"})
|
||||
public void testBasicAppend(HoodieLogBlockType dataBlockType) throws IOException, InterruptedException, URISyntaxException {
|
||||
Writer writer =
|
||||
HoodieLogFormat.newWriterBuilder().onParentPath(partitionPath).withFileExtension(HoodieLogFile.DELTA_EXTENSION)
|
||||
@@ -171,7 +177,7 @@ public class TestHoodieLogFormat extends HoodieCommonTestHarness {
|
||||
Map<HoodieLogBlock.HeaderMetadataType, String> header = new HashMap<>();
|
||||
header.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, "100");
|
||||
header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, getSimpleSchema().toString());
|
||||
HoodieDataBlock dataBlock = getDataBlock(records, header);
|
||||
HoodieDataBlock dataBlock = getDataBlock(DEFAULT_DATA_BLOCK_TYPE, records, header);
|
||||
// Write out a block
|
||||
AppendResult firstAppend = writer.appendBlock(dataBlock);
|
||||
// Get the size of the block
|
||||
@@ -186,7 +192,7 @@ public class TestHoodieLogFormat extends HoodieCommonTestHarness {
|
||||
HoodieLogFormat.newWriterBuilder().onParentPath(partitionPath).withFileExtension(HoodieLogFile.DELTA_EXTENSION)
|
||||
.withFileId("test-fileid1").overBaseCommit("100").withFs(fs).withSizeThreshold(size - 1).build();
|
||||
records = SchemaTestUtil.generateTestRecords(0, 100);
|
||||
dataBlock = getDataBlock(records, header);
|
||||
dataBlock = getDataBlock(DEFAULT_DATA_BLOCK_TYPE, records, header);
|
||||
AppendResult secondAppend = writer.appendBlock(dataBlock);
|
||||
|
||||
assertEquals(firstAppend.logFile(), secondAppend.logFile());
|
||||
@@ -198,7 +204,7 @@ public class TestHoodieLogFormat extends HoodieCommonTestHarness {
|
||||
|
||||
// Write one more block, which should not go to the new log file.
|
||||
records = SchemaTestUtil.generateTestRecords(0, 100);
|
||||
dataBlock = getDataBlock(records, header);
|
||||
dataBlock = getDataBlock(DEFAULT_DATA_BLOCK_TYPE, records, header);
|
||||
AppendResult rolloverAppend = writer.appendBlock(dataBlock);
|
||||
|
||||
assertNotEquals(secondAppend.logFile(), rolloverAppend.logFile());
|
||||
@@ -245,7 +251,7 @@ public class TestHoodieLogFormat extends HoodieCommonTestHarness {
|
||||
Map<HoodieLogBlock.HeaderMetadataType, String> header = new HashMap<>();
|
||||
header.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, "100");
|
||||
header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, getSimpleSchema().toString());
|
||||
HoodieDataBlock dataBlock = getDataBlock(records, header);
|
||||
HoodieDataBlock dataBlock = getDataBlock(DEFAULT_DATA_BLOCK_TYPE, records, header);
|
||||
writer.appendBlock(dataBlock);
|
||||
Writer writer2 = builder2.build();
|
||||
writer2.appendBlock(dataBlock);
|
||||
@@ -257,8 +263,9 @@ public class TestHoodieLogFormat extends HoodieCommonTestHarness {
|
||||
assertEquals(logFile1.getLogVersion(), logFile2.getLogVersion() - 1, "Log Files must have different versions");
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testMultipleAppend() throws IOException, URISyntaxException, InterruptedException {
|
||||
@ParameterizedTest
|
||||
@EnumSource(names = {"AVRO_DATA_BLOCK", "HFILE_DATA_BLOCK", "PARQUET_DATA_BLOCK"})
|
||||
public void testMultipleAppend(HoodieLogBlockType dataBlockType) throws IOException, URISyntaxException, InterruptedException {
|
||||
Writer writer =
|
||||
HoodieLogFormat.newWriterBuilder().onParentPath(partitionPath).withFileExtension(HoodieLogFile.DELTA_EXTENSION)
|
||||
.withFileId("test-fileid1").overBaseCommit("100").withFs(fs).build();
|
||||
@@ -266,7 +273,7 @@ public class TestHoodieLogFormat extends HoodieCommonTestHarness {
|
||||
Map<HoodieLogBlock.HeaderMetadataType, String> header = new HashMap<>();
|
||||
header.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, "100");
|
||||
header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, getSimpleSchema().toString());
|
||||
HoodieDataBlock dataBlock = getDataBlock(records, header);
|
||||
HoodieDataBlock dataBlock = getDataBlock(dataBlockType, records, header);
|
||||
writer.appendBlock(dataBlock);
|
||||
long size1 = writer.getCurrentSize();
|
||||
writer.close();
|
||||
@@ -276,7 +283,7 @@ public class TestHoodieLogFormat extends HoodieCommonTestHarness {
|
||||
.withFileId("test-fileid1").overBaseCommit("100").withFs(fs).build();
|
||||
records = SchemaTestUtil.generateTestRecords(0, 100);
|
||||
header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, getSimpleSchema().toString());
|
||||
dataBlock = getDataBlock(records, header);
|
||||
dataBlock = getDataBlock(dataBlockType, records, header);
|
||||
writer.appendBlock(dataBlock);
|
||||
long size2 = writer.getCurrentSize();
|
||||
assertTrue(size2 > size1, "We just wrote a new block - size2 should be > size1");
|
||||
@@ -290,7 +297,7 @@ public class TestHoodieLogFormat extends HoodieCommonTestHarness {
|
||||
.withFileId("test-fileid1").overBaseCommit("100").withFs(fs).build();
|
||||
records = SchemaTestUtil.generateTestRecords(0, 100);
|
||||
header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, getSimpleSchema().toString());
|
||||
dataBlock = getDataBlock(records, header);
|
||||
dataBlock = getDataBlock(dataBlockType, records, header);
|
||||
writer.appendBlock(dataBlock);
|
||||
long size3 = writer.getCurrentSize();
|
||||
assertTrue(size3 > size2, "We just wrote a new block - size3 should be > size2");
|
||||
@@ -309,6 +316,7 @@ public class TestHoodieLogFormat extends HoodieCommonTestHarness {
|
||||
* This is actually a test on concurrent append and not recovery lease. Commenting this out.
|
||||
* https://issues.apache.org/jira/browse/HUDI-117
|
||||
*/
|
||||
|
||||
/**
|
||||
* @Test public void testLeaseRecovery() throws IOException, URISyntaxException, InterruptedException { Writer writer
|
||||
* = HoodieLogFormat.newWriterBuilder().onParentPath(partitionPath)
|
||||
@@ -320,7 +328,7 @@ public class TestHoodieLogFormat extends HoodieCommonTestHarness {
|
||||
* dataBlock = new HoodieAvroDataBlock(records, header); writer = writer.appendBlock(dataBlock); long size1 =
|
||||
* writer.getCurrentSize(); // do not close this writer - this simulates a data note appending to a log dying
|
||||
* without closing the file // writer.close();
|
||||
*
|
||||
* <p>
|
||||
* writer = HoodieLogFormat.newWriterBuilder().onParentPath(partitionPath)
|
||||
* .withFileExtension(HoodieLogFile.DELTA_EXTENSION).withFileId("test-fileid1").overBaseCommit("100")
|
||||
* .withFs(fs).build(); records = SchemaTestUtil.generateTestRecords(0, 100);
|
||||
@@ -344,7 +352,7 @@ public class TestHoodieLogFormat extends HoodieCommonTestHarness {
|
||||
Map<HoodieLogBlock.HeaderMetadataType, String> header = new HashMap<>();
|
||||
header.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, "100");
|
||||
header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, getSimpleSchema().toString());
|
||||
HoodieDataBlock dataBlock = getDataBlock(records, header);
|
||||
HoodieDataBlock dataBlock = getDataBlock(DEFAULT_DATA_BLOCK_TYPE, records, header);
|
||||
|
||||
for (int i = 0; i < 2; i++) {
|
||||
Writer writer = HoodieLogFormat.newWriterBuilder().onParentPath(testPath)
|
||||
@@ -371,14 +379,14 @@ public class TestHoodieLogFormat extends HoodieCommonTestHarness {
|
||||
Map<HoodieLogBlock.HeaderMetadataType, String> header = new HashMap<>();
|
||||
header.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, "100");
|
||||
header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, getSimpleSchema().toString());
|
||||
HoodieDataBlock dataBlock = getDataBlock(records, header);
|
||||
HoodieDataBlock dataBlock = getDataBlock(DEFAULT_DATA_BLOCK_TYPE, records, header);
|
||||
writer.appendBlock(dataBlock);
|
||||
writer.close();
|
||||
|
||||
Reader reader = HoodieLogFormat.newReader(fs, writer.getLogFile(), SchemaTestUtil.getSimpleSchema());
|
||||
assertTrue(reader.hasNext(), "We wrote a block, we should be able to read it");
|
||||
HoodieLogBlock nextBlock = reader.next();
|
||||
assertEquals(dataBlockType, nextBlock.getBlockType(), "The next block should be a data block");
|
||||
assertEquals(DEFAULT_DATA_BLOCK_TYPE, nextBlock.getBlockType(), "The next block should be a data block");
|
||||
HoodieDataBlock dataBlockRead = (HoodieDataBlock) nextBlock;
|
||||
assertEquals(copyOfRecords.size(), dataBlockRead.getRecords().size(),
|
||||
"Read records size should be equal to the written records size");
|
||||
@@ -400,10 +408,10 @@ public class TestHoodieLogFormat extends HoodieCommonTestHarness {
|
||||
Map<HoodieLogBlock.HeaderMetadataType, String> header = new HashMap<>();
|
||||
header.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, "100");
|
||||
header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, getSimpleSchema().toString());
|
||||
byte[] dataBlockContentBytes = getDataBlock(records, header).getContentBytes();
|
||||
HoodieDataBlock reusableDataBlock = new HoodieAvroDataBlock(null, null,
|
||||
Option.ofNullable(dataBlockContentBytes), false, 0, dataBlockContentBytes.length,
|
||||
0, getSimpleSchema(), header, new HashMap<>(), HoodieRecord.RECORD_KEY_METADATA_FIELD);
|
||||
byte[] dataBlockContentBytes = getDataBlock(DEFAULT_DATA_BLOCK_TYPE, records, header).getContentBytes();
|
||||
HoodieLogBlock.HoodieLogBlockContentLocation logBlockContentLoc = new HoodieLogBlock.HoodieLogBlockContentLocation(new Configuration(), null, 0, dataBlockContentBytes.length, 0);
|
||||
HoodieDataBlock reusableDataBlock = new HoodieAvroDataBlock(null, Option.ofNullable(dataBlockContentBytes), false,
|
||||
logBlockContentLoc, Option.ofNullable(getSimpleSchema()), header, new HashMap<>(), HoodieRecord.RECORD_KEY_METADATA_FIELD);
|
||||
long writtenSize = 0;
|
||||
int logBlockWrittenNum = 0;
|
||||
while (writtenSize < Integer.MAX_VALUE) {
|
||||
@@ -418,7 +426,7 @@ public class TestHoodieLogFormat extends HoodieCommonTestHarness {
|
||||
true, true);
|
||||
assertTrue(reader.hasNext(), "We wrote a block, we should be able to read it");
|
||||
HoodieLogBlock nextBlock = reader.next();
|
||||
assertEquals(dataBlockType, nextBlock.getBlockType(), "The next block should be a data block");
|
||||
assertEquals(DEFAULT_DATA_BLOCK_TYPE, nextBlock.getBlockType(), "The next block should be a data block");
|
||||
HoodieDataBlock dataBlockRead = (HoodieDataBlock) nextBlock;
|
||||
assertEquals(copyOfRecords.size(), dataBlockRead.getRecords().size(),
|
||||
"Read records size should be equal to the written records size");
|
||||
@@ -447,11 +455,16 @@ public class TestHoodieLogFormat extends HoodieCommonTestHarness {
|
||||
oversizeWriter.close();
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testBasicAppendAndRead() throws IOException, URISyntaxException, InterruptedException {
|
||||
Writer writer =
|
||||
HoodieLogFormat.newWriterBuilder().onParentPath(partitionPath).withFileExtension(HoodieLogFile.DELTA_EXTENSION)
|
||||
.withFileId("test-fileid1").overBaseCommit("100").withFs(fs).build();
|
||||
@ParameterizedTest
|
||||
@EnumSource(names = {"AVRO_DATA_BLOCK", "HFILE_DATA_BLOCK", "PARQUET_DATA_BLOCK"})
|
||||
public void testBasicAppendAndRead(HoodieLogBlockType dataBlockType) throws IOException, URISyntaxException, InterruptedException {
|
||||
Writer writer = HoodieLogFormat.newWriterBuilder()
|
||||
.onParentPath(partitionPath)
|
||||
.withFileExtension(HoodieLogFile.DELTA_EXTENSION)
|
||||
.withFileId("test-fileid1")
|
||||
.overBaseCommit("100")
|
||||
.withFs(fs)
|
||||
.build();
|
||||
List<IndexedRecord> records1 = SchemaTestUtil.generateTestRecords(0, 100);
|
||||
Schema schema = getSimpleSchema();
|
||||
List<IndexedRecord> copyOfRecords1 = records1.stream()
|
||||
@@ -459,30 +472,39 @@ public class TestHoodieLogFormat extends HoodieCommonTestHarness {
|
||||
Map<HoodieLogBlock.HeaderMetadataType, String> header = new HashMap<>();
|
||||
header.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, "100");
|
||||
header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, getSimpleSchema().toString());
|
||||
HoodieDataBlock dataBlock = getDataBlock(records1, header);
|
||||
HoodieDataBlock dataBlock = getDataBlock(dataBlockType, records1, header);
|
||||
writer.appendBlock(dataBlock);
|
||||
writer.close();
|
||||
|
||||
writer =
|
||||
HoodieLogFormat.newWriterBuilder().onParentPath(partitionPath).withFileExtension(HoodieLogFile.DELTA_EXTENSION)
|
||||
.withFileId("test-fileid1").overBaseCommit("100").withFs(fs).build();
|
||||
writer = HoodieLogFormat.newWriterBuilder()
|
||||
.onParentPath(partitionPath)
|
||||
.withFileExtension(HoodieLogFile.DELTA_EXTENSION)
|
||||
.withFileId("test-fileid1")
|
||||
.overBaseCommit("100")
|
||||
.withFs(fs)
|
||||
.build();
|
||||
List<IndexedRecord> records2 = SchemaTestUtil.generateTestRecords(0, 100);
|
||||
List<IndexedRecord> copyOfRecords2 = records2.stream()
|
||||
.map(record -> HoodieAvroUtils.rewriteRecord((GenericRecord) record, schema)).collect(Collectors.toList());
|
||||
header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, getSimpleSchema().toString());
|
||||
dataBlock = getDataBlock(records2, header);
|
||||
dataBlock = getDataBlock(dataBlockType, records2, header);
|
||||
writer.appendBlock(dataBlock);
|
||||
writer.close();
|
||||
|
||||
// Close and Open again and append 100 more records
|
||||
writer =
|
||||
HoodieLogFormat.newWriterBuilder().onParentPath(partitionPath).withFileExtension(HoodieLogFile.DELTA_EXTENSION)
|
||||
.withFileId("test-fileid1").overBaseCommit("100").withFs(fs).build();
|
||||
writer = HoodieLogFormat.newWriterBuilder()
|
||||
.onParentPath(partitionPath)
|
||||
.withFileExtension(HoodieLogFile.DELTA_EXTENSION)
|
||||
.withFileId("test-fileid1")
|
||||
.overBaseCommit("100")
|
||||
.withFs(fs)
|
||||
.build();
|
||||
|
||||
List<IndexedRecord> records3 = SchemaTestUtil.generateTestRecords(0, 100);
|
||||
List<IndexedRecord> copyOfRecords3 = records3.stream()
|
||||
.map(record -> HoodieAvroUtils.rewriteRecord((GenericRecord) record, schema)).collect(Collectors.toList());
|
||||
header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, getSimpleSchema().toString());
|
||||
dataBlock = getDataBlock(records3, header);
|
||||
dataBlock = getDataBlock(dataBlockType, records3, header);
|
||||
writer.appendBlock(dataBlock);
|
||||
writer.close();
|
||||
|
||||
@@ -538,7 +560,7 @@ public class TestHoodieLogFormat extends HoodieCommonTestHarness {
|
||||
.map(record -> HoodieAvroUtils.rewriteRecord((GenericRecord) record, schema)).collect(Collectors.toList());
|
||||
allRecords.add(copyOfRecords1);
|
||||
header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, schema.toString());
|
||||
HoodieDataBlock dataBlock = getDataBlock(records1, header);
|
||||
HoodieDataBlock dataBlock = getDataBlock(DEFAULT_DATA_BLOCK_TYPE, records1, header);
|
||||
writer.appendBlock(dataBlock);
|
||||
}
|
||||
writer.close();
|
||||
@@ -580,7 +602,7 @@ public class TestHoodieLogFormat extends HoodieCommonTestHarness {
|
||||
Map<HoodieLogBlock.HeaderMetadataType, String> header = new HashMap<>();
|
||||
header.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, "100");
|
||||
header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, getSimpleSchema().toString());
|
||||
HoodieDataBlock dataBlock = getDataBlock(records, header);
|
||||
HoodieDataBlock dataBlock = getDataBlock(DEFAULT_DATA_BLOCK_TYPE, records, header);
|
||||
writer.appendBlock(dataBlock);
|
||||
writer.close();
|
||||
|
||||
@@ -606,7 +628,7 @@ public class TestHoodieLogFormat extends HoodieCommonTestHarness {
|
||||
.withFileId("test-fileid1").overBaseCommit("100").withFs(fs).build();
|
||||
records = SchemaTestUtil.generateTestRecords(0, 10);
|
||||
header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, getSimpleSchema().toString());
|
||||
dataBlock = getDataBlock(records, header);
|
||||
dataBlock = getDataBlock(DEFAULT_DATA_BLOCK_TYPE, records, header);
|
||||
writer.appendBlock(dataBlock);
|
||||
writer.close();
|
||||
|
||||
@@ -644,7 +666,7 @@ public class TestHoodieLogFormat extends HoodieCommonTestHarness {
|
||||
.withFileId("test-fileid1").overBaseCommit("100").withFs(fs).build();
|
||||
records = SchemaTestUtil.generateTestRecords(0, 100);
|
||||
header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, getSimpleSchema().toString());
|
||||
dataBlock = getDataBlock(records, header);
|
||||
dataBlock = getDataBlock(DEFAULT_DATA_BLOCK_TYPE, records, header);
|
||||
writer.appendBlock(dataBlock);
|
||||
writer.close();
|
||||
|
||||
@@ -674,7 +696,7 @@ public class TestHoodieLogFormat extends HoodieCommonTestHarness {
|
||||
Map<HoodieLogBlock.HeaderMetadataType, String> header = new HashMap<>();
|
||||
header.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, "100");
|
||||
header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, getSimpleSchema().toString());
|
||||
HoodieDataBlock dataBlock = getDataBlock(records, header);
|
||||
HoodieDataBlock dataBlock = getDataBlock(DEFAULT_DATA_BLOCK_TYPE, records, header);
|
||||
writer.appendBlock(dataBlock);
|
||||
writer.close();
|
||||
|
||||
@@ -702,7 +724,7 @@ public class TestHoodieLogFormat extends HoodieCommonTestHarness {
|
||||
.withFileId("test-fileid1").overBaseCommit("100").withFs(fs).build();
|
||||
records = SchemaTestUtil.generateTestRecords(0, 10);
|
||||
header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, getSimpleSchema().toString());
|
||||
dataBlock = getDataBlock(records, header);
|
||||
dataBlock = getDataBlock(DEFAULT_DATA_BLOCK_TYPE, records, header);
|
||||
writer.appendBlock(dataBlock);
|
||||
writer.close();
|
||||
|
||||
@@ -741,7 +763,7 @@ public class TestHoodieLogFormat extends HoodieCommonTestHarness {
|
||||
Map<HoodieLogBlock.HeaderMetadataType, String> header = new HashMap<>();
|
||||
header.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, "100");
|
||||
header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, schema.toString());
|
||||
HoodieDataBlock dataBlock = getDataBlock(records1, header);
|
||||
HoodieDataBlock dataBlock = getDataBlock(DEFAULT_DATA_BLOCK_TYPE, records1, header);
|
||||
writer.appendBlock(dataBlock);
|
||||
|
||||
// Write 2
|
||||
@@ -749,7 +771,7 @@ public class TestHoodieLogFormat extends HoodieCommonTestHarness {
|
||||
List<IndexedRecord> copyOfRecords2 = records2.stream()
|
||||
.map(record -> HoodieAvroUtils.rewriteRecord((GenericRecord) record, schema)).collect(Collectors.toList());
|
||||
header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, schema.toString());
|
||||
dataBlock = getDataBlock(records2, header);
|
||||
dataBlock = getDataBlock(DEFAULT_DATA_BLOCK_TYPE, records2, header);
|
||||
writer.appendBlock(dataBlock);
|
||||
writer.close();
|
||||
|
||||
@@ -804,14 +826,14 @@ public class TestHoodieLogFormat extends HoodieCommonTestHarness {
|
||||
header.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, "100");
|
||||
|
||||
header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, schema.toString());
|
||||
HoodieDataBlock dataBlock = getDataBlock(records1, header);
|
||||
HoodieDataBlock dataBlock = getDataBlock(DEFAULT_DATA_BLOCK_TYPE, records1, header);
|
||||
writer.appendBlock(dataBlock);
|
||||
|
||||
// Write 2
|
||||
header.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, "101");
|
||||
List<IndexedRecord> records2 = SchemaTestUtil.generateHoodieTestRecords(0, 100);
|
||||
header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, schema.toString());
|
||||
dataBlock = getDataBlock(records2, header);
|
||||
dataBlock = getDataBlock(DEFAULT_DATA_BLOCK_TYPE, records2, header);
|
||||
writer.appendBlock(dataBlock);
|
||||
|
||||
// Rollback the last write
|
||||
@@ -827,7 +849,7 @@ public class TestHoodieLogFormat extends HoodieCommonTestHarness {
|
||||
List<IndexedRecord> copyOfRecords3 = records3.stream()
|
||||
.map(record -> HoodieAvroUtils.rewriteRecord((GenericRecord) record, schema)).collect(Collectors.toList());
|
||||
header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, schema.toString());
|
||||
dataBlock = getDataBlock(records3, header);
|
||||
dataBlock = getDataBlock(DEFAULT_DATA_BLOCK_TYPE, records3, header);
|
||||
writer.appendBlock(dataBlock);
|
||||
writer.close();
|
||||
|
||||
@@ -880,7 +902,7 @@ public class TestHoodieLogFormat extends HoodieCommonTestHarness {
|
||||
Map<HoodieLogBlock.HeaderMetadataType, String> header = new HashMap<>();
|
||||
header.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, "100");
|
||||
header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, schema.toString());
|
||||
HoodieDataBlock dataBlock = getDataBlock(records1, header);
|
||||
HoodieDataBlock dataBlock = getDataBlock(DEFAULT_DATA_BLOCK_TYPE, records1, header);
|
||||
writer.appendBlock(dataBlock);
|
||||
writer.close();
|
||||
|
||||
@@ -914,7 +936,7 @@ public class TestHoodieLogFormat extends HoodieCommonTestHarness {
|
||||
.map(record -> HoodieAvroUtils.rewriteRecord((GenericRecord) record, schema)).collect(Collectors.toList());
|
||||
|
||||
header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, schema.toString());
|
||||
dataBlock = getDataBlock(records3, header);
|
||||
dataBlock = getDataBlock(DEFAULT_DATA_BLOCK_TYPE, records3, header);
|
||||
writer.appendBlock(dataBlock);
|
||||
writer.close();
|
||||
|
||||
@@ -968,7 +990,7 @@ public class TestHoodieLogFormat extends HoodieCommonTestHarness {
|
||||
Map<HoodieLogBlock.HeaderMetadataType, String> header = new HashMap<>();
|
||||
header.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, "100");
|
||||
header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, schema.toString());
|
||||
HoodieDataBlock dataBlock = getDataBlock(records1, header);
|
||||
HoodieDataBlock dataBlock = getDataBlock(DEFAULT_DATA_BLOCK_TYPE, records1, header);
|
||||
writer.appendBlock(dataBlock);
|
||||
|
||||
// Write 2
|
||||
@@ -976,7 +998,7 @@ public class TestHoodieLogFormat extends HoodieCommonTestHarness {
|
||||
List<IndexedRecord> records2 = SchemaTestUtil.generateHoodieTestRecords(0, 100);
|
||||
List<IndexedRecord> copyOfRecords2 = records2.stream()
|
||||
.map(record -> HoodieAvroUtils.rewriteRecord((GenericRecord) record, schema)).collect(Collectors.toList());
|
||||
dataBlock = getDataBlock(records2, header);
|
||||
dataBlock = getDataBlock(DEFAULT_DATA_BLOCK_TYPE, records2, header);
|
||||
writer.appendBlock(dataBlock);
|
||||
|
||||
copyOfRecords1.addAll(copyOfRecords2);
|
||||
@@ -1089,13 +1111,13 @@ public class TestHoodieLogFormat extends HoodieCommonTestHarness {
|
||||
header.put(HoodieLogBlock.HeaderMetadataType.TARGET_INSTANT_TIME, "100");
|
||||
|
||||
header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, schema.toString());
|
||||
HoodieDataBlock dataBlock = getDataBlock(records1, header);
|
||||
HoodieDataBlock dataBlock = getDataBlock(DEFAULT_DATA_BLOCK_TYPE, records1, header);
|
||||
writer.appendBlock(dataBlock);
|
||||
|
||||
// Write 2
|
||||
List<IndexedRecord> records2 = SchemaTestUtil.generateHoodieTestRecords(0, 100);
|
||||
header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, schema.toString());
|
||||
dataBlock = getDataBlock(records2, header);
|
||||
dataBlock = getDataBlock(DEFAULT_DATA_BLOCK_TYPE, records2, header);
|
||||
writer.appendBlock(dataBlock);
|
||||
|
||||
// Delete 50 keys
|
||||
@@ -1173,7 +1195,7 @@ public class TestHoodieLogFormat extends HoodieCommonTestHarness {
|
||||
header.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, "100");
|
||||
header.put(HoodieLogBlock.HeaderMetadataType.TARGET_INSTANT_TIME, "100");
|
||||
header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, schema.toString());
|
||||
HoodieDataBlock dataBlock = getDataBlock(records1, header);
|
||||
HoodieDataBlock dataBlock = getDataBlock(DEFAULT_DATA_BLOCK_TYPE, records1, header);
|
||||
writer.appendBlock(dataBlock);
|
||||
|
||||
// Delete 50 keys
|
||||
@@ -1232,7 +1254,7 @@ public class TestHoodieLogFormat extends HoodieCommonTestHarness {
|
||||
Map<HoodieLogBlock.HeaderMetadataType, String> header = new HashMap<>();
|
||||
header.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, "100");
|
||||
header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, schema.toString());
|
||||
HoodieDataBlock dataBlock = getDataBlock(records1, header);
|
||||
HoodieDataBlock dataBlock = getDataBlock(DEFAULT_DATA_BLOCK_TYPE, records1, header);
|
||||
writer.appendBlock(dataBlock);
|
||||
|
||||
FileCreateUtils.createDeltaCommit(basePath, "100", fs);
|
||||
@@ -1290,7 +1312,7 @@ public class TestHoodieLogFormat extends HoodieCommonTestHarness {
|
||||
header.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, "100");
|
||||
header.put(HoodieLogBlock.HeaderMetadataType.TARGET_INSTANT_TIME, "100");
|
||||
header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, schema.toString());
|
||||
HoodieDataBlock dataBlock = getDataBlock(records1, header);
|
||||
HoodieDataBlock dataBlock = getDataBlock(DEFAULT_DATA_BLOCK_TYPE, records1, header);
|
||||
writer.appendBlock(dataBlock);
|
||||
writer.appendBlock(dataBlock);
|
||||
writer.appendBlock(dataBlock);
|
||||
@@ -1354,7 +1376,7 @@ public class TestHoodieLogFormat extends HoodieCommonTestHarness {
|
||||
Map<HoodieLogBlock.HeaderMetadataType, String> header = new HashMap<>();
|
||||
header.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, "100");
|
||||
header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, schema.toString());
|
||||
HoodieDataBlock dataBlock = getDataBlock(records1, header);
|
||||
HoodieDataBlock dataBlock = getDataBlock(DEFAULT_DATA_BLOCK_TYPE, records1, header);
|
||||
writer.appendBlock(dataBlock);
|
||||
writer.appendBlock(dataBlock);
|
||||
writer.appendBlock(dataBlock);
|
||||
@@ -1473,7 +1495,7 @@ public class TestHoodieLogFormat extends HoodieCommonTestHarness {
|
||||
Map<HoodieLogBlock.HeaderMetadataType, String> header = new HashMap<>();
|
||||
header.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, "100");
|
||||
header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, schema.toString());
|
||||
HoodieDataBlock dataBlock = getDataBlock(records.subList(0, numRecordsInLog1), header);
|
||||
HoodieDataBlock dataBlock = getDataBlock(DEFAULT_DATA_BLOCK_TYPE, records.subList(0, numRecordsInLog1), header);
|
||||
writer.appendBlock(dataBlock);
|
||||
// Get the size of the block
|
||||
long size = writer.getCurrentSize();
|
||||
@@ -1487,7 +1509,7 @@ public class TestHoodieLogFormat extends HoodieCommonTestHarness {
|
||||
Map<HoodieLogBlock.HeaderMetadataType, String> header2 = new HashMap<>();
|
||||
header2.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, "100");
|
||||
header2.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, schema.toString());
|
||||
HoodieDataBlock dataBlock2 = getDataBlock(records2.subList(0, numRecordsInLog2), header2);
|
||||
HoodieDataBlock dataBlock2 = getDataBlock(DEFAULT_DATA_BLOCK_TYPE, records2.subList(0, numRecordsInLog2), header2);
|
||||
writer2.appendBlock(dataBlock2);
|
||||
// Get the size of the block
|
||||
writer2.close();
|
||||
@@ -1574,7 +1596,7 @@ public class TestHoodieLogFormat extends HoodieCommonTestHarness {
|
||||
Map<HoodieLogBlock.HeaderMetadataType, String> header = new HashMap<>();
|
||||
header.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, "100");
|
||||
header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, schema.toString());
|
||||
HoodieDataBlock dataBlock = getDataBlock(records1, header);
|
||||
HoodieDataBlock dataBlock = getDataBlock(DEFAULT_DATA_BLOCK_TYPE, records1, header);
|
||||
writer.appendBlock(dataBlock);
|
||||
writer.close();
|
||||
|
||||
@@ -1584,7 +1606,7 @@ public class TestHoodieLogFormat extends HoodieCommonTestHarness {
|
||||
List<IndexedRecord> records2 = SchemaTestUtil.generateTestRecords(0, 100);
|
||||
List<IndexedRecord> copyOfRecords2 = records2.stream()
|
||||
.map(record -> HoodieAvroUtils.rewriteRecord((GenericRecord) record, schema)).collect(Collectors.toList());
|
||||
dataBlock = getDataBlock(records2, header);
|
||||
dataBlock = getDataBlock(DEFAULT_DATA_BLOCK_TYPE, records2, header);
|
||||
writer.appendBlock(dataBlock);
|
||||
writer.close();
|
||||
|
||||
@@ -1595,7 +1617,7 @@ public class TestHoodieLogFormat extends HoodieCommonTestHarness {
|
||||
List<IndexedRecord> records3 = SchemaTestUtil.generateTestRecords(0, 100);
|
||||
List<IndexedRecord> copyOfRecords3 = records3.stream()
|
||||
.map(record -> HoodieAvroUtils.rewriteRecord((GenericRecord) record, schema)).collect(Collectors.toList());
|
||||
dataBlock = getDataBlock(records3, header);
|
||||
dataBlock = getDataBlock(DEFAULT_DATA_BLOCK_TYPE, records3, header);
|
||||
writer.appendBlock(dataBlock);
|
||||
writer.close();
|
||||
|
||||
@@ -1646,7 +1668,7 @@ public class TestHoodieLogFormat extends HoodieCommonTestHarness {
|
||||
Map<HoodieLogBlock.HeaderMetadataType, String> header = new HashMap<>();
|
||||
header.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, "100");
|
||||
header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, schema.toString());
|
||||
HoodieDataBlock dataBlock = getDataBlock(records, header);
|
||||
HoodieDataBlock dataBlock = getDataBlock(DEFAULT_DATA_BLOCK_TYPE, records, header);
|
||||
writer.appendBlock(dataBlock);
|
||||
writer.close();
|
||||
|
||||
@@ -1674,7 +1696,7 @@ public class TestHoodieLogFormat extends HoodieCommonTestHarness {
|
||||
HoodieLogFormat.newWriterBuilder().onParentPath(partitionPath).withFileExtension(HoodieLogFile.DELTA_EXTENSION)
|
||||
.withFileId("test-fileid1").overBaseCommit("100").withFs(fs).build();
|
||||
records = SchemaTestUtil.generateTestRecords(0, 100);
|
||||
dataBlock = getDataBlock(records, header);
|
||||
dataBlock = getDataBlock(DEFAULT_DATA_BLOCK_TYPE, records, header);
|
||||
writer.appendBlock(dataBlock);
|
||||
writer.close();
|
||||
|
||||
@@ -1708,7 +1730,7 @@ public class TestHoodieLogFormat extends HoodieCommonTestHarness {
|
||||
Map<HoodieLogBlock.HeaderMetadataType, String> header = new HashMap<>();
|
||||
header.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, "100");
|
||||
header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, schema.toString());
|
||||
HoodieDataBlock dataBlock = getDataBlock(records1, header);
|
||||
HoodieDataBlock dataBlock = getDataBlock(DEFAULT_DATA_BLOCK_TYPE, records1, header);
|
||||
writer.appendBlock(dataBlock);
|
||||
writer.close();
|
||||
|
||||
@@ -1716,7 +1738,7 @@ public class TestHoodieLogFormat extends HoodieCommonTestHarness {
|
||||
HoodieLogFormat.newWriterBuilder().onParentPath(partitionPath).withFileExtension(HoodieLogFile.DELTA_EXTENSION)
|
||||
.withFileId("test-fileid1").overBaseCommit("100").withFs(fs).build();
|
||||
List<IndexedRecord> records2 = SchemaTestUtil.generateTestRecords(0, 100);
|
||||
dataBlock = getDataBlock(records2, header);
|
||||
dataBlock = getDataBlock(DEFAULT_DATA_BLOCK_TYPE, records2, header);
|
||||
writer.appendBlock(dataBlock);
|
||||
writer.close();
|
||||
|
||||
@@ -1725,7 +1747,7 @@ public class TestHoodieLogFormat extends HoodieCommonTestHarness {
|
||||
HoodieLogFormat.newWriterBuilder().onParentPath(partitionPath).withFileExtension(HoodieLogFile.DELTA_EXTENSION)
|
||||
.withFileId("test-fileid1").overBaseCommit("100").withFs(fs).build();
|
||||
List<IndexedRecord> records3 = SchemaTestUtil.generateTestRecords(0, 100);
|
||||
dataBlock = getDataBlock(records3, header);
|
||||
dataBlock = getDataBlock(DEFAULT_DATA_BLOCK_TYPE, records3, header);
|
||||
writer.appendBlock(dataBlock);
|
||||
writer.close();
|
||||
|
||||
@@ -1786,8 +1808,66 @@ public class TestHoodieLogFormat extends HoodieCommonTestHarness {
|
||||
}
|
||||
}
|
||||
|
||||
private HoodieDataBlock getDataBlock(List<IndexedRecord> records, Map<HeaderMetadataType, String> header) {
|
||||
return getDataBlock(dataBlockType, records, header);
|
||||
@ParameterizedTest
|
||||
@EnumSource(names = {"AVRO_DATA_BLOCK", "HFILE_DATA_BLOCK", "PARQUET_DATA_BLOCK"})
|
||||
public void testDataBlockFormatAppendAndReadWithProjectedSchema(
|
||||
HoodieLogBlockType dataBlockType
|
||||
) throws IOException, URISyntaxException, InterruptedException {
|
||||
Writer writer = HoodieLogFormat.newWriterBuilder()
|
||||
.onParentPath(partitionPath)
|
||||
.withFileExtension(HoodieLogFile.DELTA_EXTENSION)
|
||||
.withFileId("test-fileid1")
|
||||
.overBaseCommit("100")
|
||||
.withFs(fs)
|
||||
.build();
|
||||
|
||||
List<GenericRecord> records = SchemaTestUtil.generateTestGenericRecords(0, 1000);
|
||||
|
||||
Schema schema = getSimpleSchema();
|
||||
|
||||
Map<HoodieLogBlock.HeaderMetadataType, String> header =
|
||||
new HashMap<HoodieLogBlock.HeaderMetadataType, String>() {{
|
||||
put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, "100");
|
||||
put(HoodieLogBlock.HeaderMetadataType.SCHEMA, schema.toString());
|
||||
}};
|
||||
|
||||
// Init Benchmark to report number of bytes actually read from the Block
|
||||
BenchmarkCounter.initCounterFromReporter(HadoopMapRedUtils.createTestReporter(), fs.getConf());
|
||||
|
||||
// NOTE: Have to use this ugly hack since List generic is not covariant in its type param
|
||||
HoodieDataBlock dataBlock = getDataBlock(dataBlockType, (List<IndexedRecord>)(List) records, header);
|
||||
|
||||
writer.appendBlock(dataBlock);
|
||||
writer.close();
|
||||
|
||||
Schema projectedSchema = HoodieAvroUtils.generateProjectionSchema(schema, Collections.singletonList("name"));
|
||||
|
||||
List<GenericRecord> projectedRecords = HoodieAvroUtils.rewriteRecords(records, projectedSchema);
|
||||
|
||||
try (Reader reader = HoodieLogFormat.newReader(fs, writer.getLogFile(), projectedSchema, true, false)) {
|
||||
assertTrue(reader.hasNext(), "First block should be available");
|
||||
|
||||
HoodieLogBlock nextBlock = reader.next();
|
||||
|
||||
HoodieDataBlock dataBlockRead = (HoodieDataBlock) nextBlock;
|
||||
|
||||
Map<HoodieLogBlockType, Integer> expectedReadBytes =
|
||||
new HashMap<HoodieLogBlockType, Integer>() {{
|
||||
put(HoodieLogBlockType.AVRO_DATA_BLOCK, 0); // not supported
|
||||
put(HoodieLogBlockType.HFILE_DATA_BLOCK, 0); // not supported
|
||||
put(HoodieLogBlockType.PARQUET_DATA_BLOCK, 2605);
|
||||
}};
|
||||
|
||||
assertEquals(projectedRecords.size(), dataBlockRead.getRecords().size(),
|
||||
"Read records size should be equal to the written records size");
|
||||
assertEquals(projectedRecords, dataBlockRead.getRecords(),
|
||||
"Both records lists should be the same. (ordering guaranteed)");
|
||||
assertEquals(dataBlockRead.getSchema(), projectedSchema);
|
||||
|
||||
int bytesRead = (int) BenchmarkCounter.getBytesRead();
|
||||
|
||||
assertEquals(expectedReadBytes.get(dataBlockType), bytesRead, "Read bytes have to match");
|
||||
}
|
||||
}
|
||||
|
||||
private HoodieDataBlock getDataBlock(HoodieLogBlockType dataBlockType, List<IndexedRecord> records,
|
||||
@@ -1796,7 +1876,9 @@ public class TestHoodieLogFormat extends HoodieCommonTestHarness {
|
||||
case AVRO_DATA_BLOCK:
|
||||
return new HoodieAvroDataBlock(records, header, HoodieRecord.RECORD_KEY_METADATA_FIELD);
|
||||
case HFILE_DATA_BLOCK:
|
||||
return new HoodieHFileDataBlock(records, header, HoodieRecord.RECORD_KEY_METADATA_FIELD);
|
||||
return new HoodieHFileDataBlock(records, header, Compression.Algorithm.GZ);
|
||||
case PARQUET_DATA_BLOCK:
|
||||
return new HoodieParquetDataBlock(records, header, HoodieRecord.RECORD_KEY_METADATA_FIELD, CompressionCodecName.GZIP);
|
||||
default:
|
||||
throw new RuntimeException("Unknown data block type " + dataBlockType);
|
||||
}
|
||||
|
||||
@@ -19,6 +19,7 @@
|
||||
package org.apache.hudi.common.functional;
|
||||
|
||||
import org.apache.hudi.common.model.HoodieArchivedLogFile;
|
||||
import org.apache.hudi.common.model.HoodieRecord;
|
||||
import org.apache.hudi.common.table.log.HoodieLogFormat;
|
||||
import org.apache.hudi.common.table.log.HoodieLogFormat.Writer;
|
||||
import org.apache.hudi.common.table.log.block.HoodieAvroDataBlock;
|
||||
@@ -104,7 +105,7 @@ public class TestHoodieLogFormatAppendFailure {
|
||||
Map<HoodieLogBlock.HeaderMetadataType, String> header = new HashMap<>(2);
|
||||
header.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, "100");
|
||||
header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, getSimpleSchema().toString());
|
||||
HoodieAvroDataBlock dataBlock = new HoodieAvroDataBlock(records, header);
|
||||
HoodieAvroDataBlock dataBlock = new HoodieAvroDataBlock(records, header, HoodieRecord.RECORD_KEY_METADATA_FIELD);
|
||||
|
||||
Writer writer = HoodieLogFormat.newWriterBuilder().onParentPath(testPath)
|
||||
.withFileExtension(HoodieArchivedLogFile.ARCHIVE_EXTENSION).withFileId("commits.archive")
|
||||
|
||||
@@ -0,0 +1,87 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.common.testutils;
|
||||
|
||||
import org.apache.hadoop.mapred.Counters;
|
||||
import org.apache.hadoop.mapred.InputSplit;
|
||||
import org.apache.hadoop.mapred.Reporter;
|
||||
import org.apache.hudi.common.util.Option;
|
||||
|
||||
import java.util.concurrent.ConcurrentHashMap;
|
||||
|
||||
public class HadoopMapRedUtils {
|
||||
|
||||
/**
|
||||
* Creates instance of {@link Reporter} to collect reported counters
|
||||
*/
|
||||
public static Reporter createTestReporter() {
|
||||
class TestReporter implements Reporter {
|
||||
private final ConcurrentHashMap<String, Counters.Counter> counters =
|
||||
new ConcurrentHashMap<>();
|
||||
|
||||
@Override
|
||||
public void setStatus(String status) {
|
||||
// not-supported
|
||||
}
|
||||
|
||||
@Override
|
||||
public Counters.Counter getCounter(Enum<?> name) {
|
||||
return counters.computeIfAbsent(name.name(), (ignored) -> new Counters.Counter());
|
||||
}
|
||||
|
||||
@Override
|
||||
public Counters.Counter getCounter(String group, String name) {
|
||||
return counters.computeIfAbsent(getKey(group, name), (ignored) -> new Counters.Counter());
|
||||
}
|
||||
|
||||
@Override
|
||||
public void incrCounter(Enum<?> key, long amount) {
|
||||
Option.ofNullable(counters.get(key))
|
||||
.ifPresent(c -> c.increment(amount));
|
||||
}
|
||||
|
||||
@Override
|
||||
public void incrCounter(String group, String counter, long amount) {
|
||||
Option.ofNullable(counters.get(getKey(group, counter)))
|
||||
.ifPresent(c -> c.increment(amount));
|
||||
}
|
||||
|
||||
@Override
|
||||
public InputSplit getInputSplit() throws UnsupportedOperationException {
|
||||
throw new UnsupportedOperationException("not supported");
|
||||
}
|
||||
|
||||
@Override
|
||||
public float getProgress() {
|
||||
return -1;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void progress() {
|
||||
// not-supported
|
||||
}
|
||||
|
||||
private String getKey(String group, String name) {
|
||||
return String.format("%s:%s", group, name);
|
||||
}
|
||||
}
|
||||
|
||||
return new TestReporter();
|
||||
}
|
||||
}
|
||||
@@ -71,6 +71,10 @@ public final class SchemaTestUtil {
|
||||
return toRecords(getSimpleSchema(), getSimpleSchema(), from, limit);
|
||||
}
|
||||
|
||||
public static List<GenericRecord> generateTestGenericRecords(int from, int limit) throws IOException, URISyntaxException {
|
||||
return toRecords(getSimpleSchema(), getSimpleSchema(), from, limit);
|
||||
}
|
||||
|
||||
public static List<String> generateTestJsonRecords(int from, int limit) throws IOException, URISyntaxException {
|
||||
Path dataPath = initializeSampleDataPath();
|
||||
|
||||
@@ -81,9 +85,9 @@ public final class SchemaTestUtil {
|
||||
}
|
||||
}
|
||||
|
||||
private static List<IndexedRecord> toRecords(Schema writerSchema, Schema readerSchema, int from, int limit)
|
||||
private static <T extends IndexedRecord> List<T> toRecords(Schema writerSchema, Schema readerSchema, int from, int limit)
|
||||
throws IOException, URISyntaxException {
|
||||
GenericDatumReader<IndexedRecord> reader = new GenericDatumReader<>(writerSchema, readerSchema);
|
||||
GenericDatumReader<T> reader = new GenericDatumReader<>(writerSchema, readerSchema);
|
||||
Path dataPath = initializeSampleDataPath();
|
||||
|
||||
try (Stream<String> stream = Files.lines(dataPath)) {
|
||||
|
||||
@@ -0,0 +1,113 @@
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package org.apache.hudi.common.util.io;
|
||||
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
import java.nio.ByteBuffer;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.assertArrayEquals;
|
||||
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||
import static org.junit.jupiter.api.Assertions.assertThrows;
|
||||
|
||||
public class TestByteBufferBackedInputStream {
|
||||
|
||||
@Test
|
||||
public void testConstructor() {
|
||||
byte[] bytes = { 0xD, 0xE, 0xA, 0xD, 0xD, 0xE, 0xE, 0xD };
|
||||
ByteBuffer byteBuf = ByteBuffer.wrap(bytes, 0, 1);
|
||||
ByteBuffer byteBufClone = byteBuf.duplicate();
|
||||
|
||||
// ByteBuffer ctor
|
||||
ByteBufferBackedInputStream first = new ByteBufferBackedInputStream(byteBuf);
|
||||
|
||||
assertEquals(first.read(), 0xD);
|
||||
assertThrows(IllegalArgumentException.class, first::read);
|
||||
// Make sure that the original buffer stays intact
|
||||
assertEquals(byteBufClone, byteBuf);
|
||||
|
||||
// byte[] ctor
|
||||
ByteBufferBackedInputStream second = new ByteBufferBackedInputStream(bytes);
|
||||
|
||||
assertEquals(second.read(), 0xD);
|
||||
|
||||
// byte[] ctor (w/ offset)
|
||||
ByteBufferBackedInputStream third = new ByteBufferBackedInputStream(bytes, 1, 1);
|
||||
|
||||
assertEquals(third.read(), 0xE);
|
||||
assertThrows(IllegalArgumentException.class, third::read);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testRead() {
|
||||
byte[] sourceBytes = { 0xD, 0xE, 0xA, 0xD, 0xD, 0xE, 0xE, 0xD };
|
||||
|
||||
ByteBufferBackedInputStream stream = new ByteBufferBackedInputStream(sourceBytes);
|
||||
|
||||
int firstByte = stream.read();
|
||||
assertEquals(firstByte, 0xD);
|
||||
|
||||
byte[] readBytes = new byte[4];
|
||||
int read = stream.read(readBytes, 1, 3);
|
||||
|
||||
assertEquals(3, read);
|
||||
assertArrayEquals(new byte[]{0, 0xE, 0xA, 0xD}, readBytes);
|
||||
assertEquals(4, stream.getPosition());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testSeek() {
|
||||
byte[] sourceBytes = { 0xD, 0xE, 0xA, 0xD, 0xD, 0xA, 0xE, 0xD };
|
||||
|
||||
ByteBufferBackedInputStream stream = new ByteBufferBackedInputStream(sourceBytes, 1, 7);
|
||||
|
||||
// Seek to 2 byte in the stream (3 in the original buffer)
|
||||
stream.seek(1);
|
||||
int firstRead = stream.read();
|
||||
assertEquals(0xA, firstRead);
|
||||
|
||||
// Seek to 5 byte in the stream (6 in the original buffer)
|
||||
stream.seek(5);
|
||||
int secondRead = stream.read();
|
||||
assertEquals(0xE, secondRead);
|
||||
|
||||
// Try to seek past the stream boundary
|
||||
assertThrows(IllegalArgumentException.class, () -> stream.seek(8));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testCopyFrom() {
|
||||
byte[] sourceBytes = { 0xD, 0xE, 0xA, 0xD, 0xD, 0xA, 0xE, 0xD };
|
||||
|
||||
ByteBufferBackedInputStream stream = new ByteBufferBackedInputStream(sourceBytes);
|
||||
|
||||
int firstByte = stream.read();
|
||||
assertEquals(firstByte, 0xD);
|
||||
|
||||
// Copy 5 byes from the stream (while keeping stream's state intact)
|
||||
byte[] targetBytes = new byte[5];
|
||||
stream.copyFrom(2, targetBytes, 0, targetBytes.length);
|
||||
|
||||
assertArrayEquals(new byte[] { 0xA, 0xD, 0xD, 0xA, 0xE }, targetBytes);
|
||||
|
||||
// Continue reading the stream from where we left of (before copying)
|
||||
int secondByte = stream.read();
|
||||
assertEquals(secondByte, 0xE);
|
||||
}
|
||||
}
|
||||
@@ -163,6 +163,12 @@ public class TestHoodieRealtimeRecordReader {
|
||||
HoodieLogBlock.HoodieLogBlockType.HFILE_DATA_BLOCK);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testParquetInlineReader() throws Exception {
|
||||
testReaderInternal(ExternalSpillableMap.DiskMapType.BITCASK, false, false,
|
||||
HoodieLogBlock.HoodieLogBlockType.PARQUET_DATA_BLOCK);
|
||||
}
|
||||
|
||||
private void testReaderInternal(ExternalSpillableMap.DiskMapType diskMapType,
|
||||
boolean isCompressionEnabled,
|
||||
boolean partitioned) throws Exception {
|
||||
|
||||
@@ -20,6 +20,7 @@ package org.apache.hudi.hadoop.testutils;
|
||||
|
||||
import org.apache.hadoop.fs.LocalFileSystem;
|
||||
import org.apache.hadoop.fs.RawLocalFileSystem;
|
||||
import org.apache.hadoop.hbase.io.compress.Compression;
|
||||
import org.apache.hudi.avro.HoodieAvroUtils;
|
||||
import org.apache.hudi.common.fs.FSUtils;
|
||||
import org.apache.hudi.common.model.HoodieFileFormat;
|
||||
@@ -33,6 +34,7 @@ import org.apache.hudi.common.table.log.block.HoodieCommandBlock;
|
||||
import org.apache.hudi.common.table.log.block.HoodieDataBlock;
|
||||
import org.apache.hudi.common.table.log.block.HoodieHFileDataBlock;
|
||||
import org.apache.hudi.common.table.log.block.HoodieLogBlock;
|
||||
import org.apache.hudi.common.table.log.block.HoodieParquetDataBlock;
|
||||
import org.apache.hudi.common.testutils.HoodieTestUtils;
|
||||
import org.apache.hudi.common.testutils.SchemaTestUtil;
|
||||
import org.apache.hudi.hadoop.utils.HoodieHiveUtils;
|
||||
@@ -47,6 +49,7 @@ import org.apache.hadoop.hive.metastore.api.hive_metastoreConstants;
|
||||
import org.apache.hadoop.hive.serde2.ColumnProjectionUtils;
|
||||
import org.apache.hadoop.mapred.JobConf;
|
||||
import org.apache.parquet.avro.AvroParquetWriter;
|
||||
import org.apache.parquet.hadoop.metadata.CompressionCodecName;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
@@ -363,9 +366,14 @@ public class InputFormatTestUtil {
|
||||
Map<HoodieLogBlock.HeaderMetadataType, String> header = new HashMap<>();
|
||||
header.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, newCommit);
|
||||
header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, writeSchema.toString());
|
||||
HoodieDataBlock dataBlock = (logBlockType == HoodieLogBlock.HoodieLogBlockType.HFILE_DATA_BLOCK)
|
||||
? new HoodieHFileDataBlock(records, header, HoodieRecord.RECORD_KEY_METADATA_FIELD) :
|
||||
new HoodieAvroDataBlock(records, header);
|
||||
HoodieDataBlock dataBlock = null;
|
||||
if (logBlockType == HoodieLogBlock.HoodieLogBlockType.HFILE_DATA_BLOCK) {
|
||||
dataBlock = new HoodieHFileDataBlock(records, header, Compression.Algorithm.GZ);
|
||||
} else if (logBlockType == HoodieLogBlock.HoodieLogBlockType.PARQUET_DATA_BLOCK) {
|
||||
dataBlock = new HoodieParquetDataBlock(records, header, HoodieRecord.RECORD_KEY_METADATA_FIELD, CompressionCodecName.GZIP);
|
||||
} else {
|
||||
dataBlock = new HoodieAvroDataBlock(records, header, HoodieRecord.RECORD_KEY_METADATA_FIELD);
|
||||
}
|
||||
writer.appendBlock(dataBlock);
|
||||
return writer;
|
||||
}
|
||||
|
||||
@@ -28,6 +28,7 @@ import org.apache.hudi.common.model.HoodieBaseFile;
|
||||
import org.apache.hudi.common.model.HoodieCommitMetadata;
|
||||
import org.apache.hudi.common.model.HoodieDeltaWriteStat;
|
||||
import org.apache.hudi.common.model.HoodieLogFile;
|
||||
import org.apache.hudi.common.model.HoodieRecord;
|
||||
import org.apache.hudi.common.model.HoodieReplaceCommitMetadata;
|
||||
import org.apache.hudi.common.model.HoodieTableType;
|
||||
import org.apache.hudi.common.model.HoodieWriteStat;
|
||||
@@ -428,7 +429,7 @@ public class HiveTestUtil {
|
||||
Map<HeaderMetadataType, String> header = new HashMap<>(2);
|
||||
header.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, dataFile.getCommitTime());
|
||||
header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, schema.toString());
|
||||
HoodieAvroDataBlock dataBlock = new HoodieAvroDataBlock(records, header);
|
||||
HoodieAvroDataBlock dataBlock = new HoodieAvroDataBlock(records, header, HoodieRecord.RECORD_KEY_METADATA_FIELD);
|
||||
logWriter.appendBlock(dataBlock);
|
||||
logWriter.close();
|
||||
return logWriter.getLogFile();
|
||||
|
||||
Reference in New Issue
Block a user