[HUDI-3083] Support component data types for flink bulk_insert (#4470)
* [HUDI-3083] Support component data types for flink bulk_insert * add nested row type test
This commit is contained in:
@@ -18,19 +18,22 @@
|
|||||||
|
|
||||||
package org.apache.hudi.io.storage.row.parquet;
|
package org.apache.hudi.io.storage.row.parquet;
|
||||||
|
|
||||||
|
import org.apache.flink.table.data.ArrayData;
|
||||||
import org.apache.flink.table.data.DecimalDataUtils;
|
import org.apache.flink.table.data.DecimalDataUtils;
|
||||||
|
import org.apache.flink.table.data.MapData;
|
||||||
import org.apache.flink.table.data.RowData;
|
import org.apache.flink.table.data.RowData;
|
||||||
import org.apache.flink.table.data.TimestampData;
|
import org.apache.flink.table.data.TimestampData;
|
||||||
|
import org.apache.flink.table.types.logical.ArrayType;
|
||||||
import org.apache.flink.table.types.logical.DecimalType;
|
import org.apache.flink.table.types.logical.DecimalType;
|
||||||
import org.apache.flink.table.types.logical.LocalZonedTimestampType;
|
import org.apache.flink.table.types.logical.LocalZonedTimestampType;
|
||||||
import org.apache.flink.table.types.logical.LogicalType;
|
import org.apache.flink.table.types.logical.LogicalType;
|
||||||
|
import org.apache.flink.table.types.logical.MapType;
|
||||||
import org.apache.flink.table.types.logical.RowType;
|
import org.apache.flink.table.types.logical.RowType;
|
||||||
import org.apache.flink.table.types.logical.TimestampType;
|
import org.apache.flink.table.types.logical.TimestampType;
|
||||||
import org.apache.flink.util.Preconditions;
|
import org.apache.flink.util.Preconditions;
|
||||||
import org.apache.parquet.io.api.Binary;
|
import org.apache.parquet.io.api.Binary;
|
||||||
import org.apache.parquet.io.api.RecordConsumer;
|
import org.apache.parquet.io.api.RecordConsumer;
|
||||||
import org.apache.parquet.schema.GroupType;
|
import org.apache.parquet.schema.GroupType;
|
||||||
import org.apache.parquet.schema.Type;
|
|
||||||
|
|
||||||
import java.nio.ByteBuffer;
|
import java.nio.ByteBuffer;
|
||||||
import java.nio.ByteOrder;
|
import java.nio.ByteOrder;
|
||||||
@@ -46,7 +49,8 @@ import static org.apache.flink.formats.parquet.vector.reader.TimestampColumnRead
|
|||||||
/**
|
/**
|
||||||
* Writes a record to the Parquet API with the expected schema in order to be written to a file.
|
* Writes a record to the Parquet API with the expected schema in order to be written to a file.
|
||||||
*
|
*
|
||||||
* <p>Reference org.apache.flink.formats.parquet.row.ParquetRowDataWriter to support timestamp of INT64 8 bytes.
|
* <p>Reference {@code org.apache.flink.formats.parquet.row.ParquetRowDataWriter}
|
||||||
|
* to support timestamp of INT64 8 bytes and complex data types.
|
||||||
*/
|
*/
|
||||||
public class ParquetRowDataWriter {
|
public class ParquetRowDataWriter {
|
||||||
|
|
||||||
@@ -67,7 +71,7 @@ public class ParquetRowDataWriter {
|
|||||||
this.filedWriters = new FieldWriter[rowType.getFieldCount()];
|
this.filedWriters = new FieldWriter[rowType.getFieldCount()];
|
||||||
this.fieldNames = rowType.getFieldNames().toArray(new String[0]);
|
this.fieldNames = rowType.getFieldNames().toArray(new String[0]);
|
||||||
for (int i = 0; i < rowType.getFieldCount(); i++) {
|
for (int i = 0; i < rowType.getFieldCount(); i++) {
|
||||||
this.filedWriters[i] = createWriter(rowType.getTypeAt(i), schema.getType(i));
|
this.filedWriters[i] = createWriter(rowType.getTypeAt(i));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -91,8 +95,7 @@ public class ParquetRowDataWriter {
|
|||||||
recordConsumer.endMessage();
|
recordConsumer.endMessage();
|
||||||
}
|
}
|
||||||
|
|
||||||
private FieldWriter createWriter(LogicalType t, Type type) {
|
private FieldWriter createWriter(LogicalType t) {
|
||||||
if (type.isPrimitive()) {
|
|
||||||
switch (t.getTypeRoot()) {
|
switch (t.getTypeRoot()) {
|
||||||
case CHAR:
|
case CHAR:
|
||||||
case VARCHAR:
|
case VARCHAR:
|
||||||
@@ -133,17 +136,34 @@ public class ParquetRowDataWriter {
|
|||||||
} else {
|
} else {
|
||||||
return new Timestamp96Writer(localZonedTimestampType.getPrecision());
|
return new Timestamp96Writer(localZonedTimestampType.getPrecision());
|
||||||
}
|
}
|
||||||
|
case ARRAY:
|
||||||
|
ArrayType arrayType = (ArrayType) t;
|
||||||
|
LogicalType elementType = arrayType.getElementType();
|
||||||
|
FieldWriter elementWriter = createWriter(elementType);
|
||||||
|
return new ArrayWriter(elementWriter);
|
||||||
|
case MAP:
|
||||||
|
MapType mapType = (MapType) t;
|
||||||
|
LogicalType keyType = mapType.getKeyType();
|
||||||
|
LogicalType valueType = mapType.getValueType();
|
||||||
|
FieldWriter keyWriter = createWriter(keyType);
|
||||||
|
FieldWriter valueWriter = createWriter(valueType);
|
||||||
|
return new MapWriter(keyWriter, valueWriter);
|
||||||
|
case ROW:
|
||||||
|
RowType rowType = (RowType) t;
|
||||||
|
FieldWriter[] fieldWriters = rowType.getFields().stream()
|
||||||
|
.map(RowType.RowField::getType).map(this::createWriter).toArray(FieldWriter[]::new);
|
||||||
|
String[] fieldNames = rowType.getFields().stream()
|
||||||
|
.map(RowType.RowField::getName).toArray(String[]::new);
|
||||||
|
return new RowWriter(fieldNames, fieldWriters);
|
||||||
default:
|
default:
|
||||||
throw new UnsupportedOperationException("Unsupported type: " + type);
|
throw new UnsupportedOperationException("Unsupported type: " + t);
|
||||||
}
|
|
||||||
} else {
|
|
||||||
throw new IllegalArgumentException("Unsupported data type: " + t);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private interface FieldWriter {
|
private interface FieldWriter {
|
||||||
|
|
||||||
void write(RowData row, int ordinal);
|
void write(RowData row, int ordinal);
|
||||||
|
|
||||||
|
void write(ArrayData array, int ordinal);
|
||||||
}
|
}
|
||||||
|
|
||||||
private class BooleanWriter implements FieldWriter {
|
private class BooleanWriter implements FieldWriter {
|
||||||
@@ -152,6 +172,11 @@ public class ParquetRowDataWriter {
|
|||||||
public void write(RowData row, int ordinal) {
|
public void write(RowData row, int ordinal) {
|
||||||
recordConsumer.addBoolean(row.getBoolean(ordinal));
|
recordConsumer.addBoolean(row.getBoolean(ordinal));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void write(ArrayData array, int ordinal) {
|
||||||
|
recordConsumer.addBoolean(array.getBoolean(ordinal));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private class ByteWriter implements FieldWriter {
|
private class ByteWriter implements FieldWriter {
|
||||||
@@ -160,6 +185,11 @@ public class ParquetRowDataWriter {
|
|||||||
public void write(RowData row, int ordinal) {
|
public void write(RowData row, int ordinal) {
|
||||||
recordConsumer.addInteger(row.getByte(ordinal));
|
recordConsumer.addInteger(row.getByte(ordinal));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void write(ArrayData array, int ordinal) {
|
||||||
|
recordConsumer.addInteger(array.getByte(ordinal));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private class ShortWriter implements FieldWriter {
|
private class ShortWriter implements FieldWriter {
|
||||||
@@ -168,6 +198,11 @@ public class ParquetRowDataWriter {
|
|||||||
public void write(RowData row, int ordinal) {
|
public void write(RowData row, int ordinal) {
|
||||||
recordConsumer.addInteger(row.getShort(ordinal));
|
recordConsumer.addInteger(row.getShort(ordinal));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void write(ArrayData array, int ordinal) {
|
||||||
|
recordConsumer.addInteger(array.getShort(ordinal));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private class LongWriter implements FieldWriter {
|
private class LongWriter implements FieldWriter {
|
||||||
@@ -176,6 +211,11 @@ public class ParquetRowDataWriter {
|
|||||||
public void write(RowData row, int ordinal) {
|
public void write(RowData row, int ordinal) {
|
||||||
recordConsumer.addLong(row.getLong(ordinal));
|
recordConsumer.addLong(row.getLong(ordinal));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void write(ArrayData array, int ordinal) {
|
||||||
|
recordConsumer.addLong(array.getLong(ordinal));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private class FloatWriter implements FieldWriter {
|
private class FloatWriter implements FieldWriter {
|
||||||
@@ -184,6 +224,11 @@ public class ParquetRowDataWriter {
|
|||||||
public void write(RowData row, int ordinal) {
|
public void write(RowData row, int ordinal) {
|
||||||
recordConsumer.addFloat(row.getFloat(ordinal));
|
recordConsumer.addFloat(row.getFloat(ordinal));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void write(ArrayData array, int ordinal) {
|
||||||
|
recordConsumer.addFloat(array.getFloat(ordinal));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private class DoubleWriter implements FieldWriter {
|
private class DoubleWriter implements FieldWriter {
|
||||||
@@ -192,6 +237,11 @@ public class ParquetRowDataWriter {
|
|||||||
public void write(RowData row, int ordinal) {
|
public void write(RowData row, int ordinal) {
|
||||||
recordConsumer.addDouble(row.getDouble(ordinal));
|
recordConsumer.addDouble(row.getDouble(ordinal));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void write(ArrayData array, int ordinal) {
|
||||||
|
recordConsumer.addDouble(array.getDouble(ordinal));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private class StringWriter implements FieldWriter {
|
private class StringWriter implements FieldWriter {
|
||||||
@@ -200,6 +250,11 @@ public class ParquetRowDataWriter {
|
|||||||
public void write(RowData row, int ordinal) {
|
public void write(RowData row, int ordinal) {
|
||||||
recordConsumer.addBinary(Binary.fromReusedByteArray(row.getString(ordinal).toBytes()));
|
recordConsumer.addBinary(Binary.fromReusedByteArray(row.getString(ordinal).toBytes()));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void write(ArrayData array, int ordinal) {
|
||||||
|
recordConsumer.addBinary(Binary.fromReusedByteArray(array.getString(ordinal).toBytes()));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private class BinaryWriter implements FieldWriter {
|
private class BinaryWriter implements FieldWriter {
|
||||||
@@ -208,6 +263,11 @@ public class ParquetRowDataWriter {
|
|||||||
public void write(RowData row, int ordinal) {
|
public void write(RowData row, int ordinal) {
|
||||||
recordConsumer.addBinary(Binary.fromReusedByteArray(row.getBinary(ordinal)));
|
recordConsumer.addBinary(Binary.fromReusedByteArray(row.getBinary(ordinal)));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void write(ArrayData array, int ordinal) {
|
||||||
|
recordConsumer.addBinary(Binary.fromReusedByteArray(array.getBinary(ordinal)));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private class IntWriter implements FieldWriter {
|
private class IntWriter implements FieldWriter {
|
||||||
@@ -216,6 +276,11 @@ public class ParquetRowDataWriter {
|
|||||||
public void write(RowData row, int ordinal) {
|
public void write(RowData row, int ordinal) {
|
||||||
recordConsumer.addInteger(row.getInt(ordinal));
|
recordConsumer.addInteger(row.getInt(ordinal));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void write(ArrayData array, int ordinal) {
|
||||||
|
recordConsumer.addInteger(array.getInt(ordinal));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@@ -231,6 +296,11 @@ public class ParquetRowDataWriter {
|
|||||||
public void write(RowData row, int ordinal) {
|
public void write(RowData row, int ordinal) {
|
||||||
recordConsumer.addLong(timestampToInt64(row.getTimestamp(ordinal, 3)));
|
recordConsumer.addLong(timestampToInt64(row.getTimestamp(ordinal, 3)));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void write(ArrayData array, int ordinal) {
|
||||||
|
recordConsumer.addLong(timestampToInt64(array.getTimestamp(ordinal, 3)));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private long timestampToInt64(TimestampData timestampData) {
|
private long timestampToInt64(TimestampData timestampData) {
|
||||||
@@ -254,6 +324,11 @@ public class ParquetRowDataWriter {
|
|||||||
public void write(RowData row, int ordinal) {
|
public void write(RowData row, int ordinal) {
|
||||||
recordConsumer.addBinary(timestampToInt96(row.getTimestamp(ordinal, precision)));
|
recordConsumer.addBinary(timestampToInt96(row.getTimestamp(ordinal, precision)));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void write(ArrayData array, int ordinal) {
|
||||||
|
recordConsumer.addBinary(timestampToInt96(array.getTimestamp(ordinal, precision)));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private Binary timestampToInt96(TimestampData timestampData) {
|
private Binary timestampToInt96(TimestampData timestampData) {
|
||||||
@@ -304,10 +379,20 @@ public class ParquetRowDataWriter {
|
|||||||
@Override
|
@Override
|
||||||
public void write(RowData row, int ordinal) {
|
public void write(RowData row, int ordinal) {
|
||||||
long unscaledLong = row.getDecimal(ordinal, precision, scale).toUnscaledLong();
|
long unscaledLong = row.getDecimal(ordinal, precision, scale).toUnscaledLong();
|
||||||
|
doWrite(unscaledLong);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void write(ArrayData array, int ordinal) {
|
||||||
|
long unscaledLong = array.getDecimal(ordinal, precision, scale).toUnscaledLong();
|
||||||
|
doWrite(unscaledLong);
|
||||||
|
}
|
||||||
|
|
||||||
|
private void doWrite(long unscaled) {
|
||||||
int i = 0;
|
int i = 0;
|
||||||
int shift = initShift;
|
int shift = initShift;
|
||||||
while (i < numBytes) {
|
while (i < numBytes) {
|
||||||
decimalBuffer[i] = (byte) (unscaledLong >> shift);
|
decimalBuffer[i] = (byte) (unscaled >> shift);
|
||||||
i += 1;
|
i += 1;
|
||||||
shift -= 8;
|
shift -= 8;
|
||||||
}
|
}
|
||||||
@@ -328,6 +413,16 @@ public class ParquetRowDataWriter {
|
|||||||
@Override
|
@Override
|
||||||
public void write(RowData row, int ordinal) {
|
public void write(RowData row, int ordinal) {
|
||||||
byte[] bytes = row.getDecimal(ordinal, precision, scale).toUnscaledBytes();
|
byte[] bytes = row.getDecimal(ordinal, precision, scale).toUnscaledBytes();
|
||||||
|
doWrite(bytes);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void write(ArrayData array, int ordinal) {
|
||||||
|
byte[] bytes = array.getDecimal(ordinal, precision, scale).toUnscaledBytes();
|
||||||
|
doWrite(bytes);
|
||||||
|
}
|
||||||
|
|
||||||
|
private void doWrite(byte[] bytes) {
|
||||||
byte[] writtenBytes;
|
byte[] writtenBytes;
|
||||||
if (bytes.length == numBytes) {
|
if (bytes.length == numBytes) {
|
||||||
// Avoid copy.
|
// Avoid copy.
|
||||||
@@ -353,5 +448,132 @@ public class ParquetRowDataWriter {
|
|||||||
// 19 <= precision <= 38, writes as FIXED_LEN_BYTE_ARRAY
|
// 19 <= precision <= 38, writes as FIXED_LEN_BYTE_ARRAY
|
||||||
return new UnscaledBytesWriter();
|
return new UnscaledBytesWriter();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private class ArrayWriter implements FieldWriter {
|
||||||
|
private final FieldWriter elementWriter;
|
||||||
|
|
||||||
|
private ArrayWriter(FieldWriter elementWriter) {
|
||||||
|
this.elementWriter = elementWriter;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void write(RowData row, int ordinal) {
|
||||||
|
ArrayData arrayData = row.getArray(ordinal);
|
||||||
|
doWrite(arrayData);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void write(ArrayData array, int ordinal) {
|
||||||
|
ArrayData arrayData = array.getArray(ordinal);
|
||||||
|
doWrite(arrayData);
|
||||||
|
}
|
||||||
|
|
||||||
|
private void doWrite(ArrayData arrayData) {
|
||||||
|
recordConsumer.startGroup();
|
||||||
|
if (arrayData.size() > 0) {
|
||||||
|
final String repeatedGroup = "list";
|
||||||
|
final String elementField = "element";
|
||||||
|
recordConsumer.startField(repeatedGroup, 0);
|
||||||
|
for (int i = 0; i < arrayData.size(); i++) {
|
||||||
|
recordConsumer.startGroup();
|
||||||
|
if (!arrayData.isNullAt(i)) {
|
||||||
|
// Only creates the element field if the current array element is not null.
|
||||||
|
recordConsumer.startField(elementField, 0);
|
||||||
|
elementWriter.write(arrayData, i);
|
||||||
|
recordConsumer.endField(elementField, 0);
|
||||||
|
}
|
||||||
|
recordConsumer.endGroup();
|
||||||
|
}
|
||||||
|
recordConsumer.endField(repeatedGroup, 0);
|
||||||
|
}
|
||||||
|
recordConsumer.endGroup();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private class MapWriter implements FieldWriter {
|
||||||
|
private final FieldWriter keyWriter;
|
||||||
|
private final FieldWriter valueWriter;
|
||||||
|
|
||||||
|
private MapWriter(FieldWriter keyWriter, FieldWriter valueWriter) {
|
||||||
|
this.keyWriter = keyWriter;
|
||||||
|
this.valueWriter = valueWriter;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void write(RowData row, int ordinal) {
|
||||||
|
MapData map = row.getMap(ordinal);
|
||||||
|
doWrite(map);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void write(ArrayData array, int ordinal) {
|
||||||
|
MapData map = array.getMap(ordinal);
|
||||||
|
doWrite(map);
|
||||||
|
}
|
||||||
|
|
||||||
|
private void doWrite(MapData mapData) {
|
||||||
|
ArrayData keyArray = mapData.keyArray();
|
||||||
|
ArrayData valueArray = mapData.valueArray();
|
||||||
|
recordConsumer.startGroup();
|
||||||
|
if (mapData.size() > 0) {
|
||||||
|
final String repeatedGroup = "key_value";
|
||||||
|
final String kField = "key";
|
||||||
|
final String vField = "value";
|
||||||
|
recordConsumer.startField(repeatedGroup, 0);
|
||||||
|
for (int i = 0; i < mapData.size(); i++) {
|
||||||
|
recordConsumer.startGroup();
|
||||||
|
// key
|
||||||
|
recordConsumer.startField(kField, 0);
|
||||||
|
this.keyWriter.write(keyArray, i);
|
||||||
|
recordConsumer.endField(kField, 0);
|
||||||
|
// value
|
||||||
|
if (!valueArray.isNullAt(i)) {
|
||||||
|
// Only creates the "value" field if the value if non-empty
|
||||||
|
recordConsumer.startField(vField, 1);
|
||||||
|
this.valueWriter.write(valueArray, i);
|
||||||
|
recordConsumer.endField(vField, 1);
|
||||||
|
}
|
||||||
|
recordConsumer.endGroup();
|
||||||
|
}
|
||||||
|
recordConsumer.endField(repeatedGroup, 0);
|
||||||
|
}
|
||||||
|
recordConsumer.endGroup();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private class RowWriter implements FieldWriter {
|
||||||
|
private final String[] fieldNames;
|
||||||
|
private final FieldWriter[] fieldWriters;
|
||||||
|
|
||||||
|
private RowWriter(String[] fieldNames, FieldWriter[] fieldWriters) {
|
||||||
|
this.fieldNames = fieldNames;
|
||||||
|
this.fieldWriters = fieldWriters;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void write(RowData row, int ordinal) {
|
||||||
|
RowData nested = row.getRow(ordinal, fieldWriters.length);
|
||||||
|
doWrite(nested);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void write(ArrayData array, int ordinal) {
|
||||||
|
RowData nested = array.getRow(ordinal, fieldWriters.length);
|
||||||
|
doWrite(nested);
|
||||||
|
}
|
||||||
|
|
||||||
|
private void doWrite(RowData row) {
|
||||||
|
recordConsumer.startGroup();
|
||||||
|
for (int i = 0; i < row.getArity(); i++) {
|
||||||
|
if (!row.isNullAt(i)) {
|
||||||
|
String fieldName = fieldNames[i];
|
||||||
|
recordConsumer.startField(fieldName, i);
|
||||||
|
fieldWriters[i].write(row, i);
|
||||||
|
recordConsumer.endField(fieldName, i);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
recordConsumer.endGroup();
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -25,9 +25,11 @@ import org.apache.flink.api.common.typeinfo.TypeInformation;
|
|||||||
import org.apache.flink.api.java.typeutils.MapTypeInfo;
|
import org.apache.flink.api.java.typeutils.MapTypeInfo;
|
||||||
import org.apache.flink.api.java.typeutils.ObjectArrayTypeInfo;
|
import org.apache.flink.api.java.typeutils.ObjectArrayTypeInfo;
|
||||||
import org.apache.flink.api.java.typeutils.RowTypeInfo;
|
import org.apache.flink.api.java.typeutils.RowTypeInfo;
|
||||||
|
import org.apache.flink.table.types.logical.ArrayType;
|
||||||
import org.apache.flink.table.types.logical.DecimalType;
|
import org.apache.flink.table.types.logical.DecimalType;
|
||||||
import org.apache.flink.table.types.logical.LocalZonedTimestampType;
|
import org.apache.flink.table.types.logical.LocalZonedTimestampType;
|
||||||
import org.apache.flink.table.types.logical.LogicalType;
|
import org.apache.flink.table.types.logical.LogicalType;
|
||||||
|
import org.apache.flink.table.types.logical.MapType;
|
||||||
import org.apache.flink.table.types.logical.RowType;
|
import org.apache.flink.table.types.logical.RowType;
|
||||||
|
|
||||||
import org.apache.flink.table.types.logical.TimestampType;
|
import org.apache.flink.table.types.logical.TimestampType;
|
||||||
@@ -616,6 +618,45 @@ public class ParquetSchemaConverter {
|
|||||||
return Types.primitive(PrimitiveType.PrimitiveTypeName.INT96, repetition)
|
return Types.primitive(PrimitiveType.PrimitiveTypeName.INT96, repetition)
|
||||||
.named(name);
|
.named(name);
|
||||||
}
|
}
|
||||||
|
case ARRAY:
|
||||||
|
// <list-repetition> group <name> (LIST) {
|
||||||
|
// repeated group list {
|
||||||
|
// <element-repetition> <element-type> element;
|
||||||
|
// }
|
||||||
|
// }
|
||||||
|
ArrayType arrayType = (ArrayType) type;
|
||||||
|
LogicalType elementType = arrayType.getElementType();
|
||||||
|
return Types
|
||||||
|
.buildGroup(repetition).as(OriginalType.LIST)
|
||||||
|
.addField(
|
||||||
|
Types.repeatedGroup()
|
||||||
|
.addField(convertToParquetType("element", elementType, repetition))
|
||||||
|
.named("list"))
|
||||||
|
.named(name);
|
||||||
|
case MAP:
|
||||||
|
// <map-repetition> group <name> (MAP) {
|
||||||
|
// repeated group key_value {
|
||||||
|
// required <key-type> key;
|
||||||
|
// <value-repetition> <value-type> value;
|
||||||
|
// }
|
||||||
|
// }
|
||||||
|
MapType mapType = (MapType) type;
|
||||||
|
LogicalType keyType = mapType.getKeyType();
|
||||||
|
LogicalType valueType = mapType.getValueType();
|
||||||
|
return Types
|
||||||
|
.buildGroup(repetition).as(OriginalType.MAP)
|
||||||
|
.addField(
|
||||||
|
Types
|
||||||
|
.repeatedGroup()
|
||||||
|
.addField(convertToParquetType("key", keyType, repetition))
|
||||||
|
.addField(convertToParquetType("value", valueType, repetition))
|
||||||
|
.named("key_value"))
|
||||||
|
.named(name);
|
||||||
|
case ROW:
|
||||||
|
RowType rowType = (RowType) type;
|
||||||
|
Types.GroupBuilder<GroupType> builder = Types.buildGroup(repetition);
|
||||||
|
rowType.getFields().forEach(field -> builder.addField(convertToParquetType(field.getName(), field.getType(), repetition)));
|
||||||
|
return builder.named(name);
|
||||||
default:
|
default:
|
||||||
throw new UnsupportedOperationException("Unsupported type: " + type);
|
throw new UnsupportedOperationException("Unsupported type: " + type);
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -0,0 +1,74 @@
|
|||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one
|
||||||
|
* or more contributor license agreements. See the NOTICE file
|
||||||
|
* distributed with this work for additional information
|
||||||
|
* regarding copyright ownership. The ASF licenses this file
|
||||||
|
* to you under the Apache License, Version 2.0 (the
|
||||||
|
* "License"); you may not use this file except in compliance
|
||||||
|
* with the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.apache.hudi.io.storage.row.parquet;
|
||||||
|
|
||||||
|
import org.apache.flink.table.api.DataTypes;
|
||||||
|
import org.apache.flink.table.types.DataType;
|
||||||
|
import org.apache.flink.table.types.logical.RowType;
|
||||||
|
import org.junit.jupiter.api.Test;
|
||||||
|
|
||||||
|
import static org.hamcrest.CoreMatchers.is;
|
||||||
|
import static org.hamcrest.MatcherAssert.assertThat;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Test cases for {@link ParquetSchemaConverter}.
|
||||||
|
*/
|
||||||
|
public class TestParquetSchemaConverter {
|
||||||
|
@Test
|
||||||
|
void testConvertComplexTypes() {
|
||||||
|
DataType dataType = DataTypes.ROW(
|
||||||
|
DataTypes.FIELD("f_array",
|
||||||
|
DataTypes.ARRAY(DataTypes.CHAR(10))),
|
||||||
|
DataTypes.FIELD("f_map",
|
||||||
|
DataTypes.MAP(DataTypes.INT(), DataTypes.VARCHAR(20))),
|
||||||
|
DataTypes.FIELD("f_row",
|
||||||
|
DataTypes.ROW(
|
||||||
|
DataTypes.FIELD("f_row_f0", DataTypes.INT()),
|
||||||
|
DataTypes.FIELD("f_row_f1", DataTypes.VARCHAR(10)),
|
||||||
|
DataTypes.FIELD("f_row_f2",
|
||||||
|
DataTypes.ROW(
|
||||||
|
DataTypes.FIELD("f_row_f2_f0", DataTypes.INT()),
|
||||||
|
DataTypes.FIELD("f_row_f2_f1", DataTypes.VARCHAR(10)))))));
|
||||||
|
org.apache.parquet.schema.MessageType messageType =
|
||||||
|
ParquetSchemaConverter.convertToParquetMessageType("converted", (RowType) dataType.getLogicalType());
|
||||||
|
assertThat(messageType.getColumns().size(), is(7));
|
||||||
|
final String expected = "message converted {\n"
|
||||||
|
+ " optional group f_array (LIST) {\n"
|
||||||
|
+ " repeated group list {\n"
|
||||||
|
+ " optional binary element (UTF8);\n"
|
||||||
|
+ " }\n"
|
||||||
|
+ " }\n"
|
||||||
|
+ " optional group f_map (MAP) {\n"
|
||||||
|
+ " repeated group key_value {\n"
|
||||||
|
+ " optional int32 key;\n"
|
||||||
|
+ " optional binary value (UTF8);\n"
|
||||||
|
+ " }\n"
|
||||||
|
+ " }\n"
|
||||||
|
+ " optional group f_row {\n"
|
||||||
|
+ " optional int32 f_row_f0;\n"
|
||||||
|
+ " optional binary f_row_f1 (UTF8);\n"
|
||||||
|
+ " optional group f_row_f2 {\n"
|
||||||
|
+ " optional int32 f_row_f2_f0;\n"
|
||||||
|
+ " optional binary f_row_f2_f1 (UTF8);\n"
|
||||||
|
+ " }\n"
|
||||||
|
+ " }\n"
|
||||||
|
+ "}\n";
|
||||||
|
assertThat(messageType.toString(), is(expected));
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -30,7 +30,10 @@ import org.apache.flink.configuration.Configuration;
|
|||||||
import org.apache.flink.table.data.RowData;
|
import org.apache.flink.table.data.RowData;
|
||||||
import org.apache.flink.table.types.logical.RowType;
|
import org.apache.flink.table.types.logical.RowType;
|
||||||
import org.apache.flink.util.Collector;
|
import org.apache.flink.util.Collector;
|
||||||
|
import org.slf4j.Logger;
|
||||||
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
|
import java.util.Collections;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@@ -43,6 +46,7 @@ import java.util.List;
|
|||||||
* @see StreamWriteOperatorCoordinator
|
* @see StreamWriteOperatorCoordinator
|
||||||
*/
|
*/
|
||||||
public class AppendWriteFunction<I> extends AbstractStreamWriteFunction<I> {
|
public class AppendWriteFunction<I> extends AbstractStreamWriteFunction<I> {
|
||||||
|
private static final Logger LOG = LoggerFactory.getLogger(AppendWriteFunction.class);
|
||||||
|
|
||||||
private static final long serialVersionUID = 1L;
|
private static final long serialVersionUID = 1L;
|
||||||
|
|
||||||
@@ -113,14 +117,19 @@ public class AppendWriteFunction<I> extends AbstractStreamWriteFunction<I> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
private void flushData(boolean endInput) {
|
private void flushData(boolean endInput) {
|
||||||
if (this.writerHelper == null) {
|
final List<WriteStatus> writeStatus;
|
||||||
// does not process any inputs, returns early.
|
final String instant;
|
||||||
return;
|
if (this.writerHelper != null) {
|
||||||
|
writeStatus = this.writerHelper.getWriteStatuses(this.taskID);
|
||||||
|
instant = this.writerHelper.getInstantTime();
|
||||||
|
} else {
|
||||||
|
LOG.info("No data to write in subtask [{}] for instant [{}]", taskID, currentInstant);
|
||||||
|
writeStatus = Collections.emptyList();
|
||||||
|
instant = instantToWrite(false);
|
||||||
}
|
}
|
||||||
final List<WriteStatus> writeStatus = this.writerHelper.getWriteStatuses(this.taskID);
|
|
||||||
final WriteMetadataEvent event = WriteMetadataEvent.builder()
|
final WriteMetadataEvent event = WriteMetadataEvent.builder()
|
||||||
.taskID(taskID)
|
.taskID(taskID)
|
||||||
.instantTime(this.writerHelper.getInstantTime())
|
.instantTime(instant)
|
||||||
.writeStatus(writeStatus)
|
.writeStatus(writeStatus)
|
||||||
.lastBatch(true)
|
.lastBatch(true)
|
||||||
.endInput(endInput)
|
.endInput(endInput)
|
||||||
|
|||||||
@@ -18,10 +18,11 @@
|
|||||||
|
|
||||||
package org.apache.hudi.table.format.cow;
|
package org.apache.hudi.table.format.cow;
|
||||||
|
|
||||||
|
import org.apache.hudi.table.format.cow.data.ColumnarRowData;
|
||||||
|
import org.apache.hudi.table.format.cow.vector.VectorizedColumnBatch;
|
||||||
|
|
||||||
import org.apache.flink.formats.parquet.vector.reader.ColumnReader;
|
import org.apache.flink.formats.parquet.vector.reader.ColumnReader;
|
||||||
import org.apache.flink.table.data.ColumnarRowData;
|
|
||||||
import org.apache.flink.table.data.vector.ColumnVector;
|
import org.apache.flink.table.data.vector.ColumnVector;
|
||||||
import org.apache.flink.table.data.vector.VectorizedColumnBatch;
|
|
||||||
import org.apache.flink.table.data.vector.writable.WritableColumnVector;
|
import org.apache.flink.table.data.vector.writable.WritableColumnVector;
|
||||||
import org.apache.flink.table.types.logical.LogicalType;
|
import org.apache.flink.table.types.logical.LogicalType;
|
||||||
import org.apache.flink.table.types.logical.LogicalTypeRoot;
|
import org.apache.flink.table.types.logical.LogicalTypeRoot;
|
||||||
@@ -208,11 +209,14 @@ public class ParquetColumnarRowSplitReader implements Closeable {
|
|||||||
|
|
||||||
private WritableColumnVector[] createWritableVectors() {
|
private WritableColumnVector[] createWritableVectors() {
|
||||||
WritableColumnVector[] columns = new WritableColumnVector[requestedTypes.length];
|
WritableColumnVector[] columns = new WritableColumnVector[requestedTypes.length];
|
||||||
|
List<Type> types = requestedSchema.getFields();
|
||||||
|
List<ColumnDescriptor> descriptors = requestedSchema.getColumns();
|
||||||
for (int i = 0; i < requestedTypes.length; i++) {
|
for (int i = 0; i < requestedTypes.length; i++) {
|
||||||
columns[i] = createWritableColumnVector(
|
columns[i] = createWritableColumnVector(
|
||||||
batchSize,
|
batchSize,
|
||||||
requestedTypes[i],
|
requestedTypes[i],
|
||||||
requestedSchema.getColumns().get(i).getPrimitiveType());
|
types.get(i),
|
||||||
|
descriptors);
|
||||||
}
|
}
|
||||||
return columns;
|
return columns;
|
||||||
}
|
}
|
||||||
@@ -236,11 +240,6 @@ public class ParquetColumnarRowSplitReader implements Closeable {
|
|||||||
* Check that the requested schema is supported.
|
* Check that the requested schema is supported.
|
||||||
*/
|
*/
|
||||||
for (int i = 0; i < requestedSchema.getFieldCount(); ++i) {
|
for (int i = 0; i < requestedSchema.getFieldCount(); ++i) {
|
||||||
Type t = requestedSchema.getFields().get(i);
|
|
||||||
if (!t.isPrimitive() || t.isRepetition(Type.Repetition.REPEATED)) {
|
|
||||||
throw new UnsupportedOperationException("Complex types not supported.");
|
|
||||||
}
|
|
||||||
|
|
||||||
String[] colPath = requestedSchema.getPaths().get(i);
|
String[] colPath = requestedSchema.getPaths().get(i);
|
||||||
if (fileSchema.containsPath(colPath)) {
|
if (fileSchema.containsPath(colPath)) {
|
||||||
ColumnDescriptor fd = fileSchema.getColumnDescription(colPath);
|
ColumnDescriptor fd = fileSchema.getColumnDescription(colPath);
|
||||||
@@ -322,14 +321,16 @@ public class ParquetColumnarRowSplitReader implements Closeable {
|
|||||||
throw new IOException("expecting more rows but reached last block. Read "
|
throw new IOException("expecting more rows but reached last block. Read "
|
||||||
+ rowsReturned + " out of " + totalRowCount);
|
+ rowsReturned + " out of " + totalRowCount);
|
||||||
}
|
}
|
||||||
|
List<Type> types = requestedSchema.getFields();
|
||||||
List<ColumnDescriptor> columns = requestedSchema.getColumns();
|
List<ColumnDescriptor> columns = requestedSchema.getColumns();
|
||||||
columnReaders = new ColumnReader[columns.size()];
|
columnReaders = new ColumnReader[types.size()];
|
||||||
for (int i = 0; i < columns.size(); ++i) {
|
for (int i = 0; i < types.size(); ++i) {
|
||||||
columnReaders[i] = createColumnReader(
|
columnReaders[i] = createColumnReader(
|
||||||
utcTimestamp,
|
utcTimestamp,
|
||||||
requestedTypes[i],
|
requestedTypes[i],
|
||||||
columns.get(i),
|
types.get(i),
|
||||||
pages.getPageReader(columns.get(i)));
|
columns,
|
||||||
|
pages);
|
||||||
}
|
}
|
||||||
totalCountLoadedSoFar += pages.getRowCount();
|
totalCountLoadedSoFar += pages.getRowCount();
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -32,9 +32,9 @@ import org.apache.flink.table.data.vector.DecimalColumnVector;
|
|||||||
*/
|
*/
|
||||||
public class ParquetDecimalVector implements DecimalColumnVector {
|
public class ParquetDecimalVector implements DecimalColumnVector {
|
||||||
|
|
||||||
private final ColumnVector vector;
|
public final ColumnVector vector;
|
||||||
|
|
||||||
ParquetDecimalVector(ColumnVector vector) {
|
public ParquetDecimalVector(ColumnVector vector) {
|
||||||
this.vector = vector;
|
this.vector = vector;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -18,6 +18,15 @@
|
|||||||
|
|
||||||
package org.apache.hudi.table.format.cow;
|
package org.apache.hudi.table.format.cow;
|
||||||
|
|
||||||
|
import org.apache.hudi.common.util.ValidationUtils;
|
||||||
|
import org.apache.hudi.table.format.cow.vector.HeapArrayVector;
|
||||||
|
import org.apache.hudi.table.format.cow.vector.HeapMapColumnVector;
|
||||||
|
import org.apache.hudi.table.format.cow.vector.HeapRowColumnVector;
|
||||||
|
import org.apache.hudi.table.format.cow.vector.VectorizedColumnBatch;
|
||||||
|
import org.apache.hudi.table.format.cow.vector.reader.ArrayColumnReader;
|
||||||
|
import org.apache.hudi.table.format.cow.vector.reader.MapColumnReader;
|
||||||
|
import org.apache.hudi.table.format.cow.vector.reader.RowColumnReader;
|
||||||
|
|
||||||
import org.apache.flink.core.fs.Path;
|
import org.apache.flink.core.fs.Path;
|
||||||
import org.apache.flink.formats.parquet.vector.reader.BooleanColumnReader;
|
import org.apache.flink.formats.parquet.vector.reader.BooleanColumnReader;
|
||||||
import org.apache.flink.formats.parquet.vector.reader.ByteColumnReader;
|
import org.apache.flink.formats.parquet.vector.reader.ByteColumnReader;
|
||||||
@@ -32,7 +41,6 @@ import org.apache.flink.formats.parquet.vector.reader.TimestampColumnReader;
|
|||||||
import org.apache.flink.table.data.DecimalData;
|
import org.apache.flink.table.data.DecimalData;
|
||||||
import org.apache.flink.table.data.TimestampData;
|
import org.apache.flink.table.data.TimestampData;
|
||||||
import org.apache.flink.table.data.vector.ColumnVector;
|
import org.apache.flink.table.data.vector.ColumnVector;
|
||||||
import org.apache.flink.table.data.vector.VectorizedColumnBatch;
|
|
||||||
import org.apache.flink.table.data.vector.heap.HeapBooleanVector;
|
import org.apache.flink.table.data.vector.heap.HeapBooleanVector;
|
||||||
import org.apache.flink.table.data.vector.heap.HeapByteVector;
|
import org.apache.flink.table.data.vector.heap.HeapByteVector;
|
||||||
import org.apache.flink.table.data.vector.heap.HeapBytesVector;
|
import org.apache.flink.table.data.vector.heap.HeapBytesVector;
|
||||||
@@ -44,16 +52,24 @@ import org.apache.flink.table.data.vector.heap.HeapShortVector;
|
|||||||
import org.apache.flink.table.data.vector.heap.HeapTimestampVector;
|
import org.apache.flink.table.data.vector.heap.HeapTimestampVector;
|
||||||
import org.apache.flink.table.data.vector.writable.WritableColumnVector;
|
import org.apache.flink.table.data.vector.writable.WritableColumnVector;
|
||||||
import org.apache.flink.table.types.DataType;
|
import org.apache.flink.table.types.DataType;
|
||||||
|
import org.apache.flink.table.types.logical.ArrayType;
|
||||||
import org.apache.flink.table.types.logical.DecimalType;
|
import org.apache.flink.table.types.logical.DecimalType;
|
||||||
import org.apache.flink.table.types.logical.IntType;
|
import org.apache.flink.table.types.logical.IntType;
|
||||||
import org.apache.flink.table.types.logical.LogicalType;
|
import org.apache.flink.table.types.logical.LogicalType;
|
||||||
|
import org.apache.flink.table.types.logical.MapType;
|
||||||
|
import org.apache.flink.table.types.logical.RowType;
|
||||||
import org.apache.flink.table.types.logical.VarBinaryType;
|
import org.apache.flink.table.types.logical.VarBinaryType;
|
||||||
import org.apache.flink.util.Preconditions;
|
import org.apache.flink.util.Preconditions;
|
||||||
import org.apache.hadoop.conf.Configuration;
|
import org.apache.hadoop.conf.Configuration;
|
||||||
|
import org.apache.parquet.ParquetRuntimeException;
|
||||||
import org.apache.parquet.column.ColumnDescriptor;
|
import org.apache.parquet.column.ColumnDescriptor;
|
||||||
|
import org.apache.parquet.column.page.PageReadStore;
|
||||||
import org.apache.parquet.column.page.PageReader;
|
import org.apache.parquet.column.page.PageReader;
|
||||||
|
import org.apache.parquet.schema.GroupType;
|
||||||
|
import org.apache.parquet.schema.InvalidSchemaException;
|
||||||
import org.apache.parquet.schema.OriginalType;
|
import org.apache.parquet.schema.OriginalType;
|
||||||
import org.apache.parquet.schema.PrimitiveType;
|
import org.apache.parquet.schema.PrimitiveType;
|
||||||
|
import org.apache.parquet.schema.Type;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.math.BigDecimal;
|
import java.math.BigDecimal;
|
||||||
@@ -61,6 +77,7 @@ import java.nio.charset.StandardCharsets;
|
|||||||
import java.sql.Date;
|
import java.sql.Date;
|
||||||
import java.time.LocalDate;
|
import java.time.LocalDate;
|
||||||
import java.time.LocalDateTime;
|
import java.time.LocalDateTime;
|
||||||
|
import java.util.ArrayList;
|
||||||
import java.util.Arrays;
|
import java.util.Arrays;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
@@ -252,11 +269,40 @@ public class ParquetSplitReaderUtil {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private static List<ColumnDescriptor> filterDescriptors(int depth, Type type, List<ColumnDescriptor> columns) throws ParquetRuntimeException {
|
||||||
|
List<ColumnDescriptor> filtered = new ArrayList<>();
|
||||||
|
for (ColumnDescriptor descriptor : columns) {
|
||||||
|
if (depth >= descriptor.getPath().length) {
|
||||||
|
throw new InvalidSchemaException("Expect depth " + depth + " for schema: " + descriptor);
|
||||||
|
}
|
||||||
|
if (type.getName().equals(descriptor.getPath()[depth])) {
|
||||||
|
filtered.add(descriptor);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
ValidationUtils.checkState(filtered.size() > 0, "Corrupted Parquet schema");
|
||||||
|
return filtered;
|
||||||
|
}
|
||||||
|
|
||||||
public static ColumnReader createColumnReader(
|
public static ColumnReader createColumnReader(
|
||||||
boolean utcTimestamp,
|
boolean utcTimestamp,
|
||||||
LogicalType fieldType,
|
LogicalType fieldType,
|
||||||
ColumnDescriptor descriptor,
|
Type physicalType,
|
||||||
PageReader pageReader) throws IOException {
|
List<ColumnDescriptor> descriptors,
|
||||||
|
PageReadStore pages) throws IOException {
|
||||||
|
return createColumnReader(utcTimestamp, fieldType, physicalType, descriptors,
|
||||||
|
pages, 0);
|
||||||
|
}
|
||||||
|
|
||||||
|
private static ColumnReader createColumnReader(
|
||||||
|
boolean utcTimestamp,
|
||||||
|
LogicalType fieldType,
|
||||||
|
Type physicalType,
|
||||||
|
List<ColumnDescriptor> columns,
|
||||||
|
PageReadStore pages,
|
||||||
|
int depth) throws IOException {
|
||||||
|
List<ColumnDescriptor> descriptors = filterDescriptors(depth, physicalType, columns);
|
||||||
|
ColumnDescriptor descriptor = descriptors.get(0);
|
||||||
|
PageReader pageReader = pages.getPageReader(descriptor);
|
||||||
switch (fieldType.getTypeRoot()) {
|
switch (fieldType.getTypeRoot()) {
|
||||||
case BOOLEAN:
|
case BOOLEAN:
|
||||||
return new BooleanColumnReader(descriptor, pageReader);
|
return new BooleanColumnReader(descriptor, pageReader);
|
||||||
@@ -303,6 +349,45 @@ public class ParquetSplitReaderUtil {
|
|||||||
default:
|
default:
|
||||||
throw new AssertionError();
|
throw new AssertionError();
|
||||||
}
|
}
|
||||||
|
case ARRAY:
|
||||||
|
return new ArrayColumnReader(
|
||||||
|
descriptor,
|
||||||
|
pageReader,
|
||||||
|
utcTimestamp,
|
||||||
|
descriptor.getPrimitiveType(),
|
||||||
|
fieldType);
|
||||||
|
case MAP:
|
||||||
|
MapType mapType = (MapType) fieldType;
|
||||||
|
ArrayColumnReader keyReader =
|
||||||
|
new ArrayColumnReader(
|
||||||
|
descriptor,
|
||||||
|
pageReader,
|
||||||
|
utcTimestamp,
|
||||||
|
descriptor.getPrimitiveType(),
|
||||||
|
new ArrayType(mapType.getKeyType()));
|
||||||
|
ArrayColumnReader valueReader =
|
||||||
|
new ArrayColumnReader(
|
||||||
|
descriptors.get(1),
|
||||||
|
pages.getPageReader(descriptors.get(1)),
|
||||||
|
utcTimestamp,
|
||||||
|
descriptors.get(1).getPrimitiveType(),
|
||||||
|
new ArrayType(mapType.getValueType()));
|
||||||
|
return new MapColumnReader(keyReader, valueReader, fieldType);
|
||||||
|
case ROW:
|
||||||
|
RowType rowType = (RowType) fieldType;
|
||||||
|
GroupType groupType = physicalType.asGroupType();
|
||||||
|
List<ColumnReader> fieldReaders = new ArrayList<>();
|
||||||
|
for (int i = 0; i < rowType.getFieldCount(); i++) {
|
||||||
|
fieldReaders.add(
|
||||||
|
createColumnReader(
|
||||||
|
utcTimestamp,
|
||||||
|
rowType.getTypeAt(i),
|
||||||
|
groupType.getType(i),
|
||||||
|
descriptors,
|
||||||
|
pages,
|
||||||
|
depth + 1));
|
||||||
|
}
|
||||||
|
return new RowColumnReader(fieldReaders);
|
||||||
default:
|
default:
|
||||||
throw new UnsupportedOperationException(fieldType + " is not supported now.");
|
throw new UnsupportedOperationException(fieldType + " is not supported now.");
|
||||||
}
|
}
|
||||||
@@ -311,7 +396,19 @@ public class ParquetSplitReaderUtil {
|
|||||||
public static WritableColumnVector createWritableColumnVector(
|
public static WritableColumnVector createWritableColumnVector(
|
||||||
int batchSize,
|
int batchSize,
|
||||||
LogicalType fieldType,
|
LogicalType fieldType,
|
||||||
PrimitiveType primitiveType) {
|
Type physicalType,
|
||||||
|
List<ColumnDescriptor> descriptors) {
|
||||||
|
return createWritableColumnVector(batchSize, fieldType, physicalType, descriptors, 0);
|
||||||
|
}
|
||||||
|
|
||||||
|
private static WritableColumnVector createWritableColumnVector(
|
||||||
|
int batchSize,
|
||||||
|
LogicalType fieldType,
|
||||||
|
Type physicalType,
|
||||||
|
List<ColumnDescriptor> columns,
|
||||||
|
int depth) {
|
||||||
|
List<ColumnDescriptor> descriptors = filterDescriptors(depth, physicalType, columns);
|
||||||
|
PrimitiveType primitiveType = descriptors.get(0).getPrimitiveType();
|
||||||
PrimitiveType.PrimitiveTypeName typeName = primitiveType.getPrimitiveTypeName();
|
PrimitiveType.PrimitiveTypeName typeName = primitiveType.getPrimitiveTypeName();
|
||||||
switch (fieldType.getTypeRoot()) {
|
switch (fieldType.getTypeRoot()) {
|
||||||
case BOOLEAN:
|
case BOOLEAN:
|
||||||
@@ -371,6 +468,49 @@ public class ParquetSplitReaderUtil {
|
|||||||
&& primitiveType.getOriginalType() == OriginalType.DECIMAL,
|
&& primitiveType.getOriginalType() == OriginalType.DECIMAL,
|
||||||
"Unexpected type: %s", typeName);
|
"Unexpected type: %s", typeName);
|
||||||
return new HeapBytesVector(batchSize);
|
return new HeapBytesVector(batchSize);
|
||||||
|
case ARRAY:
|
||||||
|
ArrayType arrayType = (ArrayType) fieldType;
|
||||||
|
return new HeapArrayVector(
|
||||||
|
batchSize,
|
||||||
|
createWritableColumnVector(
|
||||||
|
batchSize,
|
||||||
|
arrayType.getElementType(),
|
||||||
|
physicalType,
|
||||||
|
descriptors,
|
||||||
|
depth));
|
||||||
|
case MAP:
|
||||||
|
MapType mapType = (MapType) fieldType;
|
||||||
|
GroupType repeatedType = physicalType.asGroupType().getType(0).asGroupType();
|
||||||
|
// the map column has three level paths.
|
||||||
|
return new HeapMapColumnVector(
|
||||||
|
batchSize,
|
||||||
|
createWritableColumnVector(
|
||||||
|
batchSize,
|
||||||
|
mapType.getKeyType(),
|
||||||
|
repeatedType.getType(0),
|
||||||
|
descriptors,
|
||||||
|
depth + 2),
|
||||||
|
createWritableColumnVector(
|
||||||
|
batchSize,
|
||||||
|
mapType.getValueType(),
|
||||||
|
repeatedType.getType(1),
|
||||||
|
descriptors,
|
||||||
|
depth + 2));
|
||||||
|
case ROW:
|
||||||
|
RowType rowType = (RowType) fieldType;
|
||||||
|
GroupType groupType = physicalType.asGroupType();
|
||||||
|
WritableColumnVector[] columnVectors =
|
||||||
|
new WritableColumnVector[rowType.getFieldCount()];
|
||||||
|
for (int i = 0; i < columnVectors.length; i++) {
|
||||||
|
columnVectors[i] =
|
||||||
|
createWritableColumnVector(
|
||||||
|
batchSize,
|
||||||
|
rowType.getTypeAt(i),
|
||||||
|
groupType.getType(i),
|
||||||
|
descriptors,
|
||||||
|
depth + 1);
|
||||||
|
}
|
||||||
|
return new HeapRowColumnVector(batchSize, columnVectors);
|
||||||
default:
|
default:
|
||||||
throw new UnsupportedOperationException(fieldType + " is not supported now.");
|
throw new UnsupportedOperationException(fieldType + " is not supported now.");
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -0,0 +1,272 @@
|
|||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one
|
||||||
|
* or more contributor license agreements. See the NOTICE file
|
||||||
|
* distributed with this work for additional information
|
||||||
|
* regarding copyright ownership. The ASF licenses this file
|
||||||
|
* to you under the Apache License, Version 2.0 (the
|
||||||
|
* "License"); you may not use this file except in compliance
|
||||||
|
* with the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.apache.hudi.table.format.cow.data;
|
||||||
|
|
||||||
|
import org.apache.hudi.table.format.cow.vector.MapColumnVector;
|
||||||
|
|
||||||
|
import org.apache.flink.table.data.ArrayData;
|
||||||
|
import org.apache.flink.table.data.DecimalData;
|
||||||
|
import org.apache.flink.table.data.MapData;
|
||||||
|
import org.apache.flink.table.data.RawValueData;
|
||||||
|
import org.apache.flink.table.data.RowData;
|
||||||
|
import org.apache.flink.table.data.StringData;
|
||||||
|
import org.apache.flink.table.data.TimestampData;
|
||||||
|
import org.apache.flink.table.data.binary.TypedSetters;
|
||||||
|
import org.apache.flink.table.data.vector.ArrayColumnVector;
|
||||||
|
import org.apache.flink.table.data.vector.BooleanColumnVector;
|
||||||
|
import org.apache.flink.table.data.vector.ByteColumnVector;
|
||||||
|
import org.apache.flink.table.data.vector.BytesColumnVector;
|
||||||
|
import org.apache.flink.table.data.vector.ColumnVector;
|
||||||
|
import org.apache.flink.table.data.vector.DecimalColumnVector;
|
||||||
|
import org.apache.flink.table.data.vector.DoubleColumnVector;
|
||||||
|
import org.apache.flink.table.data.vector.FloatColumnVector;
|
||||||
|
import org.apache.flink.table.data.vector.IntColumnVector;
|
||||||
|
import org.apache.flink.table.data.vector.LongColumnVector;
|
||||||
|
import org.apache.flink.table.data.vector.RowColumnVector;
|
||||||
|
import org.apache.flink.table.data.vector.ShortColumnVector;
|
||||||
|
import org.apache.flink.table.data.vector.TimestampColumnVector;
|
||||||
|
|
||||||
|
import java.util.Arrays;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Columnar array to support access to vector column data.
|
||||||
|
*
|
||||||
|
* <p>References {@code org.apache.flink.table.data.ColumnarArrayData} to include FLINK-15390.
|
||||||
|
*/
|
||||||
|
public final class ColumnarArrayData implements ArrayData, TypedSetters {
|
||||||
|
|
||||||
|
private final ColumnVector data;
|
||||||
|
private final int offset;
|
||||||
|
private final int numElements;
|
||||||
|
|
||||||
|
public ColumnarArrayData(ColumnVector data, int offset, int numElements) {
|
||||||
|
this.data = data;
|
||||||
|
this.offset = offset;
|
||||||
|
this.numElements = numElements;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int size() {
|
||||||
|
return numElements;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public boolean isNullAt(int pos) {
|
||||||
|
return data.isNullAt(offset + pos);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void setNullAt(int pos) {
|
||||||
|
throw new UnsupportedOperationException("Not support the operation!");
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public boolean getBoolean(int pos) {
|
||||||
|
return ((BooleanColumnVector) data).getBoolean(offset + pos);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public byte getByte(int pos) {
|
||||||
|
return ((ByteColumnVector) data).getByte(offset + pos);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public short getShort(int pos) {
|
||||||
|
return ((ShortColumnVector) data).getShort(offset + pos);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int getInt(int pos) {
|
||||||
|
return ((IntColumnVector) data).getInt(offset + pos);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public long getLong(int pos) {
|
||||||
|
return ((LongColumnVector) data).getLong(offset + pos);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public float getFloat(int pos) {
|
||||||
|
return ((FloatColumnVector) data).getFloat(offset + pos);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public double getDouble(int pos) {
|
||||||
|
return ((DoubleColumnVector) data).getDouble(offset + pos);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public StringData getString(int pos) {
|
||||||
|
BytesColumnVector.Bytes byteArray = getByteArray(pos);
|
||||||
|
return StringData.fromBytes(byteArray.data, byteArray.offset, byteArray.len);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public DecimalData getDecimal(int pos, int precision, int scale) {
|
||||||
|
return ((DecimalColumnVector) data).getDecimal(offset + pos, precision, scale);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public TimestampData getTimestamp(int pos, int precision) {
|
||||||
|
return ((TimestampColumnVector) data).getTimestamp(offset + pos, precision);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public <T> RawValueData<T> getRawValue(int pos) {
|
||||||
|
throw new UnsupportedOperationException("RawValueData is not supported.");
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public byte[] getBinary(int pos) {
|
||||||
|
BytesColumnVector.Bytes byteArray = getByteArray(pos);
|
||||||
|
if (byteArray.len == byteArray.data.length) {
|
||||||
|
return byteArray.data;
|
||||||
|
} else {
|
||||||
|
return Arrays.copyOfRange(byteArray.data, byteArray.offset, byteArray.len);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public ArrayData getArray(int pos) {
|
||||||
|
return ((ArrayColumnVector) data).getArray(offset + pos);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public MapData getMap(int pos) {
|
||||||
|
return ((MapColumnVector) data).getMap(offset + pos);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public RowData getRow(int pos, int numFields) {
|
||||||
|
return ((RowColumnVector) data).getRow(offset + pos);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void setBoolean(int pos, boolean value) {
|
||||||
|
throw new UnsupportedOperationException("Not support the operation!");
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void setByte(int pos, byte value) {
|
||||||
|
throw new UnsupportedOperationException("Not support the operation!");
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void setShort(int pos, short value) {
|
||||||
|
throw new UnsupportedOperationException("Not support the operation!");
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void setInt(int pos, int value) {
|
||||||
|
throw new UnsupportedOperationException("Not support the operation!");
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void setLong(int pos, long value) {
|
||||||
|
throw new UnsupportedOperationException("Not support the operation!");
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void setFloat(int pos, float value) {
|
||||||
|
throw new UnsupportedOperationException("Not support the operation!");
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void setDouble(int pos, double value) {
|
||||||
|
throw new UnsupportedOperationException("Not support the operation!");
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void setDecimal(int pos, DecimalData value, int precision) {
|
||||||
|
throw new UnsupportedOperationException("Not support the operation!");
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void setTimestamp(int pos, TimestampData value, int precision) {
|
||||||
|
throw new UnsupportedOperationException("Not support the operation!");
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public boolean[] toBooleanArray() {
|
||||||
|
boolean[] res = new boolean[numElements];
|
||||||
|
for (int i = 0; i < numElements; i++) {
|
||||||
|
res[i] = getBoolean(i);
|
||||||
|
}
|
||||||
|
return res;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public byte[] toByteArray() {
|
||||||
|
byte[] res = new byte[numElements];
|
||||||
|
for (int i = 0; i < numElements; i++) {
|
||||||
|
res[i] = getByte(i);
|
||||||
|
}
|
||||||
|
return res;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public short[] toShortArray() {
|
||||||
|
short[] res = new short[numElements];
|
||||||
|
for (int i = 0; i < numElements; i++) {
|
||||||
|
res[i] = getShort(i);
|
||||||
|
}
|
||||||
|
return res;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int[] toIntArray() {
|
||||||
|
int[] res = new int[numElements];
|
||||||
|
for (int i = 0; i < numElements; i++) {
|
||||||
|
res[i] = getInt(i);
|
||||||
|
}
|
||||||
|
return res;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public long[] toLongArray() {
|
||||||
|
long[] res = new long[numElements];
|
||||||
|
for (int i = 0; i < numElements; i++) {
|
||||||
|
res[i] = getLong(i);
|
||||||
|
}
|
||||||
|
return res;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public float[] toFloatArray() {
|
||||||
|
float[] res = new float[numElements];
|
||||||
|
for (int i = 0; i < numElements; i++) {
|
||||||
|
res[i] = getFloat(i);
|
||||||
|
}
|
||||||
|
return res;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public double[] toDoubleArray() {
|
||||||
|
double[] res = new double[numElements];
|
||||||
|
for (int i = 0; i < numElements; i++) {
|
||||||
|
res[i] = getDouble(i);
|
||||||
|
}
|
||||||
|
return res;
|
||||||
|
}
|
||||||
|
|
||||||
|
private BytesColumnVector.Bytes getByteArray(int pos) {
|
||||||
|
return ((BytesColumnVector) data).getBytes(offset + pos);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
@@ -0,0 +1,75 @@
|
|||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one
|
||||||
|
* or more contributor license agreements. See the NOTICE file
|
||||||
|
* distributed with this work for additional information
|
||||||
|
* regarding copyright ownership. The ASF licenses this file
|
||||||
|
* to you under the Apache License, Version 2.0 (the
|
||||||
|
* "License"); you may not use this file except in compliance
|
||||||
|
* with the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.apache.hudi.table.format.cow.data;
|
||||||
|
|
||||||
|
import org.apache.flink.table.data.ArrayData;
|
||||||
|
import org.apache.flink.table.data.ColumnarArrayData;
|
||||||
|
import org.apache.flink.table.data.MapData;
|
||||||
|
import org.apache.flink.table.data.vector.ColumnVector;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Columnar map to support access to vector column data.
|
||||||
|
*
|
||||||
|
* <p>Referenced from flink 1.14.0 {@code org.apache.flink.table.data.ColumnarMapData}.
|
||||||
|
*/
|
||||||
|
public final class ColumnarMapData implements MapData {
|
||||||
|
|
||||||
|
private final ColumnVector keyColumnVector;
|
||||||
|
private final ColumnVector valueColumnVector;
|
||||||
|
private final int offset;
|
||||||
|
private final int size;
|
||||||
|
|
||||||
|
public ColumnarMapData(
|
||||||
|
ColumnVector keyColumnVector,
|
||||||
|
ColumnVector valueColumnVector,
|
||||||
|
int offset,
|
||||||
|
int size) {
|
||||||
|
this.keyColumnVector = keyColumnVector;
|
||||||
|
this.valueColumnVector = valueColumnVector;
|
||||||
|
this.offset = offset;
|
||||||
|
this.size = size;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int size() {
|
||||||
|
return size;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public ArrayData keyArray() {
|
||||||
|
return new ColumnarArrayData(keyColumnVector, offset, size);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public ArrayData valueArray() {
|
||||||
|
return new ColumnarArrayData(valueColumnVector, offset, size);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public boolean equals(Object o) {
|
||||||
|
throw new UnsupportedOperationException(
|
||||||
|
"ColumnarMapData do not support equals, please compare fields one by one!");
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int hashCode() {
|
||||||
|
throw new UnsupportedOperationException(
|
||||||
|
"ColumnarMapData do not support hashCode, please hash fields one by one!");
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -0,0 +1,232 @@
|
|||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one
|
||||||
|
* or more contributor license agreements. See the NOTICE file
|
||||||
|
* distributed with this work for additional information
|
||||||
|
* regarding copyright ownership. The ASF licenses this file
|
||||||
|
* to you under the Apache License, Version 2.0 (the
|
||||||
|
* "License"); you may not use this file except in compliance
|
||||||
|
* with the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.apache.hudi.table.format.cow.data;
|
||||||
|
|
||||||
|
import org.apache.hudi.table.format.cow.vector.VectorizedColumnBatch;
|
||||||
|
|
||||||
|
import org.apache.flink.table.data.ArrayData;
|
||||||
|
import org.apache.flink.table.data.DecimalData;
|
||||||
|
import org.apache.flink.table.data.MapData;
|
||||||
|
import org.apache.flink.table.data.RawValueData;
|
||||||
|
import org.apache.flink.table.data.RowData;
|
||||||
|
import org.apache.flink.table.data.StringData;
|
||||||
|
import org.apache.flink.table.data.TimestampData;
|
||||||
|
import org.apache.flink.table.data.binary.TypedSetters;
|
||||||
|
import org.apache.flink.table.data.vector.BytesColumnVector;
|
||||||
|
import org.apache.flink.types.RowKind;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Columnar row to support access to vector column data.
|
||||||
|
* It is a row view in {@link VectorizedColumnBatch}.
|
||||||
|
*
|
||||||
|
* <p>References {@code org.apache.flink.table.data.ColumnarRowData} to include FLINK-15390.
|
||||||
|
*/
|
||||||
|
public final class ColumnarRowData implements RowData, TypedSetters {
|
||||||
|
|
||||||
|
private RowKind rowKind = RowKind.INSERT;
|
||||||
|
private VectorizedColumnBatch vectorizedColumnBatch;
|
||||||
|
private int rowId;
|
||||||
|
|
||||||
|
public ColumnarRowData() {
|
||||||
|
}
|
||||||
|
|
||||||
|
public ColumnarRowData(VectorizedColumnBatch vectorizedColumnBatch) {
|
||||||
|
this(vectorizedColumnBatch, 0);
|
||||||
|
}
|
||||||
|
|
||||||
|
public ColumnarRowData(VectorizedColumnBatch vectorizedColumnBatch, int rowId) {
|
||||||
|
this.vectorizedColumnBatch = vectorizedColumnBatch;
|
||||||
|
this.rowId = rowId;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setVectorizedColumnBatch(VectorizedColumnBatch vectorizedColumnBatch) {
|
||||||
|
this.vectorizedColumnBatch = vectorizedColumnBatch;
|
||||||
|
this.rowId = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setRowId(int rowId) {
|
||||||
|
this.rowId = rowId;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public RowKind getRowKind() {
|
||||||
|
return rowKind;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void setRowKind(RowKind kind) {
|
||||||
|
this.rowKind = kind;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int getArity() {
|
||||||
|
return vectorizedColumnBatch.getArity();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public boolean isNullAt(int pos) {
|
||||||
|
return vectorizedColumnBatch.isNullAt(rowId, pos);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public boolean getBoolean(int pos) {
|
||||||
|
return vectorizedColumnBatch.getBoolean(rowId, pos);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public byte getByte(int pos) {
|
||||||
|
return vectorizedColumnBatch.getByte(rowId, pos);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public short getShort(int pos) {
|
||||||
|
return vectorizedColumnBatch.getShort(rowId, pos);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int getInt(int pos) {
|
||||||
|
return vectorizedColumnBatch.getInt(rowId, pos);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public long getLong(int pos) {
|
||||||
|
return vectorizedColumnBatch.getLong(rowId, pos);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public float getFloat(int pos) {
|
||||||
|
return vectorizedColumnBatch.getFloat(rowId, pos);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public double getDouble(int pos) {
|
||||||
|
return vectorizedColumnBatch.getDouble(rowId, pos);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public StringData getString(int pos) {
|
||||||
|
BytesColumnVector.Bytes byteArray = vectorizedColumnBatch.getByteArray(rowId, pos);
|
||||||
|
return StringData.fromBytes(byteArray.data, byteArray.offset, byteArray.len);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public DecimalData getDecimal(int pos, int precision, int scale) {
|
||||||
|
return vectorizedColumnBatch.getDecimal(rowId, pos, precision, scale);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public TimestampData getTimestamp(int pos, int precision) {
|
||||||
|
return vectorizedColumnBatch.getTimestamp(rowId, pos, precision);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public <T> RawValueData<T> getRawValue(int pos) {
|
||||||
|
throw new UnsupportedOperationException("RawValueData is not supported.");
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public byte[] getBinary(int pos) {
|
||||||
|
BytesColumnVector.Bytes byteArray = vectorizedColumnBatch.getByteArray(rowId, pos);
|
||||||
|
if (byteArray.len == byteArray.data.length) {
|
||||||
|
return byteArray.data;
|
||||||
|
} else {
|
||||||
|
byte[] ret = new byte[byteArray.len];
|
||||||
|
System.arraycopy(byteArray.data, byteArray.offset, ret, 0, byteArray.len);
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public RowData getRow(int pos, int numFields) {
|
||||||
|
return vectorizedColumnBatch.getRow(rowId, pos);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public ArrayData getArray(int pos) {
|
||||||
|
return vectorizedColumnBatch.getArray(rowId, pos);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public MapData getMap(int pos) {
|
||||||
|
return vectorizedColumnBatch.getMap(rowId, pos);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void setNullAt(int pos) {
|
||||||
|
throw new UnsupportedOperationException("Not support the operation!");
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void setBoolean(int pos, boolean value) {
|
||||||
|
throw new UnsupportedOperationException("Not support the operation!");
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void setByte(int pos, byte value) {
|
||||||
|
throw new UnsupportedOperationException("Not support the operation!");
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void setShort(int pos, short value) {
|
||||||
|
throw new UnsupportedOperationException("Not support the operation!");
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void setInt(int pos, int value) {
|
||||||
|
throw new UnsupportedOperationException("Not support the operation!");
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void setLong(int pos, long value) {
|
||||||
|
throw new UnsupportedOperationException("Not support the operation!");
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void setFloat(int pos, float value) {
|
||||||
|
throw new UnsupportedOperationException("Not support the operation!");
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void setDouble(int pos, double value) {
|
||||||
|
throw new UnsupportedOperationException("Not support the operation!");
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void setDecimal(int pos, DecimalData value, int precision) {
|
||||||
|
throw new UnsupportedOperationException("Not support the operation!");
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void setTimestamp(int pos, TimestampData value, int precision) {
|
||||||
|
throw new UnsupportedOperationException("Not support the operation!");
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public boolean equals(Object o) {
|
||||||
|
throw new UnsupportedOperationException(
|
||||||
|
"ColumnarRowData do not support equals, please compare fields one by one!");
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int hashCode() {
|
||||||
|
throw new UnsupportedOperationException(
|
||||||
|
"ColumnarRowData do not support hashCode, please hash fields one by one!");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
@@ -0,0 +1,71 @@
|
|||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one
|
||||||
|
* or more contributor license agreements. See the NOTICE file
|
||||||
|
* distributed with this work for additional information
|
||||||
|
* regarding copyright ownership. The ASF licenses this file
|
||||||
|
* to you under the Apache License, Version 2.0 (the
|
||||||
|
* "License"); you may not use this file except in compliance
|
||||||
|
* with the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.apache.hudi.table.format.cow.vector;
|
||||||
|
|
||||||
|
import org.apache.hudi.table.format.cow.data.ColumnarArrayData;
|
||||||
|
|
||||||
|
import org.apache.flink.table.data.ArrayData;
|
||||||
|
import org.apache.flink.table.data.vector.ArrayColumnVector;
|
||||||
|
import org.apache.flink.table.data.vector.ColumnVector;
|
||||||
|
import org.apache.flink.table.data.vector.heap.AbstractHeapVector;
|
||||||
|
import org.apache.flink.table.data.vector.writable.WritableColumnVector;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* This class represents a nullable heap array column vector.
|
||||||
|
*/
|
||||||
|
public class HeapArrayVector extends AbstractHeapVector
|
||||||
|
implements WritableColumnVector, ArrayColumnVector {
|
||||||
|
|
||||||
|
public long[] offsets;
|
||||||
|
public long[] lengths;
|
||||||
|
public ColumnVector child;
|
||||||
|
private int size;
|
||||||
|
|
||||||
|
public HeapArrayVector(int len) {
|
||||||
|
super(len);
|
||||||
|
offsets = new long[len];
|
||||||
|
lengths = new long[len];
|
||||||
|
}
|
||||||
|
|
||||||
|
public HeapArrayVector(int len, ColumnVector vector) {
|
||||||
|
super(len);
|
||||||
|
offsets = new long[len];
|
||||||
|
lengths = new long[len];
|
||||||
|
this.child = vector;
|
||||||
|
}
|
||||||
|
|
||||||
|
public int getSize() {
|
||||||
|
return size;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setSize(int size) {
|
||||||
|
this.size = size;
|
||||||
|
}
|
||||||
|
|
||||||
|
public int getLen() {
|
||||||
|
return this.isNull.length;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public ArrayData getArray(int i) {
|
||||||
|
long offset = offsets[i];
|
||||||
|
long length = lengths[i];
|
||||||
|
return new ColumnarArrayData(child, (int) offset, (int) length);
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -0,0 +1,79 @@
|
|||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one
|
||||||
|
* or more contributor license agreements. See the NOTICE file
|
||||||
|
* distributed with this work for additional information
|
||||||
|
* regarding copyright ownership. The ASF licenses this file
|
||||||
|
* to you under the Apache License, Version 2.0 (the
|
||||||
|
* "License"); you may not use this file except in compliance
|
||||||
|
* with the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.apache.hudi.table.format.cow.vector;
|
||||||
|
|
||||||
|
import org.apache.hudi.table.format.cow.data.ColumnarMapData;
|
||||||
|
|
||||||
|
import org.apache.flink.table.data.MapData;
|
||||||
|
import org.apache.flink.table.data.vector.ColumnVector;
|
||||||
|
import org.apache.flink.table.data.vector.heap.AbstractHeapVector;
|
||||||
|
import org.apache.flink.table.data.vector.writable.WritableColumnVector;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* This class represents a nullable heap map column vector.
|
||||||
|
*/
|
||||||
|
public class HeapMapColumnVector extends AbstractHeapVector
|
||||||
|
implements WritableColumnVector, MapColumnVector {
|
||||||
|
|
||||||
|
private long[] offsets;
|
||||||
|
private long[] lengths;
|
||||||
|
private int size;
|
||||||
|
private ColumnVector keys;
|
||||||
|
private ColumnVector values;
|
||||||
|
|
||||||
|
public HeapMapColumnVector(int len, ColumnVector keys, ColumnVector values) {
|
||||||
|
super(len);
|
||||||
|
size = 0;
|
||||||
|
offsets = new long[len];
|
||||||
|
lengths = new long[len];
|
||||||
|
this.keys = keys;
|
||||||
|
this.values = values;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setOffsets(long[] offsets) {
|
||||||
|
this.offsets = offsets;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setLengths(long[] lengths) {
|
||||||
|
this.lengths = lengths;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setKeys(ColumnVector keys) {
|
||||||
|
this.keys = keys;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setValues(ColumnVector values) {
|
||||||
|
this.values = values;
|
||||||
|
}
|
||||||
|
|
||||||
|
public int getSize() {
|
||||||
|
return size;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setSize(int size) {
|
||||||
|
this.size = size;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public MapData getMap(int i) {
|
||||||
|
long offset = offsets[i];
|
||||||
|
long length = lengths[i];
|
||||||
|
return new ColumnarMapData(keys, values, (int) offset, (int) length);
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -0,0 +1,45 @@
|
|||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one
|
||||||
|
* or more contributor license agreements. See the NOTICE file
|
||||||
|
* distributed with this work for additional information
|
||||||
|
* regarding copyright ownership. The ASF licenses this file
|
||||||
|
* to you under the Apache License, Version 2.0 (the
|
||||||
|
* "License"); you may not use this file except in compliance
|
||||||
|
* with the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.apache.hudi.table.format.cow.vector;
|
||||||
|
|
||||||
|
import org.apache.hudi.table.format.cow.data.ColumnarRowData;
|
||||||
|
|
||||||
|
import org.apache.flink.table.data.vector.heap.AbstractHeapVector;
|
||||||
|
import org.apache.flink.table.data.vector.writable.WritableColumnVector;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* This class represents a nullable heap row column vector.
|
||||||
|
*/
|
||||||
|
public class HeapRowColumnVector extends AbstractHeapVector
|
||||||
|
implements WritableColumnVector, RowColumnVector {
|
||||||
|
|
||||||
|
public WritableColumnVector[] vectors;
|
||||||
|
|
||||||
|
public HeapRowColumnVector(int len, WritableColumnVector... vectors) {
|
||||||
|
super(len);
|
||||||
|
this.vectors = vectors;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public ColumnarRowData getRow(int i) {
|
||||||
|
ColumnarRowData columnarRowData = new ColumnarRowData(new VectorizedColumnBatch(vectors));
|
||||||
|
columnarRowData.setRowId(i);
|
||||||
|
return columnarRowData;
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -0,0 +1,29 @@
|
|||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one
|
||||||
|
* or more contributor license agreements. See the NOTICE file
|
||||||
|
* distributed with this work for additional information
|
||||||
|
* regarding copyright ownership. The ASF licenses this file
|
||||||
|
* to you under the Apache License, Version 2.0 (the
|
||||||
|
* "License"); you may not use this file except in compliance
|
||||||
|
* with the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.apache.hudi.table.format.cow.vector;
|
||||||
|
|
||||||
|
import org.apache.flink.table.data.MapData;
|
||||||
|
import org.apache.flink.table.data.vector.ColumnVector;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Map column vector.
|
||||||
|
*/
|
||||||
|
public interface MapColumnVector extends ColumnVector {
|
||||||
|
MapData getMap(int i);
|
||||||
|
}
|
||||||
@@ -0,0 +1,30 @@
|
|||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one
|
||||||
|
* or more contributor license agreements. See the NOTICE file
|
||||||
|
* distributed with this work for additional information
|
||||||
|
* regarding copyright ownership. The ASF licenses this file
|
||||||
|
* to you under the Apache License, Version 2.0 (the
|
||||||
|
* "License"); you may not use this file except in compliance
|
||||||
|
* with the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.apache.hudi.table.format.cow.vector;
|
||||||
|
|
||||||
|
import org.apache.hudi.table.format.cow.data.ColumnarRowData;
|
||||||
|
|
||||||
|
import org.apache.flink.table.data.vector.ColumnVector;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Row column vector.
|
||||||
|
*/
|
||||||
|
public interface RowColumnVector extends ColumnVector {
|
||||||
|
ColumnarRowData getRow(int i);
|
||||||
|
}
|
||||||
@@ -0,0 +1,148 @@
|
|||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one
|
||||||
|
* or more contributor license agreements. See the NOTICE file
|
||||||
|
* distributed with this work for additional information
|
||||||
|
* regarding copyright ownership. The ASF licenses this file
|
||||||
|
* to you under the Apache License, Version 2.0 (the
|
||||||
|
* "License"); you may not use this file except in compliance
|
||||||
|
* with the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.apache.hudi.table.format.cow.vector;
|
||||||
|
|
||||||
|
import org.apache.flink.table.data.ArrayData;
|
||||||
|
import org.apache.flink.table.data.DecimalData;
|
||||||
|
import org.apache.flink.table.data.MapData;
|
||||||
|
import org.apache.flink.table.data.RowData;
|
||||||
|
import org.apache.flink.table.data.TimestampData;
|
||||||
|
import org.apache.flink.table.data.vector.ArrayColumnVector;
|
||||||
|
import org.apache.flink.table.data.vector.BooleanColumnVector;
|
||||||
|
import org.apache.flink.table.data.vector.ByteColumnVector;
|
||||||
|
import org.apache.flink.table.data.vector.BytesColumnVector;
|
||||||
|
import org.apache.flink.table.data.vector.ColumnVector;
|
||||||
|
import org.apache.flink.table.data.vector.DecimalColumnVector;
|
||||||
|
import org.apache.flink.table.data.vector.DoubleColumnVector;
|
||||||
|
import org.apache.flink.table.data.vector.FloatColumnVector;
|
||||||
|
import org.apache.flink.table.data.vector.IntColumnVector;
|
||||||
|
import org.apache.flink.table.data.vector.LongColumnVector;
|
||||||
|
import org.apache.flink.table.data.vector.ShortColumnVector;
|
||||||
|
import org.apache.flink.table.data.vector.TimestampColumnVector;
|
||||||
|
|
||||||
|
import java.io.Serializable;
|
||||||
|
import java.nio.charset.StandardCharsets;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* A VectorizedColumnBatch is a set of rows, organized with each column as a vector. It is the unit
|
||||||
|
* of query execution, organized to minimize the cost per row.
|
||||||
|
*
|
||||||
|
* <p>{@code VectorizedColumnBatch}s are influenced by Apache Hive VectorizedRowBatch.
|
||||||
|
*
|
||||||
|
* <p>References {@code org.apache.flink.table.data.vector.VectorizedColumnBatch} to include FLINK-15390.
|
||||||
|
*/
|
||||||
|
public class VectorizedColumnBatch implements Serializable {
|
||||||
|
private static final long serialVersionUID = 1L;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* This number is carefully chosen to minimize overhead and typically allows one
|
||||||
|
* VectorizedColumnBatch to fit in cache.
|
||||||
|
*/
|
||||||
|
public static final int DEFAULT_SIZE = 2048;
|
||||||
|
|
||||||
|
private int numRows;
|
||||||
|
public final ColumnVector[] columns;
|
||||||
|
|
||||||
|
public VectorizedColumnBatch(ColumnVector[] vectors) {
|
||||||
|
this.columns = vectors;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setNumRows(int numRows) {
|
||||||
|
this.numRows = numRows;
|
||||||
|
}
|
||||||
|
|
||||||
|
public int getNumRows() {
|
||||||
|
return numRows;
|
||||||
|
}
|
||||||
|
|
||||||
|
public int getArity() {
|
||||||
|
return columns.length;
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean isNullAt(int rowId, int colId) {
|
||||||
|
return columns[colId].isNullAt(rowId);
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean getBoolean(int rowId, int colId) {
|
||||||
|
return ((BooleanColumnVector) columns[colId]).getBoolean(rowId);
|
||||||
|
}
|
||||||
|
|
||||||
|
public byte getByte(int rowId, int colId) {
|
||||||
|
return ((ByteColumnVector) columns[colId]).getByte(rowId);
|
||||||
|
}
|
||||||
|
|
||||||
|
public short getShort(int rowId, int colId) {
|
||||||
|
return ((ShortColumnVector) columns[colId]).getShort(rowId);
|
||||||
|
}
|
||||||
|
|
||||||
|
public int getInt(int rowId, int colId) {
|
||||||
|
return ((IntColumnVector) columns[colId]).getInt(rowId);
|
||||||
|
}
|
||||||
|
|
||||||
|
public long getLong(int rowId, int colId) {
|
||||||
|
return ((LongColumnVector) columns[colId]).getLong(rowId);
|
||||||
|
}
|
||||||
|
|
||||||
|
public float getFloat(int rowId, int colId) {
|
||||||
|
return ((FloatColumnVector) columns[colId]).getFloat(rowId);
|
||||||
|
}
|
||||||
|
|
||||||
|
public double getDouble(int rowId, int colId) {
|
||||||
|
return ((DoubleColumnVector) columns[colId]).getDouble(rowId);
|
||||||
|
}
|
||||||
|
|
||||||
|
public BytesColumnVector.Bytes getByteArray(int rowId, int colId) {
|
||||||
|
return ((BytesColumnVector) columns[colId]).getBytes(rowId);
|
||||||
|
}
|
||||||
|
|
||||||
|
private byte[] getBytes(int rowId, int colId) {
|
||||||
|
BytesColumnVector.Bytes byteArray = getByteArray(rowId, colId);
|
||||||
|
if (byteArray.len == byteArray.data.length) {
|
||||||
|
return byteArray.data;
|
||||||
|
} else {
|
||||||
|
return byteArray.getBytes();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getString(int rowId, int colId) {
|
||||||
|
BytesColumnVector.Bytes byteArray = getByteArray(rowId, colId);
|
||||||
|
return new String(byteArray.data, byteArray.offset, byteArray.len, StandardCharsets.UTF_8);
|
||||||
|
}
|
||||||
|
|
||||||
|
public DecimalData getDecimal(int rowId, int colId, int precision, int scale) {
|
||||||
|
return ((DecimalColumnVector) (columns[colId])).getDecimal(rowId, precision, scale);
|
||||||
|
}
|
||||||
|
|
||||||
|
public TimestampData getTimestamp(int rowId, int colId, int precision) {
|
||||||
|
return ((TimestampColumnVector) (columns[colId])).getTimestamp(rowId, precision);
|
||||||
|
}
|
||||||
|
|
||||||
|
public ArrayData getArray(int rowId, int colId) {
|
||||||
|
return ((ArrayColumnVector) columns[colId]).getArray(rowId);
|
||||||
|
}
|
||||||
|
|
||||||
|
public RowData getRow(int rowId, int colId) {
|
||||||
|
return ((RowColumnVector) columns[colId]).getRow(rowId);
|
||||||
|
}
|
||||||
|
|
||||||
|
public MapData getMap(int rowId, int colId) {
|
||||||
|
return ((MapColumnVector) columns[colId]).getMap(rowId);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
@@ -0,0 +1,473 @@
|
|||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one
|
||||||
|
* or more contributor license agreements. See the NOTICE file
|
||||||
|
* distributed with this work for additional information
|
||||||
|
* regarding copyright ownership. The ASF licenses this file
|
||||||
|
* to you under the Apache License, Version 2.0 (the
|
||||||
|
* "License"); you may not use this file except in compliance
|
||||||
|
* with the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.apache.hudi.table.format.cow.vector.reader;
|
||||||
|
|
||||||
|
import org.apache.hudi.table.format.cow.ParquetDecimalVector;
|
||||||
|
import org.apache.hudi.table.format.cow.vector.HeapArrayVector;
|
||||||
|
|
||||||
|
import org.apache.flink.formats.parquet.vector.reader.ColumnReader;
|
||||||
|
import org.apache.flink.table.data.TimestampData;
|
||||||
|
import org.apache.flink.table.data.vector.VectorizedColumnBatch;
|
||||||
|
import org.apache.flink.table.data.vector.heap.HeapBooleanVector;
|
||||||
|
import org.apache.flink.table.data.vector.heap.HeapByteVector;
|
||||||
|
import org.apache.flink.table.data.vector.heap.HeapBytesVector;
|
||||||
|
import org.apache.flink.table.data.vector.heap.HeapDoubleVector;
|
||||||
|
import org.apache.flink.table.data.vector.heap.HeapFloatVector;
|
||||||
|
import org.apache.flink.table.data.vector.heap.HeapIntVector;
|
||||||
|
import org.apache.flink.table.data.vector.heap.HeapLongVector;
|
||||||
|
import org.apache.flink.table.data.vector.heap.HeapShortVector;
|
||||||
|
import org.apache.flink.table.data.vector.heap.HeapTimestampVector;
|
||||||
|
import org.apache.flink.table.data.vector.writable.WritableColumnVector;
|
||||||
|
import org.apache.flink.table.types.logical.ArrayType;
|
||||||
|
import org.apache.flink.table.types.logical.LogicalType;
|
||||||
|
import org.apache.parquet.column.ColumnDescriptor;
|
||||||
|
import org.apache.parquet.column.page.PageReader;
|
||||||
|
import org.apache.parquet.schema.PrimitiveType;
|
||||||
|
import org.apache.parquet.schema.Type;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Array {@link ColumnReader}.
|
||||||
|
*/
|
||||||
|
public class ArrayColumnReader extends BaseVectorizedColumnReader {
|
||||||
|
|
||||||
|
// The value read in last time
|
||||||
|
private Object lastValue;
|
||||||
|
|
||||||
|
// flag to indicate if there is no data in parquet data page
|
||||||
|
private boolean eof = false;
|
||||||
|
|
||||||
|
// flag to indicate if it's the first time to read parquet data page with this instance
|
||||||
|
boolean isFirstRow = true;
|
||||||
|
|
||||||
|
public ArrayColumnReader(
|
||||||
|
ColumnDescriptor descriptor,
|
||||||
|
PageReader pageReader,
|
||||||
|
boolean isUtcTimestamp,
|
||||||
|
Type type,
|
||||||
|
LogicalType logicalType)
|
||||||
|
throws IOException {
|
||||||
|
super(descriptor, pageReader, isUtcTimestamp, type, logicalType);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void readToVector(int readNumber, WritableColumnVector vector) throws IOException {
|
||||||
|
HeapArrayVector lcv = (HeapArrayVector) vector;
|
||||||
|
// before readBatch, initial the size of offsets & lengths as the default value,
|
||||||
|
// the actual size will be assigned in setChildrenInfo() after reading complete.
|
||||||
|
lcv.offsets = new long[VectorizedColumnBatch.DEFAULT_SIZE];
|
||||||
|
lcv.lengths = new long[VectorizedColumnBatch.DEFAULT_SIZE];
|
||||||
|
// Because the length of ListColumnVector.child can't be known now,
|
||||||
|
// the valueList will save all data for ListColumnVector temporary.
|
||||||
|
List<Object> valueList = new ArrayList<>();
|
||||||
|
|
||||||
|
LogicalType category = ((ArrayType) logicalType).getElementType();
|
||||||
|
|
||||||
|
// read the first row in parquet data page, this will be only happened once for this
|
||||||
|
// instance
|
||||||
|
if (isFirstRow) {
|
||||||
|
if (!fetchNextValue(category)) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
isFirstRow = false;
|
||||||
|
}
|
||||||
|
|
||||||
|
int index = collectDataFromParquetPage(readNumber, lcv, valueList, category);
|
||||||
|
|
||||||
|
// Convert valueList to array for the ListColumnVector.child
|
||||||
|
fillColumnVector(category, lcv, valueList, index);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Reads a single value from parquet page, puts it into lastValue. Returns a boolean indicating
|
||||||
|
* if there is more values to read (true).
|
||||||
|
*
|
||||||
|
* @param category
|
||||||
|
* @return boolean
|
||||||
|
* @throws IOException
|
||||||
|
*/
|
||||||
|
private boolean fetchNextValue(LogicalType category) throws IOException {
|
||||||
|
int left = readPageIfNeed();
|
||||||
|
if (left > 0) {
|
||||||
|
// get the values of repetition and definitionLevel
|
||||||
|
readRepetitionAndDefinitionLevels();
|
||||||
|
// read the data if it isn't null
|
||||||
|
if (definitionLevel == maxDefLevel) {
|
||||||
|
if (isCurrentPageDictionaryEncoded) {
|
||||||
|
lastValue = dataColumn.readValueDictionaryId();
|
||||||
|
} else {
|
||||||
|
lastValue = readPrimitiveTypedRow(category);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
lastValue = null;
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
} else {
|
||||||
|
eof = true;
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private int readPageIfNeed() throws IOException {
|
||||||
|
// Compute the number of values we want to read in this page.
|
||||||
|
int leftInPage = (int) (endOfPageValueCount - valuesRead);
|
||||||
|
if (leftInPage == 0) {
|
||||||
|
// no data left in current page, load data from new page
|
||||||
|
readPage();
|
||||||
|
leftInPage = (int) (endOfPageValueCount - valuesRead);
|
||||||
|
}
|
||||||
|
return leftInPage;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Need to be in consistent with that VectorizedPrimitiveColumnReader#readBatchHelper
|
||||||
|
// TODO Reduce the duplicated code
|
||||||
|
private Object readPrimitiveTypedRow(LogicalType category) {
|
||||||
|
switch (category.getTypeRoot()) {
|
||||||
|
case CHAR:
|
||||||
|
case VARCHAR:
|
||||||
|
case BINARY:
|
||||||
|
case VARBINARY:
|
||||||
|
return dataColumn.readString();
|
||||||
|
case BOOLEAN:
|
||||||
|
return dataColumn.readBoolean();
|
||||||
|
case TIME_WITHOUT_TIME_ZONE:
|
||||||
|
case DATE:
|
||||||
|
case INTEGER:
|
||||||
|
return dataColumn.readInteger();
|
||||||
|
case TINYINT:
|
||||||
|
return dataColumn.readTinyInt();
|
||||||
|
case SMALLINT:
|
||||||
|
return dataColumn.readSmallInt();
|
||||||
|
case BIGINT:
|
||||||
|
return dataColumn.readLong();
|
||||||
|
case FLOAT:
|
||||||
|
return dataColumn.readFloat();
|
||||||
|
case DOUBLE:
|
||||||
|
return dataColumn.readDouble();
|
||||||
|
case DECIMAL:
|
||||||
|
switch (descriptor.getPrimitiveType().getPrimitiveTypeName()) {
|
||||||
|
case INT32:
|
||||||
|
return dataColumn.readInteger();
|
||||||
|
case INT64:
|
||||||
|
return dataColumn.readLong();
|
||||||
|
case BINARY:
|
||||||
|
case FIXED_LEN_BYTE_ARRAY:
|
||||||
|
return dataColumn.readString();
|
||||||
|
default:
|
||||||
|
throw new AssertionError();
|
||||||
|
}
|
||||||
|
case TIMESTAMP_WITHOUT_TIME_ZONE:
|
||||||
|
case TIMESTAMP_WITH_LOCAL_TIME_ZONE:
|
||||||
|
return dataColumn.readTimestamp();
|
||||||
|
default:
|
||||||
|
throw new RuntimeException("Unsupported type in the list: " + type);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private Object dictionaryDecodeValue(LogicalType category, Integer dictionaryValue) {
|
||||||
|
if (dictionaryValue == null) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
switch (category.getTypeRoot()) {
|
||||||
|
case CHAR:
|
||||||
|
case VARCHAR:
|
||||||
|
case BINARY:
|
||||||
|
case VARBINARY:
|
||||||
|
return dictionary.readString(dictionaryValue);
|
||||||
|
case DATE:
|
||||||
|
case TIME_WITHOUT_TIME_ZONE:
|
||||||
|
case INTEGER:
|
||||||
|
return dictionary.readInteger(dictionaryValue);
|
||||||
|
case BOOLEAN:
|
||||||
|
return dictionary.readBoolean(dictionaryValue) ? 1 : 0;
|
||||||
|
case DOUBLE:
|
||||||
|
return dictionary.readDouble(dictionaryValue);
|
||||||
|
case FLOAT:
|
||||||
|
return dictionary.readFloat(dictionaryValue);
|
||||||
|
case TINYINT:
|
||||||
|
return dictionary.readTinyInt(dictionaryValue);
|
||||||
|
case SMALLINT:
|
||||||
|
return dictionary.readSmallInt(dictionaryValue);
|
||||||
|
case BIGINT:
|
||||||
|
return dictionary.readLong(dictionaryValue);
|
||||||
|
case DECIMAL:
|
||||||
|
switch (descriptor.getPrimitiveType().getPrimitiveTypeName()) {
|
||||||
|
case INT32:
|
||||||
|
return dictionary.readInteger(dictionaryValue);
|
||||||
|
case INT64:
|
||||||
|
return dictionary.readLong(dictionaryValue);
|
||||||
|
case FIXED_LEN_BYTE_ARRAY:
|
||||||
|
case BINARY:
|
||||||
|
return dictionary.readString(dictionaryValue);
|
||||||
|
default:
|
||||||
|
throw new AssertionError();
|
||||||
|
}
|
||||||
|
case TIMESTAMP_WITHOUT_TIME_ZONE:
|
||||||
|
case TIMESTAMP_WITH_LOCAL_TIME_ZONE:
|
||||||
|
return dictionary.readTimestamp(dictionaryValue);
|
||||||
|
default:
|
||||||
|
throw new RuntimeException("Unsupported type in the list: " + type);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Collects data from a parquet page and returns the final row index where it stopped. The
|
||||||
|
* returned index can be equal to or less than total.
|
||||||
|
*
|
||||||
|
* @param total maximum number of rows to collect
|
||||||
|
* @param lcv column vector to do initial setup in data collection time
|
||||||
|
* @param valueList collection of values that will be fed into the vector later
|
||||||
|
* @param category
|
||||||
|
* @return int
|
||||||
|
* @throws IOException
|
||||||
|
*/
|
||||||
|
private int collectDataFromParquetPage(
|
||||||
|
int total, HeapArrayVector lcv, List<Object> valueList, LogicalType category)
|
||||||
|
throws IOException {
|
||||||
|
int index = 0;
|
||||||
|
/*
|
||||||
|
* Here is a nested loop for collecting all values from a parquet page.
|
||||||
|
* A column of array type can be considered as a list of lists, so the two loops are as below:
|
||||||
|
* 1. The outer loop iterates on rows (index is a row index, so points to a row in the batch), e.g.:
|
||||||
|
* [0, 2, 3] <- index: 0
|
||||||
|
* [NULL, 3, 4] <- index: 1
|
||||||
|
*
|
||||||
|
* 2. The inner loop iterates on values within a row (sets all data from parquet data page
|
||||||
|
* for an element in ListColumnVector), so fetchNextValue returns values one-by-one:
|
||||||
|
* 0, 2, 3, NULL, 3, 4
|
||||||
|
*
|
||||||
|
* As described below, the repetition level (repetitionLevel != 0)
|
||||||
|
* can be used to decide when we'll start to read values for the next list.
|
||||||
|
*/
|
||||||
|
while (!eof && index < total) {
|
||||||
|
// add element to ListColumnVector one by one
|
||||||
|
lcv.offsets[index] = valueList.size();
|
||||||
|
/*
|
||||||
|
* Let's collect all values for a single list.
|
||||||
|
* Repetition level = 0 means that a new list started there in the parquet page,
|
||||||
|
* in that case, let's exit from the loop, and start to collect value for a new list.
|
||||||
|
*/
|
||||||
|
do {
|
||||||
|
/*
|
||||||
|
* Definition level = 0 when a NULL value was returned instead of a list
|
||||||
|
* (this is not the same as a NULL value in of a list).
|
||||||
|
*/
|
||||||
|
if (definitionLevel == 0) {
|
||||||
|
lcv.setNullAt(index);
|
||||||
|
}
|
||||||
|
valueList.add(
|
||||||
|
isCurrentPageDictionaryEncoded
|
||||||
|
? dictionaryDecodeValue(category, (Integer) lastValue)
|
||||||
|
: lastValue);
|
||||||
|
} while (fetchNextValue(category) && (repetitionLevel != 0));
|
||||||
|
|
||||||
|
lcv.lengths[index] = valueList.size() - lcv.offsets[index];
|
||||||
|
index++;
|
||||||
|
}
|
||||||
|
return index;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* The lengths & offsets will be initialized as default size (1024), it should be set to the
|
||||||
|
* actual size according to the element number.
|
||||||
|
*/
|
||||||
|
private void setChildrenInfo(HeapArrayVector lcv, int itemNum, int elementNum) {
|
||||||
|
lcv.setSize(itemNum);
|
||||||
|
long[] lcvLength = new long[elementNum];
|
||||||
|
long[] lcvOffset = new long[elementNum];
|
||||||
|
System.arraycopy(lcv.lengths, 0, lcvLength, 0, elementNum);
|
||||||
|
System.arraycopy(lcv.offsets, 0, lcvOffset, 0, elementNum);
|
||||||
|
lcv.lengths = lcvLength;
|
||||||
|
lcv.offsets = lcvOffset;
|
||||||
|
}
|
||||||
|
|
||||||
|
private void fillColumnVector(
|
||||||
|
LogicalType category, HeapArrayVector lcv, List valueList, int elementNum) {
|
||||||
|
int total = valueList.size();
|
||||||
|
setChildrenInfo(lcv, total, elementNum);
|
||||||
|
switch (category.getTypeRoot()) {
|
||||||
|
case CHAR:
|
||||||
|
case VARCHAR:
|
||||||
|
case BINARY:
|
||||||
|
case VARBINARY:
|
||||||
|
lcv.child = new HeapBytesVector(total);
|
||||||
|
((HeapBytesVector) lcv.child).reset();
|
||||||
|
for (int i = 0; i < valueList.size(); i++) {
|
||||||
|
byte[] src = ((List<byte[]>) valueList).get(i);
|
||||||
|
if (src == null) {
|
||||||
|
((HeapBytesVector) lcv.child).setNullAt(i);
|
||||||
|
} else {
|
||||||
|
((HeapBytesVector) lcv.child).appendBytes(i, src, 0, src.length);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
case BOOLEAN:
|
||||||
|
lcv.child = new HeapBooleanVector(total);
|
||||||
|
((HeapBooleanVector) lcv.child).reset();
|
||||||
|
for (int i = 0; i < valueList.size(); i++) {
|
||||||
|
if (valueList.get(i) == null) {
|
||||||
|
((HeapBooleanVector) lcv.child).setNullAt(i);
|
||||||
|
} else {
|
||||||
|
((HeapBooleanVector) lcv.child).vector[i] =
|
||||||
|
((List<Boolean>) valueList).get(i);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
case TINYINT:
|
||||||
|
lcv.child = new HeapByteVector(total);
|
||||||
|
((HeapByteVector) lcv.child).reset();
|
||||||
|
for (int i = 0; i < valueList.size(); i++) {
|
||||||
|
if (valueList.get(i) == null) {
|
||||||
|
((HeapByteVector) lcv.child).setNullAt(i);
|
||||||
|
} else {
|
||||||
|
((HeapByteVector) lcv.child).vector[i] =
|
||||||
|
(byte) ((List<Integer>) valueList).get(i).intValue();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
case SMALLINT:
|
||||||
|
lcv.child = new HeapShortVector(total);
|
||||||
|
((HeapShortVector) lcv.child).reset();
|
||||||
|
for (int i = 0; i < valueList.size(); i++) {
|
||||||
|
if (valueList.get(i) == null) {
|
||||||
|
((HeapShortVector) lcv.child).setNullAt(i);
|
||||||
|
} else {
|
||||||
|
((HeapShortVector) lcv.child).vector[i] =
|
||||||
|
(short) ((List<Integer>) valueList).get(i).intValue();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
case INTEGER:
|
||||||
|
case DATE:
|
||||||
|
case TIME_WITHOUT_TIME_ZONE:
|
||||||
|
lcv.child = new HeapIntVector(total);
|
||||||
|
((HeapIntVector) lcv.child).reset();
|
||||||
|
for (int i = 0; i < valueList.size(); i++) {
|
||||||
|
if (valueList.get(i) == null) {
|
||||||
|
((HeapIntVector) lcv.child).setNullAt(i);
|
||||||
|
} else {
|
||||||
|
((HeapIntVector) lcv.child).vector[i] = ((List<Integer>) valueList).get(i);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
case FLOAT:
|
||||||
|
lcv.child = new HeapFloatVector(total);
|
||||||
|
((HeapFloatVector) lcv.child).reset();
|
||||||
|
for (int i = 0; i < valueList.size(); i++) {
|
||||||
|
if (valueList.get(i) == null) {
|
||||||
|
((HeapFloatVector) lcv.child).setNullAt(i);
|
||||||
|
} else {
|
||||||
|
((HeapFloatVector) lcv.child).vector[i] = ((List<Float>) valueList).get(i);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
case BIGINT:
|
||||||
|
lcv.child = new HeapLongVector(total);
|
||||||
|
((HeapLongVector) lcv.child).reset();
|
||||||
|
for (int i = 0; i < valueList.size(); i++) {
|
||||||
|
if (valueList.get(i) == null) {
|
||||||
|
((HeapLongVector) lcv.child).setNullAt(i);
|
||||||
|
} else {
|
||||||
|
((HeapLongVector) lcv.child).vector[i] = ((List<Long>) valueList).get(i);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
case DOUBLE:
|
||||||
|
lcv.child = new HeapDoubleVector(total);
|
||||||
|
((HeapDoubleVector) lcv.child).reset();
|
||||||
|
for (int i = 0; i < valueList.size(); i++) {
|
||||||
|
if (valueList.get(i) == null) {
|
||||||
|
((HeapDoubleVector) lcv.child).setNullAt(i);
|
||||||
|
} else {
|
||||||
|
((HeapDoubleVector) lcv.child).vector[i] =
|
||||||
|
((List<Double>) valueList).get(i);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
case TIMESTAMP_WITHOUT_TIME_ZONE:
|
||||||
|
case TIMESTAMP_WITH_LOCAL_TIME_ZONE:
|
||||||
|
lcv.child = new HeapTimestampVector(total);
|
||||||
|
((HeapTimestampVector) lcv.child).reset();
|
||||||
|
for (int i = 0; i < valueList.size(); i++) {
|
||||||
|
if (valueList.get(i) == null) {
|
||||||
|
((HeapTimestampVector) lcv.child).setNullAt(i);
|
||||||
|
} else {
|
||||||
|
((HeapTimestampVector) lcv.child)
|
||||||
|
.setTimestamp(i, ((List<TimestampData>) valueList).get(i));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
case DECIMAL:
|
||||||
|
PrimitiveType.PrimitiveTypeName primitiveTypeName =
|
||||||
|
descriptor.getPrimitiveType().getPrimitiveTypeName();
|
||||||
|
switch (primitiveTypeName) {
|
||||||
|
case INT32:
|
||||||
|
lcv.child = new ParquetDecimalVector(new HeapIntVector(total));
|
||||||
|
((HeapIntVector) ((ParquetDecimalVector) lcv.child).vector).reset();
|
||||||
|
for (int i = 0; i < valueList.size(); i++) {
|
||||||
|
if (valueList.get(i) == null) {
|
||||||
|
((HeapIntVector) ((ParquetDecimalVector) lcv.child).vector)
|
||||||
|
.setNullAt(i);
|
||||||
|
} else {
|
||||||
|
((HeapIntVector) ((ParquetDecimalVector) lcv.child).vector)
|
||||||
|
.vector[i] =
|
||||||
|
((List<Integer>) valueList).get(i);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
case INT64:
|
||||||
|
lcv.child = new ParquetDecimalVector(new HeapLongVector(total));
|
||||||
|
((HeapLongVector) ((ParquetDecimalVector) lcv.child).vector).reset();
|
||||||
|
for (int i = 0; i < valueList.size(); i++) {
|
||||||
|
if (valueList.get(i) == null) {
|
||||||
|
((HeapLongVector) ((ParquetDecimalVector) lcv.child).vector)
|
||||||
|
.setNullAt(i);
|
||||||
|
} else {
|
||||||
|
((HeapLongVector) ((ParquetDecimalVector) lcv.child).vector)
|
||||||
|
.vector[i] =
|
||||||
|
((List<Long>) valueList).get(i);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
lcv.child = new ParquetDecimalVector(new HeapBytesVector(total));
|
||||||
|
((HeapBytesVector) ((ParquetDecimalVector) lcv.child).vector).reset();
|
||||||
|
for (int i = 0; i < valueList.size(); i++) {
|
||||||
|
byte[] src = ((List<byte[]>) valueList).get(i);
|
||||||
|
if (valueList.get(i) == null) {
|
||||||
|
((HeapBytesVector) ((ParquetDecimalVector) lcv.child).vector)
|
||||||
|
.setNullAt(i);
|
||||||
|
} else {
|
||||||
|
((HeapBytesVector) ((ParquetDecimalVector) lcv.child).vector)
|
||||||
|
.appendBytes(i, src, 0, src.length);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
throw new RuntimeException("Unsupported type in the list: " + type);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
@@ -0,0 +1,313 @@
|
|||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one
|
||||||
|
* or more contributor license agreements. See the NOTICE file
|
||||||
|
* distributed with this work for additional information
|
||||||
|
* regarding copyright ownership. The ASF licenses this file
|
||||||
|
* to you under the Apache License, Version 2.0 (the
|
||||||
|
* "License"); you may not use this file except in compliance
|
||||||
|
* with the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.apache.hudi.table.format.cow.vector.reader;
|
||||||
|
|
||||||
|
import org.apache.flink.formats.parquet.vector.reader.ColumnReader;
|
||||||
|
import org.apache.flink.table.data.vector.writable.WritableColumnVector;
|
||||||
|
import org.apache.flink.table.types.logical.LogicalType;
|
||||||
|
import org.apache.parquet.bytes.ByteBufferInputStream;
|
||||||
|
import org.apache.parquet.bytes.BytesInput;
|
||||||
|
import org.apache.parquet.bytes.BytesUtils;
|
||||||
|
import org.apache.parquet.column.ColumnDescriptor;
|
||||||
|
import org.apache.parquet.column.Encoding;
|
||||||
|
import org.apache.parquet.column.page.DataPage;
|
||||||
|
import org.apache.parquet.column.page.DataPageV1;
|
||||||
|
import org.apache.parquet.column.page.DataPageV2;
|
||||||
|
import org.apache.parquet.column.page.DictionaryPage;
|
||||||
|
import org.apache.parquet.column.page.PageReader;
|
||||||
|
import org.apache.parquet.column.values.ValuesReader;
|
||||||
|
import org.apache.parquet.column.values.rle.RunLengthBitPackingHybridDecoder;
|
||||||
|
import org.apache.parquet.io.ParquetDecodingException;
|
||||||
|
import org.apache.parquet.schema.Type;
|
||||||
|
import org.slf4j.Logger;
|
||||||
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
|
import java.io.ByteArrayInputStream;
|
||||||
|
import java.io.IOException;
|
||||||
|
|
||||||
|
import static org.apache.parquet.column.ValuesType.DEFINITION_LEVEL;
|
||||||
|
import static org.apache.parquet.column.ValuesType.REPETITION_LEVEL;
|
||||||
|
import static org.apache.parquet.column.ValuesType.VALUES;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Abstract {@link ColumnReader}. part of the code is referred from Apache Hive and Apache Parquet.
|
||||||
|
*/
|
||||||
|
public abstract class BaseVectorizedColumnReader implements ColumnReader<WritableColumnVector> {
|
||||||
|
|
||||||
|
private static final Logger LOG = LoggerFactory.getLogger(BaseVectorizedColumnReader.class);
|
||||||
|
|
||||||
|
protected boolean isUtcTimestamp;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Total number of values read.
|
||||||
|
*/
|
||||||
|
protected long valuesRead;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* value that indicates the end of the current page. That is, if valuesRead ==
|
||||||
|
* endOfPageValueCount, we are at the end of the page.
|
||||||
|
*/
|
||||||
|
protected long endOfPageValueCount;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* The dictionary, if this column has dictionary encoding.
|
||||||
|
*/
|
||||||
|
protected final ParquetDataColumnReader dictionary;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* If true, the current page is dictionary encoded.
|
||||||
|
*/
|
||||||
|
protected boolean isCurrentPageDictionaryEncoded;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Maximum definition level for this column.
|
||||||
|
*/
|
||||||
|
protected final int maxDefLevel;
|
||||||
|
|
||||||
|
protected int definitionLevel;
|
||||||
|
protected int repetitionLevel;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Repetition/Definition/Value readers.
|
||||||
|
*/
|
||||||
|
protected IntIterator repetitionLevelColumn;
|
||||||
|
|
||||||
|
protected IntIterator definitionLevelColumn;
|
||||||
|
protected ParquetDataColumnReader dataColumn;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Total values in the current page.
|
||||||
|
*/
|
||||||
|
protected int pageValueCount;
|
||||||
|
|
||||||
|
protected final PageReader pageReader;
|
||||||
|
protected final ColumnDescriptor descriptor;
|
||||||
|
protected final Type type;
|
||||||
|
protected final LogicalType logicalType;
|
||||||
|
|
||||||
|
public BaseVectorizedColumnReader(
|
||||||
|
ColumnDescriptor descriptor,
|
||||||
|
PageReader pageReader,
|
||||||
|
boolean isUtcTimestamp,
|
||||||
|
Type parquetType,
|
||||||
|
LogicalType logicalType)
|
||||||
|
throws IOException {
|
||||||
|
this.descriptor = descriptor;
|
||||||
|
this.type = parquetType;
|
||||||
|
this.pageReader = pageReader;
|
||||||
|
this.maxDefLevel = descriptor.getMaxDefinitionLevel();
|
||||||
|
this.isUtcTimestamp = isUtcTimestamp;
|
||||||
|
this.logicalType = logicalType;
|
||||||
|
|
||||||
|
DictionaryPage dictionaryPage = pageReader.readDictionaryPage();
|
||||||
|
if (dictionaryPage != null) {
|
||||||
|
try {
|
||||||
|
this.dictionary =
|
||||||
|
ParquetDataColumnReaderFactory.getDataColumnReaderByTypeOnDictionary(
|
||||||
|
parquetType.asPrimitiveType(),
|
||||||
|
dictionaryPage
|
||||||
|
.getEncoding()
|
||||||
|
.initDictionary(descriptor, dictionaryPage),
|
||||||
|
isUtcTimestamp);
|
||||||
|
this.isCurrentPageDictionaryEncoded = true;
|
||||||
|
} catch (IOException e) {
|
||||||
|
throw new IOException("could not decode the dictionary for " + descriptor, e);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
this.dictionary = null;
|
||||||
|
this.isCurrentPageDictionaryEncoded = false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
protected void readRepetitionAndDefinitionLevels() {
|
||||||
|
repetitionLevel = repetitionLevelColumn.nextInt();
|
||||||
|
definitionLevel = definitionLevelColumn.nextInt();
|
||||||
|
valuesRead++;
|
||||||
|
}
|
||||||
|
|
||||||
|
protected void readPage() throws IOException {
|
||||||
|
DataPage page = pageReader.readPage();
|
||||||
|
|
||||||
|
if (page == null) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
page.accept(
|
||||||
|
new DataPage.Visitor<Void>() {
|
||||||
|
@Override
|
||||||
|
public Void visit(DataPageV1 dataPageV1) {
|
||||||
|
readPageV1(dataPageV1);
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Void visit(DataPageV2 dataPageV2) {
|
||||||
|
readPageV2(dataPageV2);
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
private void initDataReader(Encoding dataEncoding, ByteBufferInputStream in, int valueCount)
|
||||||
|
throws IOException {
|
||||||
|
this.pageValueCount = valueCount;
|
||||||
|
this.endOfPageValueCount = valuesRead + pageValueCount;
|
||||||
|
if (dataEncoding.usesDictionary()) {
|
||||||
|
this.dataColumn = null;
|
||||||
|
if (dictionary == null) {
|
||||||
|
throw new IOException(
|
||||||
|
"could not read page in col "
|
||||||
|
+ descriptor
|
||||||
|
+ " as the dictionary was missing for encoding "
|
||||||
|
+ dataEncoding);
|
||||||
|
}
|
||||||
|
dataColumn =
|
||||||
|
ParquetDataColumnReaderFactory.getDataColumnReaderByType(
|
||||||
|
type.asPrimitiveType(),
|
||||||
|
dataEncoding.getDictionaryBasedValuesReader(
|
||||||
|
descriptor, VALUES, dictionary.getDictionary()),
|
||||||
|
isUtcTimestamp);
|
||||||
|
this.isCurrentPageDictionaryEncoded = true;
|
||||||
|
} else {
|
||||||
|
dataColumn =
|
||||||
|
ParquetDataColumnReaderFactory.getDataColumnReaderByType(
|
||||||
|
type.asPrimitiveType(),
|
||||||
|
dataEncoding.getValuesReader(descriptor, VALUES),
|
||||||
|
isUtcTimestamp);
|
||||||
|
this.isCurrentPageDictionaryEncoded = false;
|
||||||
|
}
|
||||||
|
|
||||||
|
try {
|
||||||
|
dataColumn.initFromPage(pageValueCount, in);
|
||||||
|
} catch (IOException e) {
|
||||||
|
throw new IOException("could not read page in col " + descriptor, e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private void readPageV1(DataPageV1 page) {
|
||||||
|
ValuesReader rlReader = page.getRlEncoding().getValuesReader(descriptor, REPETITION_LEVEL);
|
||||||
|
ValuesReader dlReader = page.getDlEncoding().getValuesReader(descriptor, DEFINITION_LEVEL);
|
||||||
|
this.repetitionLevelColumn = new ValuesReaderIntIterator(rlReader);
|
||||||
|
this.definitionLevelColumn = new ValuesReaderIntIterator(dlReader);
|
||||||
|
try {
|
||||||
|
BytesInput bytes = page.getBytes();
|
||||||
|
LOG.debug("page size " + bytes.size() + " bytes and " + pageValueCount + " records");
|
||||||
|
ByteBufferInputStream in = bytes.toInputStream();
|
||||||
|
LOG.debug("reading repetition levels at " + in.position());
|
||||||
|
rlReader.initFromPage(pageValueCount, in);
|
||||||
|
LOG.debug("reading definition levels at " + in.position());
|
||||||
|
dlReader.initFromPage(pageValueCount, in);
|
||||||
|
LOG.debug("reading data at " + in.position());
|
||||||
|
initDataReader(page.getValueEncoding(), in, page.getValueCount());
|
||||||
|
} catch (IOException e) {
|
||||||
|
throw new ParquetDecodingException(
|
||||||
|
"could not read page " + page + " in col " + descriptor, e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private void readPageV2(DataPageV2 page) {
|
||||||
|
this.pageValueCount = page.getValueCount();
|
||||||
|
this.repetitionLevelColumn =
|
||||||
|
newRLEIterator(descriptor.getMaxRepetitionLevel(), page.getRepetitionLevels());
|
||||||
|
this.definitionLevelColumn =
|
||||||
|
newRLEIterator(descriptor.getMaxDefinitionLevel(), page.getDefinitionLevels());
|
||||||
|
try {
|
||||||
|
LOG.debug(
|
||||||
|
"page data size "
|
||||||
|
+ page.getData().size()
|
||||||
|
+ " bytes and "
|
||||||
|
+ pageValueCount
|
||||||
|
+ " records");
|
||||||
|
initDataReader(
|
||||||
|
page.getDataEncoding(), page.getData().toInputStream(), page.getValueCount());
|
||||||
|
} catch (IOException e) {
|
||||||
|
throw new ParquetDecodingException(
|
||||||
|
"could not read page " + page + " in col " + descriptor, e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private IntIterator newRLEIterator(int maxLevel, BytesInput bytes) {
|
||||||
|
try {
|
||||||
|
if (maxLevel == 0) {
|
||||||
|
return new NullIntIterator();
|
||||||
|
}
|
||||||
|
return new RLEIntIterator(
|
||||||
|
new RunLengthBitPackingHybridDecoder(
|
||||||
|
BytesUtils.getWidthFromMaxInt(maxLevel),
|
||||||
|
new ByteArrayInputStream(bytes.toByteArray())));
|
||||||
|
} catch (IOException e) {
|
||||||
|
throw new ParquetDecodingException(
|
||||||
|
"could not read levels in page for col " + descriptor, e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Utility classes to abstract over different way to read ints with different encodings.
|
||||||
|
*/
|
||||||
|
abstract static class IntIterator {
|
||||||
|
abstract int nextInt();
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* read ints from {@link ValuesReader}.
|
||||||
|
*/
|
||||||
|
protected static final class ValuesReaderIntIterator extends IntIterator {
|
||||||
|
ValuesReader delegate;
|
||||||
|
|
||||||
|
public ValuesReaderIntIterator(ValuesReader delegate) {
|
||||||
|
this.delegate = delegate;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
int nextInt() {
|
||||||
|
return delegate.readInteger();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* read ints from {@link RunLengthBitPackingHybridDecoder}.
|
||||||
|
*/
|
||||||
|
protected static final class RLEIntIterator extends IntIterator {
|
||||||
|
RunLengthBitPackingHybridDecoder delegate;
|
||||||
|
|
||||||
|
public RLEIntIterator(RunLengthBitPackingHybridDecoder delegate) {
|
||||||
|
this.delegate = delegate;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
int nextInt() {
|
||||||
|
try {
|
||||||
|
return delegate.readInt();
|
||||||
|
} catch (IOException e) {
|
||||||
|
throw new ParquetDecodingException(e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* return zero.
|
||||||
|
*/
|
||||||
|
protected static final class NullIntIterator extends IntIterator {
|
||||||
|
@Override
|
||||||
|
int nextInt() {
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
@@ -0,0 +1,76 @@
|
|||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one
|
||||||
|
* or more contributor license agreements. See the NOTICE file
|
||||||
|
* distributed with this work for additional information
|
||||||
|
* regarding copyright ownership. The ASF licenses this file
|
||||||
|
* to you under the Apache License, Version 2.0 (the
|
||||||
|
* "License"); you may not use this file except in compliance
|
||||||
|
* with the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.apache.hudi.table.format.cow.vector.reader;
|
||||||
|
|
||||||
|
import org.apache.hudi.table.format.cow.vector.HeapArrayVector;
|
||||||
|
import org.apache.hudi.table.format.cow.vector.HeapMapColumnVector;
|
||||||
|
|
||||||
|
import org.apache.flink.formats.parquet.vector.reader.ColumnReader;
|
||||||
|
import org.apache.flink.table.data.vector.ColumnVector;
|
||||||
|
import org.apache.flink.table.data.vector.writable.WritableColumnVector;
|
||||||
|
import org.apache.flink.table.types.logical.LogicalType;
|
||||||
|
import org.apache.flink.table.types.logical.MapType;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Map {@link ColumnReader}.
|
||||||
|
*/
|
||||||
|
public class MapColumnReader implements ColumnReader<WritableColumnVector> {
|
||||||
|
|
||||||
|
private final LogicalType logicalType;
|
||||||
|
private final ArrayColumnReader keyReader;
|
||||||
|
private final ArrayColumnReader valueReader;
|
||||||
|
|
||||||
|
public MapColumnReader(
|
||||||
|
ArrayColumnReader keyReader, ArrayColumnReader valueReader, LogicalType logicalType) {
|
||||||
|
this.keyReader = keyReader;
|
||||||
|
this.valueReader = valueReader;
|
||||||
|
this.logicalType = logicalType;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void readBatch(int total, ColumnVector column) throws IOException {
|
||||||
|
HeapMapColumnVector mapColumnVector = (HeapMapColumnVector) column;
|
||||||
|
MapType mapType = (MapType) logicalType;
|
||||||
|
// initialize 2 ListColumnVector for keys and values
|
||||||
|
HeapArrayVector keyArrayColumnVector = new HeapArrayVector(total);
|
||||||
|
HeapArrayVector valueArrayColumnVector = new HeapArrayVector(total);
|
||||||
|
// read the keys and values
|
||||||
|
keyReader.readToVector(total, keyArrayColumnVector);
|
||||||
|
valueReader.readToVector(total, valueArrayColumnVector);
|
||||||
|
|
||||||
|
// set the related attributes according to the keys and values
|
||||||
|
mapColumnVector.setKeys(keyArrayColumnVector.child);
|
||||||
|
mapColumnVector.setValues(valueArrayColumnVector.child);
|
||||||
|
mapColumnVector.setOffsets(keyArrayColumnVector.offsets);
|
||||||
|
mapColumnVector.setLengths(keyArrayColumnVector.lengths);
|
||||||
|
mapColumnVector.setSize(keyArrayColumnVector.getSize());
|
||||||
|
for (int i = 0; i < keyArrayColumnVector.getLen(); i++) {
|
||||||
|
if (keyArrayColumnVector.isNullAt(i)) {
|
||||||
|
mapColumnVector.setNullAt(i);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void readToVector(int readNumber, WritableColumnVector vector) throws IOException {
|
||||||
|
readBatch(readNumber, vector);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
@@ -0,0 +1,199 @@
|
|||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one
|
||||||
|
* or more contributor license agreements. See the NOTICE file
|
||||||
|
* distributed with this work for additional information
|
||||||
|
* regarding copyright ownership. The ASF licenses this file
|
||||||
|
* to you under the Apache License, Version 2.0 (the
|
||||||
|
* "License"); you may not use this file except in compliance
|
||||||
|
* with the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.apache.hudi.table.format.cow.vector.reader;
|
||||||
|
|
||||||
|
import org.apache.flink.table.data.TimestampData;
|
||||||
|
import org.apache.parquet.bytes.ByteBufferInputStream;
|
||||||
|
import org.apache.parquet.column.Dictionary;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* The interface to wrap the underlying Parquet dictionary and non dictionary encoded page reader.
|
||||||
|
*/
|
||||||
|
public interface ParquetDataColumnReader {
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Initialize the reader by page data.
|
||||||
|
*
|
||||||
|
* @param valueCount value count
|
||||||
|
* @param in page data
|
||||||
|
* @throws IOException
|
||||||
|
*/
|
||||||
|
void initFromPage(int valueCount, ByteBufferInputStream in) throws IOException;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @return the next Dictionary ID from the page
|
||||||
|
*/
|
||||||
|
int readValueDictionaryId();
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @return the next Long from the page
|
||||||
|
*/
|
||||||
|
long readLong();
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @return the next Integer from the page
|
||||||
|
*/
|
||||||
|
int readInteger();
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @return the next SmallInt from the page
|
||||||
|
*/
|
||||||
|
int readSmallInt();
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @return the next TinyInt from the page
|
||||||
|
*/
|
||||||
|
int readTinyInt();
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @return the next Float from the page
|
||||||
|
*/
|
||||||
|
float readFloat();
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @return the next Boolean from the page
|
||||||
|
*/
|
||||||
|
boolean readBoolean();
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @return the next String from the page
|
||||||
|
*/
|
||||||
|
byte[] readString();
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @return the next Varchar from the page
|
||||||
|
*/
|
||||||
|
byte[] readVarchar();
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @return the next Char from the page
|
||||||
|
*/
|
||||||
|
byte[] readChar();
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @return the next Bytes from the page
|
||||||
|
*/
|
||||||
|
byte[] readBytes();
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @return the next Decimal from the page
|
||||||
|
*/
|
||||||
|
byte[] readDecimal();
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @return the next Double from the page
|
||||||
|
*/
|
||||||
|
double readDouble();
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @return the next TimestampData from the page
|
||||||
|
*/
|
||||||
|
TimestampData readTimestamp();
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @return is data valid
|
||||||
|
*/
|
||||||
|
boolean isValid();
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @return the underlying dictionary if current reader is dictionary encoded
|
||||||
|
*/
|
||||||
|
Dictionary getDictionary();
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @param id in dictionary
|
||||||
|
* @return the Bytes from the dictionary by id
|
||||||
|
*/
|
||||||
|
byte[] readBytes(int id);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @param id in dictionary
|
||||||
|
* @return the Float from the dictionary by id
|
||||||
|
*/
|
||||||
|
float readFloat(int id);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @param id in dictionary
|
||||||
|
* @return the Double from the dictionary by id
|
||||||
|
*/
|
||||||
|
double readDouble(int id);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @param id in dictionary
|
||||||
|
* @return the Integer from the dictionary by id
|
||||||
|
*/
|
||||||
|
int readInteger(int id);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @param id in dictionary
|
||||||
|
* @return the Long from the dictionary by id
|
||||||
|
*/
|
||||||
|
long readLong(int id);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @param id in dictionary
|
||||||
|
* @return the Small Int from the dictionary by id
|
||||||
|
*/
|
||||||
|
int readSmallInt(int id);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @param id in dictionary
|
||||||
|
* @return the tiny int from the dictionary by id
|
||||||
|
*/
|
||||||
|
int readTinyInt(int id);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @param id in dictionary
|
||||||
|
* @return the Boolean from the dictionary by id
|
||||||
|
*/
|
||||||
|
boolean readBoolean(int id);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @param id in dictionary
|
||||||
|
* @return the Decimal from the dictionary by id
|
||||||
|
*/
|
||||||
|
byte[] readDecimal(int id);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @param id in dictionary
|
||||||
|
* @return the TimestampData from the dictionary by id
|
||||||
|
*/
|
||||||
|
TimestampData readTimestamp(int id);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @param id in dictionary
|
||||||
|
* @return the String from the dictionary by id
|
||||||
|
*/
|
||||||
|
byte[] readString(int id);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @param id in dictionary
|
||||||
|
* @return the Varchar from the dictionary by id
|
||||||
|
*/
|
||||||
|
byte[] readVarchar(int id);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @param id in dictionary
|
||||||
|
* @return the Char from the dictionary by id
|
||||||
|
*/
|
||||||
|
byte[] readChar(int id);
|
||||||
|
}
|
||||||
|
|
||||||
@@ -0,0 +1,304 @@
|
|||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one
|
||||||
|
* or more contributor license agreements. See the NOTICE file
|
||||||
|
* distributed with this work for additional information
|
||||||
|
* regarding copyright ownership. The ASF licenses this file
|
||||||
|
* to you under the Apache License, Version 2.0 (the
|
||||||
|
* "License"); you may not use this file except in compliance
|
||||||
|
* with the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.apache.hudi.table.format.cow.vector.reader;
|
||||||
|
|
||||||
|
import org.apache.flink.table.data.TimestampData;
|
||||||
|
import org.apache.parquet.bytes.ByteBufferInputStream;
|
||||||
|
import org.apache.parquet.column.Dictionary;
|
||||||
|
import org.apache.parquet.column.values.ValuesReader;
|
||||||
|
import org.apache.parquet.io.api.Binary;
|
||||||
|
import org.apache.parquet.schema.PrimitiveType;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.nio.ByteBuffer;
|
||||||
|
import java.nio.ByteOrder;
|
||||||
|
import java.sql.Timestamp;
|
||||||
|
|
||||||
|
import static org.apache.flink.formats.parquet.vector.reader.TimestampColumnReader.JULIAN_EPOCH_OFFSET_DAYS;
|
||||||
|
import static org.apache.flink.formats.parquet.vector.reader.TimestampColumnReader.MILLIS_IN_DAY;
|
||||||
|
import static org.apache.flink.formats.parquet.vector.reader.TimestampColumnReader.NANOS_PER_MILLISECOND;
|
||||||
|
import static org.apache.flink.formats.parquet.vector.reader.TimestampColumnReader.NANOS_PER_SECOND;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Parquet file has self-describing schema which may differ from the user required schema (e.g.
|
||||||
|
* schema evolution). This factory is used to retrieve user required typed data via corresponding
|
||||||
|
* reader which reads the underlying data.
|
||||||
|
*/
|
||||||
|
public final class ParquetDataColumnReaderFactory {
|
||||||
|
|
||||||
|
private ParquetDataColumnReaderFactory() {
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* default reader for {@link ParquetDataColumnReader}.
|
||||||
|
*/
|
||||||
|
public static class DefaultParquetDataColumnReader implements ParquetDataColumnReader {
|
||||||
|
protected ValuesReader valuesReader;
|
||||||
|
protected Dictionary dict;
|
||||||
|
|
||||||
|
// After the data is read in the parquet type, isValid will be set to true if the data can
|
||||||
|
// be returned in the type defined in HMS. Otherwise isValid is set to false.
|
||||||
|
boolean isValid = true;
|
||||||
|
|
||||||
|
public DefaultParquetDataColumnReader(ValuesReader valuesReader) {
|
||||||
|
this.valuesReader = valuesReader;
|
||||||
|
}
|
||||||
|
|
||||||
|
public DefaultParquetDataColumnReader(Dictionary dict) {
|
||||||
|
this.dict = dict;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void initFromPage(int i, ByteBufferInputStream in) throws IOException {
|
||||||
|
valuesReader.initFromPage(i, in);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public boolean readBoolean() {
|
||||||
|
return valuesReader.readBoolean();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public boolean readBoolean(int id) {
|
||||||
|
return dict.decodeToBoolean(id);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public byte[] readString(int id) {
|
||||||
|
return dict.decodeToBinary(id).getBytesUnsafe();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public byte[] readString() {
|
||||||
|
return valuesReader.readBytes().getBytesUnsafe();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public byte[] readVarchar() {
|
||||||
|
// we need to enforce the size here even the types are the same
|
||||||
|
return valuesReader.readBytes().getBytesUnsafe();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public byte[] readVarchar(int id) {
|
||||||
|
return dict.decodeToBinary(id).getBytesUnsafe();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public byte[] readChar() {
|
||||||
|
return valuesReader.readBytes().getBytesUnsafe();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public byte[] readChar(int id) {
|
||||||
|
return dict.decodeToBinary(id).getBytesUnsafe();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public byte[] readBytes() {
|
||||||
|
return valuesReader.readBytes().getBytesUnsafe();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public byte[] readBytes(int id) {
|
||||||
|
return dict.decodeToBinary(id).getBytesUnsafe();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public byte[] readDecimal() {
|
||||||
|
return valuesReader.readBytes().getBytesUnsafe();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public byte[] readDecimal(int id) {
|
||||||
|
return dict.decodeToBinary(id).getBytesUnsafe();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public float readFloat() {
|
||||||
|
return valuesReader.readFloat();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public float readFloat(int id) {
|
||||||
|
return dict.decodeToFloat(id);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public double readDouble() {
|
||||||
|
return valuesReader.readDouble();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public double readDouble(int id) {
|
||||||
|
return dict.decodeToDouble(id);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public TimestampData readTimestamp() {
|
||||||
|
throw new RuntimeException("Unsupported operation");
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public TimestampData readTimestamp(int id) {
|
||||||
|
throw new RuntimeException("Unsupported operation");
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int readInteger() {
|
||||||
|
return valuesReader.readInteger();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int readInteger(int id) {
|
||||||
|
return dict.decodeToInt(id);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public boolean isValid() {
|
||||||
|
return isValid;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public long readLong(int id) {
|
||||||
|
return dict.decodeToLong(id);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public long readLong() {
|
||||||
|
return valuesReader.readLong();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int readSmallInt() {
|
||||||
|
return valuesReader.readInteger();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int readSmallInt(int id) {
|
||||||
|
return dict.decodeToInt(id);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int readTinyInt() {
|
||||||
|
return valuesReader.readInteger();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int readTinyInt(int id) {
|
||||||
|
return dict.decodeToInt(id);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int readValueDictionaryId() {
|
||||||
|
return valuesReader.readValueDictionaryId();
|
||||||
|
}
|
||||||
|
|
||||||
|
public void skip() {
|
||||||
|
valuesReader.skip();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public Dictionary getDictionary() {
|
||||||
|
return dict;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* The reader who reads from the underlying Timestamp value value.
|
||||||
|
*/
|
||||||
|
public static class TypesFromInt96PageReader extends DefaultParquetDataColumnReader {
|
||||||
|
private final boolean isUtcTimestamp;
|
||||||
|
|
||||||
|
public TypesFromInt96PageReader(ValuesReader realReader, boolean isUtcTimestamp) {
|
||||||
|
super(realReader);
|
||||||
|
this.isUtcTimestamp = isUtcTimestamp;
|
||||||
|
}
|
||||||
|
|
||||||
|
public TypesFromInt96PageReader(Dictionary dict, boolean isUtcTimestamp) {
|
||||||
|
super(dict);
|
||||||
|
this.isUtcTimestamp = isUtcTimestamp;
|
||||||
|
}
|
||||||
|
|
||||||
|
private TimestampData convert(Binary binary) {
|
||||||
|
ByteBuffer buf = binary.toByteBuffer();
|
||||||
|
buf.order(ByteOrder.LITTLE_ENDIAN);
|
||||||
|
long timeOfDayNanos = buf.getLong();
|
||||||
|
int julianDay = buf.getInt();
|
||||||
|
return int96ToTimestamp(isUtcTimestamp, timeOfDayNanos, julianDay);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public TimestampData readTimestamp(int id) {
|
||||||
|
return convert(dict.decodeToBinary(id));
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public TimestampData readTimestamp() {
|
||||||
|
return convert(valuesReader.readBytes());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private static ParquetDataColumnReader getDataColumnReaderByTypeHelper(
|
||||||
|
boolean isDictionary,
|
||||||
|
PrimitiveType parquetType,
|
||||||
|
Dictionary dictionary,
|
||||||
|
ValuesReader valuesReader,
|
||||||
|
boolean isUtcTimestamp) {
|
||||||
|
if (parquetType.getPrimitiveTypeName() == PrimitiveType.PrimitiveTypeName.INT96) {
|
||||||
|
return isDictionary
|
||||||
|
? new TypesFromInt96PageReader(dictionary, isUtcTimestamp)
|
||||||
|
: new TypesFromInt96PageReader(valuesReader, isUtcTimestamp);
|
||||||
|
} else {
|
||||||
|
return isDictionary
|
||||||
|
? new DefaultParquetDataColumnReader(dictionary)
|
||||||
|
: new DefaultParquetDataColumnReader(valuesReader);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public static ParquetDataColumnReader getDataColumnReaderByTypeOnDictionary(
|
||||||
|
PrimitiveType parquetType, Dictionary realReader, boolean isUtcTimestamp) {
|
||||||
|
return getDataColumnReaderByTypeHelper(true, parquetType, realReader, null, isUtcTimestamp);
|
||||||
|
}
|
||||||
|
|
||||||
|
public static ParquetDataColumnReader getDataColumnReaderByType(
|
||||||
|
PrimitiveType parquetType, ValuesReader realReader, boolean isUtcTimestamp) {
|
||||||
|
return getDataColumnReaderByTypeHelper(
|
||||||
|
false, parquetType, null, realReader, isUtcTimestamp);
|
||||||
|
}
|
||||||
|
|
||||||
|
private static TimestampData int96ToTimestamp(
|
||||||
|
boolean utcTimestamp, long nanosOfDay, int julianDay) {
|
||||||
|
long millisecond = julianDayToMillis(julianDay) + (nanosOfDay / NANOS_PER_MILLISECOND);
|
||||||
|
|
||||||
|
if (utcTimestamp) {
|
||||||
|
int nanoOfMillisecond = (int) (nanosOfDay % NANOS_PER_MILLISECOND);
|
||||||
|
return TimestampData.fromEpochMillis(millisecond, nanoOfMillisecond);
|
||||||
|
} else {
|
||||||
|
Timestamp timestamp = new Timestamp(millisecond);
|
||||||
|
timestamp.setNanos((int) (nanosOfDay % NANOS_PER_SECOND));
|
||||||
|
return TimestampData.fromTimestamp(timestamp);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private static long julianDayToMillis(int julianDay) {
|
||||||
|
return (julianDay - JULIAN_EPOCH_OFFSET_DAYS) * MILLIS_IN_DAY;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
@@ -0,0 +1,57 @@
|
|||||||
|
/*
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one
|
||||||
|
* or more contributor license agreements. See the NOTICE file
|
||||||
|
* distributed with this work for additional information
|
||||||
|
* regarding copyright ownership. The ASF licenses this file
|
||||||
|
* to you under the Apache License, Version 2.0 (the
|
||||||
|
* "License"); you may not use this file except in compliance
|
||||||
|
* with the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.apache.hudi.table.format.cow.vector.reader;
|
||||||
|
|
||||||
|
import org.apache.hudi.table.format.cow.vector.HeapRowColumnVector;
|
||||||
|
|
||||||
|
import org.apache.flink.formats.parquet.vector.reader.ColumnReader;
|
||||||
|
import org.apache.flink.table.data.vector.writable.WritableColumnVector;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Row {@link ColumnReader}.
|
||||||
|
*/
|
||||||
|
public class RowColumnReader implements ColumnReader<WritableColumnVector> {
|
||||||
|
|
||||||
|
private final List<ColumnReader> fieldReaders;
|
||||||
|
|
||||||
|
public RowColumnReader(List<ColumnReader> fieldReaders) {
|
||||||
|
this.fieldReaders = fieldReaders;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void readToVector(int readNumber, WritableColumnVector vector) throws IOException {
|
||||||
|
HeapRowColumnVector rowColumnVector = (HeapRowColumnVector) vector;
|
||||||
|
WritableColumnVector[] vectors = rowColumnVector.vectors;
|
||||||
|
for (int i = 0; i < vectors.length; i++) {
|
||||||
|
fieldReaders.get(i).readToVector(readNumber, vectors[i]);
|
||||||
|
|
||||||
|
for (int j = 0; j < readNumber; j++) {
|
||||||
|
boolean isNull = (i == 0)
|
||||||
|
? vectors[i].isNullAt(j)
|
||||||
|
: rowColumnVector.isNullAt(j) && vectors[i].isNullAt(j);
|
||||||
|
if (isNull) {
|
||||||
|
rowColumnVector.setNullAt(j);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -1081,6 +1081,62 @@ public class HoodieDataSourceITCase extends AbstractTestBase {
|
|||||||
assertRowsEquals(result, expected);
|
assertRowsEquals(result, expected);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ParameterizedTest
|
||||||
|
@ValueSource(strings = {"insert", "upsert", "bulk_insert"})
|
||||||
|
void testParquetComplexTypes(String operation) {
|
||||||
|
TableEnvironment tableEnv = batchTableEnv;
|
||||||
|
|
||||||
|
String hoodieTableDDL = sql("t1")
|
||||||
|
.field("f_int int")
|
||||||
|
.field("f_array array<varchar(10)>")
|
||||||
|
.field("f_map map<varchar(20), int>")
|
||||||
|
.field("f_row row(f_row_f0 int, f_row_f1 varchar(10))")
|
||||||
|
.pkField("f_int")
|
||||||
|
.noPartition()
|
||||||
|
.option(FlinkOptions.PATH, tempFile.getAbsolutePath())
|
||||||
|
.option(FlinkOptions.OPERATION, operation)
|
||||||
|
.end();
|
||||||
|
tableEnv.executeSql(hoodieTableDDL);
|
||||||
|
|
||||||
|
execInsertSql(tableEnv, TestSQL.COMPLEX_TYPE_INSERT_T1);
|
||||||
|
|
||||||
|
List<Row> result = CollectionUtil.iterableToList(
|
||||||
|
() -> tableEnv.sqlQuery("select * from t1").execute().collect());
|
||||||
|
final String expected = "["
|
||||||
|
+ "+I[1, [abc1, def1], {abc1=1, def1=3}, +I[1, abc1]], "
|
||||||
|
+ "+I[2, [abc2, def2], {def2=3, abc2=1}, +I[2, abc2]], "
|
||||||
|
+ "+I[3, [abc3, def3], {def3=3, abc3=1}, +I[3, abc3]]]";
|
||||||
|
assertRowsEquals(result, expected);
|
||||||
|
}
|
||||||
|
|
||||||
|
@ParameterizedTest
|
||||||
|
@ValueSource(strings = {"insert", "upsert", "bulk_insert"})
|
||||||
|
void testParquetComplexNestedRowTypes(String operation) {
|
||||||
|
TableEnvironment tableEnv = batchTableEnv;
|
||||||
|
|
||||||
|
String hoodieTableDDL = sql("t1")
|
||||||
|
.field("f_int int")
|
||||||
|
.field("f_array array<varchar(10)>")
|
||||||
|
.field("f_map map<varchar(20), int>")
|
||||||
|
.field("f_row row(f_nested_array array<varchar(10)>, f_nested_row row(f_row_f0 int, f_row_f1 varchar(10)))")
|
||||||
|
.pkField("f_int")
|
||||||
|
.noPartition()
|
||||||
|
.option(FlinkOptions.PATH, tempFile.getAbsolutePath())
|
||||||
|
.option(FlinkOptions.OPERATION, operation)
|
||||||
|
.end();
|
||||||
|
tableEnv.executeSql(hoodieTableDDL);
|
||||||
|
|
||||||
|
execInsertSql(tableEnv, TestSQL.COMPLEX_NESTED_ROW_TYPE_INSERT_T1);
|
||||||
|
|
||||||
|
List<Row> result = CollectionUtil.iterableToList(
|
||||||
|
() -> tableEnv.sqlQuery("select * from t1").execute().collect());
|
||||||
|
final String expected = "["
|
||||||
|
+ "+I[1, [abc1, def1], {abc1=1, def1=3}, +I[[abc1, def1], +I[1, abc1]]], "
|
||||||
|
+ "+I[2, [abc2, def2], {def2=3, abc2=1}, +I[[abc2, def2], +I[2, abc2]]], "
|
||||||
|
+ "+I[3, [abc3, def3], {def3=3, abc3=1}, +I[[abc3, def3], +I[3, abc3]]]]";
|
||||||
|
assertRowsEquals(result, expected);
|
||||||
|
}
|
||||||
|
|
||||||
// -------------------------------------------------------------------------
|
// -------------------------------------------------------------------------
|
||||||
// Utilities
|
// Utilities
|
||||||
// -------------------------------------------------------------------------
|
// -------------------------------------------------------------------------
|
||||||
|
|||||||
@@ -51,4 +51,14 @@ public class TestSQL {
|
|||||||
+ "('id9','Jane',19,TIMESTAMP '1970-01-01 00:00:06','par3'),\n"
|
+ "('id9','Jane',19,TIMESTAMP '1970-01-01 00:00:06','par3'),\n"
|
||||||
+ "('id10','Ella',38,TIMESTAMP '1970-01-01 00:00:07','par4'),\n"
|
+ "('id10','Ella',38,TIMESTAMP '1970-01-01 00:00:07','par4'),\n"
|
||||||
+ "('id11','Phoebe',52,TIMESTAMP '1970-01-01 00:00:08','par4')";
|
+ "('id11','Phoebe',52,TIMESTAMP '1970-01-01 00:00:08','par4')";
|
||||||
|
|
||||||
|
public static final String COMPLEX_TYPE_INSERT_T1 = "insert into t1 values\n"
|
||||||
|
+ "(1, array['abc1', 'def1'], map['abc1', 1, 'def1', 3], row(1, 'abc1')),\n"
|
||||||
|
+ "(2, array['abc2', 'def2'], map['abc2', 1, 'def2', 3], row(2, 'abc2')),\n"
|
||||||
|
+ "(3, array['abc3', 'def3'], map['abc3', 1, 'def3', 3], row(3, 'abc3'))";
|
||||||
|
|
||||||
|
public static final String COMPLEX_NESTED_ROW_TYPE_INSERT_T1 = "insert into t1 values\n"
|
||||||
|
+ "(1, array['abc1', 'def1'], map['abc1', 1, 'def1', 3], row(array['abc1', 'def1'], row(1, 'abc1'))),\n"
|
||||||
|
+ "(2, array['abc2', 'def2'], map['abc2', 1, 'def2', 3], row(array['abc2', 'def2'], row(2, 'abc2'))),\n"
|
||||||
|
+ "(3, array['abc3', 'def3'], map['abc3', 1, 'def3', 3], row(array['abc3', 'def3'], row(3, 'abc3')))";
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user