[HUDI-4319] Fixed Parquet's PLAIN_DICTIONARY encoding not being applied when bulk-inserting (#5966)
* Fixed Dictionary encoding config not being properly propagated to Parquet writer (making it unable to apply it, substantially bloating the storage footprint)
This commit is contained in:
@@ -23,12 +23,11 @@ import org.apache.hadoop.conf.Configuration
|
|||||||
import org.apache.hadoop.fs.{FileSystem, Path}
|
import org.apache.hadoop.fs.{FileSystem, Path}
|
||||||
import org.apache.hudi.avro.HoodieAvroWriteSupport
|
import org.apache.hudi.avro.HoodieAvroWriteSupport
|
||||||
import org.apache.hudi.client.SparkTaskContextSupplier
|
import org.apache.hudi.client.SparkTaskContextSupplier
|
||||||
import org.apache.hudi.common.HoodieJsonPayload
|
|
||||||
import org.apache.hudi.common.bloom.{BloomFilter, BloomFilterFactory}
|
import org.apache.hudi.common.bloom.{BloomFilter, BloomFilterFactory}
|
||||||
import org.apache.hudi.common.model.{HoodieFileFormat, HoodieRecord}
|
import org.apache.hudi.common.model.{HoodieFileFormat, HoodieRecord}
|
||||||
import org.apache.hudi.common.util.BaseFileUtils
|
import org.apache.hudi.common.util.BaseFileUtils
|
||||||
import org.apache.hudi.config.{HoodieIndexConfig, HoodieStorageConfig}
|
import org.apache.hudi.config.{HoodieIndexConfig, HoodieStorageConfig}
|
||||||
import org.apache.hudi.io.storage.{HoodieAvroParquetConfig, HoodieAvroParquetWriter}
|
import org.apache.hudi.io.storage.{HoodieAvroParquetWriter, HoodieParquetConfig}
|
||||||
import org.apache.parquet.avro.AvroSchemaConverter
|
import org.apache.parquet.avro.AvroSchemaConverter
|
||||||
import org.apache.parquet.hadoop.metadata.CompressionCodecName
|
import org.apache.parquet.hadoop.metadata.CompressionCodecName
|
||||||
import org.apache.spark.sql.{DataFrame, SQLContext}
|
import org.apache.spark.sql.{DataFrame, SQLContext}
|
||||||
@@ -45,7 +44,7 @@ object SparkHelpers {
|
|||||||
val filter: BloomFilter = BloomFilterFactory.createBloomFilter(HoodieIndexConfig.BLOOM_FILTER_NUM_ENTRIES_VALUE.defaultValue.toInt, HoodieIndexConfig.BLOOM_FILTER_FPP_VALUE.defaultValue.toDouble,
|
val filter: BloomFilter = BloomFilterFactory.createBloomFilter(HoodieIndexConfig.BLOOM_FILTER_NUM_ENTRIES_VALUE.defaultValue.toInt, HoodieIndexConfig.BLOOM_FILTER_FPP_VALUE.defaultValue.toDouble,
|
||||||
HoodieIndexConfig.BLOOM_INDEX_FILTER_DYNAMIC_MAX_ENTRIES.defaultValue.toInt, HoodieIndexConfig.BLOOM_FILTER_TYPE.defaultValue);
|
HoodieIndexConfig.BLOOM_INDEX_FILTER_DYNAMIC_MAX_ENTRIES.defaultValue.toInt, HoodieIndexConfig.BLOOM_FILTER_TYPE.defaultValue);
|
||||||
val writeSupport: HoodieAvroWriteSupport = new HoodieAvroWriteSupport(new AvroSchemaConverter(fs.getConf).convert(schema), schema, org.apache.hudi.common.util.Option.of(filter))
|
val writeSupport: HoodieAvroWriteSupport = new HoodieAvroWriteSupport(new AvroSchemaConverter(fs.getConf).convert(schema), schema, org.apache.hudi.common.util.Option.of(filter))
|
||||||
val parquetConfig: HoodieAvroParquetConfig = new HoodieAvroParquetConfig(writeSupport, CompressionCodecName.GZIP, HoodieStorageConfig.PARQUET_BLOCK_SIZE.defaultValue.toInt, HoodieStorageConfig.PARQUET_PAGE_SIZE.defaultValue.toInt, HoodieStorageConfig.PARQUET_MAX_FILE_SIZE.defaultValue.toInt, fs.getConf, HoodieStorageConfig.PARQUET_COMPRESSION_RATIO_FRACTION.defaultValue.toDouble)
|
val parquetConfig: HoodieParquetConfig[HoodieAvroWriteSupport] = new HoodieParquetConfig(writeSupport, CompressionCodecName.GZIP, HoodieStorageConfig.PARQUET_BLOCK_SIZE.defaultValue.toInt, HoodieStorageConfig.PARQUET_PAGE_SIZE.defaultValue.toInt, HoodieStorageConfig.PARQUET_MAX_FILE_SIZE.defaultValue.toInt, fs.getConf, HoodieStorageConfig.PARQUET_COMPRESSION_RATIO_FRACTION.defaultValue.toDouble)
|
||||||
|
|
||||||
// Add current classLoad for config, if not will throw classNotFound of 'HoodieWrapperFileSystem'.
|
// Add current classLoad for config, if not will throw classNotFound of 'HoodieWrapperFileSystem'.
|
||||||
parquetConfig.getHadoopConf().setClassLoader(Thread.currentThread.getContextClassLoader)
|
parquetConfig.getHadoopConf().setClassLoader(Thread.currentThread.getContextClassLoader)
|
||||||
|
|||||||
@@ -47,11 +47,11 @@ public class HoodieAvroParquetWriter<R extends IndexedRecord>
|
|||||||
|
|
||||||
@SuppressWarnings({"unchecked", "rawtypes"})
|
@SuppressWarnings({"unchecked", "rawtypes"})
|
||||||
public HoodieAvroParquetWriter(Path file,
|
public HoodieAvroParquetWriter(Path file,
|
||||||
HoodieAvroParquetConfig parquetConfig,
|
HoodieParquetConfig<HoodieAvroWriteSupport> parquetConfig,
|
||||||
String instantTime,
|
String instantTime,
|
||||||
TaskContextSupplier taskContextSupplier,
|
TaskContextSupplier taskContextSupplier,
|
||||||
boolean populateMetaFields) throws IOException {
|
boolean populateMetaFields) throws IOException {
|
||||||
super(file, (HoodieBaseParquetConfig) parquetConfig);
|
super(file, (HoodieParquetConfig) parquetConfig);
|
||||||
this.fileName = file.getName();
|
this.fileName = file.getName();
|
||||||
this.writeSupport = parquetConfig.getWriteSupport();
|
this.writeSupport = parquetConfig.getWriteSupport();
|
||||||
this.instantTime = instantTime;
|
this.instantTime = instantTime;
|
||||||
|
|||||||
@@ -43,7 +43,7 @@ public abstract class HoodieBaseParquetWriter<R> extends ParquetWriter<R> {
|
|||||||
private long lastCachedDataSize = -1;
|
private long lastCachedDataSize = -1;
|
||||||
|
|
||||||
public HoodieBaseParquetWriter(Path file,
|
public HoodieBaseParquetWriter(Path file,
|
||||||
HoodieBaseParquetConfig<? extends WriteSupport<R>> parquetConfig) throws IOException {
|
HoodieParquetConfig<? extends WriteSupport<R>> parquetConfig) throws IOException {
|
||||||
super(HoodieWrapperFileSystem.convertToHoodiePath(file, parquetConfig.getHadoopConf()),
|
super(HoodieWrapperFileSystem.convertToHoodiePath(file, parquetConfig.getHadoopConf()),
|
||||||
ParquetFileWriter.Mode.CREATE,
|
ParquetFileWriter.Mode.CREATE,
|
||||||
parquetConfig.getWriteSupport(),
|
parquetConfig.getWriteSupport(),
|
||||||
|
|||||||
@@ -77,7 +77,7 @@ public class HoodieFileWriterFactory {
|
|||||||
Option<BloomFilter> filter = enableBloomFilter ? Option.of(createBloomFilter(config)) : Option.empty();
|
Option<BloomFilter> filter = enableBloomFilter ? Option.of(createBloomFilter(config)) : Option.empty();
|
||||||
HoodieAvroWriteSupport writeSupport = new HoodieAvroWriteSupport(new AvroSchemaConverter(conf).convert(schema), schema, filter);
|
HoodieAvroWriteSupport writeSupport = new HoodieAvroWriteSupport(new AvroSchemaConverter(conf).convert(schema), schema, filter);
|
||||||
|
|
||||||
HoodieAvroParquetConfig parquetConfig = new HoodieAvroParquetConfig(writeSupport, config.getParquetCompressionCodec(),
|
HoodieParquetConfig<HoodieAvroWriteSupport> parquetConfig = new HoodieParquetConfig<>(writeSupport, config.getParquetCompressionCodec(),
|
||||||
config.getParquetBlockSize(), config.getParquetPageSize(), config.getParquetMaxFileSize(),
|
config.getParquetBlockSize(), config.getParquetPageSize(), config.getParquetMaxFileSize(),
|
||||||
conf, config.getParquetCompressionRatio(), config.parquetDictionaryEnabled());
|
conf, config.getParquetCompressionRatio(), config.parquetDictionaryEnabled());
|
||||||
|
|
||||||
|
|||||||
@@ -19,6 +19,12 @@
|
|||||||
|
|
||||||
package org.apache.hudi.testutils;
|
package org.apache.hudi.testutils;
|
||||||
|
|
||||||
|
import org.apache.avro.Schema;
|
||||||
|
import org.apache.avro.generic.GenericRecord;
|
||||||
|
import org.apache.avro.generic.IndexedRecord;
|
||||||
|
import org.apache.hadoop.conf.Configuration;
|
||||||
|
import org.apache.hadoop.fs.FileSystem;
|
||||||
|
import org.apache.hadoop.fs.Path;
|
||||||
import org.apache.hudi.avro.HoodieAvroUtils;
|
import org.apache.hudi.avro.HoodieAvroUtils;
|
||||||
import org.apache.hudi.avro.HoodieAvroWriteSupport;
|
import org.apache.hudi.avro.HoodieAvroWriteSupport;
|
||||||
import org.apache.hudi.common.bloom.BloomFilter;
|
import org.apache.hudi.common.bloom.BloomFilter;
|
||||||
@@ -38,18 +44,11 @@ import org.apache.hudi.common.testutils.HoodieMetadataTestTable;
|
|||||||
import org.apache.hudi.common.util.Option;
|
import org.apache.hudi.common.util.Option;
|
||||||
import org.apache.hudi.common.util.collection.Pair;
|
import org.apache.hudi.common.util.collection.Pair;
|
||||||
import org.apache.hudi.config.HoodieStorageConfig;
|
import org.apache.hudi.config.HoodieStorageConfig;
|
||||||
import org.apache.hudi.io.storage.HoodieAvroParquetConfig;
|
import org.apache.hudi.io.storage.HoodieAvroParquetWriter;
|
||||||
|
import org.apache.hudi.io.storage.HoodieParquetConfig;
|
||||||
import org.apache.hudi.io.storage.HoodieOrcConfig;
|
import org.apache.hudi.io.storage.HoodieOrcConfig;
|
||||||
import org.apache.hudi.io.storage.HoodieOrcWriter;
|
import org.apache.hudi.io.storage.HoodieOrcWriter;
|
||||||
import org.apache.hudi.io.storage.HoodieAvroParquetWriter;
|
|
||||||
import org.apache.hudi.metadata.HoodieTableMetadataWriter;
|
import org.apache.hudi.metadata.HoodieTableMetadataWriter;
|
||||||
|
|
||||||
import org.apache.avro.Schema;
|
|
||||||
import org.apache.avro.generic.GenericRecord;
|
|
||||||
import org.apache.avro.generic.IndexedRecord;
|
|
||||||
import org.apache.hadoop.conf.Configuration;
|
|
||||||
import org.apache.hadoop.fs.FileSystem;
|
|
||||||
import org.apache.hadoop.fs.Path;
|
|
||||||
import org.apache.log4j.LogManager;
|
import org.apache.log4j.LogManager;
|
||||||
import org.apache.log4j.Logger;
|
import org.apache.log4j.Logger;
|
||||||
import org.apache.orc.CompressionKind;
|
import org.apache.orc.CompressionKind;
|
||||||
@@ -110,7 +109,7 @@ public class HoodieWriteableTestTable extends HoodieMetadataTestTable {
|
|||||||
if (HoodieTableConfig.BASE_FILE_FORMAT.defaultValue().equals(HoodieFileFormat.PARQUET)) {
|
if (HoodieTableConfig.BASE_FILE_FORMAT.defaultValue().equals(HoodieFileFormat.PARQUET)) {
|
||||||
HoodieAvroWriteSupport writeSupport = new HoodieAvroWriteSupport(
|
HoodieAvroWriteSupport writeSupport = new HoodieAvroWriteSupport(
|
||||||
new AvroSchemaConverter().convert(schema), schema, Option.of(filter));
|
new AvroSchemaConverter().convert(schema), schema, Option.of(filter));
|
||||||
HoodieAvroParquetConfig config = new HoodieAvroParquetConfig(writeSupport, CompressionCodecName.GZIP,
|
HoodieParquetConfig<HoodieAvroWriteSupport> config = new HoodieParquetConfig<>(writeSupport, CompressionCodecName.GZIP,
|
||||||
ParquetWriter.DEFAULT_BLOCK_SIZE, ParquetWriter.DEFAULT_PAGE_SIZE, 120 * 1024 * 1024,
|
ParquetWriter.DEFAULT_BLOCK_SIZE, ParquetWriter.DEFAULT_PAGE_SIZE, 120 * 1024 * 1024,
|
||||||
new Configuration(), Double.parseDouble(HoodieStorageConfig.PARQUET_COMPRESSION_RATIO_FRACTION.defaultValue()));
|
new Configuration(), Double.parseDouble(HoodieStorageConfig.PARQUET_COMPRESSION_RATIO_FRACTION.defaultValue()));
|
||||||
try (HoodieAvroParquetWriter writer = new HoodieAvroParquetWriter<>(
|
try (HoodieAvroParquetWriter writer = new HoodieAvroParquetWriter<>(
|
||||||
|
|||||||
@@ -22,6 +22,7 @@ import org.apache.hudi.common.bloom.BloomFilter;
|
|||||||
import org.apache.hudi.common.bloom.BloomFilterFactory;
|
import org.apache.hudi.common.bloom.BloomFilterFactory;
|
||||||
import org.apache.hudi.common.fs.FSUtils;
|
import org.apache.hudi.common.fs.FSUtils;
|
||||||
import org.apache.hudi.config.HoodieWriteConfig;
|
import org.apache.hudi.config.HoodieWriteConfig;
|
||||||
|
import org.apache.hudi.io.storage.HoodieParquetConfig;
|
||||||
import org.apache.hudi.table.HoodieTable;
|
import org.apache.hudi.table.HoodieTable;
|
||||||
|
|
||||||
import org.apache.flink.table.types.logical.RowType;
|
import org.apache.flink.table.types.logical.RowType;
|
||||||
@@ -67,7 +68,7 @@ public class HoodieRowDataFileWriterFactory {
|
|||||||
HoodieRowDataParquetWriteSupport writeSupport =
|
HoodieRowDataParquetWriteSupport writeSupport =
|
||||||
new HoodieRowDataParquetWriteSupport(table.getHadoopConf(), rowType, filter);
|
new HoodieRowDataParquetWriteSupport(table.getHadoopConf(), rowType, filter);
|
||||||
return new HoodieRowDataParquetWriter(
|
return new HoodieRowDataParquetWriter(
|
||||||
path, new HoodieRowDataParquetConfig(
|
path, new HoodieParquetConfig<>(
|
||||||
writeSupport,
|
writeSupport,
|
||||||
writeConfig.getParquetCompressionCodec(),
|
writeConfig.getParquetCompressionCodec(),
|
||||||
writeConfig.getParquetBlockSize(),
|
writeConfig.getParquetBlockSize(),
|
||||||
|
|||||||
@@ -1,36 +0,0 @@
|
|||||||
/*
|
|
||||||
* Licensed to the Apache Software Foundation (ASF) under one
|
|
||||||
* or more contributor license agreements. See the NOTICE file
|
|
||||||
* distributed with this work for additional information
|
|
||||||
* regarding copyright ownership. The ASF licenses this file
|
|
||||||
* to you under the Apache License, Version 2.0 (the
|
|
||||||
* "License"); you may not use this file except in compliance
|
|
||||||
* with the License. You may obtain a copy of the License at
|
|
||||||
*
|
|
||||||
* http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
*
|
|
||||||
* Unless required by applicable law or agreed to in writing, software
|
|
||||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
* See the License for the specific language governing permissions and
|
|
||||||
* limitations under the License.
|
|
||||||
*/
|
|
||||||
|
|
||||||
package org.apache.hudi.io.storage.row;
|
|
||||||
|
|
||||||
import org.apache.hudi.io.storage.HoodieBaseParquetConfig;
|
|
||||||
|
|
||||||
import org.apache.hadoop.conf.Configuration;
|
|
||||||
import org.apache.parquet.hadoop.metadata.CompressionCodecName;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* ParquetConfig for datasource implementation with {@link org.apache.flink.table.data.RowData}.
|
|
||||||
*/
|
|
||||||
public class HoodieRowDataParquetConfig extends HoodieBaseParquetConfig<HoodieRowDataParquetWriteSupport> {
|
|
||||||
|
|
||||||
public HoodieRowDataParquetConfig(HoodieRowDataParquetWriteSupport writeSupport, CompressionCodecName compressionCodecName,
|
|
||||||
int blockSize, int pageSize, long maxFileSize, Configuration hadoopConf,
|
|
||||||
double compressionRatio) {
|
|
||||||
super(writeSupport, compressionCodecName, blockSize, pageSize, maxFileSize, hadoopConf, compressionRatio);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
@@ -23,6 +23,7 @@ import org.apache.hudi.common.fs.HoodieWrapperFileSystem;
|
|||||||
|
|
||||||
import org.apache.flink.table.data.RowData;
|
import org.apache.flink.table.data.RowData;
|
||||||
import org.apache.hadoop.fs.Path;
|
import org.apache.hadoop.fs.Path;
|
||||||
|
import org.apache.hudi.io.storage.HoodieParquetConfig;
|
||||||
import org.apache.parquet.hadoop.ParquetFileWriter;
|
import org.apache.parquet.hadoop.ParquetFileWriter;
|
||||||
import org.apache.parquet.hadoop.ParquetWriter;
|
import org.apache.parquet.hadoop.ParquetWriter;
|
||||||
|
|
||||||
@@ -39,7 +40,7 @@ public class HoodieRowDataParquetWriter extends ParquetWriter<RowData>
|
|||||||
private final long maxFileSize;
|
private final long maxFileSize;
|
||||||
private final HoodieRowDataParquetWriteSupport writeSupport;
|
private final HoodieRowDataParquetWriteSupport writeSupport;
|
||||||
|
|
||||||
public HoodieRowDataParquetWriter(Path file, HoodieRowDataParquetConfig parquetConfig)
|
public HoodieRowDataParquetWriter(Path file, HoodieParquetConfig<HoodieRowDataParquetWriteSupport> parquetConfig)
|
||||||
throws IOException {
|
throws IOException {
|
||||||
super(HoodieWrapperFileSystem.convertToHoodiePath(file, parquetConfig.getHadoopConf()),
|
super(HoodieWrapperFileSystem.convertToHoodiePath(file, parquetConfig.getHadoopConf()),
|
||||||
ParquetFileWriter.Mode.CREATE, parquetConfig.getWriteSupport(), parquetConfig.getCompressionCodecName(),
|
ParquetFileWriter.Mode.CREATE, parquetConfig.getWriteSupport(), parquetConfig.getCompressionCodecName(),
|
||||||
|
|||||||
@@ -23,6 +23,7 @@ import org.apache.hudi.common.bloom.BloomFilterFactory;
|
|||||||
import org.apache.hudi.common.fs.FSUtils;
|
import org.apache.hudi.common.fs.FSUtils;
|
||||||
import org.apache.hudi.config.HoodieWriteConfig;
|
import org.apache.hudi.config.HoodieWriteConfig;
|
||||||
import org.apache.hudi.exception.HoodieIOException;
|
import org.apache.hudi.exception.HoodieIOException;
|
||||||
|
import org.apache.hudi.io.storage.HoodieParquetConfig;
|
||||||
import org.apache.hudi.table.HoodieTable;
|
import org.apache.hudi.table.HoodieTable;
|
||||||
|
|
||||||
import org.apache.hadoop.fs.Path;
|
import org.apache.hadoop.fs.Path;
|
||||||
@@ -68,14 +69,17 @@ public class HoodieInternalRowFileWriterFactory {
|
|||||||
HoodieRowParquetWriteSupport writeSupport =
|
HoodieRowParquetWriteSupport writeSupport =
|
||||||
new HoodieRowParquetWriteSupport(table.getHadoopConf(), structType, filter, writeConfig);
|
new HoodieRowParquetWriteSupport(table.getHadoopConf(), structType, filter, writeConfig);
|
||||||
return new HoodieInternalRowParquetWriter(
|
return new HoodieInternalRowParquetWriter(
|
||||||
path, new HoodieRowParquetConfig(
|
path,
|
||||||
|
new HoodieParquetConfig<>(
|
||||||
writeSupport,
|
writeSupport,
|
||||||
writeConfig.getParquetCompressionCodec(),
|
writeConfig.getParquetCompressionCodec(),
|
||||||
writeConfig.getParquetBlockSize(),
|
writeConfig.getParquetBlockSize(),
|
||||||
writeConfig.getParquetPageSize(),
|
writeConfig.getParquetPageSize(),
|
||||||
writeConfig.getParquetMaxFileSize(),
|
writeConfig.getParquetMaxFileSize(),
|
||||||
writeSupport.getHadoopConf(),
|
writeSupport.getHadoopConf(),
|
||||||
writeConfig.getParquetCompressionRatio()));
|
writeConfig.getParquetCompressionRatio(),
|
||||||
|
writeConfig.parquetDictionaryEnabled()
|
||||||
|
));
|
||||||
}
|
}
|
||||||
|
|
||||||
public static HoodieInternalRowFileWriter getInternalRowFileWriterWithoutMetaFields(
|
public static HoodieInternalRowFileWriter getInternalRowFileWriterWithoutMetaFields(
|
||||||
@@ -93,13 +97,15 @@ public class HoodieInternalRowFileWriterFactory {
|
|||||||
HoodieRowParquetWriteSupport writeSupport =
|
HoodieRowParquetWriteSupport writeSupport =
|
||||||
new HoodieRowParquetWriteSupport(table.getHadoopConf(), structType, null, writeConfig);
|
new HoodieRowParquetWriteSupport(table.getHadoopConf(), structType, null, writeConfig);
|
||||||
return new HoodieInternalRowParquetWriter(
|
return new HoodieInternalRowParquetWriter(
|
||||||
path, new HoodieRowParquetConfig(
|
path, new HoodieParquetConfig<>(
|
||||||
writeSupport,
|
writeSupport,
|
||||||
writeConfig.getParquetCompressionCodec(),
|
writeConfig.getParquetCompressionCodec(),
|
||||||
writeConfig.getParquetBlockSize(),
|
writeConfig.getParquetBlockSize(),
|
||||||
writeConfig.getParquetPageSize(),
|
writeConfig.getParquetPageSize(),
|
||||||
writeConfig.getParquetMaxFileSize(),
|
writeConfig.getParquetMaxFileSize(),
|
||||||
writeSupport.getHadoopConf(),
|
writeSupport.getHadoopConf(),
|
||||||
writeConfig.getParquetCompressionRatio()));
|
writeConfig.getParquetCompressionRatio(),
|
||||||
|
writeConfig.parquetDictionaryEnabled())
|
||||||
|
);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -19,6 +19,7 @@
|
|||||||
package org.apache.hudi.io.storage.row;
|
package org.apache.hudi.io.storage.row;
|
||||||
|
|
||||||
import org.apache.hadoop.fs.Path;
|
import org.apache.hadoop.fs.Path;
|
||||||
|
import org.apache.hudi.io.storage.HoodieParquetConfig;
|
||||||
import org.apache.hudi.io.storage.HoodieBaseParquetWriter;
|
import org.apache.hudi.io.storage.HoodieBaseParquetWriter;
|
||||||
import org.apache.spark.sql.catalyst.InternalRow;
|
import org.apache.spark.sql.catalyst.InternalRow;
|
||||||
|
|
||||||
@@ -32,7 +33,7 @@ public class HoodieInternalRowParquetWriter extends HoodieBaseParquetWriter<Inte
|
|||||||
|
|
||||||
private final HoodieRowParquetWriteSupport writeSupport;
|
private final HoodieRowParquetWriteSupport writeSupport;
|
||||||
|
|
||||||
public HoodieInternalRowParquetWriter(Path file, HoodieRowParquetConfig parquetConfig)
|
public HoodieInternalRowParquetWriter(Path file, HoodieParquetConfig<HoodieRowParquetWriteSupport> parquetConfig)
|
||||||
throws IOException {
|
throws IOException {
|
||||||
super(file, parquetConfig);
|
super(file, parquetConfig);
|
||||||
|
|
||||||
|
|||||||
@@ -1,36 +0,0 @@
|
|||||||
/*
|
|
||||||
* Licensed to the Apache Software Foundation (ASF) under one
|
|
||||||
* or more contributor license agreements. See the NOTICE file
|
|
||||||
* distributed with this work for additional information
|
|
||||||
* regarding copyright ownership. The ASF licenses this file
|
|
||||||
* to you under the Apache License, Version 2.0 (the
|
|
||||||
* "License"); you may not use this file except in compliance
|
|
||||||
* with the License. You may obtain a copy of the License at
|
|
||||||
*
|
|
||||||
* http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
*
|
|
||||||
* Unless required by applicable law or agreed to in writing, software
|
|
||||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
* See the License for the specific language governing permissions and
|
|
||||||
* limitations under the License.
|
|
||||||
*/
|
|
||||||
|
|
||||||
package org.apache.hudi.io.storage.row;
|
|
||||||
|
|
||||||
import org.apache.hudi.io.storage.HoodieBaseParquetConfig;
|
|
||||||
|
|
||||||
import org.apache.hadoop.conf.Configuration;
|
|
||||||
import org.apache.parquet.hadoop.metadata.CompressionCodecName;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* ParquetConfig for datasource implementation with {@link org.apache.hudi.client.model.HoodieInternalRow}.
|
|
||||||
*/
|
|
||||||
public class HoodieRowParquetConfig extends HoodieBaseParquetConfig<HoodieRowParquetWriteSupport> {
|
|
||||||
|
|
||||||
public HoodieRowParquetConfig(HoodieRowParquetWriteSupport writeSupport, CompressionCodecName compressionCodecName,
|
|
||||||
int blockSize, int pageSize, long maxFileSize, Configuration hadoopConf,
|
|
||||||
double compressionRatio) {
|
|
||||||
super(writeSupport, compressionCodecName, blockSize, pageSize, maxFileSize, hadoopConf, compressionRatio);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
@@ -23,6 +23,7 @@ import org.apache.hudi.common.bloom.BloomFilterFactory;
|
|||||||
import org.apache.hudi.common.testutils.HoodieTestDataGenerator;
|
import org.apache.hudi.common.testutils.HoodieTestDataGenerator;
|
||||||
import org.apache.hudi.config.HoodieStorageConfig;
|
import org.apache.hudi.config.HoodieStorageConfig;
|
||||||
import org.apache.hudi.config.HoodieWriteConfig;
|
import org.apache.hudi.config.HoodieWriteConfig;
|
||||||
|
import org.apache.hudi.io.storage.HoodieParquetConfig;
|
||||||
import org.apache.hudi.testutils.HoodieClientTestHarness;
|
import org.apache.hudi.testutils.HoodieClientTestHarness;
|
||||||
import org.apache.hudi.testutils.SparkDatasetTestUtils;
|
import org.apache.hudi.testutils.SparkDatasetTestUtils;
|
||||||
|
|
||||||
@@ -73,9 +74,9 @@ public class TestHoodieInternalRowParquetWriter extends HoodieClientTestHarness
|
|||||||
// init write support and parquet config
|
// init write support and parquet config
|
||||||
HoodieRowParquetWriteSupport writeSupport = getWriteSupport(writeConfigBuilder, hadoopConf, parquetWriteLegacyFormatEnabled);
|
HoodieRowParquetWriteSupport writeSupport = getWriteSupport(writeConfigBuilder, hadoopConf, parquetWriteLegacyFormatEnabled);
|
||||||
HoodieWriteConfig cfg = writeConfigBuilder.build();
|
HoodieWriteConfig cfg = writeConfigBuilder.build();
|
||||||
HoodieRowParquetConfig parquetConfig = new HoodieRowParquetConfig(writeSupport,
|
HoodieParquetConfig<HoodieRowParquetWriteSupport> parquetConfig = new HoodieParquetConfig<>(writeSupport,
|
||||||
CompressionCodecName.SNAPPY, cfg.getParquetBlockSize(), cfg.getParquetPageSize(), cfg.getParquetMaxFileSize(),
|
CompressionCodecName.SNAPPY, cfg.getParquetBlockSize(), cfg.getParquetPageSize(), cfg.getParquetMaxFileSize(),
|
||||||
writeSupport.getHadoopConf(), cfg.getParquetCompressionRatio());
|
writeSupport.getHadoopConf(), cfg.getParquetCompressionRatio(), cfg.parquetDictionaryEnabled());
|
||||||
|
|
||||||
// prepare path
|
// prepare path
|
||||||
String fileId = UUID.randomUUID().toString();
|
String fileId = UUID.randomUUID().toString();
|
||||||
|
|||||||
@@ -18,21 +18,20 @@
|
|||||||
|
|
||||||
package org.apache.hudi.common.table.log.block;
|
package org.apache.hudi.common.table.log.block;
|
||||||
|
|
||||||
import org.apache.hudi.avro.HoodieAvroWriteSupport;
|
|
||||||
import org.apache.hudi.common.fs.inline.InLineFSUtils;
|
|
||||||
import org.apache.hudi.common.fs.inline.InLineFileSystem;
|
|
||||||
import org.apache.hudi.common.util.ClosableIterator;
|
|
||||||
import org.apache.hudi.common.util.Option;
|
|
||||||
import org.apache.hudi.common.util.ParquetReaderIterator;
|
|
||||||
import org.apache.hudi.io.storage.HoodieAvroParquetConfig;
|
|
||||||
import org.apache.hudi.io.storage.HoodieParquetStreamWriter;
|
|
||||||
|
|
||||||
import org.apache.avro.Schema;
|
import org.apache.avro.Schema;
|
||||||
import org.apache.avro.generic.IndexedRecord;
|
import org.apache.avro.generic.IndexedRecord;
|
||||||
import org.apache.hadoop.conf.Configuration;
|
import org.apache.hadoop.conf.Configuration;
|
||||||
import org.apache.hadoop.fs.FSDataInputStream;
|
import org.apache.hadoop.fs.FSDataInputStream;
|
||||||
import org.apache.hadoop.fs.FSDataOutputStream;
|
import org.apache.hadoop.fs.FSDataOutputStream;
|
||||||
import org.apache.hadoop.fs.Path;
|
import org.apache.hadoop.fs.Path;
|
||||||
|
import org.apache.hudi.avro.HoodieAvroWriteSupport;
|
||||||
|
import org.apache.hudi.common.fs.inline.InLineFSUtils;
|
||||||
|
import org.apache.hudi.common.fs.inline.InLineFileSystem;
|
||||||
|
import org.apache.hudi.common.util.ClosableIterator;
|
||||||
|
import org.apache.hudi.common.util.Option;
|
||||||
|
import org.apache.hudi.common.util.ParquetReaderIterator;
|
||||||
|
import org.apache.hudi.io.storage.HoodieParquetConfig;
|
||||||
|
import org.apache.hudi.io.storage.HoodieParquetStreamWriter;
|
||||||
import org.apache.parquet.avro.AvroParquetReader;
|
import org.apache.parquet.avro.AvroParquetReader;
|
||||||
import org.apache.parquet.avro.AvroReadSupport;
|
import org.apache.parquet.avro.AvroReadSupport;
|
||||||
import org.apache.parquet.avro.AvroSchemaConverter;
|
import org.apache.parquet.avro.AvroSchemaConverter;
|
||||||
@@ -43,7 +42,6 @@ import org.apache.parquet.hadoop.util.HadoopInputFile;
|
|||||||
import org.apache.parquet.io.InputFile;
|
import org.apache.parquet.io.InputFile;
|
||||||
|
|
||||||
import javax.annotation.Nonnull;
|
import javax.annotation.Nonnull;
|
||||||
|
|
||||||
import java.io.ByteArrayOutputStream;
|
import java.io.ByteArrayOutputStream;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.util.HashMap;
|
import java.util.HashMap;
|
||||||
@@ -97,8 +95,8 @@ public class HoodieParquetDataBlock extends HoodieDataBlock {
|
|||||||
HoodieAvroWriteSupport writeSupport = new HoodieAvroWriteSupport(
|
HoodieAvroWriteSupport writeSupport = new HoodieAvroWriteSupport(
|
||||||
new AvroSchemaConverter().convert(writerSchema), writerSchema, Option.empty());
|
new AvroSchemaConverter().convert(writerSchema), writerSchema, Option.empty());
|
||||||
|
|
||||||
HoodieAvroParquetConfig avroParquetConfig =
|
HoodieParquetConfig<HoodieAvroWriteSupport> avroParquetConfig =
|
||||||
new HoodieAvroParquetConfig(
|
new HoodieParquetConfig<>(
|
||||||
writeSupport,
|
writeSupport,
|
||||||
compressionCodecName.get(),
|
compressionCodecName.get(),
|
||||||
ParquetWriter.DEFAULT_BLOCK_SIZE,
|
ParquetWriter.DEFAULT_BLOCK_SIZE,
|
||||||
|
|||||||
@@ -1,42 +0,0 @@
|
|||||||
/*
|
|
||||||
* Licensed to the Apache Software Foundation (ASF) under one
|
|
||||||
* or more contributor license agreements. See the NOTICE file
|
|
||||||
* distributed with this work for additional information
|
|
||||||
* regarding copyright ownership. The ASF licenses this file
|
|
||||||
* to you under the Apache License, Version 2.0 (the
|
|
||||||
* "License"); you may not use this file except in compliance
|
|
||||||
* with the License. You may obtain a copy of the License at
|
|
||||||
*
|
|
||||||
* http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
*
|
|
||||||
* Unless required by applicable law or agreed to in writing, software
|
|
||||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
* See the License for the specific language governing permissions and
|
|
||||||
* limitations under the License.
|
|
||||||
*/
|
|
||||||
|
|
||||||
package org.apache.hudi.io.storage;
|
|
||||||
|
|
||||||
import org.apache.hudi.avro.HoodieAvroWriteSupport;
|
|
||||||
|
|
||||||
import org.apache.hadoop.conf.Configuration;
|
|
||||||
import org.apache.parquet.hadoop.metadata.CompressionCodecName;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* ParquetConfig for writing avro records in Parquet files.
|
|
||||||
*/
|
|
||||||
public class HoodieAvroParquetConfig extends HoodieBaseParquetConfig<HoodieAvroWriteSupport> {
|
|
||||||
|
|
||||||
public HoodieAvroParquetConfig(HoodieAvroWriteSupport writeSupport, CompressionCodecName compressionCodecName,
|
|
||||||
int blockSize, int pageSize, long maxFileSize, Configuration hadoopConf,
|
|
||||||
double compressionRatio) {
|
|
||||||
super(writeSupport, compressionCodecName, blockSize, pageSize, maxFileSize, hadoopConf, compressionRatio);
|
|
||||||
}
|
|
||||||
|
|
||||||
public HoodieAvroParquetConfig(HoodieAvroWriteSupport writeSupport, CompressionCodecName compressionCodecName,
|
|
||||||
int blockSize, int pageSize, long maxFileSize, Configuration hadoopConf,
|
|
||||||
double compressionRatio, boolean directoryEnabled) {
|
|
||||||
super(writeSupport, compressionCodecName, blockSize, pageSize, maxFileSize, hadoopConf, compressionRatio, directoryEnabled);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
@@ -25,7 +25,7 @@ import org.apache.parquet.hadoop.metadata.CompressionCodecName;
|
|||||||
* Base ParquetConfig to hold config params for writing to Parquet.
|
* Base ParquetConfig to hold config params for writing to Parquet.
|
||||||
* @param <T>
|
* @param <T>
|
||||||
*/
|
*/
|
||||||
public class HoodieBaseParquetConfig<T> {
|
public class HoodieParquetConfig<T> {
|
||||||
private final T writeSupport;
|
private final T writeSupport;
|
||||||
private final CompressionCodecName compressionCodecName;
|
private final CompressionCodecName compressionCodecName;
|
||||||
private final int blockSize;
|
private final int blockSize;
|
||||||
@@ -35,13 +35,13 @@ public class HoodieBaseParquetConfig<T> {
|
|||||||
private final double compressionRatio;
|
private final double compressionRatio;
|
||||||
private final boolean dictionaryEnabled;
|
private final boolean dictionaryEnabled;
|
||||||
|
|
||||||
public HoodieBaseParquetConfig(T writeSupport, CompressionCodecName compressionCodecName, int blockSize,
|
public HoodieParquetConfig(T writeSupport, CompressionCodecName compressionCodecName, int blockSize,
|
||||||
int pageSize, long maxFileSize, Configuration hadoopConf, double compressionRatio) {
|
int pageSize, long maxFileSize, Configuration hadoopConf, double compressionRatio) {
|
||||||
this(writeSupport, compressionCodecName, blockSize, pageSize, maxFileSize, hadoopConf, compressionRatio, false);
|
this(writeSupport, compressionCodecName, blockSize, pageSize, maxFileSize, hadoopConf, compressionRatio, false);
|
||||||
}
|
}
|
||||||
|
|
||||||
public HoodieBaseParquetConfig(T writeSupport, CompressionCodecName compressionCodecName, int blockSize,
|
public HoodieParquetConfig(T writeSupport, CompressionCodecName compressionCodecName, int blockSize,
|
||||||
int pageSize, long maxFileSize, Configuration hadoopConf, double compressionRatio, boolean dictionaryEnabled) {
|
int pageSize, long maxFileSize, Configuration hadoopConf, double compressionRatio, boolean dictionaryEnabled) {
|
||||||
this.writeSupport = writeSupport;
|
this.writeSupport = writeSupport;
|
||||||
this.compressionCodecName = compressionCodecName;
|
this.compressionCodecName = compressionCodecName;
|
||||||
this.blockSize = blockSize;
|
this.blockSize = blockSize;
|
||||||
@@ -38,7 +38,7 @@ public class HoodieParquetStreamWriter<R extends IndexedRecord> implements AutoC
|
|||||||
private final HoodieAvroWriteSupport writeSupport;
|
private final HoodieAvroWriteSupport writeSupport;
|
||||||
|
|
||||||
public HoodieParquetStreamWriter(FSDataOutputStream outputStream,
|
public HoodieParquetStreamWriter(FSDataOutputStream outputStream,
|
||||||
HoodieAvroParquetConfig parquetConfig) throws IOException {
|
HoodieParquetConfig<HoodieAvroWriteSupport> parquetConfig) throws IOException {
|
||||||
this.writeSupport = parquetConfig.getWriteSupport();
|
this.writeSupport = parquetConfig.getWriteSupport();
|
||||||
this.writer = new Builder<R>(new OutputStreamBackedOutputFile(outputStream), writeSupport)
|
this.writer = new Builder<R>(new OutputStreamBackedOutputFile(outputStream), writeSupport)
|
||||||
.withWriteMode(ParquetFileWriter.Mode.CREATE)
|
.withWriteMode(ParquetFileWriter.Mode.CREATE)
|
||||||
|
|||||||
Reference in New Issue
Block a user