[HUDI-2526] Make spark.sql.parquet.writeLegacyFormat configurable (#3917)
This commit is contained in:
@@ -115,6 +115,18 @@ public class HoodieStorageConfig extends HoodieConfig {
|
|||||||
.defaultValue(true)
|
.defaultValue(true)
|
||||||
.withDocumentation("Whether to use dictionary encoding");
|
.withDocumentation("Whether to use dictionary encoding");
|
||||||
|
|
||||||
|
public static final ConfigProperty<String> PARQUET_WRITE_LEGACY_FORMAT_ENABLED = ConfigProperty
|
||||||
|
.key("hoodie.parquet.writeLegacyFormat.enabled")
|
||||||
|
.defaultValue("false")
|
||||||
|
.withDocumentation("Sets spark.sql.parquet.writeLegacyFormat. If true, data will be written in a way of Spark 1.4 and earlier. "
|
||||||
|
+ "For example, decimal values will be written in Parquet's fixed-length byte array format which other systems such as Apache Hive and Apache Impala use. "
|
||||||
|
+ "If false, the newer format in Parquet will be used. For example, decimals will be written in int-based format.");
|
||||||
|
|
||||||
|
public static final ConfigProperty<String> PARQUET_OUTPUT_TIMESTAMP_TYPE = ConfigProperty
|
||||||
|
.key("hoodie.parquet.outputTimestampType")
|
||||||
|
.defaultValue("TIMESTAMP_MILLIS")
|
||||||
|
.withDocumentation("Sets spark.sql.parquet.outputTimestampType. Parquet timestamp type to use when Spark writes data to Parquet files.");
|
||||||
|
|
||||||
public static final ConfigProperty<String> HFILE_COMPRESSION_ALGORITHM_NAME = ConfigProperty
|
public static final ConfigProperty<String> HFILE_COMPRESSION_ALGORITHM_NAME = ConfigProperty
|
||||||
.key("hoodie.hfile.compression.algorithm")
|
.key("hoodie.hfile.compression.algorithm")
|
||||||
.defaultValue("GZ")
|
.defaultValue("GZ")
|
||||||
@@ -312,6 +324,16 @@ public class HoodieStorageConfig extends HoodieConfig {
|
|||||||
return this;
|
return this;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public Builder parquetWriteLegacyFormat(String parquetWriteLegacyFormat) {
|
||||||
|
storageConfig.setValue(PARQUET_WRITE_LEGACY_FORMAT_ENABLED, parquetWriteLegacyFormat);
|
||||||
|
return this;
|
||||||
|
}
|
||||||
|
|
||||||
|
public Builder parquetOutputTimestampType(String parquetOutputTimestampType) {
|
||||||
|
storageConfig.setValue(PARQUET_OUTPUT_TIMESTAMP_TYPE, parquetOutputTimestampType);
|
||||||
|
return this;
|
||||||
|
}
|
||||||
|
|
||||||
public Builder hfileCompressionAlgorithm(String hfileCompressionAlgorithm) {
|
public Builder hfileCompressionAlgorithm(String hfileCompressionAlgorithm) {
|
||||||
storageConfig.setValue(HFILE_COMPRESSION_ALGORITHM_NAME, hfileCompressionAlgorithm);
|
storageConfig.setValue(HFILE_COMPRESSION_ALGORITHM_NAME, hfileCompressionAlgorithm);
|
||||||
return this;
|
return this;
|
||||||
@@ -347,5 +369,4 @@ public class HoodieStorageConfig extends HoodieConfig {
|
|||||||
return storageConfig;
|
return storageConfig;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1439,6 +1439,14 @@ public class HoodieWriteConfig extends HoodieConfig {
|
|||||||
return getBoolean(HoodieStorageConfig.PARQUET_DICTIONARY_ENABLED);
|
return getBoolean(HoodieStorageConfig.PARQUET_DICTIONARY_ENABLED);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public String parquetWriteLegacyFormatEnabled() {
|
||||||
|
return getString(HoodieStorageConfig.PARQUET_WRITE_LEGACY_FORMAT_ENABLED);
|
||||||
|
}
|
||||||
|
|
||||||
|
public String parquetOutputTimestampType() {
|
||||||
|
return getString(HoodieStorageConfig.PARQUET_OUTPUT_TIMESTAMP_TYPE);
|
||||||
|
}
|
||||||
|
|
||||||
public long getLogFileMaxSize() {
|
public long getLogFileMaxSize() {
|
||||||
return getLong(HoodieStorageConfig.LOGFILE_MAX_SIZE);
|
return getLong(HoodieStorageConfig.LOGFILE_MAX_SIZE);
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -66,7 +66,7 @@ public class HoodieInternalRowFileWriterFactory {
|
|||||||
writeConfig.getDynamicBloomFilterMaxNumEntries(),
|
writeConfig.getDynamicBloomFilterMaxNumEntries(),
|
||||||
writeConfig.getBloomFilterType());
|
writeConfig.getBloomFilterType());
|
||||||
HoodieRowParquetWriteSupport writeSupport =
|
HoodieRowParquetWriteSupport writeSupport =
|
||||||
new HoodieRowParquetWriteSupport(table.getHadoopConf(), structType, filter);
|
new HoodieRowParquetWriteSupport(table.getHadoopConf(), structType, filter, writeConfig);
|
||||||
return new HoodieInternalRowParquetWriter(
|
return new HoodieInternalRowParquetWriter(
|
||||||
path, new HoodieRowParquetConfig(
|
path, new HoodieRowParquetConfig(
|
||||||
writeSupport,
|
writeSupport,
|
||||||
@@ -91,7 +91,7 @@ public class HoodieInternalRowFileWriterFactory {
|
|||||||
Path path, HoodieWriteConfig writeConfig, StructType structType, HoodieTable table)
|
Path path, HoodieWriteConfig writeConfig, StructType structType, HoodieTable table)
|
||||||
throws IOException {
|
throws IOException {
|
||||||
HoodieRowParquetWriteSupport writeSupport =
|
HoodieRowParquetWriteSupport writeSupport =
|
||||||
new HoodieRowParquetWriteSupport(table.getHadoopConf(), structType, null);
|
new HoodieRowParquetWriteSupport(table.getHadoopConf(), structType, null, writeConfig);
|
||||||
return new HoodieInternalRowParquetWriter(
|
return new HoodieInternalRowParquetWriter(
|
||||||
path, new HoodieRowParquetConfig(
|
path, new HoodieRowParquetConfig(
|
||||||
writeSupport,
|
writeSupport,
|
||||||
|
|||||||
@@ -21,6 +21,7 @@ package org.apache.hudi.io.storage.row;
|
|||||||
import org.apache.hadoop.conf.Configuration;
|
import org.apache.hadoop.conf.Configuration;
|
||||||
import org.apache.hudi.common.bloom.BloomFilter;
|
import org.apache.hudi.common.bloom.BloomFilter;
|
||||||
import org.apache.hudi.common.bloom.HoodieDynamicBoundedBloomFilter;
|
import org.apache.hudi.common.bloom.HoodieDynamicBoundedBloomFilter;
|
||||||
|
import org.apache.hudi.config.HoodieWriteConfig;
|
||||||
import org.apache.parquet.hadoop.api.WriteSupport;
|
import org.apache.parquet.hadoop.api.WriteSupport;
|
||||||
import org.apache.spark.sql.execution.datasources.parquet.ParquetWriteSupport;
|
import org.apache.spark.sql.execution.datasources.parquet.ParquetWriteSupport;
|
||||||
import org.apache.spark.sql.types.StructType;
|
import org.apache.spark.sql.types.StructType;
|
||||||
@@ -42,11 +43,11 @@ public class HoodieRowParquetWriteSupport extends ParquetWriteSupport {
|
|||||||
private String minRecordKey;
|
private String minRecordKey;
|
||||||
private String maxRecordKey;
|
private String maxRecordKey;
|
||||||
|
|
||||||
public HoodieRowParquetWriteSupport(Configuration conf, StructType structType, BloomFilter bloomFilter) {
|
public HoodieRowParquetWriteSupport(Configuration conf, StructType structType, BloomFilter bloomFilter, HoodieWriteConfig writeConfig) {
|
||||||
super();
|
super();
|
||||||
Configuration hadoopConf = new Configuration(conf);
|
Configuration hadoopConf = new Configuration(conf);
|
||||||
hadoopConf.set("spark.sql.parquet.writeLegacyFormat", "false");
|
hadoopConf.set("spark.sql.parquet.writeLegacyFormat", writeConfig.parquetWriteLegacyFormatEnabled());
|
||||||
hadoopConf.set("spark.sql.parquet.outputTimestampType", "TIMESTAMP_MILLIS");
|
hadoopConf.set("spark.sql.parquet.outputTimestampType", writeConfig.parquetOutputTimestampType());
|
||||||
this.hadoopConf = hadoopConf;
|
this.hadoopConf = hadoopConf;
|
||||||
setSchema(structType, hadoopConf);
|
setSchema(structType, hadoopConf);
|
||||||
this.bloomFilter = bloomFilter;
|
this.bloomFilter = bloomFilter;
|
||||||
|
|||||||
@@ -18,14 +18,14 @@
|
|||||||
|
|
||||||
package org.apache.hudi.io.storage.row;
|
package org.apache.hudi.io.storage.row;
|
||||||
|
|
||||||
|
import org.apache.hadoop.conf.Configuration;
|
||||||
|
import org.apache.hadoop.fs.Path;
|
||||||
import org.apache.hudi.common.bloom.BloomFilter;
|
import org.apache.hudi.common.bloom.BloomFilter;
|
||||||
import org.apache.hudi.common.bloom.BloomFilterFactory;
|
import org.apache.hudi.common.bloom.BloomFilterFactory;
|
||||||
import org.apache.hudi.common.testutils.HoodieTestDataGenerator;
|
import org.apache.hudi.common.testutils.HoodieTestDataGenerator;
|
||||||
|
import org.apache.hudi.config.HoodieStorageConfig;
|
||||||
import org.apache.hudi.config.HoodieWriteConfig;
|
import org.apache.hudi.config.HoodieWriteConfig;
|
||||||
import org.apache.hudi.testutils.HoodieClientTestHarness;
|
import org.apache.hudi.testutils.HoodieClientTestHarness;
|
||||||
|
|
||||||
import org.apache.hadoop.conf.Configuration;
|
|
||||||
import org.apache.hadoop.fs.Path;
|
|
||||||
import org.apache.hudi.testutils.SparkDatasetTestUtils;
|
import org.apache.hudi.testutils.SparkDatasetTestUtils;
|
||||||
import org.apache.parquet.hadoop.metadata.CompressionCodecName;
|
import org.apache.parquet.hadoop.metadata.CompressionCodecName;
|
||||||
import org.apache.spark.sql.Dataset;
|
import org.apache.spark.sql.Dataset;
|
||||||
@@ -33,7 +33,8 @@ import org.apache.spark.sql.Row;
|
|||||||
import org.apache.spark.sql.catalyst.InternalRow;
|
import org.apache.spark.sql.catalyst.InternalRow;
|
||||||
import org.junit.jupiter.api.AfterEach;
|
import org.junit.jupiter.api.AfterEach;
|
||||||
import org.junit.jupiter.api.BeforeEach;
|
import org.junit.jupiter.api.BeforeEach;
|
||||||
import org.junit.jupiter.api.Test;
|
import org.junit.jupiter.params.ParameterizedTest;
|
||||||
|
import org.junit.jupiter.params.provider.ValueSource;
|
||||||
|
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.Random;
|
import java.util.Random;
|
||||||
@@ -62,12 +63,14 @@ public class TestHoodieInternalRowParquetWriter extends HoodieClientTestHarness
|
|||||||
cleanupResources();
|
cleanupResources();
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@ParameterizedTest
|
||||||
public void endToEndTest() throws Exception {
|
@ValueSource(booleans = {true, false})
|
||||||
HoodieWriteConfig cfg = SparkDatasetTestUtils.getConfigBuilder(basePath).build();
|
public void endToEndTest(boolean parquetWriteLegacyFormatEnabled) throws Exception {
|
||||||
|
HoodieWriteConfig.Builder writeConfigBuilder = SparkDatasetTestUtils.getConfigBuilder(basePath);
|
||||||
for (int i = 0; i < 5; i++) {
|
for (int i = 0; i < 5; i++) {
|
||||||
// init write support and parquet config
|
// init write support and parquet config
|
||||||
HoodieRowParquetWriteSupport writeSupport = getWriteSupport(cfg, hadoopConf);
|
HoodieRowParquetWriteSupport writeSupport = getWriteSupport(writeConfigBuilder, hadoopConf, parquetWriteLegacyFormatEnabled);
|
||||||
|
HoodieWriteConfig cfg = writeConfigBuilder.build();
|
||||||
HoodieRowParquetConfig parquetConfig = new HoodieRowParquetConfig(writeSupport,
|
HoodieRowParquetConfig parquetConfig = new HoodieRowParquetConfig(writeSupport,
|
||||||
CompressionCodecName.SNAPPY, cfg.getParquetBlockSize(), cfg.getParquetPageSize(), cfg.getParquetMaxFileSize(),
|
CompressionCodecName.SNAPPY, cfg.getParquetBlockSize(), cfg.getParquetPageSize(), cfg.getParquetMaxFileSize(),
|
||||||
writeSupport.getHadoopConf(), cfg.getParquetCompressionRatio());
|
writeSupport.getHadoopConf(), cfg.getParquetCompressionRatio());
|
||||||
@@ -101,12 +104,14 @@ public class TestHoodieInternalRowParquetWriter extends HoodieClientTestHarness
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private HoodieRowParquetWriteSupport getWriteSupport(HoodieWriteConfig writeConfig, Configuration hadoopConf) {
|
private HoodieRowParquetWriteSupport getWriteSupport(HoodieWriteConfig.Builder writeConfigBuilder, Configuration hadoopConf, boolean parquetWriteLegacyFormatEnabled) {
|
||||||
|
writeConfigBuilder.withStorageConfig(HoodieStorageConfig.newBuilder().parquetWriteLegacyFormat(String.valueOf(parquetWriteLegacyFormatEnabled)).build());
|
||||||
|
HoodieWriteConfig writeConfig = writeConfigBuilder.build();
|
||||||
BloomFilter filter = BloomFilterFactory.createBloomFilter(
|
BloomFilter filter = BloomFilterFactory.createBloomFilter(
|
||||||
writeConfig.getBloomFilterNumEntries(),
|
writeConfig.getBloomFilterNumEntries(),
|
||||||
writeConfig.getBloomFilterFPP(),
|
writeConfig.getBloomFilterFPP(),
|
||||||
writeConfig.getDynamicBloomFilterMaxNumEntries(),
|
writeConfig.getDynamicBloomFilterMaxNumEntries(),
|
||||||
writeConfig.getBloomFilterType());
|
writeConfig.getBloomFilterType());
|
||||||
return new HoodieRowParquetWriteSupport(hadoopConf, SparkDatasetTestUtils.STRUCT_TYPE, filter);
|
return new HoodieRowParquetWriteSupport(hadoopConf, SparkDatasetTestUtils.STRUCT_TYPE, filter, writeConfig);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user