[HUDI-4038] Avoid calling getDataSize after every record written (#5497)
- getDataSize has non-trivial overhead in the current ParquetWriter impl, requiring traversal of already composed Column Groups in memory. Instead we can sample these calls to getDataSize to amortize its cost. Co-authored-by: sivabalan <n.siva.b@gmail.com>
This commit is contained in:
@@ -28,7 +28,7 @@ import org.apache.hudi.common.bloom.{BloomFilter, BloomFilterFactory}
|
||||
import org.apache.hudi.common.model.{HoodieFileFormat, HoodieRecord}
|
||||
import org.apache.hudi.common.util.BaseFileUtils
|
||||
import org.apache.hudi.config.{HoodieIndexConfig, HoodieStorageConfig}
|
||||
import org.apache.hudi.io.storage.{HoodieAvroParquetConfig, HoodieParquetWriter}
|
||||
import org.apache.hudi.io.storage.{HoodieAvroParquetConfig, HoodieAvroParquetWriter}
|
||||
import org.apache.parquet.avro.AvroSchemaConverter
|
||||
import org.apache.parquet.hadoop.metadata.CompressionCodecName
|
||||
import org.apache.spark.sql.{DataFrame, SQLContext}
|
||||
@@ -50,8 +50,7 @@ object SparkHelpers {
|
||||
// Add current classLoad for config, if not will throw classNotFound of 'HoodieWrapperFileSystem'.
|
||||
parquetConfig.getHadoopConf().setClassLoader(Thread.currentThread.getContextClassLoader)
|
||||
|
||||
val writer = new HoodieParquetWriter[HoodieJsonPayload, IndexedRecord](instantTime, destinationFile, parquetConfig, schema, new SparkTaskContextSupplier(),
|
||||
true)
|
||||
val writer = new HoodieAvroParquetWriter[IndexedRecord](destinationFile, parquetConfig, instantTime, new SparkTaskContextSupplier(), true)
|
||||
for (rec <- sourceRecords) {
|
||||
val key: String = rec.get(HoodieRecord.RECORD_KEY_METADATA_FIELD).toString
|
||||
if (!keysToSkip.contains(key)) {
|
||||
|
||||
Reference in New Issue
Block a user