From 473be87aa5d71939c2e8a367851b0e3b96744bc0 Mon Sep 17 00:00:00 2001 From: Rahil Chertara Date: Wed, 20 Jul 2022 17:04:00 -0700 Subject: [PATCH] Disable EmrFS file metadata caching and EMR Spark's data prefetcher feature --- .../src/main/scala/org/apache/hudi/DefaultSource.scala | 3 +++ 1 file changed, 3 insertions(+) diff --git a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/DefaultSource.scala b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/DefaultSource.scala index 484debbb8..af8fb1b61 100644 --- a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/DefaultSource.scala +++ b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/DefaultSource.scala @@ -56,6 +56,9 @@ class DefaultSource extends RelationProvider // Enable "passPartitionByAsOptions" to support "write.partitionBy(...)" spark.conf.set("spark.sql.legacy.sources.write.passPartitionByAsOptions", "true") } + // Revisit EMR Spark and EMRFS incompatibilities, for now disable + spark.conf.set("spark.sql.dataPrefetch.enabled", "false") + spark.sparkContext.hadoopConfiguration.set("fs.s3.metadata.cache.expiration.seconds", "0") } private val log = LogManager.getLogger(classOf[DefaultSource])