1
0

[HUDI-2610] pass the spark version when sync the table created by spark (#4758)

* [HUDI-2610] pass the spark version when sync the table created by spark

* [MINOR] sync spark version in DataSourceUtils#buildHiveSyncConfig
This commit is contained in:
Yann Byron
2022-02-10 23:35:28 +08:00
committed by GitHub
parent 1c778590d1
commit 2fe7a3a41f
4 changed files with 15 additions and 0 deletions

View File

@@ -129,6 +129,9 @@ public class HiveSyncConfig implements Serializable {
@Parameter(names = {"--conditional-sync"}, description = "If true, only sync on conditions like schema change or partition change.")
public Boolean isConditionalSync = false;
@Parameter(names = {"--spark-version"}, description = "The spark version", required = false)
public String sparkVersion;
// enhance the similar function in child class
public static HiveSyncConfig copy(HiveSyncConfig cfg) {
HiveSyncConfig newConfig = new HiveSyncConfig();
@@ -155,6 +158,7 @@ public class HiveSyncConfig implements Serializable {
newConfig.sparkSchemaLengthThreshold = cfg.sparkSchemaLengthThreshold;
newConfig.withOperationField = cfg.withOperationField;
newConfig.isConditionalSync = cfg.isConditionalSync;
newConfig.sparkVersion = cfg.sparkVersion;
return newConfig;
}

View File

@@ -27,6 +27,7 @@ import org.apache.hudi.common.fs.FSUtils;
import org.apache.hudi.common.model.HoodieFileFormat;
import org.apache.hudi.common.model.HoodieTableType;
import org.apache.hudi.common.util.Option;
import org.apache.hudi.common.util.StringUtils;
import org.apache.hudi.exception.HoodieException;
import org.apache.hudi.exception.InvalidTableException;
import org.apache.hudi.hadoop.utils.HoodieInputFormatUtils;
@@ -302,6 +303,9 @@ public class HiveSyncTool extends AbstractSyncTool {
Map<String, String> sparkProperties = new HashMap<>();
sparkProperties.put("spark.sql.sources.provider", "hudi");
if (!StringUtils.isNullOrEmpty(cfg.sparkVersion)) {
sparkProperties.put("spark.sql.create.version", cfg.sparkVersion);
}
// Split the schema string to multi-parts according the schemaLengthThreshold size.
String schemaString = Parquet2SparkSchemaUtils.convertToSparkSchemaJson(reOrderedType);
int numSchemaPart = (schemaString.length() + schemaLengthThreshold - 1) / schemaLengthThreshold;