1
0

[HUDI-1771] Propagate CDC format for hoodie (#3285)

This commit is contained in:
swuferhong
2021-08-10 20:23:23 +08:00
committed by GitHub
parent b4441abcf7
commit 21db6d7a84
50 changed files with 1081 additions and 199 deletions

View File

@@ -71,7 +71,7 @@ public class HoodieDLAClient extends AbstractSyncHoodieClient {
public HoodieDLAClient(DLASyncConfig syncConfig, FileSystem fs) {
super(syncConfig.basePath, syncConfig.assumeDatePartitioning, syncConfig.useFileListingFromMetadata,
syncConfig.verifyMetadataFileListing, fs);
syncConfig.verifyMetadataFileListing, false, fs);
this.dlaConfig = syncConfig;
try {
this.partitionValueExtractor =

View File

@@ -120,6 +120,9 @@ public class HiveSyncConfig implements Serializable {
@Parameter(names = {"--spark-schema-length-threshold"}, description = "The maximum length allowed in a single cell when storing additional schema information in Hive's metastore.")
public int sparkSchemaLengthThreshold = 4000;
@Parameter(names = {"--with-operation-field"}, description = "Whether to include the '_hoodie_operation' field in the metadata fields")
public Boolean withOperationField = false;
// enhance the similar function in child class
public static HiveSyncConfig copy(HiveSyncConfig cfg) {
HiveSyncConfig newConfig = new HiveSyncConfig();
@@ -143,6 +146,7 @@ public class HiveSyncConfig implements Serializable {
newConfig.batchSyncNum = cfg.batchSyncNum;
newConfig.syncAsSparkDataSourceTable = cfg.syncAsSparkDataSourceTable;
newConfig.sparkSchemaLengthThreshold = cfg.sparkSchemaLengthThreshold;
newConfig.withOperationField = cfg.withOperationField;
return newConfig;
}
@@ -174,6 +178,7 @@ public class HiveSyncConfig implements Serializable {
+ ", createManagedTable=" + createManagedTable
+ ", syncAsSparkDataSourceTable=" + syncAsSparkDataSourceTable
+ ", sparkSchemaLengthThreshold=" + sparkSchemaLengthThreshold
+ ", withOperationField=" + withOperationField
+ '}';
}
}

View File

@@ -62,7 +62,7 @@ public class HoodieHiveClient extends AbstractSyncHoodieClient {
private final HiveSyncConfig syncConfig;
public HoodieHiveClient(HiveSyncConfig cfg, HiveConf configuration, FileSystem fs) {
super(cfg.basePath, cfg.assumeDatePartitioning, cfg.useFileListingFromMetadata, cfg.verifyMetadataFileListing, fs);
super(cfg.basePath, cfg.assumeDatePartitioning, cfg.useFileListingFromMetadata, cfg.verifyMetadataFileListing, cfg.withOperationField, fs);
this.syncConfig = cfg;
// Support JDBC, HiveQL and metastore based implementations for backwards compatiblity. Future users should

View File

@@ -51,19 +51,21 @@ public abstract class AbstractSyncHoodieClient {
protected final HoodieTableMetaClient metaClient;
protected final HoodieTableType tableType;
protected final FileSystem fs;
private String basePath;
private boolean assumeDatePartitioning;
private boolean useFileListingFromMetadata;
private boolean verifyMetadataFileListing;
private final String basePath;
private final boolean assumeDatePartitioning;
private final boolean useFileListingFromMetadata;
private final boolean verifyMetadataFileListing;
private final boolean withOperationField;
public AbstractSyncHoodieClient(String basePath, boolean assumeDatePartitioning, boolean useFileListingFromMetadata,
boolean verifyMetadataFileListing, FileSystem fs) {
boolean verifyMetadataFileListing, boolean withOperationField, FileSystem fs) {
this.metaClient = HoodieTableMetaClient.builder().setConf(fs.getConf()).setBasePath(basePath).setLoadActiveTimelineOnLoad(true).build();
this.tableType = metaClient.getTableType();
this.basePath = basePath;
this.assumeDatePartitioning = assumeDatePartitioning;
this.useFileListingFromMetadata = useFileListingFromMetadata;
this.verifyMetadataFileListing = verifyMetadataFileListing;
this.withOperationField = withOperationField;
this.fs = fs;
}
@@ -139,7 +141,11 @@ public abstract class AbstractSyncHoodieClient {
*/
public MessageType getDataSchema() {
try {
return new TableSchemaResolver(metaClient).getTableParquetSchema();
if (withOperationField) {
return new TableSchemaResolver(metaClient, true).getTableParquetSchema();
} else {
return new TableSchemaResolver(metaClient).getTableParquetSchema();
}
} catch (Exception e) {
throw new HoodieSyncException("Failed to read data schema", e);
}