[HUDI-1932] Update Hive sync timestamp when change detected (#3053)
* Update Hive sync timestamp when change detected Only update the last commit timestamp on the Hive table when the table schema has changed or a partition is created/updated. When using AWS Glue Data Catalog as the metastore for Hive this will ensure that table versions are substantive (including schema and/or partition changes). Prior to this change when a Hive sync is performed without schema or partition changes the table in the Glue Data Catalog would have a new version published with the only change being the timestamp property. https://issues.apache.org/jira/browse/HUDI-1932 * add conditional sync flag * fix testSyncWithoutDiffs * fix HiveSyncConfig Co-authored-by: Raymond Xu <2701446+xushiyan@users.noreply.github.com>
This commit is contained in:
@@ -120,6 +120,9 @@ public class HiveSyncConfig implements Serializable {
|
|||||||
@Parameter(names = {"--with-operation-field"}, description = "Whether to include the '_hoodie_operation' field in the metadata fields")
|
@Parameter(names = {"--with-operation-field"}, description = "Whether to include the '_hoodie_operation' field in the metadata fields")
|
||||||
public Boolean withOperationField = false;
|
public Boolean withOperationField = false;
|
||||||
|
|
||||||
|
@Parameter(names = {"--conditional-sync"}, description = "If true, only sync on conditions like schema change or partition change.")
|
||||||
|
public Boolean isConditionalSync = false;
|
||||||
|
|
||||||
// enhance the similar function in child class
|
// enhance the similar function in child class
|
||||||
public static HiveSyncConfig copy(HiveSyncConfig cfg) {
|
public static HiveSyncConfig copy(HiveSyncConfig cfg) {
|
||||||
HiveSyncConfig newConfig = new HiveSyncConfig();
|
HiveSyncConfig newConfig = new HiveSyncConfig();
|
||||||
@@ -143,6 +146,7 @@ public class HiveSyncConfig implements Serializable {
|
|||||||
newConfig.syncAsSparkDataSourceTable = cfg.syncAsSparkDataSourceTable;
|
newConfig.syncAsSparkDataSourceTable = cfg.syncAsSparkDataSourceTable;
|
||||||
newConfig.sparkSchemaLengthThreshold = cfg.sparkSchemaLengthThreshold;
|
newConfig.sparkSchemaLengthThreshold = cfg.sparkSchemaLengthThreshold;
|
||||||
newConfig.withOperationField = cfg.withOperationField;
|
newConfig.withOperationField = cfg.withOperationField;
|
||||||
|
newConfig.isConditionalSync = cfg.isConditionalSync;
|
||||||
return newConfig;
|
return newConfig;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -174,6 +178,7 @@ public class HiveSyncConfig implements Serializable {
|
|||||||
+ ", syncAsSparkDataSourceTable=" + syncAsSparkDataSourceTable
|
+ ", syncAsSparkDataSourceTable=" + syncAsSparkDataSourceTable
|
||||||
+ ", sparkSchemaLengthThreshold=" + sparkSchemaLengthThreshold
|
+ ", sparkSchemaLengthThreshold=" + sparkSchemaLengthThreshold
|
||||||
+ ", withOperationField=" + withOperationField
|
+ ", withOperationField=" + withOperationField
|
||||||
|
+ ", isConditionalSync=" + isConditionalSync
|
||||||
+ '}';
|
+ '}';
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -179,7 +179,7 @@ public class HiveSyncTool extends AbstractSyncTool {
|
|||||||
cfg.syncAsSparkDataSourceTable = false;
|
cfg.syncAsSparkDataSourceTable = false;
|
||||||
}
|
}
|
||||||
// Sync schema if needed
|
// Sync schema if needed
|
||||||
syncSchema(tableName, tableExists, useRealtimeInputFormat, readAsOptimized, schema);
|
boolean schemaChanged = syncSchema(tableName, tableExists, useRealtimeInputFormat, readAsOptimized, schema);
|
||||||
|
|
||||||
LOG.info("Schema sync complete. Syncing partitions for " + tableName);
|
LOG.info("Schema sync complete. Syncing partitions for " + tableName);
|
||||||
// Get the last time we successfully synced partitions
|
// Get the last time we successfully synced partitions
|
||||||
@@ -192,8 +192,11 @@ public class HiveSyncTool extends AbstractSyncTool {
|
|||||||
LOG.info("Storage partitions scan complete. Found " + writtenPartitionsSince.size());
|
LOG.info("Storage partitions scan complete. Found " + writtenPartitionsSince.size());
|
||||||
|
|
||||||
// Sync the partitions if needed
|
// Sync the partitions if needed
|
||||||
syncPartitions(tableName, writtenPartitionsSince);
|
boolean partitionsChanged = syncPartitions(tableName, writtenPartitionsSince);
|
||||||
|
boolean meetSyncConditions = schemaChanged || partitionsChanged;
|
||||||
|
if (!cfg.isConditionalSync || meetSyncConditions) {
|
||||||
hoodieHiveClient.updateLastCommitTimeSynced(tableName);
|
hoodieHiveClient.updateLastCommitTimeSynced(tableName);
|
||||||
|
}
|
||||||
LOG.info("Sync complete for " + tableName);
|
LOG.info("Sync complete for " + tableName);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -204,7 +207,7 @@ public class HiveSyncTool extends AbstractSyncTool {
|
|||||||
* @param tableExists - does table exist
|
* @param tableExists - does table exist
|
||||||
* @param schema - extracted schema
|
* @param schema - extracted schema
|
||||||
*/
|
*/
|
||||||
private void syncSchema(String tableName, boolean tableExists, boolean useRealTimeInputFormat,
|
private boolean syncSchema(String tableName, boolean tableExists, boolean useRealTimeInputFormat,
|
||||||
boolean readAsOptimized, MessageType schema) {
|
boolean readAsOptimized, MessageType schema) {
|
||||||
// Append spark table properties & serde properties
|
// Append spark table properties & serde properties
|
||||||
Map<String, String> tableProperties = ConfigUtils.toMap(cfg.tableProperties);
|
Map<String, String> tableProperties = ConfigUtils.toMap(cfg.tableProperties);
|
||||||
@@ -215,6 +218,7 @@ public class HiveSyncTool extends AbstractSyncTool {
|
|||||||
tableProperties.putAll(sparkTableProperties);
|
tableProperties.putAll(sparkTableProperties);
|
||||||
serdeProperties.putAll(sparkSerdeProperties);
|
serdeProperties.putAll(sparkSerdeProperties);
|
||||||
}
|
}
|
||||||
|
boolean schemaChanged = false;
|
||||||
// Check and sync schema
|
// Check and sync schema
|
||||||
if (!tableExists) {
|
if (!tableExists) {
|
||||||
LOG.info("Hive table " + tableName + " is not found. Creating it");
|
LOG.info("Hive table " + tableName + " is not found. Creating it");
|
||||||
@@ -236,6 +240,7 @@ public class HiveSyncTool extends AbstractSyncTool {
|
|||||||
// /ql/exec/DDLTask.java#L3488
|
// /ql/exec/DDLTask.java#L3488
|
||||||
hoodieHiveClient.createTable(tableName, schema, inputFormatClassName,
|
hoodieHiveClient.createTable(tableName, schema, inputFormatClassName,
|
||||||
outputFormatClassName, serDeFormatClassName, serdeProperties, tableProperties);
|
outputFormatClassName, serDeFormatClassName, serdeProperties, tableProperties);
|
||||||
|
schemaChanged = true;
|
||||||
} else {
|
} else {
|
||||||
// Check if the table schema has evolved
|
// Check if the table schema has evolved
|
||||||
Map<String, String> tableSchema = hoodieHiveClient.getTableSchema(tableName);
|
Map<String, String> tableSchema = hoodieHiveClient.getTableSchema(tableName);
|
||||||
@@ -248,10 +253,12 @@ public class HiveSyncTool extends AbstractSyncTool {
|
|||||||
hoodieHiveClient.updateTableProperties(tableName, tableProperties);
|
hoodieHiveClient.updateTableProperties(tableName, tableProperties);
|
||||||
LOG.info("Sync table properties for " + tableName + ", table properties is: " + cfg.tableProperties);
|
LOG.info("Sync table properties for " + tableName + ", table properties is: " + cfg.tableProperties);
|
||||||
}
|
}
|
||||||
|
schemaChanged = true;
|
||||||
} else {
|
} else {
|
||||||
LOG.info("No Schema difference for " + tableName);
|
LOG.info("No Schema difference for " + tableName);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
return schemaChanged;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@@ -324,7 +331,8 @@ public class HiveSyncTool extends AbstractSyncTool {
|
|||||||
* Syncs the list of storage partitions passed in (checks if the partition is in hive, if not adds it or if the
|
* Syncs the list of storage partitions passed in (checks if the partition is in hive, if not adds it or if the
|
||||||
* partition path does not match, it updates the partition path).
|
* partition path does not match, it updates the partition path).
|
||||||
*/
|
*/
|
||||||
private void syncPartitions(String tableName, List<String> writtenPartitionsSince) {
|
private boolean syncPartitions(String tableName, List<String> writtenPartitionsSince) {
|
||||||
|
boolean partitionsChanged;
|
||||||
try {
|
try {
|
||||||
List<Partition> hivePartitions = hoodieHiveClient.scanTablePartitions(tableName);
|
List<Partition> hivePartitions = hoodieHiveClient.scanTablePartitions(tableName);
|
||||||
List<PartitionEvent> partitionEvents =
|
List<PartitionEvent> partitionEvents =
|
||||||
@@ -335,9 +343,11 @@ public class HiveSyncTool extends AbstractSyncTool {
|
|||||||
List<String> updatePartitions = filterPartitions(partitionEvents, PartitionEventType.UPDATE);
|
List<String> updatePartitions = filterPartitions(partitionEvents, PartitionEventType.UPDATE);
|
||||||
LOG.info("Changed Partitions " + updatePartitions);
|
LOG.info("Changed Partitions " + updatePartitions);
|
||||||
hoodieHiveClient.updatePartitionsToTable(tableName, updatePartitions);
|
hoodieHiveClient.updatePartitionsToTable(tableName, updatePartitions);
|
||||||
|
partitionsChanged = !updatePartitions.isEmpty() || !newPartitions.isEmpty();
|
||||||
} catch (Exception e) {
|
} catch (Exception e) {
|
||||||
throw new HoodieHiveSyncException("Failed to sync partitions for table " + tableName, e);
|
throw new HoodieHiveSyncException("Failed to sync partitions for table " + tableName, e);
|
||||||
}
|
}
|
||||||
|
return partitionsChanged;
|
||||||
}
|
}
|
||||||
|
|
||||||
private List<String> filterPartitions(List<PartitionEvent> events, PartitionEventType eventType) {
|
private List<String> filterPartitions(List<PartitionEvent> events, PartitionEventType eventType) {
|
||||||
|
|||||||
@@ -1017,4 +1017,35 @@ public class TestHiveSyncTool {
|
|||||||
.containsValue("BIGINT"), errorMsg);
|
.containsValue("BIGINT"), errorMsg);
|
||||||
ddlExecutor.runSQL(dropTableSql);
|
ddlExecutor.runSQL(dropTableSql);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ParameterizedTest
|
||||||
|
@MethodSource("syncMode")
|
||||||
|
public void testSyncWithoutDiffs(String syncMode) throws Exception {
|
||||||
|
hiveSyncConfig.syncMode = syncMode;
|
||||||
|
hiveSyncConfig.isConditionalSync = true;
|
||||||
|
HiveTestUtil.hiveSyncConfig.batchSyncNum = 2;
|
||||||
|
String tableName = HiveTestUtil.hiveSyncConfig.tableName + HiveSyncTool.SUFFIX_SNAPSHOT_TABLE;
|
||||||
|
|
||||||
|
String commitTime0 = "100";
|
||||||
|
String commitTime1 = "101";
|
||||||
|
String commitTime2 = "102";
|
||||||
|
HiveTestUtil.createMORTable(commitTime0, commitTime1, 2, true, true);
|
||||||
|
|
||||||
|
HoodieHiveClient hiveClient =
|
||||||
|
new HoodieHiveClient(HiveTestUtil.hiveSyncConfig, HiveTestUtil.getHiveConf(), HiveTestUtil.fileSystem);
|
||||||
|
|
||||||
|
HiveSyncTool tool = new HiveSyncTool(HiveTestUtil.hiveSyncConfig, HiveTestUtil.getHiveConf(), HiveTestUtil.fileSystem);
|
||||||
|
tool.syncHoodieTable();
|
||||||
|
|
||||||
|
assertTrue(hiveClient.doesTableExist(tableName));
|
||||||
|
assertEquals(commitTime1, hiveClient.getLastCommitTimeSynced(tableName).get());
|
||||||
|
|
||||||
|
HiveTestUtil.addMORPartitions(0, true, true, true, ZonedDateTime.now().plusDays(2), commitTime1, commitTime2);
|
||||||
|
|
||||||
|
tool = new HiveSyncTool(HiveTestUtil.hiveSyncConfig, HiveTestUtil.getHiveConf(), HiveTestUtil.fileSystem);
|
||||||
|
tool.syncHoodieTable();
|
||||||
|
hiveClient = new HoodieHiveClient(HiveTestUtil.hiveSyncConfig, HiveTestUtil.getHiveConf(), HiveTestUtil.fileSystem);
|
||||||
|
assertEquals(commitTime1, hiveClient.getLastCommitTimeSynced(tableName).get());
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user