[HUDI-3029] Transaction manager: avoid deadlock when doing begin and end transactions (#4363)
* [HUDI-3029] Transaction manager: avoid deadlock when doing begin and end transactions - Transaction manager has begin and end transactions as synchronized methods. Based on the lock provider implementaion, this can lead to deadlock situation when the underlying lock() calls are blocking or with a long timeout. - Fixing transaction manager begin and end transactions to not get to deadlock and to not assume anything on the lock provider implementation.
This commit is contained in:
committed by
GitHub
parent
47852446e8
commit
d1d48ed494
@@ -201,7 +201,7 @@ public abstract class AbstractHoodieWriteClient<T extends HoodieRecordPayload, I
|
||||
} catch (IOException e) {
|
||||
throw new HoodieCommitException("Failed to complete commit " + config.getBasePath() + " at time " + instantTime, e);
|
||||
} finally {
|
||||
this.txnManager.endTransaction();
|
||||
this.txnManager.endTransaction(Option.of(inflightInstant));
|
||||
}
|
||||
// do this outside of lock since compaction, clustering can be time taking and we don't need a lock for the entire execution period
|
||||
runTableServicesInline(table, metadata, extraMetadata);
|
||||
@@ -1063,13 +1063,14 @@ public abstract class AbstractHoodieWriteClient<T extends HoodieRecordPayload, I
|
||||
public Option<String> scheduleTableService(String instantTime, Option<Map<String, String>> extraMetadata,
|
||||
TableServiceType tableServiceType) {
|
||||
// A lock is required to guard against race conditions between an on-going writer and scheduling a table service.
|
||||
final Option<HoodieInstant> inflightInstant = Option.of(new HoodieInstant(HoodieInstant.State.REQUESTED,
|
||||
tableServiceType.getAction(), instantTime));
|
||||
try {
|
||||
this.txnManager.beginTransaction(Option.of(new HoodieInstant(HoodieInstant.State.REQUESTED,
|
||||
tableServiceType.getAction(), instantTime)), Option.empty());
|
||||
this.txnManager.beginTransaction(inflightInstant, Option.empty());
|
||||
LOG.info("Scheduling table service " + tableServiceType);
|
||||
return scheduleTableServiceInternal(instantTime, extraMetadata, tableServiceType);
|
||||
} finally {
|
||||
this.txnManager.endTransaction();
|
||||
this.txnManager.endTransaction(inflightInstant);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -35,49 +35,64 @@ import java.io.Serializable;
|
||||
public class TransactionManager implements Serializable {
|
||||
|
||||
private static final Logger LOG = LogManager.getLogger(TransactionManager.class);
|
||||
|
||||
private final LockManager lockManager;
|
||||
private Option<HoodieInstant> currentTxnOwnerInstant;
|
||||
private Option<HoodieInstant> lastCompletedTxnOwnerInstant;
|
||||
private boolean supportsOptimisticConcurrency;
|
||||
private final boolean isOptimisticConcurrencyControlEnabled;
|
||||
private Option<HoodieInstant> currentTxnOwnerInstant = Option.empty();
|
||||
private Option<HoodieInstant> lastCompletedTxnOwnerInstant = Option.empty();
|
||||
|
||||
public TransactionManager(HoodieWriteConfig config, FileSystem fs) {
|
||||
this.lockManager = new LockManager(config, fs);
|
||||
this.supportsOptimisticConcurrency = config.getWriteConcurrencyMode().supportsOptimisticConcurrencyControl();
|
||||
this.isOptimisticConcurrencyControlEnabled = config.getWriteConcurrencyMode().supportsOptimisticConcurrencyControl();
|
||||
}
|
||||
|
||||
public synchronized void beginTransaction() {
|
||||
if (supportsOptimisticConcurrency) {
|
||||
public void beginTransaction() {
|
||||
if (isOptimisticConcurrencyControlEnabled) {
|
||||
LOG.info("Transaction starting without a transaction owner");
|
||||
lockManager.lock();
|
||||
LOG.info("Transaction started");
|
||||
LOG.info("Transaction started without a transaction owner");
|
||||
}
|
||||
}
|
||||
|
||||
public synchronized void beginTransaction(Option<HoodieInstant> currentTxnOwnerInstant, Option<HoodieInstant> lastCompletedTxnOwnerInstant) {
|
||||
if (supportsOptimisticConcurrency) {
|
||||
this.lastCompletedTxnOwnerInstant = lastCompletedTxnOwnerInstant;
|
||||
lockManager.setLatestCompletedWriteInstant(lastCompletedTxnOwnerInstant);
|
||||
LOG.info("Latest completed transaction instant " + lastCompletedTxnOwnerInstant);
|
||||
this.currentTxnOwnerInstant = currentTxnOwnerInstant;
|
||||
LOG.info("Transaction starting with transaction owner " + currentTxnOwnerInstant);
|
||||
public void beginTransaction(Option<HoodieInstant> newTxnOwnerInstant,
|
||||
Option<HoodieInstant> lastCompletedTxnOwnerInstant) {
|
||||
if (isOptimisticConcurrencyControlEnabled) {
|
||||
LOG.info("Transaction starting for " + newTxnOwnerInstant
|
||||
+ " with latest completed transaction instant " + lastCompletedTxnOwnerInstant);
|
||||
lockManager.lock();
|
||||
LOG.info("Transaction started");
|
||||
reset(currentTxnOwnerInstant, newTxnOwnerInstant, lastCompletedTxnOwnerInstant);
|
||||
LOG.info("Transaction started for " + newTxnOwnerInstant
|
||||
+ " with latest completed transaction instant " + lastCompletedTxnOwnerInstant);
|
||||
}
|
||||
}
|
||||
|
||||
public synchronized void endTransaction() {
|
||||
if (supportsOptimisticConcurrency) {
|
||||
LOG.info("Transaction ending with transaction owner " + currentTxnOwnerInstant);
|
||||
public void endTransaction() {
|
||||
if (isOptimisticConcurrencyControlEnabled) {
|
||||
LOG.info("Transaction ending without a transaction owner");
|
||||
lockManager.unlock();
|
||||
LOG.info("Transaction ended");
|
||||
this.lastCompletedTxnOwnerInstant = Option.empty();
|
||||
lockManager.resetLatestCompletedWriteInstant();
|
||||
LOG.info("Transaction ended without a transaction owner");
|
||||
}
|
||||
}
|
||||
|
||||
public void endTransaction(Option<HoodieInstant> currentTxnOwnerInstant) {
|
||||
if (isOptimisticConcurrencyControlEnabled) {
|
||||
LOG.info("Transaction ending with transaction owner " + currentTxnOwnerInstant);
|
||||
reset(currentTxnOwnerInstant, Option.empty(), Option.empty());
|
||||
lockManager.unlock();
|
||||
LOG.info("Transaction ended with transaction owner " + currentTxnOwnerInstant);
|
||||
}
|
||||
}
|
||||
|
||||
private synchronized void reset(Option<HoodieInstant> callerInstant,
|
||||
Option<HoodieInstant> newTxnOwnerInstant,
|
||||
Option<HoodieInstant> lastCompletedTxnOwnerInstant) {
|
||||
if (!this.currentTxnOwnerInstant.isPresent() || this.currentTxnOwnerInstant == callerInstant) {
|
||||
this.currentTxnOwnerInstant = newTxnOwnerInstant;
|
||||
this.lastCompletedTxnOwnerInstant = lastCompletedTxnOwnerInstant;
|
||||
}
|
||||
}
|
||||
|
||||
public void close() {
|
||||
if (supportsOptimisticConcurrency) {
|
||||
if (isOptimisticConcurrencyControlEnabled) {
|
||||
lockManager.close();
|
||||
LOG.info("Transaction manager closed");
|
||||
}
|
||||
|
||||
@@ -111,7 +111,7 @@ public class InProcessLockProvider implements LockProvider<ReentrantReadWriteLoc
|
||||
}
|
||||
|
||||
private String getLogMessage(LockState state) {
|
||||
return StringUtils.join(String.valueOf(Thread.currentThread().getId()),
|
||||
state.name(), " local process lock.");
|
||||
return StringUtils.join("Thread ", String.valueOf(Thread.currentThread().getName()), " ",
|
||||
state.name(), " in-process lock.");
|
||||
}
|
||||
}
|
||||
|
||||
@@ -20,13 +20,10 @@ package org.apache.hudi.client.transaction.lock;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
import java.util.concurrent.atomic.AtomicReference;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.hudi.common.config.LockConfiguration;
|
||||
import org.apache.hudi.common.config.SerializableConfiguration;
|
||||
import org.apache.hudi.common.lock.LockProvider;
|
||||
import org.apache.hudi.common.table.timeline.HoodieInstant;
|
||||
import org.apache.hudi.common.util.Option;
|
||||
import org.apache.hudi.common.util.ReflectionUtils;
|
||||
import org.apache.hudi.config.HoodieWriteConfig;
|
||||
import org.apache.hudi.exception.HoodieLockException;
|
||||
@@ -46,11 +43,8 @@ public class LockManager implements Serializable, AutoCloseable {
|
||||
private final LockConfiguration lockConfiguration;
|
||||
private final SerializableConfiguration hadoopConf;
|
||||
private volatile LockProvider lockProvider;
|
||||
// Holds the latest completed write instant to know which ones to check conflict against
|
||||
private final AtomicReference<Option<HoodieInstant>> latestCompletedWriteInstant;
|
||||
|
||||
public LockManager(HoodieWriteConfig writeConfig, FileSystem fs) {
|
||||
this.latestCompletedWriteInstant = new AtomicReference<>(Option.empty());
|
||||
this.writeConfig = writeConfig;
|
||||
this.hadoopConf = new SerializableConfiguration(fs.getConf());
|
||||
this.lockConfiguration = new LockConfiguration(writeConfig.getProps());
|
||||
@@ -100,22 +94,6 @@ public class LockManager implements Serializable, AutoCloseable {
|
||||
return lockProvider;
|
||||
}
|
||||
|
||||
public void setLatestCompletedWriteInstant(Option<HoodieInstant> instant) {
|
||||
this.latestCompletedWriteInstant.set(instant);
|
||||
}
|
||||
|
||||
public void compareAndSetLatestCompletedWriteInstant(Option<HoodieInstant> expected, Option<HoodieInstant> newValue) {
|
||||
this.latestCompletedWriteInstant.compareAndSet(expected, newValue);
|
||||
}
|
||||
|
||||
public AtomicReference<Option<HoodieInstant>> getLatestCompletedWriteInstant() {
|
||||
return latestCompletedWriteInstant;
|
||||
}
|
||||
|
||||
public void resetLatestCompletedWriteInstant() {
|
||||
this.latestCompletedWriteInstant.set(Option.empty());
|
||||
}
|
||||
|
||||
@Override
|
||||
public void close() {
|
||||
closeQuietly();
|
||||
|
||||
@@ -217,7 +217,7 @@ public class CleanActionExecutor<T extends HoodieRecordPayload, I, K, O> extends
|
||||
throw new HoodieIOException("Failed to clean up after commit", e);
|
||||
} finally {
|
||||
if (!skipLocking) {
|
||||
this.txnManager.endTransaction();
|
||||
this.txnManager.endTransaction(Option.empty());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -147,14 +147,16 @@ public abstract class BaseCommitActionExecutor<T extends HoodieRecordPayload, I,
|
||||
}
|
||||
|
||||
protected void autoCommit(Option<Map<String, String>> extraMetadata, HoodieWriteMetadata<O> result) {
|
||||
this.txnManager.beginTransaction(Option.of(new HoodieInstant(State.INFLIGHT, HoodieTimeline.COMMIT_ACTION, instantTime)),
|
||||
final Option<HoodieInstant> inflightInstant = Option.of(new HoodieInstant(State.INFLIGHT,
|
||||
HoodieTimeline.COMMIT_ACTION, instantTime));
|
||||
this.txnManager.beginTransaction(inflightInstant,
|
||||
lastCompletedTxn.isPresent() ? Option.of(lastCompletedTxn.get().getLeft()) : Option.empty());
|
||||
try {
|
||||
TransactionUtils.resolveWriteConflictIfAny(table, this.txnManager.getCurrentTransactionOwner(),
|
||||
result.getCommitMetadata(), config, this.txnManager.getLastCompletedTransactionOwner());
|
||||
commit(extraMetadata, result);
|
||||
} finally {
|
||||
this.txnManager.endTransaction();
|
||||
this.txnManager.endTransaction(inflightInstant);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -112,7 +112,7 @@ public abstract class BaseRestoreActionExecutor<T extends HoodieRecordPayload, I
|
||||
this.txnManager.beginTransaction(Option.empty(), Option.empty());
|
||||
writeTableMetadata(restoreMetadata);
|
||||
} finally {
|
||||
this.txnManager.endTransaction();
|
||||
this.txnManager.endTransaction(Option.empty());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -266,7 +266,7 @@ public abstract class BaseRollbackActionExecutor<T extends HoodieRecordPayload,
|
||||
throw new HoodieIOException("Error executing rollback at instant " + instantTime, e);
|
||||
} finally {
|
||||
if (!skipLocking) {
|
||||
this.txnManager.endTransaction();
|
||||
this.txnManager.endTransaction(Option.empty());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user