From e9b7f71b80ec242c6529f4a6cb3171435debf3a7 Mon Sep 17 00:00:00 2001 From: Matt Richter Date: Thu, 9 Apr 2026 11:20:24 -0400 Subject: [PATCH 1/6] fix: use read lock for clone directory reuse check to unblock parallel plans MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When parallel_plan is enabled, all project goroutines call Clone() which acquires an exclusive write lock on the shared clone directory before checking if the repo is at the correct commit. Meanwhile, runSteps() holds a read lock on the same directory for plan execution. Since RWMutex.Lock() waits for all readers to release, each project's Clone() blocks until the previous project's plan completes — serializing parallel plans to effectively 1 at a time. Fix: add an optimistic fast path that checks the commit under a READ lock first (git rev-parse is read-only). If the directory exists and is at the correct commit, return immediately without ever acquiring the write lock. Only fall through to the write-lock path when the directory needs cloning or updating. This enables true parallel plan execution for the common case where the clone directory is already checked out at the right commit (e.g., autoplan on push, re-plan via comment). Fixes #5364 Relates to #1618 Co-Authored-By: Claude Opus 4.6 (1M context) Signed-off-by: Matt Richter --- server/events/working_dir.go | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) diff --git a/server/events/working_dir.go b/server/events/working_dir.go index 65264a3491..ad7bd7515f 100644 --- a/server/events/working_dir.go +++ b/server/events/working_dir.go @@ -103,13 +103,26 @@ type FileWorkspace struct { // multiple dirs of the same repo without deleting existing plans. func (w *FileWorkspace) Clone(logger logging.SimpleLogging, headRepo models.Repo, p models.PullRequest, workspace string) (string, error) { cloneDir := w.cloneDir(p.BaseRepo, p, workspace) + c := wrappedGitContext{cloneDir, headRepo, p} - // Unconditionally wait for the clone lock here, if anyone else is doing any clone - // operation in this directory, we wait for it to finish before we check anything. + // Fast path: if the directory already exists and is at the right commit, + // verify under a READ lock so we don't block concurrent plan operations. + // isBranchAtTargetRef only runs "git rev-parse" which is read-only. + if _, err := os.Stat(cloneDir); err == nil { + gitReadUnlockFn := w.gitReadLock(cloneDir) + isUpToDate, err := w.isBranchAtTargetRef(logger, c, p.HeadCommit) + gitReadUnlockFn() + if err == nil && isUpToDate { + logger.Info("repo is at correct commit %q so will not re-clone (fast path)", p.HeadCommit) + return cloneDir, nil + } + } + + // Slow path: directory doesn't exist, is at wrong commit, or check failed. + // Acquire write lock for clone/update operations. gitWriteUnlockFn := w.gitWriteLock(cloneDir) defer gitWriteUnlockFn() - c := wrappedGitContext{cloneDir, headRepo, p} ok, err := w.attemptReuseCloneDir(logger, c, cloneDir) if ok && err == nil { return cloneDir, nil From 39875d361fbecaec93c392fb5ee37f474969d476 Mon Sep 17 00:00:00 2001 From: Matt Richter Date: Fri, 17 Apr 2026 08:49:09 -0400 Subject: [PATCH 2/6] fix: extend read-lock fast path to MergeAgain() Restructures MergeAgain() to check divergence first under read locks, only acquiring the write lock when a merge is actually needed. This mirrors the Clone() optimization in this PR and eliminates the second serialization point for parallel plans using merge checkout strategy. Changes: - MergeAgain() now calls recheckDiverged() first; write lock acquired only when divergence is detected and a merge must be performed. - recheckDiverged() now acquires its own gitRefLock + gitReadLock internally, making the locking contract explicit rather than relying on an implicit "caller holds the write lock" convention. - hasDivergedForPatterns() now continues on per-file match errors instead of returning true (avoids false positives from transient pattern errors forcing unnecessary merges). Signed-off-by: Matt Richter --- server/events/working_dir.go | 61 +++++++++++++++++++++--------------- 1 file changed, 35 insertions(+), 26 deletions(-) diff --git a/server/events/working_dir.go b/server/events/working_dir.go index ad7bd7515f..b6b6c8d4bb 100644 --- a/server/events/working_dir.go +++ b/server/events/working_dir.go @@ -170,6 +170,10 @@ func (w *FileWorkspace) attemptReuseCloneDir(logger logging.SimpleLogging, c wra // MergeAgain merges again with upstream if we are using the merge checkout strategy, // and upstream has been modified since we last checked. // It returns a flag indicating whether we had to merge with upstream again. +// +// Locking strategy: recheckDiverged uses the read and ref lock to serialize git metadata +// operations (URL updates, fetch, status) without taking the repo write lock. +// The write lock is only acquired when we actually need to merge. func (w *FileWorkspace) MergeAgain( logger logging.SimpleLogging, headRepo models.Repo, @@ -181,28 +185,31 @@ func (w *FileWorkspace) MergeAgain( } cloneDir := w.cloneDir(p.BaseRepo, p, workspace) - // We atomically set the recheckRequiredMap flag here before grabbing the clone lock. - // If the flag is cleared after we grab the lock, it means some other thread - // did the necessary work late enough that we do not have to do it again. + // We atomically set the recheckRequiredMap flag here before grabbing any lock. + // If the flag is cleared after we grab the write lock, it means some other + // thread did the necessary work late enough that we do not have to do it again. recheckRequiredMap.Store(cloneDir, struct{}{}) - // Unconditionally wait for the clone lock here, if anyone else is doing any clone - // operation in this directory, we wait for it to finish before we check anything. + c := wrappedGitContext{cloneDir, headRepo, p} + + if !w.recheckDiverged(logger, p, headRepo, cloneDir) { + recheckRequiredMap.Delete(cloneDir) + return false, nil + } + + // Diverged — acquire the write lock to perform the merge, which mutates the + // working tree (git reset --hard + git merge). gitWriteUnlockFn := w.gitWriteLock(cloneDir) defer gitWriteUnlockFn() if _, exists := recheckRequiredMap.Load(cloneDir); !exists { - logger.Debug("Skipping upstream check. Some other thread has done this for us") + logger.Debug("Skipping upstream merge. Some other thread has done this for us") return false, nil } recheckRequiredMap.Delete(cloneDir) - c := wrappedGitContext{cloneDir, headRepo, p} - if w.recheckDiverged(logger, p, headRepo, cloneDir) { - logger.Info("base branch may have been updated, using merge strategy and will merge again") - return true, w.mergeAgain(logger, c) - } - return false, nil + logger.Info("base branch may have been updated, using merge strategy and will merge again") + return true, w.mergeAgain(logger, c) } // recheckDiverged returns true if the branch we're merging into has diverged @@ -212,16 +219,22 @@ func (w *FileWorkspace) MergeAgain( // and we have to perform a new merge. // If there are any errors we return true since we prefer to assume divergence // for safety. -// Locks are acquired by the caller. +// Acquires gitRefLock and gitReadLock internally to serialize git metadata operations. +// Does NOT require the caller to hold the repo read or write lock. func (w *FileWorkspace) recheckDiverged(logger logging.SimpleLogging, p models.PullRequest, headRepo models.Repo, cloneDir string) bool { if !w.CheckoutMerge { - // It only makes sense to warn that main has diverged if we're using - // the checkout merge strategy. If we're just checking out the branch, - // then it doesn't matter what's going on with main because we've - // decided to always run off the branch. return false } + // Hold the read and ref locks for the entire sequence: URL updates write .git/config, + // remote update writes .git/refs/remotes/, and hasDiverged does fetch+status. + // All of these touch shared git metadata and must be serialized with concurrent HasDiverged callers. + // Read lock is acquired before ref lock to avoid deadlock (#6409). + gitReadUnlockFn := w.gitReadLock(cloneDir) + defer gitReadUnlockFn() + unlockRef := w.gitRefLock(cloneDir) + defer unlockRef() + // Bring our remote refs up to date. // Reset the URL in case we are using github app credentials since these might have // expired and refreshed and the URL would now be different. @@ -250,9 +263,6 @@ func (w *FileWorkspace) recheckDiverged(logger logging.SimpleLogging, p models.P } } - // We already hold the write lock; take ref lock so fetch is serialized with other HasDiverged callers. - unlockRef := w.gitRefLock(cloneDir) - defer unlockRef() return w.hasDiverged(logger, cloneDir) } @@ -263,8 +273,8 @@ func (w *FileWorkspace) HasDiverged(logger logging.SimpleLogging, cloneDir strin return false } - // Hold ref lock and repo read lock for the full duration (fetch + status) so we don't race with - // clone/reset/merge. recheckDiverged does not take the read lock because it already holds the write lock. + // Hold repo read lock and ref lock for the full duration (fetch + status) so we don't race with + // clone/reset/merge. Read lock is acquired before ref lock to avoid deadlock (#6409). unlockGitReadLock := w.gitReadLock(cloneDir) defer unlockGitReadLock() @@ -278,8 +288,7 @@ func (w *FileWorkspace) HasDiverged(logger logging.SimpleLogging, cloneDir strin } // hasDiverged runs fetch and git status to detect divergence. Caller must hold -// gitRefLock(cloneDir); if not already holding the repo write lock (e.g. from recheckDiverged), -// caller must also hold gitReadLock(cloneDir). +// gitRefLock(cloneDir) and gitReadLock(cloneDir) (or the write lock). func (w *FileWorkspace) hasDiverged(logger logging.SimpleLogging, cloneDir string) bool { logger.Debug("HasDiverged: running git fetch in %s", cloneDir) cmd := exec.Command("git", "fetch") @@ -373,8 +382,8 @@ func (w *FileWorkspace) hasDivergedForPatterns(logger logging.SimpleLogging, clo for _, file := range nonEmptyChangedFiles { match, err := pm.MatchesOrParentMatches(file) if err != nil { - logger.Debug("HasDiverged: match error for file %q: %s, treating as diverged", file, err) - return true + logger.Debug("HasDiverged: match error for file %q: %s", file, err) + continue } if match { logger.Debug("HasDiverged: file %q matched patterns - branch has diverged", file) From 3743147d9e273df955a7ad90819ac45d540142d8 Mon Sep 17 00:00:00 2001 From: Matt Richter <4481324+matthewmrichter@users.noreply.github.com> Date: Fri, 24 Apr 2026 07:27:23 -0400 Subject: [PATCH 3/6] Update working_dir.go Co-authored-by: Ross Strickland Signed-off-by: Matt Richter <4481324+matthewmrichter@users.noreply.github.com> --- server/events/working_dir.go | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/server/events/working_dir.go b/server/events/working_dir.go index b6b6c8d4bb..ce0abc4c55 100644 --- a/server/events/working_dir.go +++ b/server/events/working_dir.go @@ -383,7 +383,8 @@ func (w *FileWorkspace) hasDivergedForPatterns(logger logging.SimpleLogging, clo match, err := pm.MatchesOrParentMatches(file) if err != nil { logger.Debug("HasDiverged: match error for file %q: %s", file, err) - continue + logger.Debug("HasDiverged: match error for file %q: %s, treating as diverged", file, err) + return true } if match { logger.Debug("HasDiverged: file %q matched patterns - branch has diverged", file) From 73e2b50bb7e4ba74634c6fab18f7c9bd38c57f40 Mon Sep 17 00:00:00 2001 From: Matt Richter <4481324+matthewmrichter@users.noreply.github.com> Date: Fri, 24 Apr 2026 12:27:52 -0400 Subject: [PATCH 4/6] Update working_dir.go Co-authored-by: Nicolas Vanheuverzwijn Signed-off-by: Matt Richter <4481324+matthewmrichter@users.noreply.github.com> --- server/events/working_dir.go | 1 - 1 file changed, 1 deletion(-) diff --git a/server/events/working_dir.go b/server/events/working_dir.go index ce0abc4c55..6fe80d37fd 100644 --- a/server/events/working_dir.go +++ b/server/events/working_dir.go @@ -382,7 +382,6 @@ func (w *FileWorkspace) hasDivergedForPatterns(logger logging.SimpleLogging, clo for _, file := range nonEmptyChangedFiles { match, err := pm.MatchesOrParentMatches(file) if err != nil { - logger.Debug("HasDiverged: match error for file %q: %s", file, err) logger.Debug("HasDiverged: match error for file %q: %s, treating as diverged", file, err) return true } From 1b8e26b7c94e043b9f1891639f7961e4fcb45278 Mon Sep 17 00:00:00 2001 From: Matt Richter <4481324+matthewmrichter@users.noreply.github.com> Date: Fri, 24 Apr 2026 12:28:51 -0400 Subject: [PATCH 5/6] Update working_dir.go Co-authored-by: Nicolas Vanheuverzwijn Signed-off-by: Matt Richter <4481324+matthewmrichter@users.noreply.github.com> --- server/events/working_dir.go | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/server/events/working_dir.go b/server/events/working_dir.go index 6fe80d37fd..e4a12f086f 100644 --- a/server/events/working_dir.go +++ b/server/events/working_dir.go @@ -223,6 +223,10 @@ func (w *FileWorkspace) MergeAgain( // Does NOT require the caller to hold the repo read or write lock. func (w *FileWorkspace) recheckDiverged(logger logging.SimpleLogging, p models.PullRequest, headRepo models.Repo, cloneDir string) bool { if !w.CheckoutMerge { + // It only makes sense to warn that main has diverged if we're using + // the checkout merge strategy. If we're just checking out the branch, + // then it doesn't matter what's going on with main because we've + // decided to always run off the branch. return false } From 3474bb7daaeb15c1120bb056288d4a758ed5c332 Mon Sep 17 00:00:00 2001 From: Matt Richter <4481324+matthewmrichter@users.noreply.github.com> Date: Fri, 24 Apr 2026 15:43:17 -0400 Subject: [PATCH 6/6] Update working_dir.go Co-authored-by: Nicolas Vanheuverzwijn Signed-off-by: Matt Richter <4481324+matthewmrichter@users.noreply.github.com> --- server/events/working_dir.go | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/server/events/working_dir.go b/server/events/working_dir.go index e4a12f086f..5c1fdf94f2 100644 --- a/server/events/working_dir.go +++ b/server/events/working_dir.go @@ -223,10 +223,10 @@ func (w *FileWorkspace) MergeAgain( // Does NOT require the caller to hold the repo read or write lock. func (w *FileWorkspace) recheckDiverged(logger logging.SimpleLogging, p models.PullRequest, headRepo models.Repo, cloneDir string) bool { if !w.CheckoutMerge { - // It only makes sense to warn that main has diverged if we're using - // the checkout merge strategy. If we're just checking out the branch, - // then it doesn't matter what's going on with main because we've - // decided to always run off the branch. + // It only makes sense to warn that main has diverged if we're using + // the checkout merge strategy. If we're just checking out the branch, + // then it doesn't matter what's going on with main because we've + // decided to always run off the branch. return false }