From 2367fb2536305011331660497885ad5d431557b6 Mon Sep 17 00:00:00 2001 From: Marvin Charles Date: Tue, 21 Apr 2026 15:45:44 +0300 Subject: [PATCH 1/2] fix: deadlock between Task.Size and redrawProgress under parallel install MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit #558 parallelised hermit install, which lets many goroutines drive per-package download progress concurrently. cache/http.go calls task.Size(total) followed by task.Add(n) for each chunk received; with a single active task the cycle below could never close, but with many it closes in two hops. Lock orderings: Task.Size: task.lock -> ui.lock (via UI.swapSize) UI.redrawProgress: ui.lock -> task_i.lock (via liveOperations / op.status) Goroutine A inside task_A.Size holds task_A.lock waiting on ui.lock. Goroutine B, having just returned from its own task.Add's critical section, enters redrawProgress holding ui.lock and iterates operations waiting on task_A.lock. Deadlock is silent because the UI itself is blocked, so no progress output is produced before the job is killed — matches the wild symptom of 'hermit install' stalling with zero output after v0.52.0. Fix: release task.lock in Task.Size before calling UI.swapSize. The oldSize → newSize delta is additive and commutative, so doing the swapSize outside the task's critical section loses nothing. Regression test fans out goroutines each interleaving Size/Add on their own task (mirroring cache/http.go) and fails the Go test binary explicitly with a goroutine dump rather than hanging for the default test timeout. --- ui/task.go | 10 ++++++-- ui/task_test.go | 61 +++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 69 insertions(+), 2 deletions(-) create mode 100644 ui/task_test.go diff --git a/ui/task.go b/ui/task.go index 5258c0bb2..7adddc6ee 100644 --- a/ui/task.go +++ b/ui/task.go @@ -42,13 +42,19 @@ func (o *Task) status() (progress int, size int, started bool) { // Size sets the size of the Task. func (o *Task) Size(n int) *Task { + // Lock ordering: update o.size under o.lock, then call swapSize separately. + // Callers of UI.redrawProgress acquire ui.lock first and then re-enter each + // task's lock via op.status(); if we held o.lock while calling swapSize + // (which takes ui.lock) two goroutines updating different tasks would + // deadlock — task_A.lock -> ui.lock vs. ui.lock -> task_A.lock. o.lock.Lock() - defer o.lock.Unlock() - o.w.swapSize(o.size, n) + oldSize := o.size o.size = n if o.progress > o.size { o.progress = o.size } + o.lock.Unlock() + o.w.swapSize(oldSize, n) return o } diff --git a/ui/task_test.go b/ui/task_test.go new file mode 100644 index 000000000..bab78d53f --- /dev/null +++ b/ui/task_test.go @@ -0,0 +1,61 @@ +package ui + +import ( + "runtime" + "sync" + "testing" + "time" +) + +// TestConcurrentTaskSizeAndAddNoDeadlock is a regression test for a deadlock +// surfaced when hermit install was parallelised in #558. +// +// Lock-ordering audit: +// +// Task.Size: task.lock -> ui.lock (via swapSize) +// UI.redrawProgress: ui.lock -> task_i.lock (via liveOperations / op.status) +// +// These orderings are inconsistent. With serial installs only one task is +// actively updating at any moment, so the cycle never closes. Under parallel +// installs many goroutines drive per-package download progress concurrently, +// each calling Task.Size followed by Task.Add for every chunk received +// (cache/http.go), and the cycle closes: goroutine A holds task_A.lock and +// waits on ui.lock; goroutine B holds ui.lock (inside redrawProgress after its +// own Add) and iterates operations waiting on task_A.lock. +// +// The test races Size and Add across many tasks and fails loudly on deadlock +// rather than hanging for the 10-minute Go test default. +func TestConcurrentTaskSizeAndAddNoDeadlock(t *testing.T) { + ui, _ := NewForTesting() + const tasks = 8 + const iterations = 200 + + var wg sync.WaitGroup + for i := 0; i < tasks; i++ { + wg.Add(1) + go func() { + defer wg.Done() + task := ui.Progress("task", 1) + // Mirror cache/http.go: Size() then Add() per chunk. + for j := 1; j <= iterations; j++ { + task.Size(j) + task.Add(1) + } + task.Done() + }() + } + + done := make(chan struct{}) + go func() { + wg.Wait() + close(done) + }() + + select { + case <-done: + case <-time.After(10 * time.Second): + buf := make([]byte, 1<<20) + n := runtime.Stack(buf, true) + t.Fatalf("deadlock: concurrent Task.Size/Task.Add did not complete within 10s\n\ngoroutines:\n%s", buf[:n]) + } +} From 45fe13f49e055c512cf7b6c05febe645e2d884de Mon Sep 17 00:00:00 2001 From: Josh Friend Date: Tue, 21 Apr 2026 12:26:45 -0400 Subject: [PATCH 2/2] fix: use integer range loop to satisfy intrange linter --- ui/task_test.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ui/task_test.go b/ui/task_test.go index bab78d53f..d2456f70c 100644 --- a/ui/task_test.go +++ b/ui/task_test.go @@ -31,7 +31,7 @@ func TestConcurrentTaskSizeAndAddNoDeadlock(t *testing.T) { const iterations = 200 var wg sync.WaitGroup - for i := 0; i < tasks; i++ { + for range tasks { wg.Add(1) go func() { defer wg.Done()