From d8c4e3d56c88ab6c9d6898a711d8d317bdcf98a0 Mon Sep 17 00:00:00 2001 From: "Jeffery (Zeyu) Zhao" Date: Mon, 18 Aug 2025 13:07:10 +0800 Subject: [PATCH] =?UTF-8?q?Fix=20rate=20limiter=20head=E2=80=91of=E2=80=91?= =?UTF-8?q?line=20blocking=20deadlock?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/rate_limiter.cc | 47 ++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 44 insertions(+), 3 deletions(-) diff --git a/src/rate_limiter.cc b/src/rate_limiter.cc index e89c94cc2..47917fbd2 100644 --- a/src/rate_limiter.cc +++ b/src/rate_limiter.cc @@ -578,10 +578,51 @@ RateLimiter::AttemptAllocation() { std::lock_guard lk(staged_instances_mtx_); if (!staged_instances_.empty()) { - ModelInstanceContext* instance = staged_instances_.top(); - if (resource_manager_->AllocateResources(instance)) { + // Greedy allocation: Try to allocate ANY instance that has sufficient + // resources, not just the top priority one. This prevents deadlocks + // when BLS calls require resources and the child model stucks due to + // head‑of‑line blocking + + // First, try the top instance (highest priority) + ModelInstanceContext* top_instance = staged_instances_.top(); + if (resource_manager_->AllocateResources(top_instance)) { staged_instances_.pop(); - instance->Allocate(); + top_instance->Allocate(); + // After allocating one instance, recursively try to allocate more + // in case resources were freed up + AttemptAllocation(); + return; + } + + // If top instance can't be allocated, try other staged instances + // to find one that can be allocated (greedy approach) + std::vector temp_instances; + temp_instances.push_back(top_instance); + staged_instances_.pop(); + + bool allocated = false; + while (!staged_instances_.empty() && !allocated) { + ModelInstanceContext* instance = staged_instances_.top(); + staged_instances_.pop(); + + if (resource_manager_->AllocateResources(instance)) { + // Found an instance that can be allocated + instance->Allocate(); + allocated = true; + } else { + // Can't allocate this instance, save it for later + temp_instances.push_back(instance); + } + } + + // Restore unallocated instances back to the priority queue + for (auto* instance : temp_instances) { + staged_instances_.push(instance); + } + + // If we allocated an instance, try again in case more can be allocated + if (allocated) { + AttemptAllocation(); } } }