From fd953ce753add2d3f38e3f1f00505dd815539eec Mon Sep 17 00:00:00 2001 From: mvdbeek Date: Thu, 26 Feb 2026 17:18:38 +0100 Subject: [PATCH] Fix race condition in job recovery causing external_id lookup failure The ManagerMonitor thread was starting before active jobs had their external IDs recovered from disk. This caused status checks to fail with "Failed to obtain external_id for job_id" errors on startup. Observed on usegalaxy.be with job 655740: - Monitor thread checked job status at 17:10:56,273 - MainThread recovered external_id (292) at 17:10:56,278 - The 5ms gap caused the status check to fail Fix: Reorder startup so __recover_jobs() runs before __setup_bind_to_message_queue(), ensuring external IDs are loaded before the monitor thread begins polling. --- pulsar/core.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pulsar/core.py b/pulsar/core.py index c53dd3f2..6baf2c4e 100644 --- a/pulsar/core.py +++ b/pulsar/core.py @@ -51,8 +51,8 @@ def __init__(self, **conf): self.__setup_user_auth_manager(conf) self.__setup_managers(conf) self.__setup_file_cache(conf) - self.__setup_bind_to_message_queue(conf) self.__recover_jobs() + self.__setup_bind_to_message_queue(conf) self.ensure_cleanup = conf.get("ensure_cleanup", False) def shutdown(self, timeout=None):