Skip to content

Commit b633c68

Browse files
committed
ucx: meet at barrier before disconnecting, not after
Call pmix_fence while we still have connectivity, because after we disconnect we may never get to being fenced.
1 parent 800b047 commit b633c68

File tree

1 file changed

+8
-2
lines changed

1 file changed

+8
-2
lines changed

opal/mca/common/ucx/common_ucx.c

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -522,9 +522,15 @@ OPAL_DECLSPEC int opal_common_ucx_del_procs(opal_common_ucx_del_proc_t *procs, s
522522
size_t my_rank, size_t max_disconnect,
523523
ucp_worker_h worker)
524524
{
525-
opal_common_ucx_del_procs_nofence(procs, count, my_rank, max_disconnect, worker);
525+
/* fence while we are still connected to our peers */
526+
int rc = opal_common_ucx_mca_pmix_fence(worker);
527+
if (OPAL_SUCCESS != rc) {
528+
MCA_COMMON_UCX_ERROR("pmix fence failed during ucx proc disconnection: %d", rc);
529+
return rc;
530+
}
526531

527-
return opal_common_ucx_mca_pmix_fence(worker);
532+
/* now that everyone is at the barrier, they are free to go their separate ways */
533+
return opal_common_ucx_del_procs_nofence(procs, count, my_rank, max_disconnect, worker);
528534
}
529535

530536
static void safety_valve(void) __opal_attribute_destructor__;

0 commit comments

Comments
 (0)