Skip to content

Commit 112b4f8

Browse files
committed
btl/uct: complete re-work of the BTL
This commit is large and contains the following changes: - Disconnect the connection memory domain from the communication domain. This allows any memory domain to be used for connections. The default is to use tcp but it can be disabled which will allow UD and others to be used. - Move tl attributes off od the tl context structure. In theory tl attributes do not differ betweeen contexts so query them once when the tl is created not once per context. This removes the need to allocate the first context so that code has also been removed. - Change mca_btl_uct_tl_t uct_dev_contexts member to be an array. The btl always allocates the maximum number of contexts. This is not a significant amount of memory. Rather than reduce it to be based on the configured maximum number of contexts it makes sense to just make it an array and remove the extra indirection when accessing the contexts. - Do not call mca_btl_uct_endpoint_set_flag before sending a message on the connection endpoint. This method may cause the release of the connection endpoint (cached on the BTL endpoint). If this happens it would lead to a SEGV. - Flush the endpoint only when it is being released. There is no need to do so on every send. Releasing the endpoint without flushing it may lead to it being destroyed while still processing data. - Downgrade endpoint lock from recursive. Recursive locks are not needed for the endpoint lock. - Move the async context from the module to the tl. There is no real benefit from sharing the async context between tls. Given this and some other changes that will be made it makes sense to move it from the module to the tl. - Connection TLs are only used to form connections for connect-to-endpoint TLs. They do not need to belong to the same memory domain as the one they are used with so there is no need to rely on a BTL module. Moved the pending_connection_reqs to the tl and changes the code to support a NULL module for the connection tl. - Put active tls in a list on the mca_btl_uct_md_t structure This simplifies the code a bit by moving mca_btl_uct_tl_t ownership to the mca_btl_uct_md_t class. - There is an issue with btl/uct which prevents the usage of the standard btl_uct_ MCA variables (eager limit, flags, etc). Because of the way the btl was written these values are all determined directly from UCT and can not be changed using the MCA variable interface. To address this issue this commit breaks apart the initialization code and separates out the pieces that are necessary for discovery only. The discovery pieces now use a new set of variables that include the memory domain name and directly control the behavior for BTLs on that memory domain as well as enabling the usage of the btl_uct variable to control the defaults for these variables. Example, using memory domain irdma0 will create variables: btl_uct_irdma0_eager_limit, btl_uct_irdma0_max_send_size, etc. The defaults will be based on what is reported by UCT and the user can set the values to a subset of what UCT reports. For example, if the max send size for the hardware is 8192B then it can be set to anything up to and including that value. The same is true for feature flags, if the hardware supports only some btl atomics or operations the user can specify a subset of them (others will be ignored). - Move device context code to a new file. There is a specific header for device contexts so it makes sense to move the context-specific code to a matching C file. No changes in this other than moving code around. - Use uct_ep_am_short_iov for short messages. The uct_ep_am_short_iov method should allow for faster short messages than uct_ep_am_short (which can only take a single buffer). This commit moves btl/uct to the newer method which breaks compatibility with some version of UCT. Since we already no longer support those versions this change is safe. Signed-off-by: Nathan Hjelm <hjelmn@google.com>
1 parent 3a2e908 commit 112b4f8

19 files changed

+1669
-956
lines changed

opal/mca/btl/uct/Makefile.am

Lines changed: 21 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
# Copyright (c) 2017 IBM Corporation. All rights reserved.
1414
# Copyright (c) 2017-2018 Los Alamos National Security, LLC. All rights
1515
# reserved.
16+
# Copyright (c) 2025 Google, LLC. All rights reserved.
1617
# $COPYRIGHT$
1718
#
1819
# Additional copyrights may follow
@@ -24,22 +25,31 @@ AM_CPPFLAGS = $(btl_uct_CPPFLAGS)
2425

2526
amca_paramdir = $(AMCA_PARAM_SETS_DIR)
2627

27-
sources = \
28+
headers = \
2829
btl_uct.h \
30+
btl_uct_rdma.h \
31+
btl_uct_endpoint.h \
32+
btl_uct_am.h \
33+
btl_uct_frag.h \
34+
btl_uct_types.h \
35+
btl_uct_device_context.h \
36+
btl_uct_discover.h \
37+
btl_uct_modex.h \
38+
btl_uct_include_list.h
39+
40+
sources = \
2941
btl_uct_module.c \
3042
btl_uct_component.c \
31-
btl_uct_rdma.h \
3243
btl_uct_rdma.c \
33-
btl_uct_endpoint.h \
3444
btl_uct_endpoint.c \
3545
btl_uct_amo.c \
36-
btl_uct_am.h \
3746
btl_uct_am.c \
38-
btl_uct_frag.h \
3947
btl_uct_frag.c \
4048
btl_uct_tl.c \
41-
btl_uct_types.h \
42-
btl_uct_device_context.h
49+
btl_uct_discover.c \
50+
btl_uct_modex.c \
51+
btl_uct_include_list.c \
52+
btl_uct_device_context.c
4353

4454
# Make the output library in this directory, and name it either
4555
# mca_<type>_<name>.la (for DSO builds) or libmca_<type>_<name>.la
@@ -50,20 +60,22 @@ lib =
5060
lib_sources =
5161
component = mca_btl_uct.la
5262
component_sources = $(sources)
63+
component_headers = $(headers)
5364
else
5465
lib = libmca_btl_uct.la
5566
lib_sources = $(sources)
67+
lib_headers = ${headers}
5668
component =
5769
component_sources =
5870
endif
5971

6072
mcacomponentdir = $(opallibdir)
6173
mcacomponent_LTLIBRARIES = $(component)
62-
mca_btl_uct_la_SOURCES = $(component_sources)
74+
mca_btl_uct_la_SOURCES = $(component_sources) $(component_headers)
6375
mca_btl_uct_la_LDFLAGS = -module -avoid-version $(btl_uct_LDFLAGS)
6476
mca_btl_uct_la_LIBADD = $(top_builddir)/opal/lib@OPAL_LIB_NAME@.la $(btl_uct_LIBS)
6577

6678
noinst_LTLIBRARIES = $(lib)
67-
libmca_btl_uct_la_SOURCES = $(lib_sources)
79+
libmca_btl_uct_la_SOURCES = $(lib_sources) $(lib_headers)
6880
libmca_btl_uct_la_LDFLAGS = -module -avoid-version $(btl_uct_LDFLAGS)
6981
libmca_btl_uct_la_LIBADD = $(btl_uct_LIBS)

opal/mca/btl/uct/btl_uct.h

Lines changed: 40 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,9 @@ struct mca_btl_uct_module_t {
6464
/** base BTL interface */
6565
mca_btl_base_module_t super;
6666

67+
/** module index in the component module array */
68+
int module_index;
69+
6770
/** whether the module has been fully initialized or not */
6871
bool initialized;
6972

@@ -76,31 +79,15 @@ struct mca_btl_uct_module_t {
7679
/** mutex to protect the module */
7780
opal_recursive_mutex_t lock;
7881

79-
/** async context */
80-
ucs_async_context_t *ucs_async;
81-
8282
/** transport for active messaging */
8383
mca_btl_uct_tl_t *am_tl;
8484

8585
/** transport for RDMA/AMOs */
8686
mca_btl_uct_tl_t *rdma_tl;
8787

88-
/** transport for forming connections (if needed) */
89-
mca_btl_uct_tl_t *conn_tl;
90-
91-
/** array containing the am_tl and rdma_tl */
92-
mca_btl_uct_tl_t *comm_tls[2];
93-
94-
#if UCT_API >= UCT_VERSION(1, 7)
95-
uct_component_h uct_component;
96-
#endif
97-
9888
/** registration cache */
9989
mca_rcache_base_module_t *rcache;
10090

101-
/** name of the memory domain backing this module */
102-
char *md_name;
103-
10491
/** am and rdma share endpoints */
10592
bool shared_endpoints;
10693

@@ -119,8 +106,9 @@ struct mca_btl_uct_module_t {
119106
/** frags that were waiting on connections that are now ready to send */
120107
opal_list_t pending_frags;
121108

122-
/** pending connection requests */
123-
opal_fifo_t pending_connection_reqs;
109+
/** allowed transports */
110+
char *allowed_transports;
111+
mca_btl_uct_include_list_t allowed_transport_list;
124112
};
125113
typedef struct mca_btl_uct_module_t mca_btl_uct_module_t;
126114

@@ -133,6 +121,9 @@ struct mca_btl_uct_component_t {
133121
/** base BTL component */
134122
mca_btl_base_component_3_0_0_t super;
135123

124+
/** whether the component is initialized. controls cleanup. */
125+
bool initialized;
126+
136127
/** number of TL modules */
137128
int module_count;
138129

@@ -141,10 +132,15 @@ struct mca_btl_uct_component_t {
141132

142133
/** allowed UCT memory domains */
143134
char *memory_domains;
135+
mca_btl_uct_include_list_t memory_domain_list;
144136

145137
/** allowed transports */
146138
char *allowed_transports;
147139

140+
/** transports to consider for forming connections */
141+
char *connection_domains;
142+
mca_btl_uct_include_list_t connection_domain_list;
143+
148144
/** number of worker contexts to create */
149145
int num_contexts_per_module;
150146

@@ -158,6 +154,17 @@ struct mca_btl_uct_component_t {
158154

159155
/** connection retry timeout */
160156
unsigned int connection_retry_timeout;
157+
158+
#if UCT_API >= UCT_VERSION(1, 7)
159+
uct_component_h *uct_components;
160+
unsigned num_uct_components;
161+
#endif
162+
163+
/** list of memory domains (btl_uct_md_t) */
164+
opal_list_t md_list;
165+
166+
/** connection transport (if needed). reference is owned by conn_md */
167+
mca_btl_uct_tl_t *conn_tl;
161168
};
162169
typedef struct mca_btl_uct_component_t mca_btl_uct_component_t;
163170

@@ -293,19 +300,24 @@ ucs_status_t mca_btl_uct_am_handler(void *arg, void *data, size_t length, unsign
293300
struct mca_btl_base_endpoint_t *mca_btl_uct_get_ep(struct mca_btl_base_module_t *module,
294301
opal_proc_t *proc);
295302

296-
int mca_btl_uct_query_tls(mca_btl_uct_module_t *module, mca_btl_uct_md_t *md,
297-
uct_tl_resource_desc_t *tl_descs, unsigned tl_count);
303+
int mca_btl_uct_populate_tls(mca_btl_uct_md_t *md, uct_tl_resource_desc_t *tl_descs, unsigned tl_count);
298304
int mca_btl_uct_process_connection_request(mca_btl_uct_module_t *module,
299305
mca_btl_uct_conn_req_t *req);
300306

307+
mca_btl_uct_module_t *mca_btl_uct_alloc_module(mca_btl_uct_md_t *md,
308+
size_t registration_size);
309+
310+
int mca_btl_uct_evaluate_tl(mca_btl_uct_module_t *module, mca_btl_uct_tl_t *tl);
311+
int mca_btl_uct_enable_tl_conn(mca_btl_uct_tl_t *tl);
312+
301313
/**
302314
* @brief Checks if a tl is suitable for using for RDMA
303315
*
304316
* @param[in] tl btl/uct tl pointer
305317
*/
306318
static inline bool mca_btl_uct_tl_supports_rdma(mca_btl_uct_tl_t *tl)
307319
{
308-
return (MCA_BTL_UCT_TL_ATTR(tl, 0).cap.flags
320+
return (tl->uct_iface_attr.cap.flags
309321
& (UCT_IFACE_FLAG_PUT_ZCOPY | UCT_IFACE_FLAG_GET_ZCOPY))
310322
== (UCT_IFACE_FLAG_PUT_ZCOPY | UCT_IFACE_FLAG_GET_ZCOPY);
311323
}
@@ -315,7 +327,7 @@ static inline bool mca_btl_uct_tl_supports_rdma(mca_btl_uct_tl_t *tl)
315327
*/
316328
static inline bool mca_btl_uct_tl_support_am(mca_btl_uct_tl_t *tl)
317329
{
318-
return (MCA_BTL_UCT_TL_ATTR(tl, 0).cap.flags
330+
return (tl->uct_iface_attr.cap.flags
319331
& (UCT_IFACE_FLAG_AM_SHORT | UCT_IFACE_FLAG_AM_BCOPY | UCT_IFACE_FLAG_AM_ZCOPY));
320332
}
321333

@@ -326,7 +338,7 @@ static inline bool mca_btl_uct_tl_support_am(mca_btl_uct_tl_t *tl)
326338
*/
327339
static inline bool mca_btl_uct_tl_supports_conn(mca_btl_uct_tl_t *tl)
328340
{
329-
return (MCA_BTL_UCT_TL_ATTR(tl, 0).cap.flags
341+
return (tl->uct_iface_attr.cap.flags
330342
& (UCT_IFACE_FLAG_AM_SHORT | UCT_IFACE_FLAG_CONNECT_TO_IFACE))
331343
== (UCT_IFACE_FLAG_AM_SHORT | UCT_IFACE_FLAG_CONNECT_TO_IFACE);
332344
}
@@ -338,7 +350,11 @@ static inline bool mca_btl_uct_tl_supports_conn(mca_btl_uct_tl_t *tl)
338350
*/
339351
static inline bool mca_btl_uct_tl_requires_connection_tl(mca_btl_uct_tl_t *tl)
340352
{
341-
return !(MCA_BTL_UCT_TL_ATTR(tl, 0).cap.flags & UCT_IFACE_FLAG_CONNECT_TO_IFACE);
353+
if (NULL == tl) {
354+
return false;
355+
}
356+
357+
return !(tl->uct_iface_attr.cap.flags & UCT_IFACE_FLAG_CONNECT_TO_IFACE);
342358
}
343359

344360
END_C_DECLS

0 commit comments

Comments
 (0)