Skip to content

Commit 2c9b68d

Browse files
authored
Merge pull request #13462 from bwbarrett/backports/v5.0.x-13415-mtl-btl-domain-share
Backport MTL/BTL Fabric and Domain sharing patches
2 parents 2c92164 + 4f118d6 commit 2c9b68d

File tree

5 files changed

+261
-22
lines changed

5 files changed

+261
-22
lines changed

ompi/mca/mtl/ofi/mtl_ofi_component.c

Lines changed: 23 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -694,6 +694,8 @@ ompi_mtl_ofi_component_init(bool enable_progress_threads,
694694
}
695695

696696
hints->domain_attr->resource_mgmt = FI_RM_ENABLED;
697+
hints->domain_attr->domain = opal_common_ofi.domain;
698+
hints->fabric_attr->fabric = opal_common_ofi.fabric;
697699

698700
/**
699701
* The EFA provider in Libfabric versions prior to 1.10 contains a bug
@@ -715,10 +717,16 @@ ompi_mtl_ofi_component_init(bool enable_progress_threads,
715717
hints_dup->fabric_attr->prov_name = strdup("efa");
716718

717719
ret = fi_getinfo(fi_primary_version, NULL, NULL, 0ULL, hints_dup, &providers);
720+
if (FI_ENODATA == -ret && (hints_dup->fabric_attr->fabric || hints_dup->domain_attr->domain)) {
721+
/* Retry without fabric and domain */
722+
hints_dup->fabric_attr->fabric = NULL;
723+
hints_dup->domain_attr->domain = NULL;
724+
ret = fi_getinfo(fi_primary_version, NULL, NULL, 0ULL, hints_dup, &providers);
725+
}
718726
if (FI_ENOSYS == -ret) {
719727
/* libfabric is not new enough, fallback to use older version of API */
720728
ret = fi_getinfo(fi_alternate_version, NULL, NULL, 0ULL, hints_dup, &providers);
721-
}
729+
}
722730

723731
opal_output_verbose(1, opal_common_ofi.output,
724732
"%s:%d: EFA specific fi_getinfo(): %s\n",
@@ -756,6 +764,11 @@ ompi_mtl_ofi_component_init(bool enable_progress_threads,
756764
0ULL, /* Optional flag */
757765
hints, /* In: Hints to filter providers */
758766
&providers); /* Out: List of matching providers */
767+
if (FI_ENODATA == -ret && (hints->fabric_attr->fabric || hints->domain_attr->domain)) {
768+
hints->fabric_attr->fabric = NULL;
769+
hints->domain_attr->domain = NULL;
770+
ret = fi_getinfo(fi_primary_version, NULL, NULL, 0ULL, hints, &providers);
771+
}
759772
if (FI_ENOSYS == -ret) {
760773
ret = fi_getinfo(fi_alternate_version, NULL, NULL, 0ULL, hints, &providers);
761774
}
@@ -972,9 +985,8 @@ ompi_mtl_ofi_component_init(bool enable_progress_threads,
972985
* instantiate the virtual or physical network. This opens a "fabric
973986
* provider". See man fi_fabric for details.
974987
*/
975-
ret = fi_fabric(prov->fabric_attr, /* In: Fabric attributes */
976-
&ompi_mtl_ofi.fabric, /* Out: Fabric handle */
977-
NULL); /* Optional context for fabric events */
988+
ret = opal_common_ofi_fi_fabric(prov->fabric_attr, /* In: Fabric attributes */
989+
&ompi_mtl_ofi.fabric); /* Out: Fabric handle */
978990
if (0 != ret) {
979991
opal_show_help("help-mtl-ofi.txt", "OFI call fail", true,
980992
"fi_fabric",
@@ -988,10 +1000,9 @@ ompi_mtl_ofi_component_init(bool enable_progress_threads,
9881000
* hardware port/collection of ports. Returns a domain object that can be
9891001
* used to create endpoints. See man fi_domain for details.
9901002
*/
991-
ret = fi_domain(ompi_mtl_ofi.fabric, /* In: Fabric object */
992-
prov, /* In: Provider */
993-
&ompi_mtl_ofi.domain, /* Out: Domain object */
994-
NULL); /* Optional context for domain events */
1003+
ret = opal_common_ofi_fi_domain(ompi_mtl_ofi.fabric, /* In: Fabric object */
1004+
prov, /* In: Provider */
1005+
&ompi_mtl_ofi.domain); /* Out: Domain object */
9951006
if (0 != ret) {
9961007
opal_show_help("help-mtl-ofi.txt", "OFI call fail", true,
9971008
"fi_domain",
@@ -1158,10 +1169,10 @@ ompi_mtl_ofi_component_init(bool enable_progress_threads,
11581169
(void) fi_close((fid_t)ompi_mtl_ofi.ofi_ctxt[0].cq);
11591170
}
11601171
if (ompi_mtl_ofi.domain) {
1161-
(void) fi_close((fid_t)ompi_mtl_ofi.domain);
1172+
(void) opal_common_ofi_domain_release(ompi_mtl_ofi.domain);
11621173
}
11631174
if (ompi_mtl_ofi.fabric) {
1164-
(void) fi_close((fid_t)ompi_mtl_ofi.fabric);
1175+
(void) opal_common_ofi_fabric_release(ompi_mtl_ofi.fabric);
11651176
}
11661177
if (ompi_mtl_ofi.comm_to_context) {
11671178
free(ompi_mtl_ofi.comm_to_context);
@@ -1209,11 +1220,11 @@ ompi_mtl_ofi_finalize(struct mca_mtl_base_module_t *mtl)
12091220
}
12101221
}
12111222

1212-
if ((ret = fi_close((fid_t)ompi_mtl_ofi.domain))) {
1223+
if ((ret = opal_common_ofi_domain_release(ompi_mtl_ofi.domain))) {
12131224
goto finalize_err;
12141225
}
12151226

1216-
if ((ret = fi_close((fid_t)ompi_mtl_ofi.fabric))) {
1227+
if ((ret = opal_common_ofi_fabric_release(ompi_mtl_ofi.fabric))) {
12171228
goto finalize_err;
12181229
}
12191230

opal/mca/btl/ofi/btl_ofi_component.c

Lines changed: 21 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -339,6 +339,12 @@ static mca_btl_base_module_t **mca_btl_ofi_component_init(int *num_btl_modules,
339339
domain_attr.control_progress = progress_mode;
340340
domain_attr.data_progress = progress_mode;
341341

342+
if (enable_mpi_threads) {
343+
domain_attr.threading = FI_THREAD_SAFE;
344+
} else {
345+
domain_attr.threading = FI_THREAD_DOMAIN;
346+
}
347+
342348
/* select endpoint type */
343349
ep_attr.type = FI_EP_RDM;
344350

@@ -359,7 +365,8 @@ static mca_btl_base_module_t **mca_btl_ofi_component_init(int *num_btl_modules,
359365
tx_attr.iov_limit = 1;
360366
rx_attr.iov_limit = 1;
361367

362-
tx_attr.op_flags = FI_DELIVERY_COMPLETE;
368+
tx_attr.op_flags = FI_DELIVERY_COMPLETE | FI_COMPLETION;
369+
rx_attr.op_flags = FI_COMPLETION;
363370

364371
mca_btl_ofi_component.module_count = 0;
365372

@@ -372,9 +379,18 @@ static mca_btl_base_module_t **mca_btl_ofi_component_init(int *num_btl_modules,
372379
no_hmem:
373380
#endif
374381

382+
hints.fabric_attr->fabric = opal_common_ofi.fabric;
383+
hints.domain_attr->domain = opal_common_ofi.domain;
384+
375385
/* Do the query. The earliest version that supports FI_HMEM hints is 1.9.
376386
* The earliest version the explictly allow provider to call CUDA API is 1.18 */
377387
rc = fi_getinfo(FI_VERSION(1, 18), NULL, NULL, 0, &hints, &info_list);
388+
if (FI_ENODATA == -rc && (hints.fabric_attr->fabric || hints.domain_attr->domain)) {
389+
/* Retry without fabric and domain */
390+
hints.fabric_attr->fabric = NULL;
391+
hints.domain_attr->domain = NULL;
392+
rc = fi_getinfo(FI_VERSION(1, 18), NULL, NULL, 0, &hints, &info_list);
393+
}
378394
if (FI_ENOSYS == -rc) {
379395
rc = fi_getinfo(FI_VERSION(1, 9), NULL, NULL, 0, &hints, &info_list);
380396
}
@@ -553,14 +569,14 @@ static int mca_btl_ofi_init_device(struct fi_info *info)
553569
("initializing dev:%s provider:%s", linux_device_name, info->fabric_attr->prov_name));
554570

555571
/* fabric */
556-
rc = fi_fabric(ofi_info->fabric_attr, &fabric, NULL);
572+
rc = opal_common_ofi_fi_fabric(ofi_info->fabric_attr, &fabric);
557573
if (0 != rc) {
558574
BTL_VERBOSE(("%s failed fi_fabric with err=%s", linux_device_name, fi_strerror(-rc)));
559575
goto fail;
560576
}
561577

562578
/* domain */
563-
rc = fi_domain(fabric, ofi_info, &domain, NULL);
579+
rc = opal_common_ofi_fi_domain(fabric, ofi_info, &domain);
564580
if (0 != rc) {
565581
BTL_VERBOSE(("%s failed fi_domain with err=%s", linux_device_name, fi_strerror(-rc)));
566582
goto fail;
@@ -743,11 +759,11 @@ static int mca_btl_ofi_init_device(struct fi_info *info)
743759
}
744760

745761
if (NULL != domain) {
746-
fi_close(&domain->fid);
762+
opal_common_ofi_domain_release(domain);
747763
}
748764

749765
if (NULL != fabric) {
750-
fi_close(&fabric->fid);
766+
opal_common_ofi_fabric_release(fabric);
751767
}
752768
free(module);
753769

opal/mca/btl/ofi/btl_ofi_module.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -380,11 +380,11 @@ int mca_btl_ofi_finalize(mca_btl_base_module_t *btl)
380380
}
381381

382382
if (NULL != ofi_btl->domain) {
383-
fi_close(&ofi_btl->domain->fid);
383+
opal_common_ofi_domain_release(ofi_btl->domain);
384384
}
385385

386386
if (NULL != ofi_btl->fabric) {
387-
fi_close(&ofi_btl->fabric->fid);
387+
opal_common_ofi_fabric_release(ofi_btl->fabric);
388388
}
389389

390390
if (NULL != ofi_btl->fabric_info) {

opal/mca/common/ofi/common_ofi.c

Lines changed: 159 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
* reserved.
77
* Copyright (c) 2020-2021 Cisco Systems, Inc. All rights reserved.
88
* Copyright (c) 2021-2023 Nanook Consulting. All rights reserved.
9-
* Copyright (c) 2021 Amazon.com, Inc. or its affiliates. All rights
9+
* Copyright (c) 2021-2025 Amazon.com, Inc. or its affiliates. All rights
1010
* reserved.
1111
* Copyright (c) 2023 UT-Battelle, LLC. All rights reserved.
1212
* $COPYRIGHT$
@@ -40,7 +40,11 @@
4040

4141
opal_common_ofi_module_t opal_common_ofi = {.prov_include = NULL,
4242
.prov_exclude = NULL,
43-
.output = -1};
43+
.output = -1,
44+
.fabric = NULL,
45+
.domain = NULL,
46+
.fabric_ref_count = 0,
47+
.domain_ref_count = 0};
4448
static const char default_prov_exclude_list[] = "shm,sockets,tcp,udp,rstream,usnic,net";
4549
static opal_mutex_t opal_common_ofi_mutex = OPAL_MUTEX_STATIC_INIT;
4650
static int opal_common_ofi_verbose_level = 0;
@@ -1037,3 +1041,156 @@ OPAL_DECLSPEC int opal_common_ofi_fi_getname(fid_t fid, void **addr, size_t *add
10371041
}
10381042
return ret;
10391043
}
1044+
1045+
/**
1046+
* Get or create fabric object
1047+
*
1048+
* Reuses existing fabric from fabric_attr->fabric if available,
1049+
* otherwise creates new fabric using fi_fabric().
1050+
*
1051+
* @param fabric_attr (IN) Fabric attributes
1052+
* @param fabric (OUT) Fabric object (new or existing)
1053+
*
1054+
* @return OPAL_SUCCESS or error code
1055+
*/
1056+
int opal_common_ofi_fi_fabric(struct fi_fabric_attr *fabric_attr,
1057+
struct fid_fabric **fabric)
1058+
{
1059+
int ret;
1060+
1061+
OPAL_THREAD_LOCK(&opal_common_ofi_mutex);
1062+
1063+
if (fabric_attr->fabric) {
1064+
*fabric = fabric_attr->fabric;
1065+
opal_common_ofi.fabric_ref_count++;
1066+
opal_output_verbose(1, opal_common_ofi.output, "Reusing existing fabric: %s",
1067+
fabric_attr->name);
1068+
} else {
1069+
ret = fi_fabric(fabric_attr, fabric, NULL);
1070+
if (0 != ret) {
1071+
OPAL_THREAD_UNLOCK(&opal_common_ofi_mutex);
1072+
return ret;
1073+
}
1074+
opal_common_ofi.fabric = *fabric;
1075+
opal_common_ofi.fabric_ref_count = 1;
1076+
}
1077+
1078+
OPAL_THREAD_UNLOCK(&opal_common_ofi_mutex);
1079+
return OPAL_SUCCESS;
1080+
}
1081+
1082+
/**
1083+
* Get or create domain object
1084+
*
1085+
* Reuses existing domain from info->domain_attr->domain if available,
1086+
* otherwise creates new domain using fi_domain().
1087+
*
1088+
* @param fabric (IN) Fabric object
1089+
* @param info (IN) Provider info
1090+
* @param domain (OUT) Domain object (new or existing)
1091+
*
1092+
* @return OPAL_SUCCESS or OPAL error code
1093+
*/
1094+
int opal_common_ofi_fi_domain(struct fid_fabric *fabric, struct fi_info *info,
1095+
struct fid_domain **domain)
1096+
{
1097+
int ret;
1098+
1099+
OPAL_THREAD_LOCK(&opal_common_ofi_mutex);
1100+
1101+
if (info->domain_attr->domain) {
1102+
*domain = info->domain_attr->domain;
1103+
opal_common_ofi.domain_ref_count++;
1104+
opal_output_verbose(1, opal_common_ofi.output, "Reusing existing domain: %s",
1105+
info->domain_attr->name);
1106+
} else {
1107+
ret = fi_domain(fabric, info, domain, NULL);
1108+
if (0 != ret) {
1109+
OPAL_THREAD_UNLOCK(&opal_common_ofi_mutex);
1110+
return ret;
1111+
}
1112+
opal_common_ofi.domain = *domain;
1113+
opal_common_ofi.domain_ref_count = 1;
1114+
}
1115+
1116+
OPAL_THREAD_UNLOCK(&opal_common_ofi_mutex);
1117+
return OPAL_SUCCESS;
1118+
}
1119+
1120+
/**
1121+
* Release fabric reference
1122+
*
1123+
* Decrements fabric reference count and closes fabric if count reaches zero.
1124+
*
1125+
* @param fabric (IN) Fabric object to release
1126+
*
1127+
* @return OPAL_SUCCESS or error code
1128+
*/
1129+
int opal_common_ofi_fabric_release(struct fid_fabric *fabric)
1130+
{
1131+
int ret = OPAL_SUCCESS;
1132+
1133+
OPAL_THREAD_LOCK(&opal_common_ofi_mutex);
1134+
1135+
if (fabric == opal_common_ofi.fabric && opal_common_ofi.fabric_ref_count > 0) {
1136+
opal_common_ofi.fabric_ref_count--;
1137+
if (opal_common_ofi.fabric_ref_count == 0) {
1138+
ret = fi_close(&fabric->fid);
1139+
if (0 != ret) {
1140+
opal_output_verbose(1, opal_common_ofi.output,
1141+
"%s:%d: fi_close failed for fabric: %s (%d)",
1142+
__FILE__, __LINE__, fi_strerror(-ret), ret);
1143+
}
1144+
opal_common_ofi.fabric = NULL;
1145+
}
1146+
} else {
1147+
ret = fi_close(&fabric->fid);
1148+
if (0 != ret) {
1149+
opal_output_verbose(1, opal_common_ofi.output,
1150+
"%s:%d: fi_close failed for fabric: %s (%d)",
1151+
__FILE__, __LINE__, fi_strerror(-ret), ret);
1152+
}
1153+
}
1154+
1155+
OPAL_THREAD_UNLOCK(&opal_common_ofi_mutex);
1156+
return ret;
1157+
}
1158+
1159+
/**
1160+
* Release domain reference
1161+
*
1162+
* Decrements domain reference count and closes domain if count reaches zero.
1163+
*
1164+
* @param domain (IN) Domain object to release
1165+
*
1166+
* @return OPAL_SUCCESS or error code
1167+
*/
1168+
int opal_common_ofi_domain_release(struct fid_domain *domain)
1169+
{
1170+
int ret = OPAL_SUCCESS;
1171+
1172+
OPAL_THREAD_LOCK(&opal_common_ofi_mutex);
1173+
1174+
if (domain == opal_common_ofi.domain && opal_common_ofi.domain_ref_count > 0) {
1175+
opal_common_ofi.domain_ref_count--;
1176+
if (opal_common_ofi.domain_ref_count == 0) {
1177+
ret = fi_close(&domain->fid);
1178+
if (0 != ret) {
1179+
opal_output_verbose(1, opal_common_ofi.output,
1180+
"%s:%d: fi_close failed for domain: %s (%d)",
1181+
__FILE__, __LINE__, fi_strerror(-ret), ret);
1182+
}
1183+
opal_common_ofi.domain = NULL;
1184+
}
1185+
} else {
1186+
ret = fi_close(&domain->fid);
1187+
if (0 != ret) {
1188+
opal_output_verbose(1, opal_common_ofi.output,
1189+
"%s:%d: fi_close failed for domain: %s (%d)",
1190+
__FILE__, __LINE__, fi_strerror(-ret), ret);
1191+
}
1192+
}
1193+
1194+
OPAL_THREAD_UNLOCK(&opal_common_ofi_mutex);
1195+
return ret;
1196+
}

0 commit comments

Comments
 (0)