Skip to content

Commit a6bfe82

Browse files
committed
mctp: Add retry for one-time peer property queries on timeout
The function `query_peer_properties()` is called once during peer initialization to query basic information after the EID becomes routable. To improve reliability, this change adds a retry mechanism when the query fails with `-ETIMEDOUT`. Since these queries are one-time initialization steps, a single successful attempt is sufficient, and retrying enhances stability under transient MCTP bus contention or multi-master timing issues. Testing: add stress test for peer initialization under multi-master ``` while true; do echo "Restarting mctpd.service..." systemctl restart mctpd.service # Wait a few seconds to allow service to initialize sleep 20 done ``` After the 30 loops, the script checks mctpd.service journal for expected retry messages to verify robustness under transient MCTP bus contention. ``` root@bmc:~# journalctl -xeu mctpd.service | grep Retrying Oct 29 00:35:21 bmc mctpd[31801]: mctpd: Retrying to get endpoint types for peer eid 10 net 1 phys physaddr if 4 hw len 1 0x20 state 1. Attempt 1 Oct 29 00:39:00 bmc mctpd[32065]: mctpd: Retrying to get endpoint types for peer eid 10 net 1 phys physaddr if 4 hw len 1 0x20 state 1. Attempt 1 Oct 29 00:39:01 bmc mctpd[32065]: mctpd: Retrying to get endpoint types for peer eid 10 net 1 phys physaddr if 4 hw len 1 0x20 state 1. Attempt 2 Oct 29 00:45:08 bmc mctpd[32360]: mctpd: Retrying to get endpoint types for peer eid 10 net 1 phys physaddr if 4 hw len 1 0x20 state 1. Attempt 1 ``` Signed-off-by: Daniel Hsu <Daniel-Hsu@quantatw.com>
1 parent 7ea8652 commit a6bfe82

File tree

3 files changed

+110
-15
lines changed

3 files changed

+110
-15
lines changed

src/mctpd.c

Lines changed: 49 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -2927,23 +2927,59 @@ static int method_learn_endpoint(sd_bus_message *call, void *data,
29272927
// and routable.
29282928
static int query_peer_properties(struct peer *peer)
29292929
{
2930+
const unsigned int max_retries = 4;
29302931
int rc;
29312932

2932-
rc = query_get_peer_msgtypes(peer);
2933-
if (rc < 0) {
2934-
// Warn here, it's a mandatory command code.
2935-
// It might be too noisy if some devices don't implement it.
2936-
warnx("Error getting endpoint types for %s. Ignoring error %d %s",
2937-
peer_tostr(peer), rc, strerror(-rc));
2938-
rc = 0;
2933+
for (unsigned int i = 0; i < max_retries; i++) {
2934+
rc = query_get_peer_msgtypes(peer);
2935+
2936+
// Success
2937+
if (rc == 0)
2938+
break;
2939+
2940+
// On timeout, retry
2941+
if (rc == -ETIMEDOUT) {
2942+
if (peer->ctx->verbose)
2943+
warnx("Retrying to get endpoint types for %s. Attempt %u",
2944+
peer_tostr(peer), i + 1);
2945+
rc = 0;
2946+
continue;
2947+
}
2948+
2949+
// On other errors, warn and ignore
2950+
if (rc < 0) {
2951+
if (peer->ctx->verbose)
2952+
warnx("Error getting endpoint types for %s. Ignoring error %d %s",
2953+
peer_tostr(peer), -rc, strerror(-rc));
2954+
rc = 0;
2955+
break;
2956+
}
29392957
}
29402958

2941-
rc = query_get_peer_uuid(peer);
2942-
if (rc < 0) {
2943-
if (peer->ctx->verbose)
2944-
warnx("Error getting UUID for %s. Ignoring error %d %s",
2945-
peer_tostr(peer), rc, strerror(-rc));
2946-
rc = 0;
2959+
for (unsigned int i = 0; i < max_retries; i++) {
2960+
rc = query_get_peer_uuid(peer);
2961+
2962+
// Success
2963+
if (rc == 0)
2964+
break;
2965+
2966+
// On timeout, retry
2967+
if (rc == -ETIMEDOUT) {
2968+
if (peer->ctx->verbose)
2969+
warnx("Retrying to get peer UUID for %s. Attempt %u",
2970+
peer_tostr(peer), i + 1);
2971+
rc = 0;
2972+
continue;
2973+
}
2974+
2975+
// On other errors, warn and ignore
2976+
if (rc < 0) {
2977+
if (peer->ctx->verbose)
2978+
warnx("Error getting UUID for %s. Ignoring error %d %s",
2979+
peer_tostr(peer), -rc, strerror(-rc));
2980+
rc = 0;
2981+
break;
2982+
}
29472983
}
29482984

29492985
// TODO: emit property changed? Though currently they are all const.

tests/mctpenv/__init__.py

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -318,14 +318,16 @@ def to_buf(self):
318318
return bytes([flags, self.cmd]) + self.data
319319

320320
class Endpoint:
321-
def __init__(self, iface, lladdr, ep_uuid = None, eid = 0, types = None):
321+
def __init__(self, iface, lladdr, ep_uuid = None, eid = 0, types = None, timeout_opcodes = set(), timeout_count = 0):
322322
self.iface = iface
323323
self.lladdr = lladdr
324324
self.uuid = ep_uuid or uuid.uuid1()
325325
self.eid = eid
326326
self.types = types or [0]
327327
self.bridged_eps = []
328328
self.allocated_pool = None # or (start, size)
329+
self.timeout_opcodes = timeout_opcodes
330+
self.timeout_count = timeout_count
329331

330332
# keyed by (type, type-specific-instance)
331333
self.commands = {}
@@ -368,6 +370,11 @@ async def handle_mctp_control(self, sock, addr, data):
368370
# Use IID from request, zero Rq and D bits
369371
hdr = [iid, opcode]
370372

373+
if opcode in self.timeout_opcodes:
374+
if self.timeout_count > 0:
375+
self.timeout_count -= 1
376+
return
377+
371378
if opcode == 1:
372379
# Set Endpoint ID
373380
(op, eid) = data[2:]
@@ -1193,7 +1200,6 @@ async def handle_control(self, nursery):
11931200
await send_fd(self.sock_local, remote.fileno())
11941201
remote.close()
11951202
nursery.start_soon(sd.run)
1196-
11971203
else:
11981204
print(f"unknown op {op}")
11991205

tests/test_mctpd.py

Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1376,3 +1376,56 @@ async def test_register_vdm_type_support_errors(dbus, mctpd):
13761376
with pytest.raises(asyncdbus.errors.DBusError) as ex:
13771377
await mctp.call_register_vdm_type_support(0x00, v_type, 0x0001)
13781378
assert str(ex.value) == "VDM type already registered"
1379+
1380+
async def test_query_peer_properties_retry_timeout(nursery, dbus, sysnet):
1381+
1382+
# activate mctpd
1383+
mctpd = MctpdWrapper(dbus, sysnet)
1384+
await mctpd.start_mctpd(nursery)
1385+
1386+
iface = mctpd.system.interfaces[0]
1387+
mctp = await mctpd_mctp_iface_obj(dbus, iface)
1388+
1389+
# define expected message types
1390+
# add a normal endpoint to network
1391+
expected_types = [0, 1, 2]
1392+
ep = Endpoint(iface, bytes([0x1a]), eid=15, types=expected_types)
1393+
mctpd.network.add_endpoint(ep)
1394+
1395+
# call setup_endpoint on ep, which will allocate a object path for it
1396+
(eid, net, path, new) = await mctp.call_setup_endpoint(ep.lladdr)
1397+
1398+
objep = await mctpd_mctp_endpoint_common_obj(dbus, path)
1399+
objtypes = list(await objep.get_supported_message_types())
1400+
objtypes.sort()
1401+
assert objtypes == expected_types
1402+
1403+
ep.lladdr = bytes([0x1b]) # change lladdr to force retry
1404+
ep.timeout_opcodes.add(0x05) # set Message Type Support would timeout
1405+
ep.timeout_count = 2 # timeout twice before responding
1406+
1407+
# call setup_endpoint again, which will trigger query of peer properties
1408+
(eid, net, path, new) = await mctp.call_setup_endpoint(ep.lladdr)
1409+
1410+
# timeout twice does not prevent us from getting the correct message types
1411+
objep = await mctpd_mctp_endpoint_common_obj(dbus, path)
1412+
objtypes = list(await objep.get_supported_message_types())
1413+
objtypes.sort()
1414+
assert objtypes == expected_types
1415+
1416+
ep.lladdr = bytes([0x1c]) # change lladdr to force retry
1417+
ep.timeout_count = 5 # timeout five times before responding
1418+
1419+
# call setup_endpoint again, which will trigger query of peer properties
1420+
(eid, net, path, new) = await mctp.call_setup_endpoint(ep.lladdr)
1421+
1422+
# timeout five times does not prevent us from getting the correct message types
1423+
objep = await mctpd_mctp_endpoint_common_obj(dbus, path)
1424+
objtypes = list(await objep.get_supported_message_types())
1425+
objtypes.sort()
1426+
expected_types = [] # exceeded retry limit, so no types known
1427+
assert objtypes == expected_types
1428+
1429+
# exit mctpd
1430+
res = await mctpd.stop_mctpd()
1431+
assert res == 0

0 commit comments

Comments
 (0)