Skip to content

Commit 58649c4

Browse files
committed
mctp: Add retry for one-time peer property queries on timeout
The function `query_peer_properties()` is called once during peer initialization to query basic information after the EID becomes routable. To improve reliability, this change adds a retry mechanism when the query fails with `-ETIMEDOUT`. Since these queries are one-time initialization steps, a single successful attempt is sufficient, and retrying enhances stability under transient MCTP bus contention or multi-master timing issues. Testing: add stress test for peer initialization under multi-master ``` while true; do echo "Restarting mctpd.service..." systemctl restart mctpd.service # Wait a few seconds to allow service to initialize sleep 20 done ``` After the 30 loops, the script checks mctpd.service journal for expected retry messages to verify robustness under transient MCTP bus contention. ``` root@bmc:~# journalctl -xeu mctpd.service | grep Retrying Oct 29 00:35:21 bmc mctpd[31801]: mctpd: Retrying to get endpoint types for peer eid 10 net 1 phys physaddr if 4 hw len 1 0x20 state 1. Attempt 1 Oct 29 00:39:00 bmc mctpd[32065]: mctpd: Retrying to get endpoint types for peer eid 10 net 1 phys physaddr if 4 hw len 1 0x20 state 1. Attempt 1 Oct 29 00:39:01 bmc mctpd[32065]: mctpd: Retrying to get endpoint types for peer eid 10 net 1 phys physaddr if 4 hw len 1 0x20 state 1. Attempt 2 Oct 29 00:45:08 bmc mctpd[32360]: mctpd: Retrying to get endpoint types for peer eid 10 net 1 phys physaddr if 4 hw len 1 0x20 state 1. Attempt 1 ``` Signed-off-by: Daniel Hsu <Daniel-Hsu@quantatw.com>
1 parent 8b019a3 commit 58649c4

File tree

3 files changed

+94
-13
lines changed

3 files changed

+94
-13
lines changed

src/mctpd.c

Lines changed: 47 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -2835,23 +2835,57 @@ static int method_learn_endpoint(sd_bus_message *call, void *data,
28352835
// and routable.
28362836
static int query_peer_properties(struct peer *peer)
28372837
{
2838+
const unsigned int max_retries = 4;
28382839
int rc;
28392840

2840-
rc = query_get_peer_msgtypes(peer);
2841-
if (rc < 0) {
2842-
// Warn here, it's a mandatory command code.
2843-
// It might be too noisy if some devices don't implement it.
2844-
warnx("Error getting endpoint types for %s. Ignoring error %d %s",
2845-
peer_tostr(peer), rc, strerror(-rc));
2846-
rc = 0;
2841+
for (unsigned int i = 0; i < max_retries; i++) {
2842+
rc = query_get_peer_msgtypes(peer);
2843+
2844+
// Success
2845+
if (rc == 0)
2846+
break;
2847+
2848+
// On timeout, retry
2849+
if (rc == -ETIMEDOUT) {
2850+
if (peer->ctx->verbose)
2851+
warnx("Retrying to get endpoint types for %s. Attempt %u",
2852+
peer_tostr(peer), i + 1);
2853+
continue;
2854+
}
2855+
2856+
// On other errors, warn and ignore
2857+
if (rc < 0) {
2858+
if (peer->ctx->verbose)
2859+
warnx("Error getting endpoint types for %s. Ignoring error %d %s",
2860+
peer_tostr(peer), -rc, strerror(-rc));
2861+
rc = 0;
2862+
break;
2863+
}
28472864
}
28482865

2849-
rc = query_get_peer_uuid(peer);
2850-
if (rc < 0) {
2851-
if (peer->ctx->verbose)
2852-
warnx("Error getting UUID for %s. Ignoring error %d %s",
2853-
peer_tostr(peer), rc, strerror(-rc));
2854-
rc = 0;
2866+
for (unsigned int i = 0; i < max_retries; i++) {
2867+
rc = query_get_peer_uuid(peer);
2868+
2869+
// Success
2870+
if (rc == 0)
2871+
break;
2872+
2873+
// On timeout, retry
2874+
if (rc == -ETIMEDOUT) {
2875+
if (peer->ctx->verbose)
2876+
warnx("Retrying to get peer UUID for %s. Attempt %u",
2877+
peer_tostr(peer), i + 1);
2878+
continue;
2879+
}
2880+
2881+
// On other errors, warn and ignore
2882+
if (rc < 0) {
2883+
if (peer->ctx->verbose)
2884+
warnx("Error getting UUID for %s. Ignoring error %d %s",
2885+
peer_tostr(peer), -rc, strerror(-rc));
2886+
rc = 0;
2887+
break;
2888+
}
28552889
}
28562890

28572891
// TODO: emit property changed? Though currently they are all const.
2.63 KB
Binary file not shown.

tests/test_mctpd.py

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1289,3 +1289,50 @@ async def test_get_message_types(dbus, mctpd):
12891289
cmd = MCTPControlCommand(True, 0, 0x04, bytes([0x05]))
12901290
rsp = await ep.send_control(mctpd.network.mctp_socket, cmd)
12911291
assert rsp.hex(' ') == '00 04 00 01 f4 f3 f2 f1'
1292+
1293+
async def test_query_peer_properties_retry_timeout(nursery, dbus, sysnet):
1294+
1295+
# activate mctpd
1296+
mctpd = MctpdWrapper(dbus, sysnet)
1297+
await mctpd.start_mctpd(nursery)
1298+
1299+
iface = mctpd.system.interfaces[0]
1300+
ep = mctpd.network.endpoints[0]
1301+
1302+
mctp = await mctpd_mctp_iface_obj(dbus, iface)
1303+
1304+
# add a bridged endpoint to ep
1305+
fake_ep = Endpoint(iface, b'\x12\x34', types=[0, 2])
1306+
ep.add_bridged_ep(fake_ep)
1307+
mctpd.network.add_endpoint(fake_ep)
1308+
1309+
# override fake_ep.send_control to simulate a timeout on the first call
1310+
original_send = fake_ep.send_control
1311+
fake_ep._first_timeout = True
1312+
1313+
async def fake_send(sock, cmd):
1314+
if fake_ep._first_timeout:
1315+
fake_ep._first_timeout = False
1316+
# simulate timeout
1317+
await trio.sleep(1.0)
1318+
return None
1319+
# normal response on second call
1320+
return await original_send(sock, cmd)
1321+
1322+
fake_ep.send_control = fake_send
1323+
1324+
# call assign_endpoint on ep, which will allocate a pool for fake_ep
1325+
mctp = await mctpd_mctp_iface_obj(dbus, iface)
1326+
await mctp.call_assign_endpoint(ep.lladdr)
1327+
1328+
# collect mctpd logs to verify retry occurred
1329+
logs = []
1330+
async for line in mctpd.proc.stdout:
1331+
logs.append(line.decode())
1332+
1333+
retry_found = any("Retrying to get endpoint types" in l for l in logs)
1334+
assert retry_found, "mctpd did not retry on timeout"
1335+
1336+
# exit mctpd
1337+
res = await mctpd.stop_mctpd()
1338+
assert res == 0

0 commit comments

Comments
 (0)