Skip to content

Commit 249675d

Browse files
committed
fix(core-backend): improve traces
1 parent be86c60 commit 249675d

File tree

6 files changed

+323
-271
lines changed

6 files changed

+323
-271
lines changed

packages/core-backend/CHANGELOG.md

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,24 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
77

88
## [Unreleased]
99

10+
### Changed
11+
12+
- Improve WebSocket connection lifecycle tracing in `BackendWebSocketService` ([#7101](https://github.com/MetaMask/core/pull/7101))
13+
- WebSocket connection duration is now properly reflected in trace span duration instead of only in custom data
14+
- Trace all disconnections (both manual and unexpected) to provide complete connection lifecycle visibility in traces
15+
- Omit `connectionDuration_ms` from disconnection traces when connection never established (onClose without onOpen)
16+
- Update `BackendWebSocketService` default exponential backoff options for reconnection
17+
- Increase default `reconnectDelay` from 500 milliseconds to 10 seconds
18+
- Increase default `maxReconnectDelay` from 30 seconds to 60 seconds
19+
- Simplify WebSocket disconnection code in `BackendWebSocketService`
20+
- Centralize all disconnection logic in `ws.onclose` handler for single source of truth
21+
- Centralize all state changes within `#establishConnection` method - state transitions only occur in `onopen` (CONNECTING → CONNECTED) and `onclose` (any state → DISCONNECTED)
22+
- Add `MANUAL_DISCONNECT_CODE` (4999) and `MANUAL_DISCONNECT_REASON` constants to distinguish manual from unexpected disconnects
23+
24+
### Removed
25+
26+
- Remove `BackendWebSocketService Channel Message` trace as it provided no useful performance insights
27+
1028
## [4.0.0]
1129

1230
### Changed

packages/core-backend/README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -311,7 +311,7 @@ sequenceDiagram
311311
#### Key Flow Characteristics
312312

313313
1. **Initial Setup**: BackendWebSocketService establishes connection, then AccountActivityService subscribes to selected account. Backend automatically sends a system notification with all chains that are currently up. AccountActivityService tracks these chains internally and notifies TokenBalancesController, which increases polling interval to 5 min
314-
2. **Chain Status Tracking**: AccountActivityService maintains an internal set of chains that are 'up' based on system notifications. On disconnect/error, it marks all tracked chains as 'down' before clearing the set
314+
2. **Chain Status Tracking**: AccountActivityService maintains an internal set of chains that are 'up' based on system notifications. On disconnect, it marks all tracked chains as 'down' before clearing the set
315315
3. **System Notifications**: Backend automatically sends chain status updates (up/down) upon subscription and when status changes. AccountActivityService forwards these to TokenBalancesController, which adjusts polling intervals (up: 5min, down: 30s + immediate fetch)
316316
4. **User Account Changes**: When users switch accounts, AccountActivityService unsubscribes from old account and subscribes to new account. Backend sends fresh system notification with current chain status for the new account
317317
5. **Connection Resilience**: On reconnection, AccountActivityService resubscribes to selected account and receives fresh chain status via system notification. Automatic reconnection with exponential backoff

packages/core-backend/src/AccountActivityService.test.ts

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -686,11 +686,11 @@ describe('AccountActivityService', () => {
686686
timestamp: 1760344704595,
687687
});
688688

689-
// Publish WebSocket ERROR state event - should flush tracked chains as down
689+
// Publish WebSocket DISCONNECTED state event - should flush tracked chains as down
690690
rootMessenger.publish(
691691
'BackendWebSocketService:connectionStateChanged',
692692
{
693-
state: WebSocketState.ERROR,
693+
state: WebSocketState.DISCONNECTED,
694694
url: 'ws://test',
695695
reconnectAttempts: 2,
696696
timeout: 10000,
@@ -701,7 +701,7 @@ describe('AccountActivityService', () => {
701701
);
702702
await completeAsyncOperations(100);
703703

704-
// Verify that the ERROR state triggered the status change for tracked chains
704+
// Verify that the DISCONNECTED state triggered the status change for tracked chains
705705
expect(statusChangedEventListener).toHaveBeenCalledWith({
706706
chainIds: ['eip155:1', 'eip155:137', 'eip155:56'],
707707
status: 'down',
@@ -720,11 +720,11 @@ describe('AccountActivityService', () => {
720720

721721
mocks.getSelectedAccount.mockReturnValue(null);
722722

723-
// Publish WebSocket ERROR state event without any tracked chains
723+
// Publish WebSocket DISCONNECTED state event without any tracked chains
724724
rootMessenger.publish(
725725
'BackendWebSocketService:connectionStateChanged',
726726
{
727-
state: WebSocketState.ERROR,
727+
state: WebSocketState.DISCONNECTED,
728728
url: 'ws://test',
729729
reconnectAttempts: 2,
730730
timeout: 10000,

packages/core-backend/src/AccountActivityService.ts

Lines changed: 15 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -451,6 +451,15 @@ export class AccountActivityService {
451451
status: data.status,
452452
timestamp,
453453
});
454+
455+
log(
456+
`WebSocket status change - Published tracked chains as ${data.status}`,
457+
{
458+
count: data.chainIds.length,
459+
chains: data.chainIds,
460+
status: data.status,
461+
},
462+
);
454463
}
455464

456465
/**
@@ -467,11 +476,8 @@ export class AccountActivityService {
467476
// WebSocket connected - resubscribe to selected account
468477
// The system notification will automatically provide the list of chains that are up
469478
await this.#subscribeToSelectedAccount();
470-
} else if (
471-
state === WebSocketState.DISCONNECTED ||
472-
state === WebSocketState.ERROR
473-
) {
474-
// On disconnect/error, flush all tracked chains as down
479+
} else if (state === WebSocketState.DISCONNECTED) {
480+
// On disconnect, flush all tracked chains as down
475481
const chainsToMarkDown = Array.from(this.#chainsUp);
476482

477483
if (chainsToMarkDown.length > 0) {
@@ -481,13 +487,10 @@ export class AccountActivityService {
481487
timestamp: Date.now(),
482488
});
483489

484-
log(
485-
'WebSocket error/disconnection - Published tracked chains as down',
486-
{
487-
count: chainsToMarkDown.length,
488-
chains: chainsToMarkDown,
489-
},
490-
);
490+
log('WebSocket disconnection - Published tracked chains as down', {
491+
count: chainsToMarkDown.length,
492+
chains: chainsToMarkDown,
493+
});
491494

492495
// Clear the tracking set since all chains are now down
493496
this.#chainsUp.clear();

packages/core-backend/src/BackendWebSocketService.test.ts

Lines changed: 159 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -102,8 +102,13 @@ class MockWebSocket extends EventTarget {
102102
MockWebSocket.instanceCount += 1;
103103
this.url = url;
104104
// TypeScript has issues with jest.spyOn on WebSocket methods, so using direct assignment
105+
// Store reference to simulateClose for use in close()
106+
const simulateCloseFn = this.simulateClose.bind(this);
105107
// eslint-disable-next-line jest/prefer-spy-on
106-
this.close = jest.fn().mockImplementation();
108+
this.close = jest.fn().mockImplementation((code = 1000, reason = '') => {
109+
// When close() is called, trigger the close event to simulate real WebSocket behavior
110+
simulateCloseFn(code, reason);
111+
});
107112
// eslint-disable-next-line jest/prefer-spy-on
108113
this.send = jest.fn().mockImplementation((data: string) => {
109114
this._lastSentMessage = data;
@@ -506,8 +511,8 @@ describe('BackendWebSocketService', () => {
506511
async ({ service }) => {
507512
expect(service.getConnectionInfo().url).toBe('ws://test.example.com');
508513
expect(service.getConnectionInfo().timeout).toBe(10000);
509-
expect(service.getConnectionInfo().reconnectDelay).toBe(500);
510-
expect(service.getConnectionInfo().maxReconnectDelay).toBe(5000);
514+
expect(service.getConnectionInfo().reconnectDelay).toBe(10000);
515+
expect(service.getConnectionInfo().maxReconnectDelay).toBe(60000);
511516
expect(service.getConnectionInfo().requestTimeout).toBe(30000);
512517
expect(service).toBeInstanceOf(BackendWebSocketService);
513518
},
@@ -877,14 +882,17 @@ describe('BackendWebSocketService', () => {
877882
mockWebSocketOptions: { autoConnect: false },
878883
},
879884
async ({ service, completeAsyncOperations }) => {
885+
// Mock Math.random to make Cockatiel's jitter deterministic
886+
jest.spyOn(Math, 'random').mockReturnValue(0);
887+
880888
// eslint-disable-next-line @typescript-eslint/no-floating-promises
881889
service.connect();
882890

883891
// Advance time past the timeout
884-
await completeAsyncOperations(150);
892+
await completeAsyncOperations(110);
885893

886-
// Should have transitioned to ERROR state after timeout
887-
expect(service.getConnectionInfo().state).toBe(WebSocketState.ERROR);
894+
// Should have transitioned to DISCONNECTED state after timeout
895+
expect(service.getConnectionInfo().state).toBe(WebSocketState.DISCONNECTED);
888896
},
889897
);
890898
});
@@ -939,13 +947,13 @@ describe('BackendWebSocketService', () => {
939947
mockWs.simulateClose(1006, 'Connection failed');
940948
await completeAsyncOperations(0);
941949

942-
// Should schedule reconnect and be in ERROR state
943-
expect(service.getConnectionInfo().state).toBe(WebSocketState.ERROR);
950+
// Should schedule reconnect and be in DISCONNECTED state
951+
expect(service.getConnectionInfo().state).toBe(WebSocketState.DISCONNECTED);
944952
},
945953
);
946954
});
947955

948-
it('should clear connection timeout in handleClose when timeout occurs then close fires', async () => {
956+
it('should clear connection timeout when timeout occurs then close fires', async () => {
949957
await withService(
950958
{
951959
options: { timeout: 100 },
@@ -965,25 +973,20 @@ describe('BackendWebSocketService', () => {
965973
WebSocketState.CONNECTING,
966974
);
967975

968-
// Let timeout fire (closes WebSocket and sets state to ERROR)
969-
await completeAsyncOperations(150);
976+
// Let timeout fire (closes WebSocket and sets state to DISCONNECTED)
977+
// Advance time past timeout but before reconnect would fire
978+
await completeAsyncOperations(100);
970979

971-
// State should be ERROR or DISCONNECTED after timeout
972-
const stateAfterTimeout = service.getConnectionInfo().state;
973-
expect([WebSocketState.ERROR, WebSocketState.DISCONNECTED]).toContain(
974-
stateAfterTimeout,
975-
);
980+
// State should be DISCONNECTED after timeout
981+
expect(service.getConnectionInfo().state).toBe(WebSocketState.DISCONNECTED);
976982

977983
// Now manually trigger close event
978-
// Since state is ERROR (not CONNECTING), onclose will call handleClose
979-
// which will clear connectionTimeout
984+
// Since state is DISCONNECTED, onclose will return early due to idempotency guard
980985
mockWs.simulateClose(1006, 'Close after timeout');
981986
await completeAsyncOperations(0);
982987

983-
// State should still be ERROR or DISCONNECTED
984-
expect([WebSocketState.ERROR, WebSocketState.DISCONNECTED]).toContain(
985-
service.getConnectionInfo().state,
986-
);
988+
// State should still be DISCONNECTED
989+
expect(service.getConnectionInfo().state).toBe(WebSocketState.DISCONNECTED);
987990
},
988991
);
989992
});
@@ -1072,10 +1075,13 @@ describe('BackendWebSocketService', () => {
10721075
await completeAsyncOperations(10);
10731076

10741077
const mockWs = getMockWebSocket();
1075-
mockWs.simulateError();
1078+
// Simulate connection failure (error is always followed by close in real WebSocket)
1079+
mockWs.simulateClose(1006, 'Connection failed');
10761080
await completeAsyncOperations(0);
10771081

10781082
const attemptsBefore = service.getConnectionInfo().reconnectAttempts;
1083+
// Should be 1 after the failure
1084+
expect(attemptsBefore).toBe(1);
10791085

10801086
// Try to force reconnection while timer is already scheduled
10811087
await service.forceReconnection();
@@ -1087,6 +1093,134 @@ describe('BackendWebSocketService', () => {
10871093
},
10881094
);
10891095
});
1096+
1097+
it('should clear reconnect timer when feature is disabled', async () => {
1098+
let isEnabled = true;
1099+
await withService(
1100+
{
1101+
options: {
1102+
isEnabled: () => isEnabled,
1103+
},
1104+
mockWebSocketOptions: { autoConnect: false },
1105+
},
1106+
async ({ service, getMockWebSocket, completeAsyncOperations }) => {
1107+
// Mock Math.random to make Cockatiel's jitter deterministic
1108+
jest.spyOn(Math, 'random').mockReturnValue(0);
1109+
1110+
// Trigger a connection failure to schedule a reconnect
1111+
// eslint-disable-next-line @typescript-eslint/no-floating-promises
1112+
service.connect();
1113+
await completeAsyncOperations(10);
1114+
1115+
const mockWs = getMockWebSocket();
1116+
// Simulate connection failure to trigger reconnect timer
1117+
mockWs.simulateClose(1006, 'Connection failed');
1118+
await completeAsyncOperations(0);
1119+
1120+
// Verify reconnect timer is scheduled
1121+
expect(service.getConnectionInfo().reconnectAttempts).toBe(1);
1122+
1123+
// Now disable the feature
1124+
isEnabled = false;
1125+
1126+
// Try to connect again with feature disabled
1127+
await service.connect();
1128+
1129+
// Reconnect attempts should be reset to 0 (timers cleared)
1130+
expect(service.getConnectionInfo().reconnectAttempts).toBe(0);
1131+
1132+
// Advance time to ensure the old reconnect timer doesn't fire
1133+
await completeAsyncOperations(10000);
1134+
1135+
// Should still be 0, confirming timer was cleared
1136+
expect(service.getConnectionInfo().reconnectAttempts).toBe(0);
1137+
},
1138+
);
1139+
});
1140+
1141+
it('should include connectionDuration_ms in trace when connection was established', async () => {
1142+
const mockTraceFn = jest.fn((_request, fn) => fn?.());
1143+
await withService(
1144+
{
1145+
options: { traceFn: mockTraceFn },
1146+
},
1147+
async ({ service, getMockWebSocket, completeAsyncOperations }) => {
1148+
// Connect and let it establish
1149+
await service.connect();
1150+
await completeAsyncOperations(10);
1151+
1152+
expect(service.getConnectionInfo().state).toBe(
1153+
WebSocketState.CONNECTED,
1154+
);
1155+
1156+
// Clear previous trace calls to focus on disconnection trace
1157+
mockTraceFn.mockClear();
1158+
1159+
// Trigger unexpected close after connection was established
1160+
const mockWs = getMockWebSocket();
1161+
mockWs.simulateClose(1006, 'Abnormal closure');
1162+
await completeAsyncOperations(10);
1163+
1164+
// Find the Disconnection trace call
1165+
const disconnectionTrace = mockTraceFn.mock.calls.find(
1166+
(call) => call[0]?.name === 'BackendWebSocketService Disconnection',
1167+
);
1168+
1169+
expect(disconnectionTrace).toBeDefined();
1170+
expect(disconnectionTrace?.[0]?.data).toHaveProperty(
1171+
'connectionDuration_ms',
1172+
);
1173+
expect(
1174+
disconnectionTrace?.[0]?.data?.connectionDuration_ms,
1175+
).toBeGreaterThan(0);
1176+
},
1177+
);
1178+
});
1179+
1180+
it('should omit connectionDuration_ms in trace when connection never established', async () => {
1181+
const mockTraceFn = jest.fn((_request, fn) => fn?.());
1182+
await withService(
1183+
{
1184+
options: { traceFn: mockTraceFn },
1185+
mockWebSocketOptions: { autoConnect: false },
1186+
},
1187+
async ({ service, getMockWebSocket, completeAsyncOperations }) => {
1188+
// Start connecting
1189+
// eslint-disable-next-line @typescript-eslint/no-floating-promises
1190+
service.connect();
1191+
await completeAsyncOperations(0);
1192+
1193+
expect(service.getConnectionInfo().state).toBe(
1194+
WebSocketState.CONNECTING,
1195+
);
1196+
1197+
// Manually disconnect before connection establishes
1198+
// This will set state to DISCONNECTING and close the WebSocket
1199+
service.disconnect();
1200+
await completeAsyncOperations(0);
1201+
1202+
// Clear trace calls from connection attempt
1203+
mockTraceFn.mockClear();
1204+
1205+
// Simulate the close event after disconnect (state is now DISCONNECTING, not CONNECTING)
1206+
// This will trigger #handleClose with connectedAt = 0
1207+
const mockWs = getMockWebSocket();
1208+
mockWs.simulateClose(1000, 'Normal closure');
1209+
await completeAsyncOperations(10);
1210+
1211+
// Find the Disconnection trace call
1212+
const disconnectionTrace = mockTraceFn.mock.calls.find(
1213+
(call) => call[0]?.name === 'BackendWebSocketService Disconnection',
1214+
);
1215+
1216+
// Trace should exist but should NOT have connectionDuration_ms
1217+
expect(disconnectionTrace).toBeDefined();
1218+
expect(disconnectionTrace?.[0]?.data).not.toHaveProperty(
1219+
'connectionDuration_ms',
1220+
);
1221+
},
1222+
);
1223+
});
10901224
});
10911225

10921226
// =====================================================
@@ -1437,7 +1571,7 @@ describe('BackendWebSocketService', () => {
14371571

14381572
service.disconnect();
14391573

1440-
await expect(requestPromise).rejects.toThrow('WebSocket disconnected');
1574+
await expect(requestPromise).rejects.toThrow('WebSocket connection closed: 4999 Internal: Manual disconnect');
14411575
});
14421576
});
14431577

@@ -1840,7 +1974,7 @@ describe('BackendWebSocketService', () => {
18401974
service.destroy();
18411975

18421976
// Pending request should be rejected
1843-
await expect(requestPromise).rejects.toThrow('Service cleanup');
1977+
await expect(requestPromise).rejects.toThrow('WebSocket connection closed: 4999 Internal: Manual disconnect');
18441978

18451979
// Verify service is in disconnected state
18461980
expect(service.getConnectionInfo().state).toBe(

0 commit comments

Comments
 (0)