Skip to content

Commit 03b0cc2

Browse files
committed
fix(core-backend): improve traces
1 parent be86c60 commit 03b0cc2

File tree

6 files changed

+347
-277
lines changed

6 files changed

+347
-277
lines changed

packages/core-backend/CHANGELOG.md

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,24 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
77

88
## [Unreleased]
99

10+
### Changed
11+
12+
- Improve WebSocket connection lifecycle tracing in `BackendWebSocketService` ([#7101](https://github.com/MetaMask/core/pull/7101))
13+
- WebSocket connection duration is now properly reflected in trace span duration instead of only in custom data
14+
- Trace all disconnections (both manual and unexpected) to provide complete connection lifecycle visibility in traces
15+
- Omit `connectionDuration_ms` from disconnection traces when connection never established (onClose without onOpen)
16+
- Update `BackendWebSocketService` default exponential backoff options for reconnection ([#7101](https://github.com/MetaMask/core/pull/7101))
17+
- Increase default `reconnectDelay` from 500 milliseconds to 10 seconds
18+
- Increase default `maxReconnectDelay` from 30 seconds to 60 seconds
19+
- Simplify WebSocket disconnection code in `BackendWebSocketService` ([#7101](https://github.com/MetaMask/core/pull/7101))
20+
- Centralize all disconnection logic in `ws.onclose` handler for single source of truth
21+
- Centralize all state changes within `#establishConnection` method - state transitions only occur in `onopen` (CONNECTING → CONNECTED) and `onclose` (any state → DISCONNECTED)
22+
- Add `MANUAL_DISCONNECT_CODE` (4999) and `MANUAL_DISCONNECT_REASON` constants to distinguish manual from unexpected disconnects
23+
24+
### Removed
25+
26+
- Remove `BackendWebSocketService Channel Message` trace as it provided no useful performance insights ([#7101](https://github.com/MetaMask/core/pull/7101))
27+
1028
## [4.0.0]
1129

1230
### Changed

packages/core-backend/README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -311,7 +311,7 @@ sequenceDiagram
311311
#### Key Flow Characteristics
312312

313313
1. **Initial Setup**: BackendWebSocketService establishes connection, then AccountActivityService subscribes to selected account. Backend automatically sends a system notification with all chains that are currently up. AccountActivityService tracks these chains internally and notifies TokenBalancesController, which increases polling interval to 5 min
314-
2. **Chain Status Tracking**: AccountActivityService maintains an internal set of chains that are 'up' based on system notifications. On disconnect/error, it marks all tracked chains as 'down' before clearing the set
314+
2. **Chain Status Tracking**: AccountActivityService maintains an internal set of chains that are 'up' based on system notifications. On disconnect, it marks all tracked chains as 'down' before clearing the set
315315
3. **System Notifications**: Backend automatically sends chain status updates (up/down) upon subscription and when status changes. AccountActivityService forwards these to TokenBalancesController, which adjusts polling intervals (up: 5min, down: 30s + immediate fetch)
316316
4. **User Account Changes**: When users switch accounts, AccountActivityService unsubscribes from old account and subscribes to new account. Backend sends fresh system notification with current chain status for the new account
317317
5. **Connection Resilience**: On reconnection, AccountActivityService resubscribes to selected account and receives fresh chain status via system notification. Automatic reconnection with exponential backoff

packages/core-backend/src/AccountActivityService.test.ts

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -686,11 +686,11 @@ describe('AccountActivityService', () => {
686686
timestamp: 1760344704595,
687687
});
688688

689-
// Publish WebSocket ERROR state event - should flush tracked chains as down
689+
// Publish WebSocket DISCONNECTED state event - should flush tracked chains as down
690690
rootMessenger.publish(
691691
'BackendWebSocketService:connectionStateChanged',
692692
{
693-
state: WebSocketState.ERROR,
693+
state: WebSocketState.DISCONNECTED,
694694
url: 'ws://test',
695695
reconnectAttempts: 2,
696696
timeout: 10000,
@@ -701,7 +701,7 @@ describe('AccountActivityService', () => {
701701
);
702702
await completeAsyncOperations(100);
703703

704-
// Verify that the ERROR state triggered the status change for tracked chains
704+
// Verify that the DISCONNECTED state triggered the status change for tracked chains
705705
expect(statusChangedEventListener).toHaveBeenCalledWith({
706706
chainIds: ['eip155:1', 'eip155:137', 'eip155:56'],
707707
status: 'down',
@@ -720,11 +720,11 @@ describe('AccountActivityService', () => {
720720

721721
mocks.getSelectedAccount.mockReturnValue(null);
722722

723-
// Publish WebSocket ERROR state event without any tracked chains
723+
// Publish WebSocket DISCONNECTED state event without any tracked chains
724724
rootMessenger.publish(
725725
'BackendWebSocketService:connectionStateChanged',
726726
{
727-
state: WebSocketState.ERROR,
727+
state: WebSocketState.DISCONNECTED,
728728
url: 'ws://test',
729729
reconnectAttempts: 2,
730730
timeout: 10000,

packages/core-backend/src/AccountActivityService.ts

Lines changed: 15 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -451,6 +451,15 @@ export class AccountActivityService {
451451
status: data.status,
452452
timestamp,
453453
});
454+
455+
log(
456+
`WebSocket status change - Published tracked chains as ${data.status}`,
457+
{
458+
count: data.chainIds.length,
459+
chains: data.chainIds,
460+
status: data.status,
461+
},
462+
);
454463
}
455464

456465
/**
@@ -467,11 +476,8 @@ export class AccountActivityService {
467476
// WebSocket connected - resubscribe to selected account
468477
// The system notification will automatically provide the list of chains that are up
469478
await this.#subscribeToSelectedAccount();
470-
} else if (
471-
state === WebSocketState.DISCONNECTED ||
472-
state === WebSocketState.ERROR
473-
) {
474-
// On disconnect/error, flush all tracked chains as down
479+
} else if (state === WebSocketState.DISCONNECTED) {
480+
// On disconnect, flush all tracked chains as down
475481
const chainsToMarkDown = Array.from(this.#chainsUp);
476482

477483
if (chainsToMarkDown.length > 0) {
@@ -481,13 +487,10 @@ export class AccountActivityService {
481487
timestamp: Date.now(),
482488
});
483489

484-
log(
485-
'WebSocket error/disconnection - Published tracked chains as down',
486-
{
487-
count: chainsToMarkDown.length,
488-
chains: chainsToMarkDown,
489-
},
490-
);
490+
log('WebSocket disconnection - Published tracked chains as down', {
491+
count: chainsToMarkDown.length,
492+
chains: chainsToMarkDown,
493+
});
491494

492495
// Clear the tracking set since all chains are now down
493496
this.#chainsUp.clear();

packages/core-backend/src/BackendWebSocketService.test.ts

Lines changed: 169 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -102,8 +102,13 @@ class MockWebSocket extends EventTarget {
102102
MockWebSocket.instanceCount += 1;
103103
this.url = url;
104104
// TypeScript has issues with jest.spyOn on WebSocket methods, so using direct assignment
105+
// Store reference to simulateClose for use in close()
106+
const simulateCloseFn = this.simulateClose.bind(this);
105107
// eslint-disable-next-line jest/prefer-spy-on
106-
this.close = jest.fn().mockImplementation();
108+
this.close = jest.fn().mockImplementation((code = 1000, reason = '') => {
109+
// When close() is called, trigger the close event to simulate real WebSocket behavior
110+
simulateCloseFn(code, reason);
111+
});
107112
// eslint-disable-next-line jest/prefer-spy-on
108113
this.send = jest.fn().mockImplementation((data: string) => {
109114
this._lastSentMessage = data;
@@ -506,8 +511,8 @@ describe('BackendWebSocketService', () => {
506511
async ({ service }) => {
507512
expect(service.getConnectionInfo().url).toBe('ws://test.example.com');
508513
expect(service.getConnectionInfo().timeout).toBe(10000);
509-
expect(service.getConnectionInfo().reconnectDelay).toBe(500);
510-
expect(service.getConnectionInfo().maxReconnectDelay).toBe(5000);
514+
expect(service.getConnectionInfo().reconnectDelay).toBe(10000);
515+
expect(service.getConnectionInfo().maxReconnectDelay).toBe(60000);
511516
expect(service.getConnectionInfo().requestTimeout).toBe(30000);
512517
expect(service).toBeInstanceOf(BackendWebSocketService);
513518
},
@@ -877,14 +882,19 @@ describe('BackendWebSocketService', () => {
877882
mockWebSocketOptions: { autoConnect: false },
878883
},
879884
async ({ service, completeAsyncOperations }) => {
885+
// Mock Math.random to make Cockatiel's jitter deterministic
886+
jest.spyOn(Math, 'random').mockReturnValue(0);
887+
880888
// eslint-disable-next-line @typescript-eslint/no-floating-promises
881889
service.connect();
882890

883891
// Advance time past the timeout
884-
await completeAsyncOperations(150);
892+
await completeAsyncOperations(110);
885893

886-
// Should have transitioned to ERROR state after timeout
887-
expect(service.getConnectionInfo().state).toBe(WebSocketState.ERROR);
894+
// Should have transitioned to DISCONNECTED state after timeout
895+
expect(service.getConnectionInfo().state).toBe(
896+
WebSocketState.DISCONNECTED,
897+
);
888898
},
889899
);
890900
});
@@ -939,13 +949,15 @@ describe('BackendWebSocketService', () => {
939949
mockWs.simulateClose(1006, 'Connection failed');
940950
await completeAsyncOperations(0);
941951

942-
// Should schedule reconnect and be in ERROR state
943-
expect(service.getConnectionInfo().state).toBe(WebSocketState.ERROR);
952+
// Should schedule reconnect and be in DISCONNECTED state
953+
expect(service.getConnectionInfo().state).toBe(
954+
WebSocketState.DISCONNECTED,
955+
);
944956
},
945957
);
946958
});
947959

948-
it('should clear connection timeout in handleClose when timeout occurs then close fires', async () => {
960+
it('should clear connection timeout when timeout occurs then close fires', async () => {
949961
await withService(
950962
{
951963
options: { timeout: 100 },
@@ -965,24 +977,23 @@ describe('BackendWebSocketService', () => {
965977
WebSocketState.CONNECTING,
966978
);
967979

968-
// Let timeout fire (closes WebSocket and sets state to ERROR)
969-
await completeAsyncOperations(150);
980+
// Let timeout fire (closes WebSocket and sets state to DISCONNECTED)
981+
// Advance time past timeout but before reconnect would fire
982+
await completeAsyncOperations(100);
970983

971-
// State should be ERROR or DISCONNECTED after timeout
972-
const stateAfterTimeout = service.getConnectionInfo().state;
973-
expect([WebSocketState.ERROR, WebSocketState.DISCONNECTED]).toContain(
974-
stateAfterTimeout,
984+
// State should be DISCONNECTED after timeout
985+
expect(service.getConnectionInfo().state).toBe(
986+
WebSocketState.DISCONNECTED,
975987
);
976988

977989
// Now manually trigger close event
978-
// Since state is ERROR (not CONNECTING), onclose will call handleClose
979-
// which will clear connectionTimeout
990+
// Since state is DISCONNECTED, onclose will return early due to idempotency guard
980991
mockWs.simulateClose(1006, 'Close after timeout');
981992
await completeAsyncOperations(0);
982993

983-
// State should still be ERROR or DISCONNECTED
984-
expect([WebSocketState.ERROR, WebSocketState.DISCONNECTED]).toContain(
985-
service.getConnectionInfo().state,
994+
// State should still be DISCONNECTED
995+
expect(service.getConnectionInfo().state).toBe(
996+
WebSocketState.DISCONNECTED,
986997
);
987998
},
988999
);
@@ -1072,10 +1083,13 @@ describe('BackendWebSocketService', () => {
10721083
await completeAsyncOperations(10);
10731084

10741085
const mockWs = getMockWebSocket();
1075-
mockWs.simulateError();
1086+
// Simulate connection failure (error is always followed by close in real WebSocket)
1087+
mockWs.simulateClose(1006, 'Connection failed');
10761088
await completeAsyncOperations(0);
10771089

10781090
const attemptsBefore = service.getConnectionInfo().reconnectAttempts;
1091+
// Should be 1 after the failure
1092+
expect(attemptsBefore).toBe(1);
10791093

10801094
// Try to force reconnection while timer is already scheduled
10811095
await service.forceReconnection();
@@ -1087,6 +1101,134 @@ describe('BackendWebSocketService', () => {
10871101
},
10881102
);
10891103
});
1104+
1105+
it('should clear reconnect timer when feature is disabled', async () => {
1106+
let isEnabled = true;
1107+
await withService(
1108+
{
1109+
options: {
1110+
isEnabled: () => isEnabled,
1111+
},
1112+
mockWebSocketOptions: { autoConnect: false },
1113+
},
1114+
async ({ service, getMockWebSocket, completeAsyncOperations }) => {
1115+
// Mock Math.random to make Cockatiel's jitter deterministic
1116+
jest.spyOn(Math, 'random').mockReturnValue(0);
1117+
1118+
// Trigger a connection failure to schedule a reconnect
1119+
// eslint-disable-next-line @typescript-eslint/no-floating-promises
1120+
service.connect();
1121+
await completeAsyncOperations(10);
1122+
1123+
const mockWs = getMockWebSocket();
1124+
// Simulate connection failure to trigger reconnect timer
1125+
mockWs.simulateClose(1006, 'Connection failed');
1126+
await completeAsyncOperations(0);
1127+
1128+
// Verify reconnect timer is scheduled
1129+
expect(service.getConnectionInfo().reconnectAttempts).toBe(1);
1130+
1131+
// Now disable the feature
1132+
isEnabled = false;
1133+
1134+
// Try to connect again with feature disabled
1135+
await service.connect();
1136+
1137+
// Reconnect attempts should be reset to 0 (timers cleared)
1138+
expect(service.getConnectionInfo().reconnectAttempts).toBe(0);
1139+
1140+
// Advance time to ensure the old reconnect timer doesn't fire
1141+
await completeAsyncOperations(10000);
1142+
1143+
// Should still be 0, confirming timer was cleared
1144+
expect(service.getConnectionInfo().reconnectAttempts).toBe(0);
1145+
},
1146+
);
1147+
});
1148+
1149+
it('should include connectionDuration_ms in trace when connection was established', async () => {
1150+
const mockTraceFn = jest.fn((_request, fn) => fn?.());
1151+
await withService(
1152+
{
1153+
options: { traceFn: mockTraceFn },
1154+
},
1155+
async ({ service, getMockWebSocket, completeAsyncOperations }) => {
1156+
// Connect and let it establish
1157+
await service.connect();
1158+
await completeAsyncOperations(10);
1159+
1160+
expect(service.getConnectionInfo().state).toBe(
1161+
WebSocketState.CONNECTED,
1162+
);
1163+
1164+
// Clear previous trace calls to focus on disconnection trace
1165+
mockTraceFn.mockClear();
1166+
1167+
// Trigger unexpected close after connection was established
1168+
const mockWs = getMockWebSocket();
1169+
mockWs.simulateClose(1006, 'Abnormal closure');
1170+
await completeAsyncOperations(10);
1171+
1172+
// Find the Disconnection trace call
1173+
const disconnectionTrace = mockTraceFn.mock.calls.find(
1174+
(call) => call[0]?.name === 'BackendWebSocketService Disconnection',
1175+
);
1176+
1177+
expect(disconnectionTrace).toBeDefined();
1178+
expect(disconnectionTrace?.[0]?.data).toHaveProperty(
1179+
'connectionDuration_ms',
1180+
);
1181+
expect(
1182+
disconnectionTrace?.[0]?.data?.connectionDuration_ms,
1183+
).toBeGreaterThan(0);
1184+
},
1185+
);
1186+
});
1187+
1188+
it('should omit connectionDuration_ms in trace when connection never established', async () => {
1189+
const mockTraceFn = jest.fn((_request, fn) => fn?.());
1190+
await withService(
1191+
{
1192+
options: { traceFn: mockTraceFn },
1193+
mockWebSocketOptions: { autoConnect: false },
1194+
},
1195+
async ({ service, getMockWebSocket, completeAsyncOperations }) => {
1196+
// Start connecting
1197+
// eslint-disable-next-line @typescript-eslint/no-floating-promises
1198+
service.connect();
1199+
await completeAsyncOperations(0);
1200+
1201+
expect(service.getConnectionInfo().state).toBe(
1202+
WebSocketState.CONNECTING,
1203+
);
1204+
1205+
// Manually disconnect before connection establishes
1206+
// This will set state to DISCONNECTING and close the WebSocket
1207+
service.disconnect();
1208+
await completeAsyncOperations(0);
1209+
1210+
// Clear trace calls from connection attempt
1211+
mockTraceFn.mockClear();
1212+
1213+
// Simulate the close event after disconnect (state is now DISCONNECTING, not CONNECTING)
1214+
// This will trigger #handleClose with connectedAt = 0
1215+
const mockWs = getMockWebSocket();
1216+
mockWs.simulateClose(1000, 'Normal closure');
1217+
await completeAsyncOperations(10);
1218+
1219+
// Find the Disconnection trace call
1220+
const disconnectionTrace = mockTraceFn.mock.calls.find(
1221+
(call) => call[0]?.name === 'BackendWebSocketService Disconnection',
1222+
);
1223+
1224+
// Trace should exist but should NOT have connectionDuration_ms
1225+
expect(disconnectionTrace).toBeDefined();
1226+
expect(disconnectionTrace?.[0]?.data).not.toHaveProperty(
1227+
'connectionDuration_ms',
1228+
);
1229+
},
1230+
);
1231+
});
10901232
});
10911233

10921234
// =====================================================
@@ -1437,7 +1579,9 @@ describe('BackendWebSocketService', () => {
14371579

14381580
service.disconnect();
14391581

1440-
await expect(requestPromise).rejects.toThrow('WebSocket disconnected');
1582+
await expect(requestPromise).rejects.toThrow(
1583+
'WebSocket connection closed: 4999 Internal: Manual disconnect',
1584+
);
14411585
});
14421586
});
14431587

@@ -1840,7 +1984,9 @@ describe('BackendWebSocketService', () => {
18401984
service.destroy();
18411985

18421986
// Pending request should be rejected
1843-
await expect(requestPromise).rejects.toThrow('Service cleanup');
1987+
await expect(requestPromise).rejects.toThrow(
1988+
'WebSocket connection closed: 4999 Internal: Manual disconnect',
1989+
);
18441990

18451991
// Verify service is in disconnected state
18461992
expect(service.getConnectionInfo().state).toBe(

0 commit comments

Comments
 (0)