From 3ca158b90ce0902b7f8f9b9656f85e1b479ff95e Mon Sep 17 00:00:00 2001
From: Justin Azoff <justin.azoff@oxide.computer>
Date: Thu, 12 Mar 2026 15:49:06 -0400
Subject: [PATCH] improve logging around unhealthy clocks

We have seen this error

    clock synchronization error: this node is more than 500ms away from at least half of the known nodes

but when this happens it's not clear what the real issue is.  Are the
clocks 501ms away? or 5000ms?

This logs an additional error any time a remote node is unhealthy

    E260312 20:20:10.978114 15 2@rpc/clock_offset.go:256  [-] 3  node 3 is not healthy: clock offset is off=91ns, err=31ns, at=1970-01-01 00:00:00 +0000 UTC
---
 pkg/rpc/clock_offset.go | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/pkg/rpc/clock_offset.go b/pkg/rpc/clock_offset.go
index 0d21f7265c1..008b632131c 100644
--- a/pkg/rpc/clock_offset.go
+++ b/pkg/rpc/clock_offset.go
@@ -252,6 +252,8 @@ func (r *RemoteClockMonitor) VerifyClockOffset(ctx context.Context) error {
 			offsets = append(offsets, float64(offset.Offset-offset.Uncertainty))
 			if offset.isHealthy(ctx, maxOffset) {
 				healthyOffsetCount++
+			} else {
+				log.Health.Errorf(ctx, "node %s is not healthy: clock offset is %s", addr, offset)
 			}
 		}
 		numClocks := len(r.mu.offsets)