From bf882377be266fa99de106d80d7f28924a8b40e1 Mon Sep 17 00:00:00 2001 From: Daniel Flook Date: Mon, 2 Feb 2026 13:45:07 +0000 Subject: [PATCH 1/3] Rewrite health-sync checks to fix excessive registrations and missing container handling This rewrites syncChecks and setChecksCritical to fix two bugs: 1. Dataplane health checks were sent to Consul on every sync cycle regardless of whether status changed, causing excessive load on Consul servers (#309) 2. Missing containers were not considered when evaluating overall dataplane health, causing traffic to be routed before services were ready (#300) The new implementation structures syncChecks as three phases: 1. Gather state - fetch container health from ECS task metadata 2. Compute checks - transform container health into Consul check statuses 3. Update Consul - send updates only for checks whose status changed New helper functions: - getContainerHealthStatuses: replaces findContainersToSync, marks missing containers as UNHEALTHY in the same map as present containers - computeOverallDataplaneHealth: computes aggregate health - computeCheckStatuses: maps container health to Consul check IDs/statuses, handling the dataplane container's special case (service + proxy checks) setChecksCritical is updated to use computeCheckStatuses for consistency. handleHealthForDataplaneContainer is removed as its logic is now in computeCheckStatuses. --- subcommand/health-sync/checks.go | 244 +++++++++++------------ subcommand/health-sync/checks_test.go | 275 +++++++++++++++++++++++--- 2 files changed, 360 insertions(+), 159 deletions(-) diff --git a/subcommand/health-sync/checks.go b/subcommand/health-sync/checks.go index f071e327..737012ff 100644 --- a/subcommand/health-sync/checks.go +++ b/subcommand/health-sync/checks.go @@ -62,163 +62,136 @@ func (c *Command) fetchHealthChecks(consulClient *api.Client, taskMeta awsutil.E return healthCheckMap, nil } -// setChecksCritical sets checks for all of the containers to critical -func (c *Command) setChecksCritical(consulClient *api.Client, taskMeta awsutil.ECSTaskMeta, clusterARN string, parsedContainerNames []string) error { +// setChecksCritical sets checks for all of the containers to critical. +// Used during graceful shutdown (SIGTERM). +func (c *Command) setChecksCritical(consulClient *api.Client, taskMeta awsutil.ECSTaskMeta, clusterARN string, containerNames []string) error { var result error - taskID := taskMeta.TaskID() serviceName := c.constructServiceName(taskMeta.Family) + serviceID := makeServiceID(serviceName, taskMeta.TaskID()) + + // Create a map with all containers as unhealthy to get all check IDs + containerStatuses := make(map[string]string) + for _, name := range containerNames { + containerStatuses[name] = ecs.HealthStatusUnhealthy + } + + // Use computeCheckStatuses to get all check IDs + checkStatuses := c.computeCheckStatuses(serviceID, containerNames, containerStatuses) - for _, containerName := range parsedContainerNames { - var err error - if containerName == config.ConsulDataplaneContainerName { - err = c.handleHealthForDataplaneContainer(consulClient, taskID, serviceName, clusterARN, containerName, ecs.HealthStatusUnhealthy) + // Update all checks to critical + for checkID := range checkStatuses { + err := c.updateConsulHealthStatus(consulClient, checkID, clusterARN, api.HealthCritical) + if err != nil { + c.log.Warn("failed to set Consul health status to critical", "err", err, "checkID", checkID) + result = multierror.Append(result, err) } else { - checkID := constructCheckID(makeServiceID(serviceName, taskID), containerName) - err = c.updateConsulHealthStatus(consulClient, checkID, clusterARN, ecs.HealthStatusUnhealthy) + c.log.Info("set Consul health status to critical", "checkID", checkID) } + } - if err == nil { - c.log.Info("set Consul health status to critical", - "container", containerName) + return result +} + + +// computeOverallDataplaneHealth computes the aggregate health status. +// Returns UNHEALTHY if any container is unhealthy. +func computeOverallDataplaneHealth(containerStatuses map[string]string) string { + if len(containerStatuses) == 0 { + // This should not be possible in practice since containerNames always + // includes at least the dataplane container. Treat as unhealthy to be safe. + return ecs.HealthStatusUnhealthy + } + + for _, status := range containerStatuses { + if status != ecs.HealthStatusHealthy { + return ecs.HealthStatusUnhealthy + } + } + return ecs.HealthStatusHealthy +} + +// computeCheckStatuses computes the desired Consul health status for each check. +// Returns a map of checkID -> Consul health status (api.HealthPassing or api.HealthCritical). +func (c *Command) computeCheckStatuses(serviceID string, containerNames []string, containerStatuses map[string]string) map[string]string { + checkStatuses := make(map[string]string) + + // Overall dataplane health is the aggregate of all container statuses + overallHealth := ecsHealthToConsulHealth(computeOverallDataplaneHealth(containerStatuses)) + + for _, name := range containerNames { + if name == config.ConsulDataplaneContainerName { + // Dataplane container maps to overall health on service check + serviceCheckID := constructCheckID(serviceID, name) + checkStatuses[serviceCheckID] = overallHealth + + // Non-gateways also have a proxy check + if !c.config.IsGateway() { + proxySvcID, _ := makeProxySvcIDAndName(serviceID, "") + proxyCheckID := constructCheckID(proxySvcID, name) + checkStatuses[proxyCheckID] = overallHealth + } } else { - c.log.Warn("failed to set Consul health status to critical", - "err", err, - "container", containerName) - result = multierror.Append(result, err) + // Non-dataplane containers map directly to their individual check + checkID := constructCheckID(serviceID, name) + checkStatuses[checkID] = ecsHealthToConsulHealth(containerStatuses[name]) } } - return result + return checkStatuses } -// syncChecks fetches metadata for the ECS task and uses that metadata to -// updates the Consul TTL checks for the containers specified in -// `parsedContainerNames`. Checks are only updated if they have changed since -// the last invocation of this function. +// syncChecks fetches ECS task metadata and updates Consul health checks +// for the specified containers. Checks are only updated when their status +// has changed since the last invocation. func (c *Command) syncChecks(consulClient *api.Client, - currentStatuses map[string]string, + previousStatuses map[string]string, clusterARN string, - parsedContainerNames []string) map[string]string { - // Fetch task metadata to get latest health of the containers + containerNames []string) map[string]string { + + // Phase 1: Gather current container state taskMeta, err := awsutil.ECSTaskMetadata() if err != nil { c.log.Error("unable to get task metadata", "err", err) - return currentStatuses + return previousStatuses } serviceName := c.constructServiceName(taskMeta.Family) - containersToSync, missingContainers := findContainersToSync(parsedContainerNames, taskMeta) + serviceID := makeServiceID(serviceName, taskMeta.TaskID()) + containerStatuses := getContainerHealthStatuses(containerNames, taskMeta) - // Mark the Consul health status as critical for missing containers - for _, name := range missingContainers { - checkID := constructCheckID(makeServiceID(serviceName, taskMeta.TaskID()), name) - c.log.Debug("marking container as unhealthy since it wasn't found in the task metadata", "name", name) + // Phase 2: Turn ecs container health into consul checks + currentStatuses := c.computeCheckStatuses(serviceID, containerNames, containerStatuses) - var err error - if name == config.ConsulDataplaneContainerName { - err = c.handleHealthForDataplaneContainer(consulClient, taskMeta.TaskID(), serviceName, clusterARN, name, ecs.HealthStatusUnhealthy) - } else { - err = c.updateConsulHealthStatus(consulClient, checkID, clusterARN, ecs.HealthStatusUnhealthy) + // Phase 3: Update Consul for any checks that have changed + for checkID, status := range currentStatuses { + previousStatus := previousStatuses[checkID] + if status == previousStatus { + continue } + err := c.updateConsulHealthStatus(consulClient, checkID, clusterARN, status) if err != nil { - c.log.Error("failed to update Consul health status for missing container", "err", err, "container", name) + c.log.Warn("failed to update Consul health status", "err", err, "checkID", checkID) + // Keep the previous status on error so we retry next cycle + currentStatuses[checkID] = previousStatus } else { - c.log.Info("container health check updated in Consul for missing container", "container", name) - currentStatuses[name] = api.HealthCritical + c.log.Info("health check updated in Consul", "checkID", checkID, "status", status) } } - parsedContainers := make(map[string]string) - // iterate over parse - for _, container := range containersToSync { - c.log.Debug("updating Consul check from ECS container health", - "name", container.Name, - "status", container.Health.Status, - "statusSince", container.Health.StatusSince, - "exitCode", container.Health.ExitCode, - ) - parsedContainers[container.Name] = container.Health.Status - previousStatus := currentStatuses[container.Name] - if container.Health.Status != previousStatus { - var err error - if container.Name != config.ConsulDataplaneContainerName { - checkID := constructCheckID(makeServiceID(serviceName, taskMeta.TaskID()), container.Name) - err = c.updateConsulHealthStatus(consulClient, checkID, clusterARN, container.Health.Status) - } - - if err != nil { - c.log.Warn("failed to update Consul health status", "err", err) - } else { - c.log.Info("container health check updated in Consul", - "name", container.Name, - "status", container.Health.Status, - "statusSince", container.Health.StatusSince, - "exitCode", container.Health.ExitCode, - ) - currentStatuses[container.Name] = container.Health.Status - } - } - - } - overallDataplaneHealthStatus, ok := parsedContainers[config.ConsulDataplaneContainerName] - // if dataplane container exist and healthy then proceed to checking the other containers health - if ok && overallDataplaneHealthStatus == ecs.HealthStatusHealthy { - // - for _, healthStatus := range parsedContainers { - // as soon as we find any unhealthy container, we can set the dataplane health to unhealthy - if healthStatus != ecs.HealthStatusHealthy { - overallDataplaneHealthStatus = ecs.HealthStatusUnhealthy - break - } - } - } else { - // If no dataplane container or dataplane container is not healthy set overall health to unhealthy - overallDataplaneHealthStatus = ecs.HealthStatusUnhealthy - } - - err = c.handleHealthForDataplaneContainer(consulClient, taskMeta.TaskID(), serviceName, clusterARN, config.ConsulDataplaneContainerName, overallDataplaneHealthStatus) - if err != nil { - c.log.Warn("failed to update Consul health status", "err", err) - } - + // Phase 4: Return current status return currentStatuses } -// handleHealthForDataplaneContainer takes care of the special handling needed for syncing -// the health of consul-dataplane container. We register two checks (one for the service -// and the other for proxy) when registering a typical service to the catalog. Updates -// should also happen twice in such cases. -func (c *Command) handleHealthForDataplaneContainer(consulClient *api.Client, taskID, serviceName, clusterARN, containerName, ecsHealthStatus string) error { - var checkID string - serviceID := makeServiceID(serviceName, taskID) - if c.config.IsGateway() { - checkID = constructCheckID(serviceID, containerName) - return c.updateConsulHealthStatus(consulClient, checkID, clusterARN, ecsHealthStatus) - } - - checkID = constructCheckID(serviceID, containerName) - err := c.updateConsulHealthStatus(consulClient, checkID, clusterARN, ecsHealthStatus) - if err != nil { - return err - } - - proxySvcID, _ := makeProxySvcIDAndName(serviceID, "") - checkID = constructCheckID(proxySvcID, containerName) - return c.updateConsulHealthStatus(consulClient, checkID, clusterARN, ecsHealthStatus) -} - -func (c *Command) updateConsulHealthStatus(consulClient *api.Client, checkID string, clusterARN string, ecsHealthStatus string) error { - consulHealthStatus := ecsHealthToConsulHealth(ecsHealthStatus) - +func (c *Command) updateConsulHealthStatus(consulClient *api.Client, checkID string, clusterARN string, consulHealthStatus string) error { check, ok := c.checks[checkID] if !ok { return fmt.Errorf("unable to find check with ID %s", checkID) } check.Status = consulHealthStatus - check.Output = fmt.Sprintf("ECS health status is %q for container %q", ecsHealthStatus, checkID) + check.Output = fmt.Sprintf("Consul health status is %q for check %q", consulHealthStatus, checkID) c.checks[checkID] = check updateCheckReq := &api.CatalogRegistration{ @@ -245,24 +218,27 @@ func constructCheckID(serviceID, containerName string) string { return fmt.Sprintf("%s-%s", serviceID, containerName) } -func findContainersToSync(containerNames []string, taskMeta awsutil.ECSTaskMeta) ([]awsutil.ECSTaskMetaContainer, []string) { - var ecsContainers []awsutil.ECSTaskMetaContainer - var missing []string - - for _, container := range containerNames { - found := false - for _, ecsContainer := range taskMeta.Containers { - if ecsContainer.Name == container { - ecsContainers = append(ecsContainers, ecsContainer) - found = true - break - } - } - if !found { - missing = append(missing, container) +// getContainerHealthStatuses builds a map of container name to ECS health status. +// Missing containers are assigned ecs.HealthStatusUnhealthy. +func getContainerHealthStatuses(containerNames []string, taskMeta awsutil.ECSTaskMeta) map[string]string { + statuses := make(map[string]string) + + // Build a lookup map from task metadata + taskContainers := make(map[string]string) + for _, container := range taskMeta.Containers { + taskContainers[container.Name] = container.Health.Status + } + + // Map each requested container to its status + for _, name := range containerNames { + if status, found := taskContainers[name]; found { + statuses[name] = status + } else { + statuses[name] = ecs.HealthStatusUnhealthy } } - return ecsContainers, missing + + return statuses } func ecsHealthToConsulHealth(ecsHealth string) string { diff --git a/subcommand/health-sync/checks_test.go b/subcommand/health-sync/checks_test.go index 6b022fe0..ca7b3107 100644 --- a/subcommand/health-sync/checks_test.go +++ b/subcommand/health-sync/checks_test.go @@ -8,6 +8,7 @@ import ( "github.com/aws/aws-sdk-go/service/ecs" "github.com/hashicorp/consul-ecs/awsutil" + "github.com/hashicorp/consul-ecs/config" "github.com/hashicorp/consul/api" "github.com/stretchr/testify/require" ) @@ -19,52 +20,276 @@ func TestEcsHealthToConsulHealth(t *testing.T) { require.Equal(t, api.HealthCritical, ecsHealthToConsulHealth("")) } -func TestFindContainersToSync(t *testing.T) { - taskMetaContainer1 := awsutil.ECSTaskMetaContainer{ - Name: "container1", - } - +func TestGetContainerHealthStatuses(t *testing.T) { cases := map[string]struct { containerNames []string taskMeta awsutil.ECSTaskMeta - missing []string - found []awsutil.ECSTaskMetaContainer + expected map[string]string }{ - "A container isn't in the metadata": { - containerNames: []string{"container1"}, + "all containers present and healthy": { + containerNames: []string{"app", "sidecar"}, + taskMeta: awsutil.ECSTaskMeta{ + Containers: []awsutil.ECSTaskMetaContainer{ + {Name: "app", Health: awsutil.ECSTaskMetaHealth{Status: ecs.HealthStatusHealthy}}, + {Name: "sidecar", Health: awsutil.ECSTaskMetaHealth{Status: ecs.HealthStatusHealthy}}, + }, + }, + expected: map[string]string{ + "app": ecs.HealthStatusHealthy, + "sidecar": ecs.HealthStatusHealthy, + }, + }, + "one container unhealthy": { + containerNames: []string{"app", "sidecar"}, + taskMeta: awsutil.ECSTaskMeta{ + Containers: []awsutil.ECSTaskMetaContainer{ + {Name: "app", Health: awsutil.ECSTaskMetaHealth{Status: ecs.HealthStatusHealthy}}, + {Name: "sidecar", Health: awsutil.ECSTaskMetaHealth{Status: ecs.HealthStatusUnhealthy}}, + }, + }, + expected: map[string]string{ + "app": ecs.HealthStatusHealthy, + "sidecar": ecs.HealthStatusUnhealthy, + }, + }, + "container missing from metadata": { + containerNames: []string{"app", "sidecar"}, + taskMeta: awsutil.ECSTaskMeta{ + Containers: []awsutil.ECSTaskMetaContainer{ + {Name: "app", Health: awsutil.ECSTaskMetaHealth{Status: ecs.HealthStatusHealthy}}, + }, + }, + expected: map[string]string{ + "app": ecs.HealthStatusHealthy, + "sidecar": ecs.HealthStatusUnhealthy, + }, + }, + "all containers missing": { + containerNames: []string{"app", "sidecar"}, taskMeta: awsutil.ECSTaskMeta{}, - missing: []string{"container1"}, - found: nil, + expected: map[string]string{ + "app": ecs.HealthStatusUnhealthy, + "sidecar": ecs.HealthStatusUnhealthy, + }, }, - "The metadata has an extra container": { + "empty container list": { containerNames: []string{}, taskMeta: awsutil.ECSTaskMeta{ Containers: []awsutil.ECSTaskMetaContainer{ - taskMetaContainer1, + {Name: "app", Health: awsutil.ECSTaskMetaHealth{Status: ecs.HealthStatusHealthy}}, }, }, - missing: nil, - found: nil, + expected: map[string]string{}, }, - "some found and some not found": { - containerNames: []string{"container1", "container2"}, + "extra containers in metadata ignored": { + containerNames: []string{"app"}, taskMeta: awsutil.ECSTaskMeta{ Containers: []awsutil.ECSTaskMetaContainer{ - taskMetaContainer1, + {Name: "app", Health: awsutil.ECSTaskMetaHealth{Status: ecs.HealthStatusHealthy}}, + {Name: "extra", Health: awsutil.ECSTaskMetaHealth{Status: ecs.HealthStatusHealthy}}, }, }, - missing: []string{"container2"}, - found: []awsutil.ECSTaskMetaContainer{ - taskMetaContainer1, + expected: map[string]string{ + "app": ecs.HealthStatusHealthy, + }, + }, + "unknown status preserved": { + containerNames: []string{"app"}, + taskMeta: awsutil.ECSTaskMeta{ + Containers: []awsutil.ECSTaskMetaContainer{ + {Name: "app", Health: awsutil.ECSTaskMetaHealth{Status: ecs.HealthStatusUnknown}}, + }, + }, + expected: map[string]string{ + "app": ecs.HealthStatusUnknown, + }, + }, + } + + for name, tc := range cases { + t.Run(name, func(t *testing.T) { + result := getContainerHealthStatuses(tc.containerNames, tc.taskMeta) + require.Equal(t, tc.expected, result) + }) + } +} + +func TestComputeOverallDataplaneHealth(t *testing.T) { + cases := map[string]struct { + containerStatuses map[string]string + expected string + }{ + "all healthy": { + containerStatuses: map[string]string{ + "app": ecs.HealthStatusHealthy, + "sidecar": ecs.HealthStatusHealthy, + }, + expected: ecs.HealthStatusHealthy, + }, + "one unhealthy": { + containerStatuses: map[string]string{ + "app": ecs.HealthStatusHealthy, + "sidecar": ecs.HealthStatusUnhealthy, + }, + expected: ecs.HealthStatusUnhealthy, + }, + "one unknown": { + containerStatuses: map[string]string{ + "app": ecs.HealthStatusHealthy, + "sidecar": ecs.HealthStatusUnknown, + }, + expected: ecs.HealthStatusUnhealthy, + }, + "all unhealthy": { + containerStatuses: map[string]string{ + "app": ecs.HealthStatusUnhealthy, + "sidecar": ecs.HealthStatusUnhealthy, + }, + expected: ecs.HealthStatusUnhealthy, + }, + "empty map treated as unhealthy": { + // This should not happen in practice since containerNames always + // includes at least the dataplane container. Treated as unhealthy to be safe. + containerStatuses: map[string]string{}, + expected: ecs.HealthStatusUnhealthy, + }, + "single healthy": { + containerStatuses: map[string]string{ + "app": ecs.HealthStatusHealthy, }, + expected: ecs.HealthStatusHealthy, + }, + "single unhealthy": { + containerStatuses: map[string]string{ + "app": ecs.HealthStatusUnhealthy, + }, + expected: ecs.HealthStatusUnhealthy, }, } - for name, testData := range cases { + for name, tc := range cases { t.Run(name, func(t *testing.T) { - found, missing := findContainersToSync(testData.containerNames, testData.taskMeta) - require.Equal(t, testData.missing, missing) - require.Equal(t, testData.found, found) + result := computeOverallDataplaneHealth(tc.containerStatuses) + require.Equal(t, tc.expected, result) + }) + } +} + +func TestComputeCheckStatuses(t *testing.T) { + const ( + serviceID = "test-service-12345" + dataplaneContainer = config.ConsulDataplaneContainerName + ) + + // Expected check IDs for non-gateway + serviceCheckID := constructCheckID(serviceID, dataplaneContainer) + proxySvcID, _ := makeProxySvcIDAndName(serviceID, "") + proxyCheckID := constructCheckID(proxySvcID, dataplaneContainer) + appCheckID := constructCheckID(serviceID, "app") + + cases := map[string]struct { + isGateway bool + containerNames []string + containerStatuses map[string]string + expectedChecks map[string]string + }{ + "non-gateway all healthy": { + isGateway: false, + containerNames: []string{"app", dataplaneContainer}, + containerStatuses: map[string]string{ + "app": ecs.HealthStatusHealthy, + dataplaneContainer: ecs.HealthStatusHealthy, + }, + expectedChecks: map[string]string{ + appCheckID: api.HealthPassing, + serviceCheckID: api.HealthPassing, + proxyCheckID: api.HealthPassing, + }, + }, + "non-gateway app unhealthy affects overall health": { + isGateway: false, + containerNames: []string{"app", dataplaneContainer}, + containerStatuses: map[string]string{ + "app": ecs.HealthStatusUnhealthy, + dataplaneContainer: ecs.HealthStatusHealthy, + }, + expectedChecks: map[string]string{ + appCheckID: api.HealthCritical, + serviceCheckID: api.HealthCritical, + proxyCheckID: api.HealthCritical, + }, + }, + "non-gateway dataplane unhealthy": { + isGateway: false, + containerNames: []string{"app", dataplaneContainer}, + containerStatuses: map[string]string{ + "app": ecs.HealthStatusHealthy, + dataplaneContainer: ecs.HealthStatusUnhealthy, + }, + expectedChecks: map[string]string{ + appCheckID: api.HealthPassing, + serviceCheckID: api.HealthCritical, + proxyCheckID: api.HealthCritical, + }, + }, + "non-gateway dataplane only": { + isGateway: false, + containerNames: []string{dataplaneContainer}, + containerStatuses: map[string]string{ + dataplaneContainer: ecs.HealthStatusHealthy, + }, + expectedChecks: map[string]string{ + serviceCheckID: api.HealthPassing, + proxyCheckID: api.HealthPassing, + }, + }, + "gateway healthy": { + isGateway: true, + containerNames: []string{dataplaneContainer}, + containerStatuses: map[string]string{ + dataplaneContainer: ecs.HealthStatusHealthy, + }, + expectedChecks: map[string]string{ + serviceCheckID: api.HealthPassing, + }, + }, + "gateway unhealthy": { + isGateway: true, + containerNames: []string{dataplaneContainer}, + containerStatuses: map[string]string{ + dataplaneContainer: ecs.HealthStatusUnhealthy, + }, + expectedChecks: map[string]string{ + serviceCheckID: api.HealthCritical, + }, + }, + "gateway no proxy check": { + isGateway: true, + containerNames: []string{"app", dataplaneContainer}, + containerStatuses: map[string]string{ + "app": ecs.HealthStatusHealthy, + dataplaneContainer: ecs.HealthStatusHealthy, + }, + expectedChecks: map[string]string{ + appCheckID: api.HealthPassing, + serviceCheckID: api.HealthPassing, + }, + }, + } + + for name, tc := range cases { + t.Run(name, func(t *testing.T) { + cmd := &Command{ + config: &config.Config{}, + } + if tc.isGateway { + cmd.config.Gateway = &config.GatewayRegistration{ + Kind: api.ServiceKindMeshGateway, + } + } + + result := cmd.computeCheckStatuses(serviceID, tc.containerNames, tc.containerStatuses) + require.Equal(t, tc.expectedChecks, result) }) } } From 62d3167dbeba3a1114db3c2ad6483933ea78b11e Mon Sep 17 00:00:00 2001 From: Daniel Flook Date: Mon, 2 Feb 2026 13:45:13 +0000 Subject: [PATCH 2/3] Add change detection integration tests for syncChecks Add integration tests that verify: - Checks are updated correctly when container health changes - Checks are not updated when health remains the same (no spurious updates) - Missing containers are treated as unhealthy - Recovery from unhealthy to healthy works correctly - Gateway services work correctly (no proxy check) Tests use log output parsing to verify that only expected checks were updated, ensuring the change detection logic works correctly. --- subcommand/health-sync/command_test.go | 519 ++++++++++++++++++++++++- 1 file changed, 515 insertions(+), 4 deletions(-) diff --git a/subcommand/health-sync/command_test.go b/subcommand/health-sync/command_test.go index 3f4322c8..7f65278b 100644 --- a/subcommand/health-sync/command_test.go +++ b/subcommand/health-sync/command_test.go @@ -4,11 +4,13 @@ package healthsync import ( + "bytes" "context" "encoding/json" "fmt" "os" "path/filepath" + "regexp" "sync/atomic" "syscall" "testing" @@ -22,6 +24,7 @@ import ( "github.com/hashicorp/consul-ecs/testutil" "github.com/hashicorp/consul-server-connection-manager/discovery" "github.com/hashicorp/consul/api" + "github.com/hashicorp/go-hclog" "github.com/hashicorp/serf/testutil/retry" "github.com/mitchellh/cli" "github.com/stretchr/testify/require" @@ -145,7 +148,7 @@ func TestRun(t *testing.T) { status: ecs.HealthStatusUnhealthy, }, }, - expectedDataplaneHealthStatus: api.HealthPassing, + expectedDataplaneHealthStatus: api.HealthCritical, consulLogin: consulLoginCfg, }, "two unhealthy health sync containers": { @@ -201,10 +204,10 @@ func TestRun(t *testing.T) { }, "container-2": { missing: true, - status: ecs.HealthStatusUnhealthy, + status: ecs.HealthStatusHealthy, }, }, - expectedDataplaneHealthStatus: api.HealthPassing, + expectedDataplaneHealthStatus: api.HealthCritical, shouldMissingContainersReappear: true, consulLogin: consulLoginCfg, }, @@ -381,6 +384,7 @@ func TestRun(t *testing.T) { if expCheck.CheckID == checkID { if hsc.missing { expCheck.Status = api.HealthCritical + markDataplaneContainerUnhealthy = true } else { expCheck.Status = ecsHealthToConsulHealth(hsc.status) // If there are multiple health sync containers and one of them is unhealthy @@ -446,8 +450,9 @@ func TestRun(t *testing.T) { if c.shouldMissingContainersReappear { // Mark all containers as non missing c.missingDataplaneContainer = false - for _, hsc := range c.healthSyncContainers { + for name, hsc := range c.healthSyncContainers { hsc.missing = false + c.healthSyncContainers[name] = hsc } // Add the containers data into task meta response @@ -841,6 +846,18 @@ func injectContainersIntoTaskMetaResponse(t *testing.T, taskMetadataResponse *aw return taskMetaRespStr } +func buildTaskMetaWithContainers(t *testing.T, taskMetadataResponse *awsutil.ECSTaskMeta, containers map[string]string) string { + var taskMetaContainersResponse []awsutil.ECSTaskMetaContainer + for name, status := range containers { + taskMetaContainersResponse = append(taskMetaContainersResponse, constructContainerResponse(name, status)) + } + taskMetadataResponse.Containers = taskMetaContainersResponse + taskMetaRespStr, err := constructTaskMetaResponseString(taskMetadataResponse) + require.NoError(t, err) + + return taskMetaRespStr +} + func constructContainerResponse(name, health string) awsutil.ECSTaskMetaContainer { return awsutil.ECSTaskMetaContainer{ Name: name, @@ -926,3 +943,497 @@ func registerNode(t *testing.T, consulClient *api.Client, taskMeta awsutil.ECSTa _, err = consulClient.Catalog().Register(payload, nil) require.NoError(t, err) } + +// expectedCheck represents the expected state of a health check +type expectedCheck struct { + serviceName string + checkID string + status string // api.HealthPassing or api.HealthCritical +} + +// assertCheckStatuses verifies all expected checks exist and have the expected status, with retries +func assertCheckStatuses(t *testing.T, client *api.Client, expectedChecks []expectedCheck, opts *api.QueryOptions) { + timer := &retry.Timer{Timeout: 5 * time.Second, Wait: 500 * time.Millisecond} + retry.RunWith(timer, t, func(r *retry.R) { + for _, exp := range expectedChecks { + filter := fmt.Sprintf("CheckID == `%s`", exp.checkID) + queryOpts := &api.QueryOptions{ + Filter: filter, + Namespace: opts.Namespace, + Partition: opts.Partition, + } + checks, _, err := client.Health().Checks(exp.serviceName, queryOpts) + require.NoError(r, err) + require.Len(r, checks, 1, "expected exactly one check with ID %s", exp.checkID) + require.Equal(r, exp.status, checks[0].Status, "check %s has unexpected status", exp.checkID) + } + }) +} + +// extractUpdatedCheckIDs parses log output and returns a set of checkIDs that were updated +func extractUpdatedCheckIDs(logContent string) map[string]bool { + updated := make(map[string]bool) + // Match log lines like: health check updated in Consul: checkID=xxx status=yyy + re := regexp.MustCompile(`health check updated in Consul: checkID=([^\s]+)`) + matches := re.FindAllStringSubmatch(logContent, -1) + for _, match := range matches { + if len(match) >= 2 { + updated[match[1]] = true + } + } + return updated +} + +// getExpectedUpdatedCheckIDs returns the set of checkIDs that should have been updated +// (i.e., those whose status changed between before and after) +func getExpectedUpdatedCheckIDs(before, after []expectedCheck) map[string]bool { + beforeStatus := make(map[string]string) + for _, c := range before { + beforeStatus[c.checkID] = c.status + } + + expected := make(map[string]bool) + for _, c := range after { + if beforeStatus[c.checkID] != c.status { + expected[c.checkID] = true + } + } + return expected +} + +// assertExpectedUpdates verifies that exactly the expected checks were updated (no more, no less) +func assertExpectedUpdates(t *testing.T, logContent string, expectedBefore, expectedAfter []expectedCheck) { + actualUpdates := extractUpdatedCheckIDs(logContent) + expectedUpdates := getExpectedUpdatedCheckIDs(expectedBefore, expectedAfter) + + // Check for missing updates (expected but not found in logs) + for checkID := range expectedUpdates { + require.True(t, actualUpdates[checkID], + "expected check %s to be updated but no log entry found", checkID) + } + + // Check for unexpected updates (found in logs but not expected) + for checkID := range actualUpdates { + require.True(t, expectedUpdates[checkID], + "check %s was updated but should not have been", checkID) + } +} + +type syncChecksTestConfig struct { + service *config.ServiceRegistration + proxy *config.AgentServiceConnectProxyConfig + gateway *config.GatewayRegistration + healthSyncContainers []string +} + +type syncChecksTestEnvironment struct { + consulClient *api.Client + apiQueryOptions *api.QueryOptions + cmd *Command + clusterARN string + containerNames []string + taskMetadataResponse *awsutil.ECSTaskMeta + currentTaskMetaResp *atomic.Value + logBuffer *bytes.Buffer +} + +func setupSyncChecksTest(t *testing.T, cfg syncChecksTestConfig) *syncChecksTestEnvironment { + var ( + partition = "" + namespace = "" + ) + + if testutil.EnterpriseFlag() { + partition = "foo" + namespace = "default" + } + + server, consulCfg := testutil.ConsulServer(t, nil) + consulClient, err := api.NewClient(consulCfg) + require.NoError(t, err) + + if testutil.EnterpriseFlag() { + createPartitionAndNamespace(t, consulClient, partition, namespace) + } + + _, serverGRPCPort := testutil.GetHostAndPortFromAddress(server.GRPCAddr) + _, serverHTTPPort := testutil.GetHostAndPortFromAddress(server.HTTPAddr) + + taskMetadataResponse := &awsutil.ECSTaskMeta{ + Cluster: "arn:aws:ecs:us-east-1:123456789:cluster/test", + TaskARN: "arn:aws:ecs:us-east-1:123456789:task/test/abcdef", + Family: "test-family", + } + taskMetaRespStr, err := constructTaskMetaResponseString(taskMetadataResponse) + require.NoError(t, err) + + var currentTaskMetaResp atomic.Value + currentTaskMetaResp.Store(taskMetaRespStr) + testutil.TaskMetaServer(t, testutil.TaskMetaHandlerFn(t, + func() string { + return currentTaskMetaResp.Load().(string) + }, + )) + + envoyBootstrapDir := testutil.TempDir(t) + + consulEcsConfig := config.Config{ + LogLevel: "DEBUG", + BootstrapDir: envoyBootstrapDir, + HealthSyncContainers: cfg.healthSyncContainers, + ConsulServers: config.ConsulServers{ + Hosts: "127.0.0.1", + GRPC: config.GRPCSettings{ + Port: serverGRPCPort, + }, + HTTP: config.HTTPSettings{ + Port: serverHTTPPort, + }, + SkipServerWatch: true, + }, + } + + if cfg.gateway != nil { + consulEcsConfig.Gateway = cfg.gateway + if testutil.EnterpriseFlag() { + consulEcsConfig.Gateway.Namespace = namespace + consulEcsConfig.Gateway.Partition = partition + } + } else { + consulEcsConfig.Service = *cfg.service + consulEcsConfig.Proxy = cfg.proxy + if testutil.EnterpriseFlag() { + consulEcsConfig.Service.Namespace = namespace + consulEcsConfig.Service.Partition = partition + } + } + + testutil.SetECSConfigEnvVar(t, &consulEcsConfig) + + // Run mesh-init to register services and create health checks + ui := cli.NewMockUi() + ctrlPlaneCmd := meshinit.Command{UI: ui} + code := ctrlPlaneCmd.Run(nil) + require.Equal(t, 0, code, ui.ErrorWriter.String()) + + // Set up the Command with a logger that writes to a buffer for testing + logBuf := &bytes.Buffer{} + logger := hclog.New(&hclog.LoggerOptions{ + Level: hclog.LevelFromString(consulEcsConfig.LogLevel), + Output: logBuf, + }) + + cmd := &Command{UI: ui} + cmd.config = &consulEcsConfig + cmd.log = logger + + taskMeta, err := awsutil.ECSTaskMetadata() + require.NoError(t, err) + + cmd.checks, err = cmd.fetchHealthChecks(consulClient, taskMeta) + require.NoError(t, err) + + clusterARN, err := taskMeta.ClusterARN() + require.NoError(t, err) + + return &syncChecksTestEnvironment{ + consulClient: consulClient, + apiQueryOptions: &api.QueryOptions{ + Namespace: namespace, + Partition: partition, + }, + cmd: cmd, + clusterARN: clusterARN, + containerNames: append(cfg.healthSyncContainers, config.ConsulDataplaneContainerName), + taskMetadataResponse: taskMetadataResponse, + currentTaskMetaResp: ¤tTaskMetaResp, + logBuffer: logBuf, + } +} + +// TestSyncChecks_ChangeDetection tests that syncChecks correctly updates Consul +// health checks and only updates when status actually changes. +func TestSyncChecks_ChangeDetection(t *testing.T) { + serviceName := "test-service" + proxyServiceName := fmt.Sprintf("%s-sidecar-proxy", serviceName) + servicePort := 8080 + taskID := "abcdef" + + cases := map[string]struct { + healthSyncContainers []string + startingContainers map[string]string + expectedChecksBeforeUpdate []expectedCheck + updatedContainers map[string]string + expectedChecksAfterUpdate []expectedCheck + }{ + "no change should not update consul": { + healthSyncContainers: []string{"app"}, + startingContainers: map[string]string{ + "app": ecs.HealthStatusHealthy, + config.ConsulDataplaneContainerName: ecs.HealthStatusHealthy, + }, + expectedChecksBeforeUpdate: []expectedCheck{ + {serviceName: serviceName, checkID: fmt.Sprintf("%s-%s-app", serviceName, taskID), status: api.HealthPassing}, + {serviceName: serviceName, checkID: fmt.Sprintf("%s-%s-consul-dataplane", serviceName, taskID), status: api.HealthPassing}, + {serviceName: proxyServiceName, checkID: fmt.Sprintf("%s-%s-sidecar-proxy-consul-dataplane", serviceName, taskID), status: api.HealthPassing}, + }, + updatedContainers: map[string]string{ + "app": ecs.HealthStatusHealthy, + config.ConsulDataplaneContainerName: ecs.HealthStatusHealthy, + }, + expectedChecksAfterUpdate: []expectedCheck{ + {serviceName: serviceName, checkID: fmt.Sprintf("%s-%s-app", serviceName, taskID), status: api.HealthPassing}, + {serviceName: serviceName, checkID: fmt.Sprintf("%s-%s-consul-dataplane", serviceName, taskID), status: api.HealthPassing}, + {serviceName: proxyServiceName, checkID: fmt.Sprintf("%s-%s-sidecar-proxy-consul-dataplane", serviceName, taskID), status: api.HealthPassing}, + }, + }, + "healthy to unhealthy should update all checks": { + healthSyncContainers: []string{"app"}, + startingContainers: map[string]string{ + "app": ecs.HealthStatusHealthy, + config.ConsulDataplaneContainerName: ecs.HealthStatusHealthy, + }, + expectedChecksBeforeUpdate: []expectedCheck{ + {serviceName: serviceName, checkID: fmt.Sprintf("%s-%s-app", serviceName, taskID), status: api.HealthPassing}, + {serviceName: serviceName, checkID: fmt.Sprintf("%s-%s-consul-dataplane", serviceName, taskID), status: api.HealthPassing}, + {serviceName: proxyServiceName, checkID: fmt.Sprintf("%s-%s-sidecar-proxy-consul-dataplane", serviceName, taskID), status: api.HealthPassing}, + }, + updatedContainers: map[string]string{ + "app": ecs.HealthStatusUnhealthy, + config.ConsulDataplaneContainerName: ecs.HealthStatusHealthy, + }, + expectedChecksAfterUpdate: []expectedCheck{ + {serviceName: serviceName, checkID: fmt.Sprintf("%s-%s-app", serviceName, taskID), status: api.HealthCritical}, + {serviceName: serviceName, checkID: fmt.Sprintf("%s-%s-consul-dataplane", serviceName, taskID), status: api.HealthCritical}, + {serviceName: proxyServiceName, checkID: fmt.Sprintf("%s-%s-sidecar-proxy-consul-dataplane", serviceName, taskID), status: api.HealthCritical}, + }, + }, + "one of two containers becomes unhealthy": { + healthSyncContainers: []string{"app1", "app2"}, + startingContainers: map[string]string{ + "app1": ecs.HealthStatusHealthy, + "app2": ecs.HealthStatusHealthy, + config.ConsulDataplaneContainerName: ecs.HealthStatusHealthy, + }, + expectedChecksBeforeUpdate: []expectedCheck{ + {serviceName: serviceName, checkID: fmt.Sprintf("%s-%s-app1", serviceName, taskID), status: api.HealthPassing}, + {serviceName: serviceName, checkID: fmt.Sprintf("%s-%s-app2", serviceName, taskID), status: api.HealthPassing}, + {serviceName: serviceName, checkID: fmt.Sprintf("%s-%s-consul-dataplane", serviceName, taskID), status: api.HealthPassing}, + {serviceName: proxyServiceName, checkID: fmt.Sprintf("%s-%s-sidecar-proxy-consul-dataplane", serviceName, taskID), status: api.HealthPassing}, + }, + updatedContainers: map[string]string{ + "app1": ecs.HealthStatusUnhealthy, + "app2": ecs.HealthStatusHealthy, + config.ConsulDataplaneContainerName: ecs.HealthStatusHealthy, + }, + expectedChecksAfterUpdate: []expectedCheck{ + {serviceName: serviceName, checkID: fmt.Sprintf("%s-%s-app1", serviceName, taskID), status: api.HealthCritical}, + {serviceName: serviceName, checkID: fmt.Sprintf("%s-%s-app2", serviceName, taskID), status: api.HealthPassing}, + {serviceName: serviceName, checkID: fmt.Sprintf("%s-%s-consul-dataplane", serviceName, taskID), status: api.HealthCritical}, + {serviceName: proxyServiceName, checkID: fmt.Sprintf("%s-%s-sidecar-proxy-consul-dataplane", serviceName, taskID), status: api.HealthCritical}, + }, + }, + "unhealthy to healthy recovery": { + healthSyncContainers: []string{"app"}, + startingContainers: map[string]string{ + "app": ecs.HealthStatusUnhealthy, + config.ConsulDataplaneContainerName: ecs.HealthStatusHealthy, + }, + expectedChecksBeforeUpdate: []expectedCheck{ + {serviceName: serviceName, checkID: fmt.Sprintf("%s-%s-app", serviceName, taskID), status: api.HealthCritical}, + {serviceName: serviceName, checkID: fmt.Sprintf("%s-%s-consul-dataplane", serviceName, taskID), status: api.HealthCritical}, + {serviceName: proxyServiceName, checkID: fmt.Sprintf("%s-%s-sidecar-proxy-consul-dataplane", serviceName, taskID), status: api.HealthCritical}, + }, + updatedContainers: map[string]string{ + "app": ecs.HealthStatusHealthy, + config.ConsulDataplaneContainerName: ecs.HealthStatusHealthy, + }, + expectedChecksAfterUpdate: []expectedCheck{ + {serviceName: serviceName, checkID: fmt.Sprintf("%s-%s-app", serviceName, taskID), status: api.HealthPassing}, + {serviceName: serviceName, checkID: fmt.Sprintf("%s-%s-consul-dataplane", serviceName, taskID), status: api.HealthPassing}, + {serviceName: proxyServiceName, checkID: fmt.Sprintf("%s-%s-sidecar-proxy-consul-dataplane", serviceName, taskID), status: api.HealthPassing}, + }, + }, + "dataplane container goes missing": { + healthSyncContainers: []string{"app"}, + startingContainers: map[string]string{ + "app": ecs.HealthStatusHealthy, + config.ConsulDataplaneContainerName: ecs.HealthStatusHealthy, + }, + expectedChecksBeforeUpdate: []expectedCheck{ + {serviceName: serviceName, checkID: fmt.Sprintf("%s-%s-app", serviceName, taskID), status: api.HealthPassing}, + {serviceName: serviceName, checkID: fmt.Sprintf("%s-%s-consul-dataplane", serviceName, taskID), status: api.HealthPassing}, + {serviceName: proxyServiceName, checkID: fmt.Sprintf("%s-%s-sidecar-proxy-consul-dataplane", serviceName, taskID), status: api.HealthPassing}, + }, + updatedContainers: map[string]string{ + "app": ecs.HealthStatusHealthy, + // dataplane container is missing from task metadata + }, + expectedChecksAfterUpdate: []expectedCheck{ + // app check remains passing since app container is still healthy + {serviceName: serviceName, checkID: fmt.Sprintf("%s-%s-app", serviceName, taskID), status: api.HealthPassing}, + // dataplane checks become critical because dataplane container is missing + {serviceName: serviceName, checkID: fmt.Sprintf("%s-%s-consul-dataplane", serviceName, taskID), status: api.HealthCritical}, + {serviceName: proxyServiceName, checkID: fmt.Sprintf("%s-%s-sidecar-proxy-consul-dataplane", serviceName, taskID), status: api.HealthCritical}, + }, + }, + "missing container treated as unhealthy": { + healthSyncContainers: []string{"app"}, + startingContainers: map[string]string{ + config.ConsulDataplaneContainerName: ecs.HealthStatusHealthy, + }, + expectedChecksBeforeUpdate: []expectedCheck{ + {serviceName: serviceName, checkID: fmt.Sprintf("%s-%s-app", serviceName, taskID), status: api.HealthCritical}, + {serviceName: serviceName, checkID: fmt.Sprintf("%s-%s-consul-dataplane", serviceName, taskID), status: api.HealthCritical}, + {serviceName: proxyServiceName, checkID: fmt.Sprintf("%s-%s-sidecar-proxy-consul-dataplane", serviceName, taskID), status: api.HealthCritical}, + }, + updatedContainers: map[string]string{ + config.ConsulDataplaneContainerName: ecs.HealthStatusHealthy, + }, + expectedChecksAfterUpdate: []expectedCheck{ + {serviceName: serviceName, checkID: fmt.Sprintf("%s-%s-app", serviceName, taskID), status: api.HealthCritical}, + {serviceName: serviceName, checkID: fmt.Sprintf("%s-%s-consul-dataplane", serviceName, taskID), status: api.HealthCritical}, + {serviceName: proxyServiceName, checkID: fmt.Sprintf("%s-%s-sidecar-proxy-consul-dataplane", serviceName, taskID), status: api.HealthCritical}, + }, + }, + "container reappears healthy": { + healthSyncContainers: []string{"app"}, + startingContainers: map[string]string{ + config.ConsulDataplaneContainerName: ecs.HealthStatusHealthy, + }, + expectedChecksBeforeUpdate: []expectedCheck{ + {serviceName: serviceName, checkID: fmt.Sprintf("%s-%s-app", serviceName, taskID), status: api.HealthCritical}, + {serviceName: serviceName, checkID: fmt.Sprintf("%s-%s-consul-dataplane", serviceName, taskID), status: api.HealthCritical}, + {serviceName: proxyServiceName, checkID: fmt.Sprintf("%s-%s-sidecar-proxy-consul-dataplane", serviceName, taskID), status: api.HealthCritical}, + }, + updatedContainers: map[string]string{ + "app": ecs.HealthStatusHealthy, + config.ConsulDataplaneContainerName: ecs.HealthStatusHealthy, + }, + expectedChecksAfterUpdate: []expectedCheck{ + {serviceName: serviceName, checkID: fmt.Sprintf("%s-%s-app", serviceName, taskID), status: api.HealthPassing}, + {serviceName: serviceName, checkID: fmt.Sprintf("%s-%s-consul-dataplane", serviceName, taskID), status: api.HealthPassing}, + {serviceName: proxyServiceName, checkID: fmt.Sprintf("%s-%s-sidecar-proxy-consul-dataplane", serviceName, taskID), status: api.HealthPassing}, + }, + }, + } + + for name, tc := range cases { + tc := tc + t.Run(name, func(t *testing.T) { + env := setupSyncChecksTest(t, syncChecksTestConfig{ + service: &config.ServiceRegistration{ + Name: serviceName, + Port: servicePort, + }, + proxy: &config.AgentServiceConnectProxyConfig{ + PublicListenerPort: config.DefaultPublicListenerPort, + }, + healthSyncContainers: tc.healthSyncContainers, + }) + + // Set up initial container state + env.currentTaskMetaResp.Store(buildTaskMetaWithContainers(t, env.taskMetadataResponse, tc.startingContainers)) + + // First syncChecks call + statuses := env.cmd.syncChecks(env.consulClient, map[string]string{}, env.clusterARN, env.containerNames) + + // Assert expected checks after first call + assertCheckStatuses(t, env.consulClient, tc.expectedChecksBeforeUpdate, env.apiQueryOptions) + + // Update container state for second call + env.currentTaskMetaResp.Store(buildTaskMetaWithContainers(t, env.taskMetadataResponse, tc.updatedContainers)) + + // Clear log buffer before second call to isolate its log output + env.logBuffer.Reset() + + // Second syncChecks call + statuses = env.cmd.syncChecks(env.consulClient, statuses, env.clusterARN, env.containerNames) + + // Assert expected checks after second call + assertCheckStatuses(t, env.consulClient, tc.expectedChecksAfterUpdate, env.apiQueryOptions) + + // Verify exactly the expected checks were updated (no more, no less) + assertExpectedUpdates(t, env.logBuffer.String(), tc.expectedChecksBeforeUpdate, tc.expectedChecksAfterUpdate) + }) + } +} + +// TestSyncChecks_Gateway_ChangeDetection tests syncChecks change detection for gateway services +func TestSyncChecks_Gateway_ChangeDetection(t *testing.T) { + serviceName := "test-mesh-gateway" + taskID := "abcdef" + + cases := map[string]struct { + startingContainers map[string]string + expectedChecksBeforeUpdate []expectedCheck + updatedContainers map[string]string + expectedChecksAfterUpdate []expectedCheck + }{ + "no change should not update consul": { + startingContainers: map[string]string{ + config.ConsulDataplaneContainerName: ecs.HealthStatusHealthy, + }, + expectedChecksBeforeUpdate: []expectedCheck{ + {serviceName: serviceName, checkID: fmt.Sprintf("%s-%s-consul-dataplane", serviceName, taskID), status: api.HealthPassing}, + }, + updatedContainers: map[string]string{ + config.ConsulDataplaneContainerName: ecs.HealthStatusHealthy, + }, + expectedChecksAfterUpdate: []expectedCheck{ + {serviceName: serviceName, checkID: fmt.Sprintf("%s-%s-consul-dataplane", serviceName, taskID), status: api.HealthPassing}, + }, + }, + "healthy to missing dataplane should update": { + startingContainers: map[string]string{ + config.ConsulDataplaneContainerName: ecs.HealthStatusHealthy, + }, + expectedChecksBeforeUpdate: []expectedCheck{ + {serviceName: serviceName, checkID: fmt.Sprintf("%s-%s-consul-dataplane", serviceName, taskID), status: api.HealthPassing}, + }, + updatedContainers: map[string]string{}, + expectedChecksAfterUpdate: []expectedCheck{ + {serviceName: serviceName, checkID: fmt.Sprintf("%s-%s-consul-dataplane", serviceName, taskID), status: api.HealthCritical}, + }, + }, + } + + for name, tc := range cases { + tc := tc + t.Run(name, func(t *testing.T) { + env := setupSyncChecksTest(t, syncChecksTestConfig{ + gateway: &config.GatewayRegistration{ + Kind: api.ServiceKindMeshGateway, + Name: serviceName, + LanAddress: &config.GatewayAddress{ + Port: 12345, + }, + }, + }) + + // Set up initial container state + env.currentTaskMetaResp.Store(buildTaskMetaWithContainers(t, env.taskMetadataResponse, tc.startingContainers)) + + // First syncChecks call + statuses := env.cmd.syncChecks(env.consulClient, map[string]string{}, env.clusterARN, env.containerNames) + + // Assert expected checks after first call + assertCheckStatuses(t, env.consulClient, tc.expectedChecksBeforeUpdate, env.apiQueryOptions) + + // Update container state for second call + env.currentTaskMetaResp.Store(buildTaskMetaWithContainers(t, env.taskMetadataResponse, tc.updatedContainers)) + + // Clear log buffer before second call to isolate its log output + env.logBuffer.Reset() + + // Second syncChecks call + statuses = env.cmd.syncChecks(env.consulClient, statuses, env.clusterARN, env.containerNames) + + // Assert expected checks after second call + assertCheckStatuses(t, env.consulClient, tc.expectedChecksAfterUpdate, env.apiQueryOptions) + + // Verify exactly the expected checks were updated (no more, no less) + assertExpectedUpdates(t, env.logBuffer.String(), tc.expectedChecksBeforeUpdate, tc.expectedChecksAfterUpdate) + }) + } +} From 8bb719d6593e35eeadda9d1ec575302ee57087f9 Mon Sep 17 00:00:00 2001 From: Daniel Flook Date: Tue, 3 Feb 2026 13:53:36 +0000 Subject: [PATCH 3/3] Fix health check output message regression The health-sync rewrite changed the output message from referencing an ECS health check that may or may not be accurate, to a generic consul health status: Before (original): ECS health status is "HEALTHY" for container "..." After (rewrite): Consul health status is "passing" for check "..." The check output message will be improved in the future with distinct messages for missing containers, aggregate health, and graceful shutdown, but for now this just reverts to the original output messaging in a way that can be built on in the future. It makes use of a distinct checkStatus struct which should prevent mixup with ECS health status strings. --- subcommand/health-sync/checks.go | 47 ++++++++++++++++++-------- subcommand/health-sync/checks_test.go | 44 +++++++++++++++++------- subcommand/health-sync/command.go | 2 +- subcommand/health-sync/command_test.go | 4 +-- 4 files changed, 67 insertions(+), 30 deletions(-) diff --git a/subcommand/health-sync/checks.go b/subcommand/health-sync/checks.go index 737012ff..0ec14cc4 100644 --- a/subcommand/health-sync/checks.go +++ b/subcommand/health-sync/checks.go @@ -13,6 +13,12 @@ import ( "github.com/hashicorp/go-multierror" ) +// checkStatus holds the computed status for a Consul health check. +type checkStatus struct { + consulStatus string // Consul health status (api.HealthPassing or api.HealthCritical) + output string // the message to display in Consul +} + // fetchHealthChecks fetches the Consul health checks for both the service // and proxy registrations func (c *Command) fetchHealthChecks(consulClient *api.Client, taskMeta awsutil.ECSTaskMeta) (map[string]*api.HealthCheck, error) { @@ -80,8 +86,8 @@ func (c *Command) setChecksCritical(consulClient *api.Client, taskMeta awsutil.E checkStatuses := c.computeCheckStatuses(serviceID, containerNames, containerStatuses) // Update all checks to critical - for checkID := range checkStatuses { - err := c.updateConsulHealthStatus(consulClient, checkID, clusterARN, api.HealthCritical) + for checkID, status := range checkStatuses { + err := c.updateConsulHealthStatus(consulClient, checkID, clusterARN, status) if err != nil { c.log.Warn("failed to set Consul health status to critical", "err", err, "checkID", checkID) result = multierror.Append(result, err) @@ -112,29 +118,40 @@ func computeOverallDataplaneHealth(containerStatuses map[string]string) string { } // computeCheckStatuses computes the desired Consul health status for each check. -// Returns a map of checkID -> Consul health status (api.HealthPassing or api.HealthCritical). -func (c *Command) computeCheckStatuses(serviceID string, containerNames []string, containerStatuses map[string]string) map[string]string { - checkStatuses := make(map[string]string) +// Returns a map of checkID -> checkStatus containing both Consul status and output message. +func (c *Command) computeCheckStatuses(serviceID string, containerNames []string, containerStatuses map[string]string) map[string]checkStatus { + checkStatuses := make(map[string]checkStatus) // Overall dataplane health is the aggregate of all container statuses - overallHealth := ecsHealthToConsulHealth(computeOverallDataplaneHealth(containerStatuses)) + overallECSHealth := computeOverallDataplaneHealth(containerStatuses) + overallConsulHealth := ecsHealthToConsulHealth(overallECSHealth) for _, name := range containerNames { if name == config.ConsulDataplaneContainerName { // Dataplane container maps to overall health on service check serviceCheckID := constructCheckID(serviceID, name) - checkStatuses[serviceCheckID] = overallHealth + checkStatuses[serviceCheckID] = checkStatus{ + consulStatus: overallConsulHealth, + output: fmt.Sprintf("ECS health status is %q for container %q", overallECSHealth, serviceCheckID), + } // Non-gateways also have a proxy check if !c.config.IsGateway() { proxySvcID, _ := makeProxySvcIDAndName(serviceID, "") proxyCheckID := constructCheckID(proxySvcID, name) - checkStatuses[proxyCheckID] = overallHealth + checkStatuses[proxyCheckID] = checkStatus{ + consulStatus: overallConsulHealth, + output: fmt.Sprintf("ECS health status is %q for container %q", overallECSHealth, proxyCheckID), + } } } else { // Non-dataplane containers map directly to their individual check checkID := constructCheckID(serviceID, name) - checkStatuses[checkID] = ecsHealthToConsulHealth(containerStatuses[name]) + ecsHealth := containerStatuses[name] + checkStatuses[checkID] = checkStatus{ + consulStatus: ecsHealthToConsulHealth(ecsHealth), + output: fmt.Sprintf("ECS health status is %q for container %q", ecsHealth, checkID), + } } } @@ -145,9 +162,9 @@ func (c *Command) computeCheckStatuses(serviceID string, containerNames []string // for the specified containers. Checks are only updated when their status // has changed since the last invocation. func (c *Command) syncChecks(consulClient *api.Client, - previousStatuses map[string]string, + previousStatuses map[string]checkStatus, clusterARN string, - containerNames []string) map[string]string { + containerNames []string) map[string]checkStatus { // Phase 1: Gather current container state taskMeta, err := awsutil.ECSTaskMetadata() @@ -176,7 +193,7 @@ func (c *Command) syncChecks(consulClient *api.Client, // Keep the previous status on error so we retry next cycle currentStatuses[checkID] = previousStatus } else { - c.log.Info("health check updated in Consul", "checkID", checkID, "status", status) + c.log.Info("health check updated in Consul", "checkID", checkID, "status", status.consulStatus) } } @@ -184,14 +201,14 @@ func (c *Command) syncChecks(consulClient *api.Client, return currentStatuses } -func (c *Command) updateConsulHealthStatus(consulClient *api.Client, checkID string, clusterARN string, consulHealthStatus string) error { +func (c *Command) updateConsulHealthStatus(consulClient *api.Client, checkID string, clusterARN string, status checkStatus) error { check, ok := c.checks[checkID] if !ok { return fmt.Errorf("unable to find check with ID %s", checkID) } - check.Status = consulHealthStatus - check.Output = fmt.Sprintf("Consul health status is %q for check %q", consulHealthStatus, checkID) + check.Status = status.consulStatus + check.Output = status.output c.checks[checkID] = check updateCheckReq := &api.CatalogRegistration{ diff --git a/subcommand/health-sync/checks_test.go b/subcommand/health-sync/checks_test.go index ca7b3107..779f6eef 100644 --- a/subcommand/health-sync/checks_test.go +++ b/subcommand/health-sync/checks_test.go @@ -4,6 +4,7 @@ package healthsync import ( + "fmt" "testing" "github.com/aws/aws-sdk-go/service/ecs" @@ -188,10 +189,11 @@ func TestComputeCheckStatuses(t *testing.T) { appCheckID := constructCheckID(serviceID, "app") cases := map[string]struct { - isGateway bool - containerNames []string - containerStatuses map[string]string - expectedChecks map[string]string + isGateway bool + containerNames []string + containerStatuses map[string]string + expectedConsulStatuses map[string]string + expectedOutputs map[string]string // optional, only checked if non-nil }{ "non-gateway all healthy": { isGateway: false, @@ -200,11 +202,14 @@ func TestComputeCheckStatuses(t *testing.T) { "app": ecs.HealthStatusHealthy, dataplaneContainer: ecs.HealthStatusHealthy, }, - expectedChecks: map[string]string{ + expectedConsulStatuses: map[string]string{ appCheckID: api.HealthPassing, serviceCheckID: api.HealthPassing, proxyCheckID: api.HealthPassing, }, + expectedOutputs: map[string]string{ + appCheckID: fmt.Sprintf("ECS health status is %q for container %q", ecs.HealthStatusHealthy, appCheckID), + }, }, "non-gateway app unhealthy affects overall health": { isGateway: false, @@ -213,11 +218,14 @@ func TestComputeCheckStatuses(t *testing.T) { "app": ecs.HealthStatusUnhealthy, dataplaneContainer: ecs.HealthStatusHealthy, }, - expectedChecks: map[string]string{ + expectedConsulStatuses: map[string]string{ appCheckID: api.HealthCritical, serviceCheckID: api.HealthCritical, proxyCheckID: api.HealthCritical, }, + expectedOutputs: map[string]string{ + appCheckID: fmt.Sprintf("ECS health status is %q for container %q", ecs.HealthStatusUnhealthy, appCheckID), + }, }, "non-gateway dataplane unhealthy": { isGateway: false, @@ -226,7 +234,7 @@ func TestComputeCheckStatuses(t *testing.T) { "app": ecs.HealthStatusHealthy, dataplaneContainer: ecs.HealthStatusUnhealthy, }, - expectedChecks: map[string]string{ + expectedConsulStatuses: map[string]string{ appCheckID: api.HealthPassing, serviceCheckID: api.HealthCritical, proxyCheckID: api.HealthCritical, @@ -238,7 +246,7 @@ func TestComputeCheckStatuses(t *testing.T) { containerStatuses: map[string]string{ dataplaneContainer: ecs.HealthStatusHealthy, }, - expectedChecks: map[string]string{ + expectedConsulStatuses: map[string]string{ serviceCheckID: api.HealthPassing, proxyCheckID: api.HealthPassing, }, @@ -249,7 +257,7 @@ func TestComputeCheckStatuses(t *testing.T) { containerStatuses: map[string]string{ dataplaneContainer: ecs.HealthStatusHealthy, }, - expectedChecks: map[string]string{ + expectedConsulStatuses: map[string]string{ serviceCheckID: api.HealthPassing, }, }, @@ -259,7 +267,7 @@ func TestComputeCheckStatuses(t *testing.T) { containerStatuses: map[string]string{ dataplaneContainer: ecs.HealthStatusUnhealthy, }, - expectedChecks: map[string]string{ + expectedConsulStatuses: map[string]string{ serviceCheckID: api.HealthCritical, }, }, @@ -270,7 +278,7 @@ func TestComputeCheckStatuses(t *testing.T) { "app": ecs.HealthStatusHealthy, dataplaneContainer: ecs.HealthStatusHealthy, }, - expectedChecks: map[string]string{ + expectedConsulStatuses: map[string]string{ appCheckID: api.HealthPassing, serviceCheckID: api.HealthPassing, }, @@ -289,7 +297,19 @@ func TestComputeCheckStatuses(t *testing.T) { } result := cmd.computeCheckStatuses(serviceID, tc.containerNames, tc.containerStatuses) - require.Equal(t, tc.expectedChecks, result) + + // Check consul statuses + for checkID, expectedStatus := range tc.expectedConsulStatuses { + require.Equal(t, expectedStatus, result[checkID].consulStatus, "consul status mismatch for %s", checkID) + } + + // Check output messages if specified + for checkID, expectedOutput := range tc.expectedOutputs { + require.Equal(t, expectedOutput, result[checkID].output, "output mismatch for %s", checkID) + } + + // Verify no extra checks + require.Len(t, result, len(tc.expectedConsulStatuses)) }) } } diff --git a/subcommand/health-sync/command.go b/subcommand/health-sync/command.go index a57a6303..ebdff2da 100644 --- a/subcommand/health-sync/command.go +++ b/subcommand/health-sync/command.go @@ -136,7 +136,7 @@ func (c *Command) realRun() error { var healthSyncContainers []string healthSyncContainers = append(healthSyncContainers, c.config.HealthSyncContainers...) healthSyncContainers = append(healthSyncContainers, config.ConsulDataplaneContainerName) - currentHealthStatuses := make(map[string]string) + currentHealthStatuses := make(map[string]checkStatus) c.checks, err = c.fetchHealthChecks(consulClient, taskMeta) if err != nil { diff --git a/subcommand/health-sync/command_test.go b/subcommand/health-sync/command_test.go index 7f65278b..3407fa0a 100644 --- a/subcommand/health-sync/command_test.go +++ b/subcommand/health-sync/command_test.go @@ -1336,7 +1336,7 @@ func TestSyncChecks_ChangeDetection(t *testing.T) { env.currentTaskMetaResp.Store(buildTaskMetaWithContainers(t, env.taskMetadataResponse, tc.startingContainers)) // First syncChecks call - statuses := env.cmd.syncChecks(env.consulClient, map[string]string{}, env.clusterARN, env.containerNames) + statuses := env.cmd.syncChecks(env.consulClient, map[string]checkStatus{}, env.clusterARN, env.containerNames) // Assert expected checks after first call assertCheckStatuses(t, env.consulClient, tc.expectedChecksBeforeUpdate, env.apiQueryOptions) @@ -1415,7 +1415,7 @@ func TestSyncChecks_Gateway_ChangeDetection(t *testing.T) { env.currentTaskMetaResp.Store(buildTaskMetaWithContainers(t, env.taskMetadataResponse, tc.startingContainers)) // First syncChecks call - statuses := env.cmd.syncChecks(env.consulClient, map[string]string{}, env.clusterARN, env.containerNames) + statuses := env.cmd.syncChecks(env.consulClient, map[string]checkStatus{}, env.clusterARN, env.containerNames) // Assert expected checks after first call assertCheckStatuses(t, env.consulClient, tc.expectedChecksBeforeUpdate, env.apiQueryOptions)