diff --git a/docs/cluster-readiness.md b/docs/cluster-readiness.md index 059774f..fec7607 100644 --- a/docs/cluster-readiness.md +++ b/docs/cluster-readiness.md @@ -424,10 +424,12 @@ Missing pieces are warnings, not errors — the command surfaces them so you can `./bin/mcp-runtime cluster doctor` runs post-install diagnostics: - Detects your distribution (k3s / kind / minikube / docker-desktop / generic). -- Checks the installed MCP Runtime namespaces, CRDs, operator, Traefik ingress, registry, Sentinel, and MCPServer reconciliation path. +- Checks the installed MCP Runtime namespaces, CRDs, operator, Traefik ingress, registry, Sentinel, and MCPServer reconciliation path, including readiness of the temporary smoke deployment. +- Prefers k3s' bundled Traefik in `kube-system/traefik` when the active cluster is k3s, then falls back to the repo-managed `traefik/traefik` install. - Verifies registry reachability, registry image-pull smoke behavior, and common pod image-pull failures. - Reports `http: server gave HTTP response to HTTPS client` when kubelet/containerd tried HTTPS against the HTTP dev registry, including the affected pod and image where possible. -- Prints the distribution-specific remediation checklist from this document. +- Streams the current check before running it, including helper pod probes and waits, so a slow run shows what it is doing. +- Prints the distribution-specific registry remediation hint only when registry or image-pull checks fail; Traefik and Sentinel failures use their own check-specific remedies. Run `bootstrap` before `setup` on a fresh cluster. Run `cluster doctor` after setup, or when debugging `ImagePullBackOff` on an installed MCP Runtime stack. diff --git a/docs/internals/go-package-reference.md b/docs/internals/go-package-reference.md index d3cde71..83d90fa 100644 --- a/docs/internals/go-package-reference.md +++ b/docs/internals/go-package-reference.md @@ -1762,6 +1762,18 @@ type DoctorCheck struct { } DoctorCheck is a single preflight check result. +type DoctorCheckProgress func(DoctorCheckProgressEvent) func(DoctorCheck) + DoctorCheckProgress is called before each doctor check starts. It returns an + optional completion callback that receives the finished check result. + +type DoctorCheckProgressEvent struct { + Name string + Detail string + Index int + Total int +} + DoctorCheckProgressEvent describes the check that is about to run. + type DoctorReport struct { Distribution Distribution Checks []DoctorCheck @@ -1771,6 +1783,13 @@ type DoctorReport struct { func RunDoctor(kubectl KubectlRunner) DoctorReport RunDoctor executes cluster diagnostics and returns a report. +func RunDoctorAndPrint(kubectl KubectlRunner) DoctorReport + RunDoctorAndPrint streams doctor progress and results as checks execute. + +func RunDoctorWithProgress(kubectl KubectlRunner, progress DoctorCheckProgress) DoctorReport + RunDoctorWithProgress executes cluster diagnostics and calls progress hooks + before and after each check. It is useful for UIs that need live feedback. + func (r DoctorReport) AllOK() bool AllOK reports whether every check passed. diff --git a/internal/cli/cluster_doctor.go b/internal/cli/cluster_doctor.go index 7f3800f..52177ef 100644 --- a/internal/cli/cluster_doctor.go +++ b/internal/cli/cluster_doctor.go @@ -7,6 +7,8 @@ package cli // full list of per-distribution prerequisites. import ( + "bufio" + "bytes" "encoding/base64" "fmt" "strconv" @@ -41,11 +43,31 @@ type DoctorReport struct { Checks []DoctorCheck } +// DoctorCheckProgress is called before each doctor check starts. It returns an +// optional completion callback that receives the finished check result. +type DoctorCheckProgress func(DoctorCheckProgressEvent) func(DoctorCheck) + +// DoctorCheckProgressEvent describes the check that is about to run. +type DoctorCheckProgressEvent struct { + Name string + Detail string + Index int + Total int +} + +type doctorCheckSpec struct { + Name string + Detail string + Run func() DoctorCheck +} + const ( doctorMCPServersNamespace = "mcp-servers" doctorTraefikNamespace = "traefik" + doctorK3sTraefikNamespace = "kube-system" doctorTraefikServiceName = "traefik" doctorTraefikWebPort = 8000 + doctorK3sTraefikWebPort = 80 doctorSentinelNamespace = "mcp-sentinel" doctorSentinelAPIService = "mcp-sentinel-api" @@ -63,6 +85,19 @@ const ( imagePullDescribeLimit = 8 ) +type doctorTraefikEndpoint struct { + Namespace string + Name string + WebPort int + Source string +} + +type doctorServicePort struct { + Name string + Port int + NodePort string +} + // AllOK reports whether every check passed. func (r DoctorReport) AllOK() bool { for _, c := range r.Checks { @@ -81,8 +116,7 @@ func (m *ClusterManager) newClusterDoctorCmd() *cobra.Command { "operator/CRD prerequisites, ingress (Traefik) wiring, image pulls, Sentinel, and MCPServer reconciliation are healthy. Prints remediation steps for your distribution " + "when something is missing. See docs/cluster-readiness.md for the full per-distribution checklist.", RunE: func(cmd *cobra.Command, args []string) error { - report := RunDoctor(m.kubectl) - PrintDoctorReport(report) + report := RunDoctorAndPrint(m.kubectl) if !report.AllOK() { return newWithSentinel(ErrSetupStepFailed, "cluster doctor found unmet prerequisites; see docs/cluster-readiness.md") } @@ -94,32 +128,113 @@ func (m *ClusterManager) newClusterDoctorCmd() *cobra.Command { // RunDoctor executes cluster diagnostics and returns a report. func RunDoctor(kubectl KubectlRunner) DoctorReport { distro := DetectDistribution(kubectl) + return runDoctorChecks(kubectl, distro, nil) +} + +// RunDoctorWithProgress executes cluster diagnostics and calls progress hooks +// before and after each check. It is useful for UIs that need live feedback. +func RunDoctorWithProgress(kubectl KubectlRunner, progress DoctorCheckProgress) DoctorReport { + distro := DetectDistribution(kubectl) + return runDoctorChecks(kubectl, distro, progress) +} + +// RunDoctorAndPrint streams doctor progress and results as checks execute. +func RunDoctorAndPrint(kubectl KubectlRunner) DoctorReport { + Section("Cluster Doctor") + Info("Detecting Kubernetes distribution — reading node kubelet versions, node names, and current context") + distro := DetectDistribution(kubectl) + Info(fmt.Sprintf("Distribution: %s", distro)) + + report := runDoctorChecks(kubectl, distro, printDoctorCheckProgress) + printDoctorReportFooter(report) + return report +} + +func runDoctorChecks(kubectl KubectlRunner, distro Distribution, progress DoctorCheckProgress) DoctorReport { + specs := doctorCheckSpecs(kubectl, distro) + checks := make([]DoctorCheck, 0, len(specs)) + for i, spec := range specs { + finish := func(DoctorCheck) {} + if progress != nil { + event := DoctorCheckProgressEvent{ + Name: spec.Name, + Detail: spec.Detail, + Index: i + 1, + Total: len(specs), + } + if progressFinish := progress(event); progressFinish != nil { + finish = progressFinish + } + } + check := spec.Run() + if check.Name == "" { + check.Name = spec.Name + } + finish(check) + checks = append(checks, check) + } return DoctorReport{ Distribution: distro, - Checks: []DoctorCheck{ - checkNamespaceExists(kubectl, doctorMCPServersNamespace), - checkNamespaceDefaultServiceAccount(kubectl, doctorMCPServersNamespace), - checkNamespacePolicyGuardrails(kubectl, doctorMCPServersNamespace), - checkNamespacePodAdmission(kubectl, doctorMCPServersNamespace), - checkMCPServerCRD(kubectl), - checkOperatorReady(kubectl), - checkOperatorRecentReconcileErrors(kubectl), - checkTraefikIngressClass(kubectl), - checkTraefikDeploymentReady(kubectl), - checkTraefikWebEntrypoint(kubectl), - checkTraefikServiceExposure(kubectl), - checkMCPServersDNSAndNetwork(kubectl), - checkIngressRouteProbe(kubectl, doctorMCPServersNamespace), - checkRegistryService(kubectl), - checkRegistryReachableFromCluster(kubectl), - checkMCPServersImagePullSecrets(kubectl, doctorMCPServersNamespace), - checkMCPServersImagePullSmoke(kubectl, doctorMCPServersNamespace), - checkRegistryHTTPPullMismatch(kubectl), - checkSentinelSecrets(kubectl), - checkSentinelAPIAuthProbe(kubectl), - checkNodeCapacity(kubectl), - checkPendingPodsByNamespace(kubectl), - checkMCPServerReconcileSmoke(kubectl, doctorMCPServersNamespace), + Checks: checks, + } +} + +func doctorCheckSpecs(kubectl KubectlRunner, distro Distribution) []doctorCheckSpec { + return []doctorCheckSpec{ + { + Name: fmt.Sprintf("namespace %s", doctorMCPServersNamespace), + Detail: "reading namespace metadata from the Kubernetes API", + Run: func() DoctorCheck { return checkNamespaceExists(kubectl, doctorMCPServersNamespace) }, + }, + { + Name: fmt.Sprintf("namespace %s default serviceaccount", doctorMCPServersNamespace), + Detail: "confirming pods in the runtime namespace have a default service account", + Run: func() DoctorCheck { return checkNamespaceDefaultServiceAccount(kubectl, doctorMCPServersNamespace) }, + }, + { + Name: fmt.Sprintf("namespace %s quota/limitrange", doctorMCPServersNamespace), + Detail: "listing ResourceQuota and LimitRange objects that can block smoke pods", + Run: func() DoctorCheck { return checkNamespacePolicyGuardrails(kubectl, doctorMCPServersNamespace) }, + }, + { + Name: fmt.Sprintf("namespace %s pod admission", doctorMCPServersNamespace), + Detail: "submitting a server-side dry-run pod to exercise admission webhooks and quota", + Run: func() DoctorCheck { return checkNamespacePodAdmission(kubectl, doctorMCPServersNamespace) }, + }, + {Name: "MCPServer CRD", Detail: "checking that the MCPServer API type is installed", Run: func() DoctorCheck { return checkMCPServerCRD(kubectl) }}, + {Name: "operator readiness", Detail: "reading ready and desired replicas for the operator deployment", Run: func() DoctorCheck { return checkOperatorReady(kubectl) }}, + {Name: "operator reconcile errors (last 10m)", Detail: "scanning recent operator logs for reconcile failure patterns", Run: func() DoctorCheck { return checkOperatorRecentReconcileErrors(kubectl) }}, + {Name: "traefik ingressClass", Detail: "checking that the traefik IngressClass exists", Run: func() DoctorCheck { return checkTraefikIngressClass(kubectl) }}, + {Name: "traefik deployment readiness", Detail: "reading ready and desired replicas for Traefik", Run: func() DoctorCheck { return checkTraefikDeploymentReady(kubectl, distro) }}, + {Name: "traefik web entrypoint", Detail: "checking the Traefik Service ports for the web entrypoint", Run: func() DoctorCheck { return checkTraefikWebEntrypoint(kubectl, distro) }}, + {Name: "traefik service exposure", Detail: "checking LoadBalancer or NodePort exposure for the web entrypoint", Run: func() DoctorCheck { return checkTraefikServiceExposure(kubectl, distro) }}, + {Name: "mcp-servers DNS/network", Detail: "launching a temporary curl pod in mcp-servers to reach the registry service", Run: func() DoctorCheck { return checkMCPServersDNSAndNetwork(kubectl) }}, + { + Name: "ingress route probe", + Detail: "reading the first MCP ingress and launching a temporary curl pod against Traefik", + Run: func() DoctorCheck { return checkIngressRouteProbe(kubectl, doctorMCPServersNamespace, distro) }, + }, + {Name: "registry Service", Detail: "checking the bundled registry Service and NodePort", Run: func() DoctorCheck { return checkRegistryService(kubectl) }}, + {Name: "registry reachability (in-cluster)", Detail: "launching a temporary curl pod in registry to call /v2/ over cluster DNS", Run: func() DoctorCheck { return checkRegistryReachableFromCluster(kubectl) }}, + { + Name: "mcp-servers imagePullSecrets", + Detail: "reading default service account pull secrets and verifying referenced Secret objects", + Run: func() DoctorCheck { return checkMCPServersImagePullSecrets(kubectl, doctorMCPServersNamespace) }, + }, + { + Name: "mcp-servers image pull smoke", + Detail: "creating a temporary pod and waiting up to 90s for kubelet image pull readiness", + Run: func() DoctorCheck { return checkMCPServersImagePullSmoke(kubectl, doctorMCPServersNamespace) }, + }, + {Name: "registry HTTP pull mismatch", Detail: "listing pods and inspecting image-pull failures for HTTP-vs-HTTPS registry errors", Run: func() DoctorCheck { return checkRegistryHTTPPullMismatch(kubectl) }}, + {Name: "sentinel secrets", Detail: "reading Sentinel API_KEYS and UI_API_KEY from mcp-sentinel-secrets", Run: func() DoctorCheck { return checkSentinelSecrets(kubectl) }}, + {Name: "sentinel API auth probe", Detail: "launching a temporary curl pod with UI_API_KEY against the Sentinel API", Run: func() DoctorCheck { return checkSentinelAPIAuthProbe(kubectl) }}, + {Name: "node capacity", Detail: "checking node metrics, then falling back to allocatable resources if metrics-server is absent", Run: func() DoctorCheck { return checkNodeCapacity(kubectl) }}, + {Name: "pending pods", Detail: "listing Pending pods across all namespaces", Run: func() DoctorCheck { return checkPendingPodsByNamespace(kubectl) }}, + { + Name: "MCPServer reconcile smoke", + Detail: "applying a temporary MCPServer and waiting up to 150s for deployment/service/ingress resources", + Run: func() DoctorCheck { return checkMCPServerReconcileSmoke(kubectl, doctorMCPServersNamespace) }, }, } } @@ -423,10 +538,18 @@ func checkOperatorRecentReconcileErrors(kubectl KubectlRunner) DoctorCheck { Remedy: "inspect operator logs directly and fix reconcile failures", } } - logs := strings.ToLower(string(out)) patterns := []string{"reconciler error", "failed to reconcile", "error syncing"} - for _, p := range patterns { - if strings.Contains(logs, p) { + scanner := bufio.NewScanner(bytes.NewReader(out)) + scanner.Buffer(make([]byte, 0, 64*1024), 1024*1024) + for scanner.Scan() { + line := strings.ToLower(scanner.Text()) + if strings.Contains(line, "doctor-smoke-") { + continue + } + for _, p := range patterns { + if !strings.Contains(line, p) { + continue + } return DoctorCheck{ Name: "operator reconcile errors (last 10m)", OK: false, @@ -435,6 +558,14 @@ func checkOperatorRecentReconcileErrors(kubectl KubectlRunner) DoctorCheck { } } } + if err := scanner.Err(); err != nil { + return DoctorCheck{ + Name: "operator reconcile errors (last 10m)", + OK: false, + Detail: fmt.Sprintf("failed scanning operator logs: %v", err), + Remedy: "inspect `kubectl logs -n mcp-runtime deploy/mcp-runtime-operator-controller-manager --since=10m`", + } + } return DoctorCheck{ Name: "operator reconcile errors (last 10m)", OK: true, @@ -469,14 +600,68 @@ func checkTraefikIngressClass(kubectl KubectlRunner) DoctorCheck { } } -func checkTraefikDeploymentReady(kubectl KubectlRunner) DoctorCheck { - cmd, err := kubectl.CommandArgs([]string{"get", "deploy", "-n", doctorTraefikNamespace, doctorTraefikServiceName, "-o", "jsonpath={.status.readyReplicas}/{.spec.replicas}"}) +func doctorTraefikEndpoints(distro Distribution) []doctorTraefikEndpoint { + if distro == DistroK3s { + return []doctorTraefikEndpoint{ + { + Namespace: doctorK3sTraefikNamespace, + Name: doctorTraefikServiceName, + WebPort: doctorK3sTraefikWebPort, + Source: "k3s bundled Traefik", + }, + { + Namespace: doctorTraefikNamespace, + Name: doctorTraefikServiceName, + WebPort: doctorTraefikWebPort, + Source: "repo-managed Traefik", + }, + } + } + return []doctorTraefikEndpoint{ + { + Namespace: doctorTraefikNamespace, + Name: doctorTraefikServiceName, + WebPort: doctorTraefikWebPort, + Source: "repo-managed Traefik", + }, + } +} + +func (e doctorTraefikEndpoint) label() string { + return fmt.Sprintf("%s %s/%s", e.Source, e.Namespace, e.Name) +} + +func traefikRemedy(distro Distribution) string { + if distro == DistroK3s { + return "k3s usually installs Traefik as `kube-system/traefik`; verify it is enabled with `kubectl -n kube-system get deploy,svc traefik`, or install the repo ingress overlay." + } + return "install Traefik deployment/service in namespace `traefik`, or run setup with the repo ingress overlay" +} + +func checkTraefikDeploymentReady(kubectl KubectlRunner, distro Distribution) DoctorCheck { + failures := make([]string, 0, len(doctorTraefikEndpoints(distro))) + for _, endpoint := range doctorTraefikEndpoints(distro) { + check := checkTraefikDeploymentReadyAt(kubectl, endpoint) + if check.OK { + return check + } + failures = append(failures, fmt.Sprintf("%s: %s", endpoint.label(), check.Detail)) + } + return DoctorCheck{ + Name: "traefik deployment readiness", + OK: false, + Detail: strings.Join(failures, "; "), + Remedy: traefikRemedy(distro), + } +} + +func checkTraefikDeploymentReadyAt(kubectl KubectlRunner, endpoint doctorTraefikEndpoint) DoctorCheck { + cmd, err := kubectl.CommandArgs([]string{"get", "deploy", "-n", endpoint.Namespace, endpoint.Name, "-o", "jsonpath={.status.readyReplicas}/{.spec.replicas}"}) if err != nil { return DoctorCheck{ Name: "traefik deployment readiness", OK: false, Detail: fmt.Sprintf("kubectl error: %v", err), - Remedy: "install Traefik deployment in namespace `traefik`", } } out, execErr := cmd.Output() @@ -485,8 +670,7 @@ func checkTraefikDeploymentReady(kubectl KubectlRunner) DoctorCheck { return DoctorCheck{ Name: "traefik deployment readiness", OK: false, - Detail: fmt.Sprintf("deployment %s/%s not found", doctorTraefikNamespace, doctorTraefikServiceName), - Remedy: "install Traefik deployment in namespace `traefik`", + Detail: fmt.Sprintf("deployment %s/%s not found", endpoint.Namespace, endpoint.Name), } } parts := strings.SplitN(pair, "/", 2) @@ -495,7 +679,6 @@ func checkTraefikDeploymentReady(kubectl KubectlRunner) DoctorCheck { Name: "traefik deployment readiness", OK: false, Detail: fmt.Sprintf("unexpected replica status %q", pair), - Remedy: "inspect `kubectl -n traefik get deploy traefik -o wide`", } } ready, readyErr := strconv.Atoi(strings.TrimSpace(parts[0])) @@ -504,63 +687,88 @@ func checkTraefikDeploymentReady(kubectl KubectlRunner) DoctorCheck { return DoctorCheck{ Name: "traefik deployment readiness", OK: false, - Detail: fmt.Sprintf("%s replicas ready", pair), - Remedy: "check Traefik pods and events: `kubectl -n traefik get pods`", + Detail: fmt.Sprintf("%s replicas ready at %s/%s", pair, endpoint.Namespace, endpoint.Name), } } return DoctorCheck{ Name: "traefik deployment readiness", OK: true, - Detail: fmt.Sprintf("%s replicas ready", pair), + Detail: fmt.Sprintf("%s replicas ready at %s/%s (%s)", pair, endpoint.Namespace, endpoint.Name, endpoint.Source), } } -func checkTraefikWebEntrypoint(kubectl KubectlRunner) DoctorCheck { - cmd, err := kubectl.CommandArgs([]string{"get", "svc", "-n", doctorTraefikNamespace, doctorTraefikServiceName, "-o", "jsonpath={range .spec.ports[*]}{.name}:{.port}:{.nodePort}{\"\\n\"}{end}"}) - if err != nil { +func checkTraefikWebEntrypoint(kubectl KubectlRunner, distro Distribution) DoctorCheck { + endpoint, ports, ok := resolveDoctorTraefikWebEndpoint(kubectl, distro) + if ok { return DoctorCheck{ Name: "traefik web entrypoint", - OK: false, - Detail: fmt.Sprintf("kubectl error: %v", err), - Remedy: "install Traefik service in namespace `traefik`", + OK: true, + Detail: fmt.Sprintf("service %s/%s exposes web entrypoint on port %d (%s)", endpoint.Namespace, endpoint.Name, endpoint.WebPort, endpoint.Source), } } + return DoctorCheck{ + Name: "traefik web entrypoint", + OK: false, + Detail: ports, + Remedy: traefikRemedy(distro), + } +} + +func resolveDoctorTraefikWebEndpoint(kubectl KubectlRunner, distro Distribution) (doctorTraefikEndpoint, string, bool) { + failures := make([]string, 0, len(doctorTraefikEndpoints(distro))) + for _, endpoint := range doctorTraefikEndpoints(distro) { + ports, err := readTraefikServicePorts(kubectl, endpoint) + if err != nil { + failures = append(failures, fmt.Sprintf("%s: %v", endpoint.label(), err)) + continue + } + webPort, ok := findTraefikWebPort(ports) + if !ok { + failures = append(failures, fmt.Sprintf("%s ports: %q", endpoint.label(), strings.TrimSpace(ports))) + continue + } + endpoint.WebPort = webPort.Port + return endpoint, ports, true + } + return doctorTraefikEndpoint{}, strings.Join(failures, "; "), false +} + +func readTraefikServicePorts(kubectl KubectlRunner, endpoint doctorTraefikEndpoint) (string, error) { + cmd, err := kubectl.CommandArgs([]string{"get", "svc", "-n", endpoint.Namespace, endpoint.Name, "-o", "jsonpath={range .spec.ports[*]}{.name}:{.port}:{.nodePort}{\"\\n\"}{end}"}) + if err != nil { + return "", fmt.Errorf("kubectl error: %v", err) + } out, err := cmd.Output() if err != nil { - return DoctorCheck{ - Name: "traefik web entrypoint", - OK: false, - Detail: "service traefik/traefik not found", - Remedy: "install Traefik service in namespace `traefik`", - } + return "", fmt.Errorf("service %s/%s not found", endpoint.Namespace, endpoint.Name) } - ports := strings.TrimSpace(string(out)) - for _, line := range strings.Split(ports, "\n") { - line = strings.TrimSpace(line) - if strings.Contains(line, fmt.Sprintf(":%d:", doctorTraefikWebPort)) { - return DoctorCheck{ - Name: "traefik web entrypoint", - OK: true, - Detail: fmt.Sprintf("service %s/%s exposes port %d (web)", doctorTraefikNamespace, doctorTraefikServiceName, doctorTraefikWebPort), - } + return strings.TrimSpace(string(out)), nil +} + +func checkTraefikServiceExposure(kubectl KubectlRunner, distro Distribution) DoctorCheck { + failures := make([]string, 0, len(doctorTraefikEndpoints(distro))) + for _, endpoint := range doctorTraefikEndpoints(distro) { + check := checkTraefikServiceExposureAt(kubectl, endpoint) + if check.OK { + return check } + failures = append(failures, fmt.Sprintf("%s: %s", endpoint.label(), check.Detail)) } return DoctorCheck{ - Name: "traefik web entrypoint", + Name: "traefik service exposure", OK: false, - Detail: fmt.Sprintf("service traefik/traefik ports: %q", ports), - Remedy: "ensure Traefik `web` entrypoint is exposed on service port 8000", + Detail: strings.Join(failures, "; "), + Remedy: "ensure the active Traefik service has an external LoadBalancer address or NodePort for the web entrypoint", } } -func checkTraefikServiceExposure(kubectl KubectlRunner) DoctorCheck { - cmd, err := kubectl.CommandArgs([]string{"get", "svc", "-n", doctorTraefikNamespace, doctorTraefikServiceName, "-o", "jsonpath={.spec.type}|{.status.loadBalancer.ingress[0].ip}|{.status.loadBalancer.ingress[0].hostname}|{range .spec.ports[*]}{.port}:{.nodePort}{\",\"}{end}"}) +func checkTraefikServiceExposureAt(kubectl KubectlRunner, endpoint doctorTraefikEndpoint) DoctorCheck { + cmd, err := kubectl.CommandArgs([]string{"get", "svc", "-n", endpoint.Namespace, endpoint.Name, "-o", "jsonpath={.spec.type}|{.status.loadBalancer.ingress[0].ip}|{.status.loadBalancer.ingress[0].hostname}|{range .spec.ports[*]}{.name}:{.port}:{.nodePort}{\",\"}{end}"}) if err != nil { return DoctorCheck{ Name: "traefik service exposure", OK: false, Detail: fmt.Sprintf("kubectl error: %v", err), - Remedy: "ensure traefik service exists", } } out, execErr := cmd.Output() @@ -568,8 +776,7 @@ func checkTraefikServiceExposure(kubectl KubectlRunner) DoctorCheck { return DoctorCheck{ Name: "traefik service exposure", OK: false, - Detail: "failed reading traefik service exposure fields", - Remedy: "inspect `kubectl -n traefik get svc traefik -o wide`", + Detail: fmt.Sprintf("failed reading service exposure fields for %s/%s", endpoint.Namespace, endpoint.Name), } } parts := strings.SplitN(strings.TrimSpace(string(out)), "|", 4) @@ -578,13 +785,20 @@ func checkTraefikServiceExposure(kubectl KubectlRunner) DoctorCheck { Name: "traefik service exposure", OK: false, Detail: fmt.Sprintf("unexpected service exposure payload %q", strings.TrimSpace(string(out))), - Remedy: "inspect `kubectl -n traefik get svc traefik -o yaml`", } } svcType := strings.TrimSpace(parts[0]) lbIP := strings.TrimSpace(parts[1]) lbHost := strings.TrimSpace(parts[2]) ports := strings.TrimSpace(parts[3]) + webPort, hasWebPort := findTraefikWebPort(ports) + if !hasWebPort { + return DoctorCheck{ + Name: "traefik service exposure", + OK: false, + Detail: fmt.Sprintf("service type=%s has no web entrypoint port (ports=%q)", svcType, ports), + } + } if svcType == "LoadBalancer" && (lbIP != "" || lbHost != "") { addr := lbIP if addr == "" { @@ -593,21 +807,20 @@ func checkTraefikServiceExposure(kubectl KubectlRunner) DoctorCheck { return DoctorCheck{ Name: "traefik service exposure", OK: true, - Detail: fmt.Sprintf("LoadBalancer ready at %s", addr), + Detail: fmt.Sprintf("%s/%s LoadBalancer ready at %s (%s)", endpoint.Namespace, endpoint.Name, addr, endpoint.Source), } } - if strings.Contains(ports, fmt.Sprintf("%d:", doctorTraefikWebPort)) { + if webPort.NodePort != "" && webPort.NodePort != "0" { return DoctorCheck{ Name: "traefik service exposure", OK: true, - Detail: fmt.Sprintf("%s service exposes nodePort for %d", svcType, doctorTraefikWebPort), + Detail: fmt.Sprintf("%s/%s %s service exposes nodePort %s for web port %d (%s)", endpoint.Namespace, endpoint.Name, svcType, webPort.NodePort, webPort.Port, endpoint.Source), } } return DoctorCheck{ Name: "traefik service exposure", OK: false, Detail: fmt.Sprintf("service type=%s exposure not ready (lbIP=%q lbHost=%q ports=%q)", svcType, lbIP, lbHost, ports), - Remedy: "ensure Traefik service has an external LoadBalancer address or NodePort for web entrypoint", } } @@ -656,7 +869,7 @@ func checkMCPServersDNSAndNetwork(kubectl KubectlRunner) DoctorCheck { } } -func checkIngressRouteProbe(kubectl KubectlRunner, namespace string) DoctorCheck { +func checkIngressRouteProbe(kubectl KubectlRunner, namespace string, distro Distribution) DoctorCheck { ingressName, err := readKubectlOutput(kubectl, []string{"get", "ingress", "-n", namespace, "-o", "jsonpath={.items[0].metadata.name}"}) if err != nil { return DoctorCheck{ @@ -696,6 +909,15 @@ func checkIngressRouteProbe(kubectl KubectlRunner, namespace string) DoctorCheck if path == "" { path = "/" } + traefik, traefikDetail, ok := resolveDoctorTraefikWebEndpoint(kubectl, distro) + if !ok { + return DoctorCheck{ + Name: "ingress route probe", + OK: false, + Detail: fmt.Sprintf("failed resolving active Traefik service for route probe: %s", traefikDetail), + Remedy: traefikRemedy(distro), + } + } podName := fmt.Sprintf("mcp-runtime-doctor-ingress-%d", time.Now().UnixNano()) curlArgs := []string{ "run", "-n", namespace, @@ -705,7 +927,7 @@ func checkIngressRouteProbe(kubectl KubectlRunner, namespace string) DoctorCheck "--image=curlimages/curl:8.7.1", podName, "--command", "--", "curl", - "-sS", "-o", "/dev/null", + "-sS", "-o", "doctor-response", "-w", "%{http_code}", "--connect-timeout", "5", "--max-time", "20", @@ -718,7 +940,7 @@ func checkIngressRouteProbe(kubectl KubectlRunner, namespace string) DoctorCheck } curlArgs = append(curlArgs, "-d", `{"jsonrpc":"2.0","id":1,"method":"initialize","params":{}}`, - fmt.Sprintf("http://traefik.%s.svc.cluster.local:%d%s", doctorTraefikNamespace, doctorTraefikWebPort, path), + fmt.Sprintf("http://%s.%s.svc.cluster.local:%d%s", traefik.Name, traefik.Namespace, traefik.WebPort, path), ) cmd, err := kubectl.CommandArgs(curlArgs) if err != nil { @@ -758,7 +980,7 @@ func checkIngressRouteProbe(kubectl KubectlRunner, namespace string) DoctorCheck return DoctorCheck{ Name: "ingress route probe", OK: true, - Detail: fmt.Sprintf("ingress %s returned HTTP %s for %s", ingressName, status, path), + Detail: fmt.Sprintf("ingress %s returned HTTP %s for %s via %s/%s", ingressName, status, path, traefik.Namespace, traefik.Name), } } @@ -993,7 +1215,7 @@ func checkSentinelAPIAuthProbe(kubectl KubectlRunner) DoctorCheck { "--image=curlimages/curl:8.7.1", podName, "--command", "--", "curl", - "-sS", "-o", "/dev/null", + "-sS", "-o", "doctor-response", "-w", "%{http_code}", "--connect-timeout", "5", "--max-time", "20", @@ -1345,35 +1567,83 @@ spec: } } - if err := kubectl.Run([]string{"rollout", "status", "deployment/" + name, "-n", namespace, "--timeout=150s"}); err != nil { + if err := waitForDoctorResource(kubectl, "deployment", name, namespace, 150*time.Second); err != nil { + return DoctorCheck{ + Name: "MCPServer reconcile smoke", + OK: false, + Detail: fmt.Sprintf("deployment was not created: %v", err), + Remedy: "inspect operator reconcile errors and MCPServer status", + } + } + if err := waitForDoctorDeploymentReady(kubectl, name, namespace, 150*time.Second); err != nil { return DoctorCheck{ Name: "MCPServer reconcile smoke", OK: false, Detail: fmt.Sprintf("deployment did not become ready: %v", err), - Remedy: "inspect operator reconcile and deployment events", + Remedy: "inspect operator reconcile and smoke deployment events", } } - if _, err := readKubectlOutput(kubectl, []string{"get", "svc", name, "-n", namespace, "-o", "jsonpath={.metadata.name}"}); err != nil { + if err := waitForDoctorResource(kubectl, "svc", name, namespace, 150*time.Second); err != nil { return DoctorCheck{ Name: "MCPServer reconcile smoke", OK: false, - Detail: "service not created for smoke MCPServer", + Detail: fmt.Sprintf("service not created for smoke MCPServer: %v", err), Remedy: "inspect operator service reconciliation", } } - if _, err := readKubectlOutput(kubectl, []string{"get", "ingress", name, "-n", namespace, "-o", "jsonpath={.metadata.name}"}); err != nil { + if err := waitForDoctorResource(kubectl, "ingress", name, namespace, 150*time.Second); err != nil { return DoctorCheck{ Name: "MCPServer reconcile smoke", OK: false, - Detail: "ingress not created for smoke MCPServer", + Detail: fmt.Sprintf("ingress not created for smoke MCPServer: %v", err), Remedy: "inspect operator ingress reconciliation", } } return DoctorCheck{ Name: "MCPServer reconcile smoke", OK: true, - Detail: fmt.Sprintf("temporary MCPServer %s reconciled (deployment/service/ingress) using %s", name, imageSource), + Detail: fmt.Sprintf("temporary MCPServer %s reconciled ready deployment/service/ingress using %s", name, imageSource), + } +} + +func waitForDoctorResource(kubectl KubectlRunner, resource, name, namespace string, timeout time.Duration) error { + timeoutTimer := time.NewTimer(timeout) + defer timeoutTimer.Stop() + ticker := time.NewTicker(2 * time.Second) + defer ticker.Stop() + + var lastErr error + for { + if _, err := readKubectlOutput(kubectl, []string{"get", resource, name, "-n", namespace, "-o", "jsonpath={.metadata.name}"}); err == nil { + return nil + } else { + lastErr = err + } + select { + case <-timeoutTimer.C: + if lastErr != nil { + return lastErr + } + return fmt.Errorf("%s/%s not found before timeout", resource, name) + case <-ticker.C: + } + } +} + +func waitForDoctorDeploymentReady(kubectl KubectlRunner, name, namespace string, timeout time.Duration) error { + cmd, err := kubectl.CommandArgs([]string{"rollout", "status", "deployment/" + name, "-n", namespace, "--timeout=" + timeout.String()}) + if err != nil { + return err + } + out, runErr := cmd.CombinedOutput() + if runErr == nil { + return nil + } + detail := strings.TrimSpace(string(out)) + if detail == "" { + return runErr } + return fmt.Errorf("%w: %s", runErr, detail) } func hasHTTP200Status(body string) bool { @@ -1452,6 +1722,57 @@ func filterNonEmptyLines(value string) []string { return out } +func parseDoctorServicePorts(value string) []doctorServicePort { + entries := strings.FieldsFunc(value, func(r rune) bool { + return r == '\n' || r == ',' + }) + ports := make([]doctorServicePort, 0, len(entries)) + for _, entry := range entries { + entry = strings.TrimSpace(entry) + if entry == "" { + continue + } + parts := strings.Split(entry, ":") + switch len(parts) { + case 2: + port, err := strconv.Atoi(strings.TrimSpace(parts[0])) + if err != nil { + continue + } + ports = append(ports, doctorServicePort{ + Port: port, + NodePort: strings.TrimSpace(parts[1]), + }) + case 3: + port, err := strconv.Atoi(strings.TrimSpace(parts[1])) + if err != nil { + continue + } + ports = append(ports, doctorServicePort{ + Name: strings.TrimSpace(parts[0]), + Port: port, + NodePort: strings.TrimSpace(parts[2]), + }) + } + } + return ports +} + +func findTraefikWebPort(value string) (doctorServicePort, bool) { + ports := parseDoctorServicePorts(value) + for _, port := range ports { + if port.Name == "web" && port.Port > 0 { + return port, true + } + } + for _, port := range ports { + if port.Port == doctorTraefikWebPort || port.Port == doctorK3sTraefikWebPort { + return port, true + } + } + return doctorServicePort{}, false +} + func doctorNormalizePath(value string) string { trimmed := strings.TrimSpace(value) if trimmed == "" { @@ -1491,20 +1812,69 @@ func PrintDoctorReport(r DoctorReport) { Section("Cluster Doctor") Info(fmt.Sprintf("Distribution: %s", r.Distribution)) for _, c := range r.Checks { - if c.OK { - Success(fmt.Sprintf("%s — %s", c.Name, c.Detail)) - continue - } - Error(fmt.Sprintf("%s — %s", c.Name, c.Detail)) - if c.Remedy != "" { - Info(" Remedy: " + c.Remedy) - } + printDoctorCheckResult(c) + } + printDoctorReportFooter(r) +} + +func printDoctorCheckProgress(event DoctorCheckProgressEvent) func(DoctorCheck) { + Info(doctorCheckProgressMessage(event)) + return func(c DoctorCheck) { + printDoctorCheckResult(c) } +} + +func doctorCheckProgressMessage(event DoctorCheckProgressEvent) string { + prefix := "Checking" + if event.Total > 0 { + prefix = fmt.Sprintf("Checking %d/%d", event.Index, event.Total) + } + if event.Detail == "" { + return fmt.Sprintf("%s %s", prefix, event.Name) + } + return fmt.Sprintf("%s %s — %s", prefix, event.Name, event.Detail) +} + +func printDoctorCheckResult(c DoctorCheck) { + if c.OK { + Success(doctorCheckMessage(c)) + return + } + Error(doctorCheckMessage(c)) + if c.Remedy != "" { + Info(" Remedy: " + c.Remedy) + } +} + +func doctorCheckMessage(c DoctorCheck) string { + return fmt.Sprintf("%s — %s", c.Name, c.Detail) +} + +func printDoctorReportFooter(r DoctorReport) { if !r.AllOK() { Info("") Info("Full remediation steps per distribution are in docs/cluster-readiness.md.") - Info(remediationHint(r.Distribution)) + if reportHasRegistryOrPullFailure(r) { + Info(remediationHint(r.Distribution)) + } + } +} + +func reportHasRegistryOrPullFailure(r DoctorReport) bool { + for _, check := range r.Checks { + if check.OK { + continue + } + switch check.Name { + case "registry Service", + "registry reachability (in-cluster)", + "mcp-servers imagePullSecrets", + "mcp-servers image pull smoke", + "registry HTTP pull mismatch": + return true + } } + return false } func remediationHint(d Distribution) string { diff --git a/internal/cli/cluster_doctor_test.go b/internal/cli/cluster_doctor_test.go index dbe68c9..27e8ebf 100644 --- a/internal/cli/cluster_doctor_test.go +++ b/internal/cli/cluster_doctor_test.go @@ -222,29 +222,51 @@ func TestCheckTraefikIngressClass(t *testing.T) { } func TestCheckTraefikWebEntrypoint(t *testing.T) { - t.Run("ok when service exposes 8000", func(t *testing.T) { + t.Run("ok when service exposes named web entrypoint", func(t *testing.T) { mock := &MockExecutor{ CommandFunc: func(spec ExecSpec) *MockCommand { return &MockCommand{OutputData: []byte("web:8000:32080\nwebsecure:8443:32443\n")} }, } kubectl := &KubectlClient{exec: mock, validators: nil} - check := checkTraefikWebEntrypoint(kubectl) + check := checkTraefikWebEntrypoint(kubectl, DistroGeneric) if !check.OK { t.Fatalf("expected OK, got detail=%q", check.Detail) } }) - t.Run("fails when service does not expose 8000", func(t *testing.T) { + t.Run("ok with k3s bundled traefik service", func(t *testing.T) { mock := &MockExecutor{ CommandFunc: func(spec ExecSpec) *MockCommand { - return &MockCommand{OutputData: []byte("web:80:32080\n")} + switch { + case contains(spec.Args, "kube-system"): + return &MockCommand{OutputData: []byte("web:80:0\nwebsecure:443:0\n")} + case contains(spec.Args, "traefik"): + return &MockCommand{OutputErr: errors.New("not found")} + } + return &MockCommand{} }, } kubectl := &KubectlClient{exec: mock, validators: nil} - check := checkTraefikWebEntrypoint(kubectl) + check := checkTraefikWebEntrypoint(kubectl, DistroK3s) + if !check.OK { + t.Fatalf("expected OK for k3s bundled Traefik, got detail=%q", check.Detail) + } + if !strings.Contains(check.Detail, "k3s bundled Traefik") { + t.Fatalf("detail should mention k3s bundled Traefik, got %q", check.Detail) + } + }) + + t.Run("fails when service does not expose web entrypoint", func(t *testing.T) { + mock := &MockExecutor{ + CommandFunc: func(spec ExecSpec) *MockCommand { + return &MockCommand{OutputData: []byte("admin:9000:32090\n")} + }, + } + kubectl := &KubectlClient{exec: mock, validators: nil} + check := checkTraefikWebEntrypoint(kubectl, DistroGeneric) if check.OK { - t.Fatal("expected failure when port 8000 is not exposed") + t.Fatal("expected failure when web entrypoint is not exposed") } }) } @@ -572,6 +594,121 @@ func TestRunDoctorAggregates(t *testing.T) { } } +func TestRunDoctorWithProgressReportsEachCheck(t *testing.T) { + mock := &MockExecutor{ + CommandFunc: func(spec ExecSpec) *MockCommand { + switch { + case contains(spec.Args, "jsonpath={.items[*].status.nodeInfo.kubeletVersion}"): + return &MockCommand{OutputData: []byte("v1.34.6+k3s1")} + case contains(spec.Args, "namespace mcp-servers"): + return &MockCommand{OutputData: []byte("mcp-servers")} + case contains(spec.Args, "crd mcpservers.mcpruntime.org"): + return &MockCommand{OutputData: []byte("mcpservers.mcpruntime.org")} + case contains(spec.Args, "mcp-runtime-operator-controller-manager"): + return &MockCommand{OutputData: []byte("1/1")} + case contains(spec.Args, "ingressclass traefik"): + return &MockCommand{OutputData: []byte("traefik")} + case contains(spec.Args, "svc -n traefik traefik"): + return &MockCommand{OutputData: []byte("web:8000:32080\n")} + case contains(spec.Args, "jsonpath={.spec.ports[0].nodePort}"): + return &MockCommand{OutputData: []byte("32000")} + case contains(spec.Args, "curl"): + return &MockCommand{OutputData: []byte("HTTP/1.1 503 Service Unavailable\n")} + } + return &MockCommand{} + }, + } + kubectl := &KubectlClient{exec: mock, validators: nil} + var events []string + report := RunDoctorWithProgress(kubectl, func(event DoctorCheckProgressEvent) func(DoctorCheck) { + if event.Index <= 0 || event.Total <= 0 { + t.Fatalf("progress event has invalid position: %+v", event) + } + if event.Detail == "" { + t.Fatalf("progress event for %q should describe what the check is doing", event.Name) + } + events = append(events, "start:"+event.Name) + return func(check DoctorCheck) { + events = append(events, "finish:"+check.Name) + } + }) + + if len(report.Checks) == 0 { + t.Fatal("expected checks") + } + if len(events) != len(report.Checks)*2 { + t.Fatalf("got %d progress events for %d checks", len(events), len(report.Checks)) + } + for i, check := range report.Checks { + start := events[i*2] + finish := events[i*2+1] + if !strings.HasPrefix(start, "start:") { + t.Fatalf("event %d = %q, want start event", i*2, start) + } + if finish != "finish:"+check.Name { + t.Fatalf("finish event for check %d = %q, want %q", i, finish, "finish:"+check.Name) + } + } +} + +func TestDoctorCurlProbesPassPathValidator(t *testing.T) { + validators := []ExecValidator{NoControlChars(), PathUnder("/workspace")} + + t.Run("ingress route probe", func(t *testing.T) { + mock := &MockExecutor{ + CommandFunc: func(spec ExecSpec) *MockCommand { + switch { + case contains(spec.Args, "jsonpath={.items[0].metadata.name}"): + return &MockCommand{OutputData: []byte("demo")} + case contains(spec.Args, "jsonpath={.spec.rules[0].host}"): + return &MockCommand{} + case contains(spec.Args, "jsonpath={.spec.rules[0].http.paths[0].path}"): + return &MockCommand{OutputData: []byte("/demo/mcp")} + case contains(spec.Args, "svc"): + return &MockCommand{OutputData: []byte("web:8000:32080\n")} + case contains(spec.Args, "curl"): + if contains(spec.Args, "/dev/null") { + t.Fatal("doctor curl helper should not pass /dev/null through kubectl validators") + } + return &MockCommand{OutputData: []byte("200")} + default: + return &MockCommand{} + } + }, + } + kubectl := &KubectlClient{exec: mock, validators: validators} + check := checkIngressRouteProbe(kubectl, "mcp-servers", DistroGeneric) + if !check.OK { + t.Fatalf("expected OK, got detail=%q", check.Detail) + } + }) + + t.Run("sentinel API auth probe", func(t *testing.T) { + mock := &MockExecutor{ + CommandFunc: func(spec ExecSpec) *MockCommand { + switch { + case contains(spec.Args, "namespace"): + return &MockCommand{OutputData: []byte(doctorSentinelNamespace)} + case contains(spec.Args, "jsonpath={.data.UI_API_KEY}"): + return &MockCommand{OutputData: []byte("dGVzdA==")} + case contains(spec.Args, "curl"): + if contains(spec.Args, "/dev/null") { + t.Fatal("doctor curl helper should not pass /dev/null through kubectl validators") + } + return &MockCommand{OutputData: []byte("200")} + default: + return &MockCommand{} + } + }, + } + kubectl := &KubectlClient{exec: mock, validators: validators} + check := checkSentinelAPIAuthProbe(kubectl) + if !check.OK { + t.Fatalf("expected OK, got detail=%q", check.Detail) + } + }) +} + func TestRemediationHintPerDistro(t *testing.T) { for _, d := range []Distribution{DistroK3s, DistroKind, DistroMinikube, DistroDockerDesktop, DistroGeneric} { hint := remediationHint(d) @@ -581,6 +718,15 @@ func TestRemediationHintPerDistro(t *testing.T) { } } +func TestReportHasRegistryOrPullFailure(t *testing.T) { + if reportHasRegistryOrPullFailure(DoctorReport{Checks: []DoctorCheck{{Name: "sentinel secrets", OK: false}}}) { + t.Fatal("sentinel-only failures should not print registry remediation") + } + if !reportHasRegistryOrPullFailure(DoctorReport{Checks: []DoctorCheck{{Name: "registry HTTP pull mismatch", OK: false}}}) { + t.Fatal("registry pull failures should print registry remediation") + } +} + func TestCheckNamespacePodAdmission(t *testing.T) { t.Run("ok on dry-run success", func(t *testing.T) { mock := &MockExecutor{ @@ -641,12 +787,34 @@ func TestCheckTraefikDeploymentReady(t *testing.T) { }, } kubectl := &KubectlClient{exec: mock, validators: nil} - check := checkTraefikDeploymentReady(kubectl) + check := checkTraefikDeploymentReady(kubectl, DistroGeneric) if check.OK != tc.wantOK { t.Fatalf("OK=%v want %v; detail=%q", check.OK, tc.wantOK, check.Detail) } }) } + + t.Run("ok with k3s bundled traefik deployment", func(t *testing.T) { + mock := &MockExecutor{ + CommandFunc: func(spec ExecSpec) *MockCommand { + switch { + case contains(spec.Args, "kube-system"): + return &MockCommand{OutputData: []byte("1/1")} + case contains(spec.Args, "traefik"): + return &MockCommand{OutputErr: errors.New("not found")} + } + return &MockCommand{} + }, + } + kubectl := &KubectlClient{exec: mock, validators: nil} + check := checkTraefikDeploymentReady(kubectl, DistroK3s) + if !check.OK { + t.Fatalf("expected OK for k3s bundled Traefik, got detail=%q", check.Detail) + } + if !strings.Contains(check.Detail, "k3s bundled Traefik") { + t.Fatalf("detail should mention k3s bundled Traefik, got %q", check.Detail) + } + }) } func TestCheckTraefikServiceExposure(t *testing.T) { @@ -697,7 +865,7 @@ func TestCheckTraefikServiceExposure(t *testing.T) { }, } kubectl := &KubectlClient{exec: mock, validators: nil} - check := checkTraefikServiceExposure(kubectl) + check := checkTraefikServiceExposure(kubectl, DistroGeneric) if check.OK != tc.wantOK { t.Fatalf("OK=%v want %v; detail=%q", check.OK, tc.wantOK, check.Detail) } @@ -706,6 +874,28 @@ func TestCheckTraefikServiceExposure(t *testing.T) { } }) } + + t.Run("ok with k3s bundled traefik service", func(t *testing.T) { + mock := &MockExecutor{ + CommandFunc: func(spec ExecSpec) *MockCommand { + switch { + case contains(spec.Args, "kube-system"): + return &MockCommand{OutputData: []byte("LoadBalancer|10.1.2.3||web:80:0,websecure:443:0,")} + case contains(spec.Args, "traefik"): + return &MockCommand{OutputErr: errors.New("not found")} + } + return &MockCommand{} + }, + } + kubectl := &KubectlClient{exec: mock, validators: nil} + check := checkTraefikServiceExposure(kubectl, DistroK3s) + if !check.OK { + t.Fatalf("expected OK for k3s bundled Traefik, got detail=%q", check.Detail) + } + if !strings.Contains(check.Detail, "k3s bundled Traefik") { + t.Fatalf("detail should mention k3s bundled Traefik, got %q", check.Detail) + } + }) } func TestCheckOperatorRecentReconcileErrors(t *testing.T) { @@ -720,6 +910,7 @@ func TestCheckOperatorRecentReconcileErrors(t *testing.T) { {name: "failed to reconcile pattern", logs: "msg=\"failed to reconcile\" server=foo\n", wantOK: false}, {name: "error syncing pattern", logs: "level=error error syncing mcpserver/foo\n", wantOK: false}, {name: "case-insensitive match", logs: "FAILED TO RECONCILE\n", wantOK: false}, + {name: "ignores doctor smoke transient errors", logs: "ERROR Reconciler error mcpserver=doctor-smoke-123\n", wantOK: true}, {name: "no logs, OK", logs: "", wantOK: true}, {name: "kubectl error surfaces", outErr: errors.New("no such deploy"), wantOK: false}, } @@ -739,6 +930,76 @@ func TestCheckOperatorRecentReconcileErrors(t *testing.T) { } } +func TestCheckMCPServerReconcileSmoke(t *testing.T) { + t.Run("waits for deployment rollout readiness", func(t *testing.T) { + sawRollout := false + mock := &MockExecutor{ + CommandFunc: func(spec ExecSpec) *MockCommand { + switch { + case contains(spec.Args, "apply"): + return &MockCommand{} + case contains(spec.Args, "rollout"): + sawRollout = true + if !contains(spec.Args, "--timeout=2m30s") { + t.Fatalf("rollout status args %v missing timeout", spec.Args) + } + return &MockCommand{} + case contains(spec.Args, "get"): + return &MockCommand{OutputData: []byte("doctor-smoke")} + case contains(spec.Args, "delete"): + return &MockCommand{} + } + return &MockCommand{OutputErr: fmt.Errorf("unexpected command: %v", spec.Args)} + }, + } + kubectl := &KubectlClient{exec: mock, validators: nil} + + check := checkMCPServerReconcileSmoke(kubectl, "mcp-servers") + if !check.OK { + t.Fatalf("expected OK, got detail=%q remedy=%q", check.Detail, check.Remedy) + } + if !sawRollout { + t.Fatal("expected smoke check to wait for deployment rollout readiness") + } + if !strings.Contains(check.Detail, "ready deployment/service/ingress") { + t.Fatalf("detail should mention ready deployment resources, got %q", check.Detail) + } + }) + + t.Run("fails when deployment rollout does not become ready", func(t *testing.T) { + mock := &MockExecutor{ + CommandFunc: func(spec ExecSpec) *MockCommand { + switch { + case contains(spec.Args, "apply"): + return &MockCommand{} + case contains(spec.Args, "rollout"): + return &MockCommand{ + OutputData: []byte("deployment \"doctor-smoke\" exceeded its progress deadline"), + OutputErr: errors.New("rollout timed out"), + } + case contains(spec.Args, "get"): + return &MockCommand{OutputData: []byte("doctor-smoke")} + case contains(spec.Args, "delete"): + return &MockCommand{} + } + return &MockCommand{OutputErr: fmt.Errorf("unexpected command: %v", spec.Args)} + }, + } + kubectl := &KubectlClient{exec: mock, validators: nil} + + check := checkMCPServerReconcileSmoke(kubectl, "mcp-servers") + if check.OK { + t.Fatalf("expected failure when rollout fails; detail=%q", check.Detail) + } + if !strings.Contains(check.Detail, "deployment did not become ready") { + t.Fatalf("detail should describe rollout readiness failure, got %q", check.Detail) + } + if !strings.Contains(check.Detail, "exceeded its progress deadline") { + t.Fatalf("detail should include rollout output, got %q", check.Detail) + } + }) +} + func TestCheckNodeCapacity(t *testing.T) { t.Run("metrics-server healthy", func(t *testing.T) { mock := &MockExecutor{