From 5e1106a7f790d6528969da86b447fe08f37dd0a2 Mon Sep 17 00:00:00 2001 From: Jay Geng Date: Wed, 19 Nov 2025 14:14:58 -0500 Subject: [PATCH 1/2] Pass network nonce to pcv2 helm release name --- .../MissionHistoryPubnetParallelCatchupV2.fs | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/src/FSLibrary/MissionHistoryPubnetParallelCatchupV2.fs b/src/FSLibrary/MissionHistoryPubnetParallelCatchupV2.fs index bfa320c0..a4dda933 100644 --- a/src/FSLibrary/MissionHistoryPubnetParallelCatchupV2.fs +++ b/src/FSLibrary/MissionHistoryPubnetParallelCatchupV2.fs @@ -25,7 +25,6 @@ open k8s open CSLibrary // Constants -let helmReleaseName = "parallel-catchup" let helmChartPath = "/supercluster/src/MissionParallelCatchup/parallel_catchup_helm" // Comment out the path below for local testing @@ -46,6 +45,7 @@ let failedJobLogFileLineCount = 10000 let failedJobLogStreamLineCount = 1000 let mutable nonce : String = "" +let mutable helmReleaseName : String = "" let jobMonitorHostName (context: MissionContext) = match context.jobMonitorExternalHost with @@ -111,7 +111,7 @@ let serviceAccountAnnotationsToHelmIndexed (index: int) (key: string, value: str sprintf "service_account.annotations[%d].key=%s,service_account.annotations[%d].value=%s" index key index value let installProject (context: MissionContext) = - LogInfo "Installing Helm chart..." + LogInfo "Installing Helm chart with release name: %s" helmReleaseName // install the project with default values from the file and overridden values from the commandline let setOptions = ResizeArray() @@ -302,7 +302,7 @@ let collectLogsFromPods (context: MissionContext) = let cleanup (context: MissionContext) = if toPerformCleanup then toPerformCleanup <- false - LogInfo "Cleaning up resources..." + LogInfo "Cleaning up resources for release: %s" helmReleaseName // Try to collect logs from all worker pods before cleanup try @@ -373,12 +373,13 @@ let historyPubnetParallelCatchupV2 (context: MissionContext) = LogInfo "Running parallel catchup v2 ..." nonce <- (MakeNetworkNonce context.tag).ToString() - LogDebug "nonce: '%s'" nonce + helmReleaseName <- sprintf "parallel-catchup-%s" nonce + LogDebug "nonce: '%s', release name: '%s'" nonce helmReleaseName // Set cleanup context so cleanup handlers can access it cleanupContext <- Some context - installProject (context) + installProject context let mutable allJobsFinished = false let mutable timeoutLeft = jobMonitorStatusCheckTimeOutSecs From 214ced06592832b07491b18b59d3c98e48c87dd4 Mon Sep 17 00:00:00 2001 From: Jay Geng Date: Tue, 25 Nov 2025 17:26:37 -0500 Subject: [PATCH 2/2] Make k8s entity names unique per helm-release --- .../MissionHistoryPubnetParallelCatchupV2.fs | 12 ++++--- .../parallel_catchup_helm/files/worker.sh | 4 ++- .../templates/catchup_workers.yaml | 36 +++++++++---------- .../templates/job_monitor.yaml | 30 ++++++++-------- .../templates/job_preload_redis.yaml | 18 +++++----- .../templates/redis_queue.yaml | 16 ++++----- .../parallel_catchup_helm/values.yaml | 2 +- 7 files changed, 62 insertions(+), 56 deletions(-) diff --git a/src/FSLibrary/MissionHistoryPubnetParallelCatchupV2.fs b/src/FSLibrary/MissionHistoryPubnetParallelCatchupV2.fs index a4dda933..1ab6c4b4 100644 --- a/src/FSLibrary/MissionHistoryPubnetParallelCatchupV2.fs +++ b/src/FSLibrary/MissionHistoryPubnetParallelCatchupV2.fs @@ -117,6 +117,10 @@ let installProject (context: MissionContext) = let setOptions = ResizeArray() setOptions.Add(sprintf "worker.stellar_core_image=%s" context.image) setOptions.Add(sprintf "worker.replicas=%d" context.pubnetParallelCatchupNumWorkers) + + // Set Redis hostname to be unique per release + setOptions.Add(sprintf "redis.hostname=%s-redis" nonce) + setOptions.Add(sprintf "range_generator.params.starting_ledger=%d" context.pubnetParallelCatchupStartingLedger) let endLedger = @@ -190,7 +194,7 @@ let installProject (context: MissionContext) = | None -> () setOptions.Add(sprintf "monitor.hostname=%s" (jobMonitorHostName context)) - setOptions.Add(sprintf "monitor.path=/%s/(.*)" context.namespaceProperty) + setOptions.Add(sprintf "monitor.path=/%s/%s/(.*)" context.namespaceProperty helmReleaseName) setOptions.Add(sprintf "monitor.logging_interval_seconds=%d" jobMonitorLoggingIntervalSecs) // Set ASAN_OPTIONS if provided @@ -259,10 +263,10 @@ let installProject (context: MissionContext) = // 3. Creates a tar.gz archive and copies it to context.destination directory let collectLogsFromPods (context: MissionContext) = // Generate pod names based on number of workers - // Pod names follow the pattern: stellar-core-0, stellar-core-1, etc. + // Pod names follow the pattern: -stellar-core-0, -stellar-core-1, etc. let podNames = [ 0 .. context.pubnetParallelCatchupNumWorkers - 1 ] - |> List.map (fun i -> sprintf "stellar-core-%d" i) + |> List.map (fun i -> sprintf "%s-stellar-core-%d" helmReleaseName i) LogInfo "Collecting logs from %d worker pods to directory: %s" (List.length podNames) context.destination.Path @@ -384,7 +388,7 @@ let historyPubnetParallelCatchupV2 (context: MissionContext) = let mutable allJobsFinished = false let mutable timeoutLeft = jobMonitorStatusCheckTimeOutSecs let mutable timeBeforeNextMetricsCheck = jobMonitorMetricsCheckIntervalSecs - let jobMonitorPath = "/" + context.namespaceProperty + let jobMonitorPath = "/" + context.namespaceProperty + "/" + helmReleaseName while not allJobsFinished do Thread.Sleep(jobMonitorStatusCheckIntervalSecs * 1000) diff --git a/src/MissionParallelCatchup/parallel_catchup_helm/files/worker.sh b/src/MissionParallelCatchup/parallel_catchup_helm/files/worker.sh index fb57018d..56cd29a1 100644 --- a/src/MissionParallelCatchup/parallel_catchup_helm/files/worker.sh +++ b/src/MissionParallelCatchup/parallel_catchup_helm/files/worker.sh @@ -69,7 +69,8 @@ if [ $LMOVE_EXIT_CODE -eq 0 ] && [ -n "$JOB_KEY" ]; then fi # Push metrics to redis in a transaction to ensure data consistency. Retry for 5min on failures - core_id=$(echo "$POD_NAME" | grep -o '[0-9]\+') + # Extract the pod ordinal (last hyphen-separated segment) from pod name like "release-name-stellar-core-0" + core_id=$(echo "$POD_NAME" | awk -F'-' '{print $NF}') # Validate core_id was extracted successfully if [ -z "$core_id" ]; then echo "Error: Failed to extract core_id from POD_NAME: $POD_NAME" @@ -103,6 +104,7 @@ else # Either Redis command failed OR queue is empty if [ $LMOVE_EXIT_CODE -ne 0 ]; then echo "Error: Failed to connect to Redis at $REDIS_HOST:$REDIS_PORT" + echo "Exit code=$LMOVE_EXIT_CODE, Output: $JOB_KEY" else echo "$(date) No more jobs in the queue." fi diff --git a/src/MissionParallelCatchup/parallel_catchup_helm/templates/catchup_workers.yaml b/src/MissionParallelCatchup/parallel_catchup_helm/templates/catchup_workers.yaml index 7121bd36..178a7ddc 100644 --- a/src/MissionParallelCatchup/parallel_catchup_helm/templates/catchup_workers.yaml +++ b/src/MissionParallelCatchup/parallel_catchup_helm/templates/catchup_workers.yaml @@ -1,11 +1,11 @@ apiVersion: v1 kind: Service metadata: - name: stellar-core + name: {{ .Release.Name }}-stellar-core spec: clusterIP: None selector: - app: stellar-core + app: {{ .Release.Name }}-stellar-core ports: - port: 11626 targetPort: 11626 @@ -13,7 +13,7 @@ spec: apiVersion: v1 kind: ServiceAccount metadata: - name: stellar-supercluster + name: {{ .Release.Name }}-sa {{- if .Values.service_account.annotations }} annotations: {{- range .Values.service_account.annotations }} @@ -24,22 +24,22 @@ metadata: apiVersion: apps/v1 kind: StatefulSet metadata: - name: stellar-core + name: {{ .Release.Name }}-stellar-core labels: - app: stellar-core + app: {{ .Release.Name }}-stellar-core spec: - serviceName: "stellar-core" + serviceName: "{{ .Release.Name }}-stellar-core" podManagementPolicy: Parallel replicas: {{ .Values.worker.replicas }} selector: matchLabels: - app: stellar-core + app: {{ .Release.Name }}-stellar-core template: metadata: labels: - app: stellar-core + app: {{ .Release.Name }}-stellar-core spec: - serviceAccountName: stellar-supercluster + serviceAccountName: {{ .Release.Name }}-sa {{- if or .Values.worker.requireNodeLabels .Values.worker.avoidNodeLabels }} affinity: nodeAffinity: @@ -93,7 +93,7 @@ spec: value: {{ .Values.worker.asanOptions | quote }} envFrom: - configMapRef: - name: worker-config + name: {{ .Release.Name }}-worker-config volumeMounts: - name: config mountPath: /config @@ -104,17 +104,17 @@ spec: volumes: - name: config configMap: - name: stellar-core-config + name: {{ .Release.Name }}-stellar-core-config - name: script configMap: - name: worker-script + name: {{ .Release.Name }}-worker-script - emptyDir: {} name: data-volume {{- if not .Values.worker.unevenSched }} topologySpreadConstraints: - labelSelector: matchLabels: - app: stellar-core + app: {{ .Release.Name }}-stellar-core # Note: maxSkew affects dynamic node scheduling with karpenter # See https://github.com/stellar/supercluster/issues/330 maxSkew: 2 @@ -125,7 +125,7 @@ spec: apiVersion: v1 kind: ConfigMap metadata: - name: worker-script + name: {{ .Release.Name }}-worker-script data: worker.sh: |- {{- (.Files.Get "files/worker.sh") | nindent 4 }} @@ -133,7 +133,7 @@ data: apiVersion: v1 kind: ConfigMap metadata: - name: stellar-core-config + name: {{ .Release.Name }}-stellar-core-config data: stellar-core.cfg: |- {{- if .Values.worker.catchup_skip_known_results_for_testing }} @@ -154,10 +154,10 @@ data: apiVersion: v1 kind: ConfigMap metadata: - name: worker-config + name: {{ .Release.Name }}-worker-config data: - REDIS_HOST: "{{ .Values.redis.hostname}}" - REDIS_PORT: "{{ .Values.redis.port}}" + REDIS_HOST: "{{ .Values.redis.hostname }}" + REDIS_PORT: "{{ .Values.redis.port }}" JOB_QUEUE: "{{ .Values.redis.job_queue }}" SUCCESS_QUEUE: "{{ .Values.redis.success_queue }}" FAILED_QUEUE: "{{ .Values.redis.failed_queue }}" diff --git a/src/MissionParallelCatchup/parallel_catchup_helm/templates/job_monitor.yaml b/src/MissionParallelCatchup/parallel_catchup_helm/templates/job_monitor.yaml index 11e49816..47f8d708 100644 --- a/src/MissionParallelCatchup/parallel_catchup_helm/templates/job_monitor.yaml +++ b/src/MissionParallelCatchup/parallel_catchup_helm/templates/job_monitor.yaml @@ -1,7 +1,7 @@ apiVersion: networking.k8s.io/v1 kind: Ingress metadata: - name: job-monitor-ingress + name: {{ .Release.Name }}-job-monitor-ingress annotations: nginx.ingress.kubernetes.io/use-regex: "true" nginx.ingress.kubernetes.io/rewrite-target: /$1 @@ -15,35 +15,35 @@ spec: pathType: Prefix backend: service: - name: job-monitor + name: {{ .Release.Name }}-job-monitor port: number: 8080 --- apiVersion: v1 kind: Service metadata: - name: job-monitor + name: {{ .Release.Name }}-job-monitor spec: type: ClusterIP ports: - port: 8080 targetPort: 8080 selector: - app: job-monitor + app: {{ .Release.Name }}-job-monitor --- apiVersion: apps/v1 kind: Deployment metadata: - name: job-monitor + name: {{ .Release.Name }}-job-monitor spec: replicas: 1 selector: matchLabels: - app: job-monitor + app: {{ .Release.Name }}-job-monitor template: metadata: labels: - app: job-monitor + app: {{ .Release.Name }}-job-monitor annotations: # Add annotations to tell prometheus service discovery to scrape metrics. # In k8s clusters without prometheus the annotations will be dormant @@ -68,7 +68,7 @@ spec: fieldPath: metadata.namespace envFrom: - configMapRef: - name: job-monitor-config + name: {{ .Release.Name }}-job-monitor-config initContainers: - name: wait-for-preload image: redis:7 @@ -76,7 +76,7 @@ spec: args: - |- # Wait until the job queue has items - until [ $(redis-cli -h {{ .Values.redis.hostname}} -p {{ .Values.redis.port}} LLEN {{ .Values.redis.job_queue }}) -gt 0 ]; do + until [ $(redis-cli -h {{ .Values.redis.hostname }} -p {{ .Values.redis.port }} LLEN {{ .Values.redis.job_queue }}) -gt 0 ]; do echo waiting for preload sleep 2 done @@ -85,16 +85,16 @@ spec: apiVersion: v1 kind: ConfigMap metadata: - name: job-monitor-config + name: {{ .Release.Name }}-job-monitor-config data: - REDIS_HOST: "{{ .Values.redis.hostname}}" - REDIS_PORT: "{{ .Values.redis.port}}" + REDIS_HOST: "{{ .Values.redis.hostname }}" + REDIS_PORT: "{{ .Values.redis.port }}" JOB_QUEUE: "{{ .Values.redis.job_queue }}" SUCCESS_QUEUE: "{{ .Values.redis.success_queue }}" FAILED_QUEUE: "{{ .Values.redis.failed_queue }}" PROGRESS_QUEUE: "{{ .Values.redis.progress_queue }}" METRICS: "{{ .Values.redis.metrics }}" - WORKER_PREFIX: "stellar-core" + WORKER_PREFIX: "{{ .Release.Name }}-stellar-core" WORKER_COUNT: "{{ .Values.worker.replicas }}" - LOGGING_INTERVAL_SECONDS: "{{ .Values.monitor.logging_interval_seconds}}" - LOGGING_LEVEL: "{{ .Values.monitor.logging_level}}" + LOGGING_INTERVAL_SECONDS: "{{ .Values.monitor.logging_interval_seconds }}" + LOGGING_LEVEL: "{{ .Values.monitor.logging_level }}" diff --git a/src/MissionParallelCatchup/parallel_catchup_helm/templates/job_preload_redis.yaml b/src/MissionParallelCatchup/parallel_catchup_helm/templates/job_preload_redis.yaml index 35998368..14de3a7b 100644 --- a/src/MissionParallelCatchup/parallel_catchup_helm/templates/job_preload_redis.yaml +++ b/src/MissionParallelCatchup/parallel_catchup_helm/templates/job_preload_redis.yaml @@ -1,7 +1,7 @@ apiVersion: batch/v1 kind: Job metadata: - name: preload-redis + name: {{ .Release.Name }}-preload-redis spec: template: spec: @@ -9,7 +9,7 @@ spec: - name: preload image: redis:7 command: ["/bin/sh", "-c"] - args: + args: - |- case "$STRATEGY" in "uniform") @@ -24,24 +24,24 @@ spec: esac envFrom: - configMapRef: - name: range-generator-config + name: {{ .Release.Name }}-range-generator-config volumeMounts: - name: script mountPath: /scripts initContainers: - name: wait-for-redis image: redis:7 - command: ['sh', '-c', "until redis-cli -h {{ .Values.redis.hostname}} -p {{ .Values.redis.port}} ping; do echo waiting for redis; sleep 2; done;"] + command: ['sh', '-c', "until redis-cli -h {{ .Values.redis.hostname }} -p {{ .Values.redis.port }} ping; do echo waiting for redis; sleep 2; done;"] restartPolicy: OnFailure volumes: - name: script configMap: - name: generator-script + name: {{ .Release.Name }}-generator-script --- apiVersion: v1 kind: ConfigMap metadata: - name: generator-script + name: {{ .Release.Name }}-generator-script data: uniform_range_generator.sh: |- {{- (.Files.Get "files/uniform_range_generator.sh") | nindent 4 }} @@ -51,7 +51,7 @@ data: apiVersion: v1 kind: ConfigMap metadata: - name: range-generator-config + name: {{ .Release.Name }}-range-generator-config data: STRATEGY: "{{ .Values.range_generator.strategy }}" STARTING_LEDGER: "{{ .Values.range_generator.params.starting_ledger }}" @@ -60,5 +60,5 @@ data: LEDGERS_PER_JOB: "{{ .Values.range_generator.params.uniform_ledgers_per_job }}" LOGARITHMIC_FLOOR_LEDGERS: "{{ .Values.range_generator.params.logarithmic_floor_ledgers }}" NUM_PARALLELISM: "192" - REDIS_HOST: "{{ .Values.redis.hostname}}" - REDIS_PORT: "{{ .Values.redis.port}}" + REDIS_HOST: "{{ .Values.redis.hostname }}" + REDIS_PORT: "{{ .Values.redis.port }}" diff --git a/src/MissionParallelCatchup/parallel_catchup_helm/templates/redis_queue.yaml b/src/MissionParallelCatchup/parallel_catchup_helm/templates/redis_queue.yaml index d3660a61..aaba65fb 100644 --- a/src/MissionParallelCatchup/parallel_catchup_helm/templates/redis_queue.yaml +++ b/src/MissionParallelCatchup/parallel_catchup_helm/templates/redis_queue.yaml @@ -2,28 +2,28 @@ apiVersion: v1 kind: Service metadata: - name: "{{ .Values.redis.hostname}}" + name: "{{ .Values.redis.hostname }}" spec: type: ClusterIP ports: - - port: {{ .Values.redis.port}} + - port: {{ .Values.redis.port }} targetPort: 6379 selector: - app: redis + app: {{ .Values.redis.hostname }} --- apiVersion: apps/v1 kind: Deployment metadata: - name: redis + name: {{ .Values.redis.hostname }} spec: replicas: 1 selector: matchLabels: - app: redis + app: {{ .Values.redis.hostname }} template: metadata: labels: - app: redis + app: {{ .Values.redis.hostname }} spec: containers: - name: redis @@ -33,5 +33,5 @@ spec: command: ["redis-server"] resources: requests: - cpu: "{{ .Values.redis.resources.requests.cpu}}" - memory: "{{ .Values.redis.resources.requests.memory}}" \ No newline at end of file + cpu: "{{ .Values.redis.resources.requests.cpu }}" + memory: "{{ .Values.redis.resources.requests.memory }}" \ No newline at end of file diff --git a/src/MissionParallelCatchup/parallel_catchup_helm/values.yaml b/src/MissionParallelCatchup/parallel_catchup_helm/values.yaml index db9b914a..d2a158fe 100644 --- a/src/MissionParallelCatchup/parallel_catchup_helm/values.yaml +++ b/src/MissionParallelCatchup/parallel_catchup_helm/values.yaml @@ -1,5 +1,5 @@ redis: - hostname: "redis" + hostname: "" # to be set by the mission port: 6379 job_queue: "ranges" success_queue: "succeeded"