From 5e1106a7f790d6528969da86b447fe08f37dd0a2 Mon Sep 17 00:00:00 2001
From: Jay Geng <jay@stellar.org>
Date: Wed, 19 Nov 2025 14:14:58 -0500
Subject: [PATCH 1/2] Pass network nonce to pcv2 helm release name

---
 .../MissionHistoryPubnetParallelCatchupV2.fs          | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)
diff --git a/src/FSLibrary/MissionHistoryPubnetParallelCatchupV2.fs b/src/FSLibrary/MissionHistoryPubnetParallelCatchupV2.fs
index bfa320c0..a4dda933 100644
--- a/src/FSLibrary/MissionHistoryPubnetParallelCatchupV2.fs
+++ b/src/FSLibrary/MissionHistoryPubnetParallelCatchupV2.fs
@@ -25,7 +25,6 @@ open k8s
 open CSLibrary
 
 // Constants
-let helmReleaseName = "parallel-catchup"
 let helmChartPath = "/supercluster/src/MissionParallelCatchup/parallel_catchup_helm"
 
 // Comment out the path below for local testing
@@ -46,6 +45,7 @@ let failedJobLogFileLineCount = 10000
 let failedJobLogStreamLineCount = 1000
 
 let mutable nonce : String = ""
+let mutable helmReleaseName : String = ""
 
 let jobMonitorHostName (context: MissionContext) =
     match context.jobMonitorExternalHost with
@@ -111,7 +111,7 @@ let serviceAccountAnnotationsToHelmIndexed (index: int) (key: string, value: str
     sprintf "service_account.annotations[%d].key=%s,service_account.annotations[%d].value=%s" index key index value
 
 let installProject (context: MissionContext) =
-    LogInfo "Installing Helm chart..."
+    LogInfo "Installing Helm chart with release name: %s" helmReleaseName
 
     // install the project with default values from the file and overridden values from the commandline
     let setOptions = ResizeArray<string>()
@@ -302,7 +302,7 @@ let collectLogsFromPods (context: MissionContext) =
 let cleanup (context: MissionContext) =
     if toPerformCleanup then
         toPerformCleanup <- false
-        LogInfo "Cleaning up resources..."
+        LogInfo "Cleaning up resources for release: %s" helmReleaseName
 
         // Try to collect logs from all worker pods before cleanup
         try
@@ -373,12 +373,13 @@ let historyPubnetParallelCatchupV2 (context: MissionContext) =
     LogInfo "Running parallel catchup v2 ..."
 
     nonce <- (MakeNetworkNonce context.tag).ToString()
-    LogDebug "nonce: '%s'" nonce
+    helmReleaseName <- sprintf "parallel-catchup-%s" nonce
+    LogDebug "nonce: '%s', release name: '%s'" nonce helmReleaseName
 
     // Set cleanup context so cleanup handlers can access it
     cleanupContext <- Some context
 
-    installProject (context)
+    installProject context
 
     let mutable allJobsFinished = false
     let mutable timeoutLeft = jobMonitorStatusCheckTimeOutSecs

From 214ced06592832b07491b18b59d3c98e48c87dd4 Mon Sep 17 00:00:00 2001
From: Jay Geng <jay@stellar.org>
Date: Tue, 25 Nov 2025 17:26:37 -0500
Subject: [PATCH 2/2] Make k8s entity names unique per helm-release

---
 .../MissionHistoryPubnetParallelCatchupV2.fs  | 12 ++++---
 .../parallel_catchup_helm/files/worker.sh     |  4 ++-
 .../templates/catchup_workers.yaml            | 36 +++++++++----------
 .../templates/job_monitor.yaml                | 30 ++++++++--------
 .../templates/job_preload_redis.yaml          | 18 +++++-----
 .../templates/redis_queue.yaml                | 16 ++++-----
 .../parallel_catchup_helm/values.yaml         |  2 +-
 7 files changed, 62 insertions(+), 56 deletions(-)

diff --git a/src/FSLibrary/MissionHistoryPubnetParallelCatchupV2.fs b/src/FSLibrary/MissionHistoryPubnetParallelCatchupV2.fs
index a4dda933..1ab6c4b4 100644
--- a/src/FSLibrary/MissionHistoryPubnetParallelCatchupV2.fs
+++ b/src/FSLibrary/MissionHistoryPubnetParallelCatchupV2.fs
@@ -117,6 +117,10 @@ let installProject (context: MissionContext) =
     let setOptions = ResizeArray<string>()
     setOptions.Add(sprintf "worker.stellar_core_image=%s" context.image)
     setOptions.Add(sprintf "worker.replicas=%d" context.pubnetParallelCatchupNumWorkers)
+
+    // Set Redis hostname to be unique per release
+    setOptions.Add(sprintf "redis.hostname=%s-redis" nonce)
+
     setOptions.Add(sprintf "range_generator.params.starting_ledger=%d" context.pubnetParallelCatchupStartingLedger)
 
     let endLedger =
@@ -190,7 +194,7 @@ let installProject (context: MissionContext) =
     | None -> ()
 
     setOptions.Add(sprintf "monitor.hostname=%s" (jobMonitorHostName context))
-    setOptions.Add(sprintf "monitor.path=/%s/(.*)" context.namespaceProperty)
+    setOptions.Add(sprintf "monitor.path=/%s/%s/(.*)" context.namespaceProperty helmReleaseName)
     setOptions.Add(sprintf "monitor.logging_interval_seconds=%d" jobMonitorLoggingIntervalSecs)
 
     // Set ASAN_OPTIONS if provided
@@ -259,10 +263,10 @@ let installProject (context: MissionContext) =
 // 3. Creates a tar.gz archive and copies it to context.destination directory
 let collectLogsFromPods (context: MissionContext) =
     // Generate pod names based on number of workers
-    // Pod names follow the pattern: stellar-core-0, stellar-core-1, etc.
+    // Pod names follow the pattern: <helmReleaseName>-stellar-core-0, <helmReleaseName>-stellar-core-1, etc.
     let podNames =
         [ 0 .. context.pubnetParallelCatchupNumWorkers - 1 ]
-        |> List.map (fun i -> sprintf "stellar-core-%d" i)
+        |> List.map (fun i -> sprintf "%s-stellar-core-%d" helmReleaseName i)
 
     LogInfo "Collecting logs from %d worker pods to directory: %s" (List.length podNames) context.destination.Path
 
@@ -384,7 +388,7 @@ let historyPubnetParallelCatchupV2 (context: MissionContext) =
     let mutable allJobsFinished = false
     let mutable timeoutLeft = jobMonitorStatusCheckTimeOutSecs
     let mutable timeBeforeNextMetricsCheck = jobMonitorMetricsCheckIntervalSecs
-    let jobMonitorPath = "/" + context.namespaceProperty
+    let jobMonitorPath = "/" + context.namespaceProperty + "/" + helmReleaseName
 
     while not allJobsFinished do
         Thread.Sleep(jobMonitorStatusCheckIntervalSecs * 1000)
diff --git a/src/MissionParallelCatchup/parallel_catchup_helm/files/worker.sh b/src/MissionParallelCatchup/parallel_catchup_helm/files/worker.sh
index fb57018d..56cd29a1 100644
--- a/src/MissionParallelCatchup/parallel_catchup_helm/files/worker.sh
+++ b/src/MissionParallelCatchup/parallel_catchup_helm/files/worker.sh
@@ -69,7 +69,8 @@ if [ $LMOVE_EXIT_CODE -eq 0 ] && [ -n "$JOB_KEY" ]; then
     fi
 
     # Push metrics to redis in a transaction to ensure data consistency. Retry for 5min on failures
-    core_id=$(echo "$POD_NAME" | grep -o '[0-9]\+')
+    # Extract the pod ordinal (last hyphen-separated segment) from pod name like "release-name-stellar-core-0"
+    core_id=$(echo "$POD_NAME" | awk -F'-' '{print $NF}')
     # Validate core_id was extracted successfully
     if [ -z "$core_id" ]; then
         echo "Error: Failed to extract core_id from POD_NAME: $POD_NAME"
@@ -103,6 +104,7 @@ else
     # Either Redis command failed OR queue is empty
     if [ $LMOVE_EXIT_CODE -ne 0 ]; then
         echo "Error: Failed to connect to Redis at $REDIS_HOST:$REDIS_PORT"
+        echo "Exit code=$LMOVE_EXIT_CODE, Output: $JOB_KEY"
     else
         echo "$(date) No more jobs in the queue."
     fi
diff --git a/src/MissionParallelCatchup/parallel_catchup_helm/templates/catchup_workers.yaml b/src/MissionParallelCatchup/parallel_catchup_helm/templates/catchup_workers.yaml
index 7121bd36..178a7ddc 100644
--- a/src/MissionParallelCatchup/parallel_catchup_helm/templates/catchup_workers.yaml
+++ b/src/MissionParallelCatchup/parallel_catchup_helm/templates/catchup_workers.yaml
@@ -1,11 +1,11 @@
 apiVersion: v1
 kind: Service
 metadata:
-  name: stellar-core
+  name: {{ .Release.Name }}-stellar-core
 spec:
   clusterIP: None
   selector:
-    app: stellar-core
+    app: {{ .Release.Name }}-stellar-core
   ports:
     - port: 11626
       targetPort: 11626
@@ -13,7 +13,7 @@ spec:
 apiVersion: v1
 kind: ServiceAccount
 metadata:
-  name: stellar-supercluster
+  name: {{ .Release.Name }}-sa
   {{- if .Values.service_account.annotations }}
   annotations:
     {{- range .Values.service_account.annotations }}
@@ -24,22 +24,22 @@ metadata:
 apiVersion: apps/v1
 kind: StatefulSet
 metadata:
-  name: stellar-core
+  name: {{ .Release.Name }}-stellar-core
   labels:
-    app: stellar-core
+    app: {{ .Release.Name }}-stellar-core
 spec:
-  serviceName: "stellar-core"
+  serviceName: "{{ .Release.Name }}-stellar-core"
   podManagementPolicy: Parallel
   replicas: {{ .Values.worker.replicas }}
   selector:
     matchLabels:
-      app: stellar-core
+      app: {{ .Release.Name }}-stellar-core
   template:
     metadata:
       labels:
-        app: stellar-core
+        app: {{ .Release.Name }}-stellar-core
     spec:
-      serviceAccountName: stellar-supercluster
+      serviceAccountName: {{ .Release.Name }}-sa
       {{- if or .Values.worker.requireNodeLabels .Values.worker.avoidNodeLabels }}
       affinity:
         nodeAffinity:
@@ -93,7 +93,7 @@ spec:
           value: {{ .Values.worker.asanOptions | quote }}
         envFrom:
         - configMapRef:
-            name: worker-config
+            name: {{ .Release.Name }}-worker-config
         volumeMounts:
         - name: config
           mountPath: /config
@@ -104,17 +104,17 @@ spec:
       volumes:
       - name: config
         configMap:
-          name: stellar-core-config
+          name: {{ .Release.Name }}-stellar-core-config
       - name: script
         configMap:
-          name: worker-script
+          name: {{ .Release.Name }}-worker-script
       - emptyDir: {}
         name: data-volume
       {{- if not .Values.worker.unevenSched }}
       topologySpreadConstraints:
       - labelSelector:
           matchLabels:
-            app: stellar-core
+            app: {{ .Release.Name }}-stellar-core
         # Note: maxSkew affects dynamic node scheduling with karpenter
         # See https://github.com/stellar/supercluster/issues/330
         maxSkew: 2
@@ -125,7 +125,7 @@ spec:
 apiVersion: v1
 kind: ConfigMap
 metadata:
-  name: worker-script
+  name: {{ .Release.Name }}-worker-script
 data:
   worker.sh: |-
     {{- (.Files.Get "files/worker.sh") | nindent 4 }}
@@ -133,7 +133,7 @@ data:
 apiVersion: v1
 kind: ConfigMap
 metadata:
-  name: stellar-core-config
+  name: {{ .Release.Name }}-stellar-core-config
 data:
   stellar-core.cfg: |-
     {{- if .Values.worker.catchup_skip_known_results_for_testing }}
@@ -154,10 +154,10 @@ data:
 apiVersion: v1
 kind: ConfigMap
 metadata:
-  name: worker-config
+  name: {{ .Release.Name }}-worker-config
 data:
-  REDIS_HOST: "{{ .Values.redis.hostname}}"
-  REDIS_PORT: "{{ .Values.redis.port}}"
+  REDIS_HOST: "{{ .Values.redis.hostname }}"
+  REDIS_PORT: "{{ .Values.redis.port }}"
   JOB_QUEUE: "{{ .Values.redis.job_queue }}"
   SUCCESS_QUEUE: "{{ .Values.redis.success_queue }}"
   FAILED_QUEUE: "{{ .Values.redis.failed_queue }}"
diff --git a/src/MissionParallelCatchup/parallel_catchup_helm/templates/job_monitor.yaml b/src/MissionParallelCatchup/parallel_catchup_helm/templates/job_monitor.yaml
index 11e49816..47f8d708 100644
--- a/src/MissionParallelCatchup/parallel_catchup_helm/templates/job_monitor.yaml
+++ b/src/MissionParallelCatchup/parallel_catchup_helm/templates/job_monitor.yaml
@@ -1,7 +1,7 @@
 apiVersion: networking.k8s.io/v1
 kind: Ingress
 metadata:
-  name: job-monitor-ingress
+  name: {{ .Release.Name }}-job-monitor-ingress
   annotations:
     nginx.ingress.kubernetes.io/use-regex: "true"
     nginx.ingress.kubernetes.io/rewrite-target: /$1
@@ -15,35 +15,35 @@ spec:
         pathType: Prefix
         backend:
           service:
-            name: job-monitor
+            name: {{ .Release.Name }}-job-monitor
             port:
               number: 8080
 ---
 apiVersion: v1
 kind: Service
 metadata:
-  name: job-monitor
+  name: {{ .Release.Name }}-job-monitor
 spec:
   type: ClusterIP
   ports:
     - port: 8080
       targetPort: 8080
   selector:
-    app: job-monitor
+    app: {{ .Release.Name }}-job-monitor
 ---
 apiVersion: apps/v1
 kind: Deployment
 metadata:
-  name: job-monitor
+  name: {{ .Release.Name }}-job-monitor
 spec:
   replicas: 1
   selector:
     matchLabels:
-      app: job-monitor
+      app: {{ .Release.Name }}-job-monitor
   template:
     metadata:
       labels:
-        app: job-monitor
+        app: {{ .Release.Name }}-job-monitor
       annotations:
         # Add annotations to tell prometheus service discovery to scrape metrics.
         # In k8s clusters without prometheus the annotations will be dormant
@@ -68,7 +68,7 @@ spec:
               fieldPath: metadata.namespace
         envFrom:
         - configMapRef:
-            name: job-monitor-config
+            name: {{ .Release.Name }}-job-monitor-config
       initContainers:
       - name: wait-for-preload
         image: redis:7
@@ -76,7 +76,7 @@ spec:
         args:
         - |-
           # Wait until the job queue has items
-          until [ $(redis-cli -h {{ .Values.redis.hostname}} -p  {{ .Values.redis.port}} LLEN {{ .Values.redis.job_queue }}) -gt 0 ]; do
+          until [ $(redis-cli -h {{ .Values.redis.hostname }} -p {{ .Values.redis.port }} LLEN {{ .Values.redis.job_queue }}) -gt 0 ]; do
             echo waiting for preload
             sleep 2
           done
@@ -85,16 +85,16 @@ spec:
 apiVersion: v1
 kind: ConfigMap
 metadata:
-  name: job-monitor-config
+  name: {{ .Release.Name }}-job-monitor-config
 data:
-  REDIS_HOST: "{{ .Values.redis.hostname}}"
-  REDIS_PORT: "{{ .Values.redis.port}}"
+  REDIS_HOST: "{{ .Values.redis.hostname }}"
+  REDIS_PORT: "{{ .Values.redis.port }}"
   JOB_QUEUE: "{{ .Values.redis.job_queue }}"
   SUCCESS_QUEUE: "{{ .Values.redis.success_queue }}"
   FAILED_QUEUE: "{{ .Values.redis.failed_queue }}"
   PROGRESS_QUEUE: "{{ .Values.redis.progress_queue }}"
   METRICS: "{{ .Values.redis.metrics }}"
-  WORKER_PREFIX: "stellar-core"
+  WORKER_PREFIX: "{{ .Release.Name }}-stellar-core"
   WORKER_COUNT: "{{ .Values.worker.replicas }}"
-  LOGGING_INTERVAL_SECONDS: "{{ .Values.monitor.logging_interval_seconds}}"
-  LOGGING_LEVEL: "{{ .Values.monitor.logging_level}}"
+  LOGGING_INTERVAL_SECONDS: "{{ .Values.monitor.logging_interval_seconds }}"
+  LOGGING_LEVEL: "{{ .Values.monitor.logging_level }}"
diff --git a/src/MissionParallelCatchup/parallel_catchup_helm/templates/job_preload_redis.yaml b/src/MissionParallelCatchup/parallel_catchup_helm/templates/job_preload_redis.yaml
index 35998368..14de3a7b 100644
--- a/src/MissionParallelCatchup/parallel_catchup_helm/templates/job_preload_redis.yaml
+++ b/src/MissionParallelCatchup/parallel_catchup_helm/templates/job_preload_redis.yaml
@@ -1,7 +1,7 @@
 apiVersion: batch/v1
 kind: Job
 metadata:
-  name: preload-redis
+  name: {{ .Release.Name }}-preload-redis
 spec:
   template:
     spec:
@@ -9,7 +9,7 @@ spec:
       - name: preload
         image: redis:7
         command: ["/bin/sh", "-c"]
-        args: 
+        args:
          - |-
            case "$STRATEGY" in
              "uniform")
@@ -24,24 +24,24 @@ spec:
            esac
         envFrom:
         - configMapRef:
-            name: range-generator-config
+            name: {{ .Release.Name }}-range-generator-config
         volumeMounts:
         - name: script
           mountPath: /scripts
       initContainers:
       - name: wait-for-redis
         image: redis:7
-        command: ['sh', '-c', "until redis-cli -h {{ .Values.redis.hostname}} -p  {{ .Values.redis.port}} ping; do echo waiting for redis; sleep 2; done;"]
+        command: ['sh', '-c', "until redis-cli -h {{ .Values.redis.hostname }} -p {{ .Values.redis.port }} ping; do echo waiting for redis; sleep 2; done;"]
       restartPolicy: OnFailure
       volumes:
       - name: script
         configMap:
-          name: generator-script
+          name: {{ .Release.Name }}-generator-script
 ---
 apiVersion: v1
 kind: ConfigMap
 metadata:
-  name: generator-script
+  name: {{ .Release.Name }}-generator-script
 data:
   uniform_range_generator.sh: |-
     {{- (.Files.Get "files/uniform_range_generator.sh") | nindent 4 }}
@@ -51,7 +51,7 @@ data:
 apiVersion: v1
 kind: ConfigMap
 metadata:
-  name: range-generator-config
+  name: {{ .Release.Name }}-range-generator-config
 data:
   STRATEGY: "{{ .Values.range_generator.strategy }}"
   STARTING_LEDGER: "{{ .Values.range_generator.params.starting_ledger }}"
@@ -60,5 +60,5 @@ data:
   LEDGERS_PER_JOB: "{{ .Values.range_generator.params.uniform_ledgers_per_job }}"
   LOGARITHMIC_FLOOR_LEDGERS: "{{ .Values.range_generator.params.logarithmic_floor_ledgers }}"
   NUM_PARALLELISM: "192"
-  REDIS_HOST: "{{ .Values.redis.hostname}}"
-  REDIS_PORT: "{{ .Values.redis.port}}"
+  REDIS_HOST: "{{ .Values.redis.hostname }}"
+  REDIS_PORT: "{{ .Values.redis.port }}"
diff --git a/src/MissionParallelCatchup/parallel_catchup_helm/templates/redis_queue.yaml b/src/MissionParallelCatchup/parallel_catchup_helm/templates/redis_queue.yaml
index d3660a61..aaba65fb 100644
--- a/src/MissionParallelCatchup/parallel_catchup_helm/templates/redis_queue.yaml
+++ b/src/MissionParallelCatchup/parallel_catchup_helm/templates/redis_queue.yaml
@@ -2,28 +2,28 @@
 apiVersion: v1
 kind: Service
 metadata:
-  name: "{{ .Values.redis.hostname}}"
+  name: "{{ .Values.redis.hostname }}"
 spec:
   type: ClusterIP
   ports:
-    - port: {{ .Values.redis.port}}
+    - port: {{ .Values.redis.port }}
       targetPort: 6379
   selector:
-    app: redis
+    app: {{ .Values.redis.hostname }}
 ---
 apiVersion: apps/v1
 kind: Deployment
 metadata:
-  name: redis
+  name: {{ .Values.redis.hostname }}
 spec:
   replicas: 1
   selector:
     matchLabels:
-      app: redis
+      app: {{ .Values.redis.hostname }}
   template:
     metadata:
       labels:
-        app: redis
+        app: {{ .Values.redis.hostname }}
     spec:
       containers:
       - name: redis
@@ -33,5 +33,5 @@ spec:
         command: ["redis-server"]
         resources:
           requests:
-            cpu: "{{ .Values.redis.resources.requests.cpu}}"
-            memory: "{{ .Values.redis.resources.requests.memory}}"
\ No newline at end of file
+            cpu: "{{ .Values.redis.resources.requests.cpu }}"
+            memory: "{{ .Values.redis.resources.requests.memory }}"
\ No newline at end of file
diff --git a/src/MissionParallelCatchup/parallel_catchup_helm/values.yaml b/src/MissionParallelCatchup/parallel_catchup_helm/values.yaml
index db9b914a..d2a158fe 100644
--- a/src/MissionParallelCatchup/parallel_catchup_helm/values.yaml
+++ b/src/MissionParallelCatchup/parallel_catchup_helm/values.yaml
@@ -1,5 +1,5 @@
 redis:
-  hostname: "redis"
+  hostname: "" # to be set by the mission
   port: 6379
   job_queue: "ranges"
   success_queue: "succeeded"