Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 14 additions & 9 deletions src/FSLibrary/MissionHistoryPubnetParallelCatchupV2.fs
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,6 @@ open k8s
open CSLibrary

// Constants
let helmReleaseName = "parallel-catchup"
let helmChartPath = "/supercluster/src/MissionParallelCatchup/parallel_catchup_helm"

// Comment out the path below for local testing
Expand All @@ -46,6 +45,7 @@ let failedJobLogFileLineCount = 10000
let failedJobLogStreamLineCount = 1000

let mutable nonce : String = ""
let mutable helmReleaseName : String = ""

let jobMonitorHostName (context: MissionContext) =
match context.jobMonitorExternalHost with
Expand Down Expand Up @@ -111,12 +111,16 @@ let serviceAccountAnnotationsToHelmIndexed (index: int) (key: string, value: str
sprintf "service_account.annotations[%d].key=%s,service_account.annotations[%d].value=%s" index key index value

let installProject (context: MissionContext) =
LogInfo "Installing Helm chart..."
LogInfo "Installing Helm chart with release name: %s" helmReleaseName

// install the project with default values from the file and overridden values from the commandline
let setOptions = ResizeArray<string>()
setOptions.Add(sprintf "worker.stellar_core_image=%s" context.image)
setOptions.Add(sprintf "worker.replicas=%d" context.pubnetParallelCatchupNumWorkers)

// Set Redis hostname to be unique per release
setOptions.Add(sprintf "redis.hostname=%s-redis" nonce)

setOptions.Add(sprintf "range_generator.params.starting_ledger=%d" context.pubnetParallelCatchupStartingLedger)

let endLedger =
Expand Down Expand Up @@ -190,7 +194,7 @@ let installProject (context: MissionContext) =
| None -> ()

setOptions.Add(sprintf "monitor.hostname=%s" (jobMonitorHostName context))
setOptions.Add(sprintf "monitor.path=/%s/(.*)" context.namespaceProperty)
setOptions.Add(sprintf "monitor.path=/%s/%s/(.*)" context.namespaceProperty helmReleaseName)
setOptions.Add(sprintf "monitor.logging_interval_seconds=%d" jobMonitorLoggingIntervalSecs)

// Set ASAN_OPTIONS if provided
Expand Down Expand Up @@ -259,10 +263,10 @@ let installProject (context: MissionContext) =
// 3. Creates a tar.gz archive and copies it to context.destination directory
let collectLogsFromPods (context: MissionContext) =
// Generate pod names based on number of workers
// Pod names follow the pattern: stellar-core-0, stellar-core-1, etc.
// Pod names follow the pattern: <helmReleaseName>-stellar-core-0, <helmReleaseName>-stellar-core-1, etc.
let podNames =
[ 0 .. context.pubnetParallelCatchupNumWorkers - 1 ]
|> List.map (fun i -> sprintf "stellar-core-%d" i)
|> List.map (fun i -> sprintf "%s-stellar-core-%d" helmReleaseName i)

LogInfo "Collecting logs from %d worker pods to directory: %s" (List.length podNames) context.destination.Path

Expand Down Expand Up @@ -302,7 +306,7 @@ let collectLogsFromPods (context: MissionContext) =
let cleanup (context: MissionContext) =
if toPerformCleanup then
toPerformCleanup <- false
LogInfo "Cleaning up resources..."
LogInfo "Cleaning up resources for release: %s" helmReleaseName

// Try to collect logs from all worker pods before cleanup
try
Expand Down Expand Up @@ -373,17 +377,18 @@ let historyPubnetParallelCatchupV2 (context: MissionContext) =
LogInfo "Running parallel catchup v2 ..."

nonce <- (MakeNetworkNonce context.tag).ToString()
LogDebug "nonce: '%s'" nonce
helmReleaseName <- sprintf "parallel-catchup-%s" nonce
LogDebug "nonce: '%s', release name: '%s'" nonce helmReleaseName

// Set cleanup context so cleanup handlers can access it
cleanupContext <- Some context

installProject (context)
installProject context

let mutable allJobsFinished = false
let mutable timeoutLeft = jobMonitorStatusCheckTimeOutSecs
let mutable timeBeforeNextMetricsCheck = jobMonitorMetricsCheckIntervalSecs
let jobMonitorPath = "/" + context.namespaceProperty
let jobMonitorPath = "/" + context.namespaceProperty + "/" + helmReleaseName

while not allJobsFinished do
Thread.Sleep(jobMonitorStatusCheckIntervalSecs * 1000)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,8 @@ if [ $LMOVE_EXIT_CODE -eq 0 ] && [ -n "$JOB_KEY" ]; then
fi

# Push metrics to redis in a transaction to ensure data consistency. Retry for 5min on failures
core_id=$(echo "$POD_NAME" | grep -o '[0-9]\+')
# Extract the pod ordinal (last hyphen-separated segment) from pod name like "release-name-stellar-core-0"
core_id=$(echo "$POD_NAME" | awk -F'-' '{print $NF}')
# Validate core_id was extracted successfully
if [ -z "$core_id" ]; then
echo "Error: Failed to extract core_id from POD_NAME: $POD_NAME"
Expand Down Expand Up @@ -103,6 +104,7 @@ else
# Either Redis command failed OR queue is empty
if [ $LMOVE_EXIT_CODE -ne 0 ]; then
echo "Error: Failed to connect to Redis at $REDIS_HOST:$REDIS_PORT"
echo "Exit code=$LMOVE_EXIT_CODE, Output: $JOB_KEY"
else
echo "$(date) No more jobs in the queue."
fi
Expand Down
Original file line number Diff line number Diff line change
@@ -1,19 +1,19 @@
apiVersion: v1
kind: Service
metadata:
name: stellar-core
name: {{ .Release.Name }}-stellar-core
spec:
clusterIP: None
selector:
app: stellar-core
app: {{ .Release.Name }}-stellar-core
ports:
- port: 11626
targetPort: 11626
---
apiVersion: v1
kind: ServiceAccount
metadata:
name: stellar-supercluster
name: {{ .Release.Name }}-sa
{{- if .Values.service_account.annotations }}
annotations:
{{- range .Values.service_account.annotations }}
Expand All @@ -24,22 +24,22 @@ metadata:
apiVersion: apps/v1
kind: StatefulSet
metadata:
name: stellar-core
name: {{ .Release.Name }}-stellar-core
labels:
app: stellar-core
app: {{ .Release.Name }}-stellar-core
spec:
serviceName: "stellar-core"
serviceName: "{{ .Release.Name }}-stellar-core"
podManagementPolicy: Parallel
replicas: {{ .Values.worker.replicas }}
selector:
matchLabels:
app: stellar-core
app: {{ .Release.Name }}-stellar-core
template:
metadata:
labels:
app: stellar-core
app: {{ .Release.Name }}-stellar-core
spec:
serviceAccountName: stellar-supercluster
serviceAccountName: {{ .Release.Name }}-sa
{{- if or .Values.worker.requireNodeLabels .Values.worker.avoidNodeLabels }}
affinity:
nodeAffinity:
Expand Down Expand Up @@ -93,7 +93,7 @@ spec:
value: {{ .Values.worker.asanOptions | quote }}
envFrom:
- configMapRef:
name: worker-config
name: {{ .Release.Name }}-worker-config
volumeMounts:
- name: config
mountPath: /config
Expand All @@ -104,17 +104,17 @@ spec:
volumes:
- name: config
configMap:
name: stellar-core-config
name: {{ .Release.Name }}-stellar-core-config
- name: script
configMap:
name: worker-script
name: {{ .Release.Name }}-worker-script
- emptyDir: {}
name: data-volume
{{- if not .Values.worker.unevenSched }}
topologySpreadConstraints:
- labelSelector:
matchLabels:
app: stellar-core
app: {{ .Release.Name }}-stellar-core
# Note: maxSkew affects dynamic node scheduling with karpenter
# See https://github.com/stellar/supercluster/issues/330
maxSkew: 2
Expand All @@ -125,15 +125,15 @@ spec:
apiVersion: v1
kind: ConfigMap
metadata:
name: worker-script
name: {{ .Release.Name }}-worker-script
data:
worker.sh: |-
{{- (.Files.Get "files/worker.sh") | nindent 4 }}
---
apiVersion: v1
kind: ConfigMap
metadata:
name: stellar-core-config
name: {{ .Release.Name }}-stellar-core-config
data:
stellar-core.cfg: |-
{{- if .Values.worker.catchup_skip_known_results_for_testing }}
Expand All @@ -154,10 +154,10 @@ data:
apiVersion: v1
kind: ConfigMap
metadata:
name: worker-config
name: {{ .Release.Name }}-worker-config
data:
REDIS_HOST: "{{ .Values.redis.hostname}}"
REDIS_PORT: "{{ .Values.redis.port}}"
REDIS_HOST: "{{ .Values.redis.hostname }}"
REDIS_PORT: "{{ .Values.redis.port }}"
JOB_QUEUE: "{{ .Values.redis.job_queue }}"
SUCCESS_QUEUE: "{{ .Values.redis.success_queue }}"
FAILED_QUEUE: "{{ .Values.redis.failed_queue }}"
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
apiVersion: networking.k8s.io/v1
kind: Ingress
metadata:
name: job-monitor-ingress
name: {{ .Release.Name }}-job-monitor-ingress
annotations:
nginx.ingress.kubernetes.io/use-regex: "true"
nginx.ingress.kubernetes.io/rewrite-target: /$1
Expand All @@ -15,35 +15,35 @@ spec:
pathType: Prefix
backend:
service:
name: job-monitor
name: {{ .Release.Name }}-job-monitor
port:
number: 8080
---
apiVersion: v1
kind: Service
metadata:
name: job-monitor
name: {{ .Release.Name }}-job-monitor
spec:
type: ClusterIP
ports:
- port: 8080
targetPort: 8080
selector:
app: job-monitor
app: {{ .Release.Name }}-job-monitor
---
apiVersion: apps/v1
kind: Deployment
metadata:
name: job-monitor
name: {{ .Release.Name }}-job-monitor
spec:
replicas: 1
selector:
matchLabels:
app: job-monitor
app: {{ .Release.Name }}-job-monitor
template:
metadata:
labels:
app: job-monitor
app: {{ .Release.Name }}-job-monitor
annotations:
# Add annotations to tell prometheus service discovery to scrape metrics.
# In k8s clusters without prometheus the annotations will be dormant
Expand All @@ -68,15 +68,15 @@ spec:
fieldPath: metadata.namespace
envFrom:
- configMapRef:
name: job-monitor-config
name: {{ .Release.Name }}-job-monitor-config
initContainers:
- name: wait-for-preload
image: redis:7
command: ['/bin/sh', '-c']
args:
- |-
# Wait until the job queue has items
until [ $(redis-cli -h {{ .Values.redis.hostname}} -p {{ .Values.redis.port}} LLEN {{ .Values.redis.job_queue }}) -gt 0 ]; do
until [ $(redis-cli -h {{ .Values.redis.hostname }} -p {{ .Values.redis.port }} LLEN {{ .Values.redis.job_queue }}) -gt 0 ]; do
echo waiting for preload
sleep 2
done
Expand All @@ -85,16 +85,16 @@ spec:
apiVersion: v1
kind: ConfigMap
metadata:
name: job-monitor-config
name: {{ .Release.Name }}-job-monitor-config
data:
REDIS_HOST: "{{ .Values.redis.hostname}}"
REDIS_PORT: "{{ .Values.redis.port}}"
REDIS_HOST: "{{ .Values.redis.hostname }}"
REDIS_PORT: "{{ .Values.redis.port }}"
JOB_QUEUE: "{{ .Values.redis.job_queue }}"
SUCCESS_QUEUE: "{{ .Values.redis.success_queue }}"
FAILED_QUEUE: "{{ .Values.redis.failed_queue }}"
PROGRESS_QUEUE: "{{ .Values.redis.progress_queue }}"
METRICS: "{{ .Values.redis.metrics }}"
WORKER_PREFIX: "stellar-core"
WORKER_PREFIX: "{{ .Release.Name }}-stellar-core"
WORKER_COUNT: "{{ .Values.worker.replicas }}"
LOGGING_INTERVAL_SECONDS: "{{ .Values.monitor.logging_interval_seconds}}"
LOGGING_LEVEL: "{{ .Values.monitor.logging_level}}"
LOGGING_INTERVAL_SECONDS: "{{ .Values.monitor.logging_interval_seconds }}"
LOGGING_LEVEL: "{{ .Values.monitor.logging_level }}"
Original file line number Diff line number Diff line change
@@ -1,15 +1,15 @@
apiVersion: batch/v1
kind: Job
metadata:
name: preload-redis
name: {{ .Release.Name }}-preload-redis
spec:
template:
spec:
containers:
- name: preload
image: redis:7
command: ["/bin/sh", "-c"]
args:
args:
- |-
case "$STRATEGY" in
"uniform")
Expand All @@ -24,24 +24,24 @@ spec:
esac
envFrom:
- configMapRef:
name: range-generator-config
name: {{ .Release.Name }}-range-generator-config
volumeMounts:
- name: script
mountPath: /scripts
initContainers:
- name: wait-for-redis
image: redis:7
command: ['sh', '-c', "until redis-cli -h {{ .Values.redis.hostname}} -p {{ .Values.redis.port}} ping; do echo waiting for redis; sleep 2; done;"]
command: ['sh', '-c', "until redis-cli -h {{ .Values.redis.hostname }} -p {{ .Values.redis.port }} ping; do echo waiting for redis; sleep 2; done;"]
restartPolicy: OnFailure
volumes:
- name: script
configMap:
name: generator-script
name: {{ .Release.Name }}-generator-script
---
apiVersion: v1
kind: ConfigMap
metadata:
name: generator-script
name: {{ .Release.Name }}-generator-script
data:
uniform_range_generator.sh: |-
{{- (.Files.Get "files/uniform_range_generator.sh") | nindent 4 }}
Expand All @@ -51,7 +51,7 @@ data:
apiVersion: v1
kind: ConfigMap
metadata:
name: range-generator-config
name: {{ .Release.Name }}-range-generator-config
data:
STRATEGY: "{{ .Values.range_generator.strategy }}"
STARTING_LEDGER: "{{ .Values.range_generator.params.starting_ledger }}"
Expand All @@ -60,5 +60,5 @@ data:
LEDGERS_PER_JOB: "{{ .Values.range_generator.params.uniform_ledgers_per_job }}"
LOGARITHMIC_FLOOR_LEDGERS: "{{ .Values.range_generator.params.logarithmic_floor_ledgers }}"
NUM_PARALLELISM: "192"
REDIS_HOST: "{{ .Values.redis.hostname}}"
REDIS_PORT: "{{ .Values.redis.port}}"
REDIS_HOST: "{{ .Values.redis.hostname }}"
REDIS_PORT: "{{ .Values.redis.port }}"
Loading