Skip to content

Commit 70474d0

Browse files
committed
Add prometheus Helm chart
Signed-off-by: Todd Short <tshort@redhat.com>
1 parent 5f5b6ba commit 70474d0

15 files changed

+359
-9
lines changed

Makefile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -294,7 +294,7 @@ test-experimental-e2e: run-internal image-registry prometheus experimental-e2e e
294294
prometheus: PROMETHEUS_NAMESPACE := olmv1-system
295295
prometheus: PROMETHEUS_VERSION := v0.83.0
296296
prometheus: $(KUSTOMIZE) #EXHELP Deploy Prometheus into specified namespace
297-
./hack/test/install-prometheus.sh $(PROMETHEUS_NAMESPACE) $(PROMETHEUS_VERSION) $(KUSTOMIZE) $(VERSION)
297+
./hack/test/install-prometheus.sh $(PROMETHEUS_NAMESPACE) $(PROMETHEUS_VERSION) $(VERSION)
298298

299299
.PHONY: test-extension-developer-e2e
300300
test-extension-developer-e2e: SOURCE_MANIFEST := $(STANDARD_E2E_MANIFEST)

hack/test/install-prometheus.sh

Lines changed: 9 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,41 +1,42 @@
11
#!/bin/bash
22

3+
source ".bingo/variables.env"
4+
35
set -euo pipefail
46

57
help="install-prometheus.sh is used to set up prometheus monitoring for e2e testing.
68
Usage:
7-
install-prometheus.sh [PROMETHEUS_NAMESPACE] [PROMETHEUS_VERSION] [KUSTOMIZE] [GIT_VERSION]
9+
install-prometheus.sh [PROMETHEUS_NAMESPACE] [PROMETHEUS_VERSION] [GIT_VERSION]
810
"
911

10-
if [[ "$#" -ne 4 ]]; then
12+
if [[ "$#" -ne 3 ]]; then
1113
echo "Illegal number of arguments passed"
1214
echo "${help}"
1315
exit 1
1416
fi
1517

1618
PROMETHEUS_NAMESPACE="$1"
1719
PROMETHEUS_VERSION="$2"
18-
KUSTOMIZE="$3"
19-
GIT_VERSION="$4"
20+
GIT_VERSION="$3"
2021

2122
TMPDIR="$(mktemp -d)"
22-
trap 'echo "Cleaning up $TMPDIR"; rm -rf "$TMPDIR"' EXIT
23+
#trap 'echo "Cleaning up $TMPDIR"; rm -rf "$TMPDIR"' EXIT
2324

2425
echo "Downloading Prometheus resources..."
2526
curl -s "https://raw.githubusercontent.com/prometheus-operator/prometheus-operator/refs/tags/${PROMETHEUS_VERSION}/kustomization.yaml" > "${TMPDIR}/kustomization.yaml"
2627
curl -s "https://raw.githubusercontent.com/prometheus-operator/prometheus-operator/refs/tags/${PROMETHEUS_VERSION}/bundle.yaml" > "${TMPDIR}/bundle.yaml"
2728

2829
echo "Patching namespace to ${PROMETHEUS_NAMESPACE}..."
29-
(cd "$TMPDIR" && $KUSTOMIZE edit set namespace "$PROMETHEUS_NAMESPACE")
30+
(cd "$TMPDIR" && ${KUSTOMIZE} edit set namespace "$PROMETHEUS_NAMESPACE")
3031

3132
echo "Applying Prometheus base..."
3233
kubectl apply -k "$TMPDIR" --server-side
3334

3435
echo "Waiting for Prometheus Operator pod to become ready..."
3536
kubectl wait --for=condition=Ready pod -n "$PROMETHEUS_NAMESPACE" -l app.kubernetes.io/name=prometheus-operator
3637

37-
echo "Applying overlay config..."
38-
$KUSTOMIZE build config/overlays/prometheus | sed "s/cert-git-version/cert-${VERSION}/g" | kubectl apply -f -
38+
echo "Applying prometheus Helm chart..."
39+
${HELM} template prometheus helm/prometheus | sed "s/cert-git-version/cert-${VERSION}/g" | kubectl apply -f -
3940

4041
echo "Waiting for metrics scraper to become ready..."
4142
kubectl wait --for=create pods -n "$PROMETHEUS_NAMESPACE" prometheus-prometheus-0 --timeout=60s

helm/prometheus/Chart.yaml

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
apiVersion: v2
2+
name: prometheus
3+
description: A Helm chart of Prometheus resources for OLMv1
4+
5+
# A chart can be either an 'application' or a 'library' chart.
6+
#
7+
# Application charts are a collection of templates that can be packaged into versioned archives
8+
# to be deployed.
9+
#
10+
# Library charts provide useful utilities or functions for the chart developer. They're included as
11+
# a dependency of application charts to inject those utilities and functions into the rendering
12+
# pipeline. Library charts do not define any templates and therefore cannot be deployed.
13+
type: application
14+
15+
# This is the chart version. This version number should be incremented each time you make changes
16+
# to the chart and its templates, including the app version.
17+
# Versions are expected to follow Semantic Versioning (https://semver.org/)
18+
version: 0.1.0
19+
20+
# This is the version number of the application being deployed. This version number should be
21+
# incremented each time you make changes to the application. Versions are not expected to
22+
# follow Semantic Versioning. They should reflect the version the application is using.
23+
# It is recommended to use it with quotes.
24+
appVersion: "1.3.0"
Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
---
2+
apiVersion: rbac.authorization.k8s.io/v1
3+
kind: ClusterRole
4+
metadata:
5+
name: prometheus
6+
rules:
7+
- apiGroups:
8+
- ""
9+
resources:
10+
- nodes
11+
- nodes/metrics
12+
- services
13+
- endpoints
14+
- pods
15+
verbs:
16+
- get
17+
- list
18+
- watch
19+
- apiGroups:
20+
- ""
21+
resources:
22+
- configmaps
23+
verbs:
24+
- get
25+
- apiGroups:
26+
- discovery.k8s.io
27+
resources:
28+
- endpointslices
29+
verbs:
30+
- get
31+
- list
32+
- watch
33+
- apiGroups:
34+
- networking.k8s.io
35+
resources:
36+
- ingresses
37+
verbs:
38+
- get
39+
- list
40+
- watch
41+
- nonResourceURLs:
42+
- /metrics
43+
verbs:
44+
- get
Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
---
2+
apiVersion: rbac.authorization.k8s.io/v1
3+
kind: ClusterRoleBinding
4+
metadata:
5+
name: prometheus
6+
roleRef:
7+
apiGroup: rbac.authorization.k8s.io
8+
kind: ClusterRole
9+
name: prometheus
10+
subjects:
11+
- kind: ServiceAccount
12+
name: prometheus
13+
namespace: {{ .Values.namespaces.olmv1.name }}
Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
---
2+
apiVersion: networking.k8s.io/v1
3+
kind: NetworkPolicy
4+
metadata:
5+
name: prometheus
6+
namespace: {{ .Values.namespaces.olmv1.name }}
7+
spec:
8+
egress:
9+
- {}
10+
ingress:
11+
- {}
12+
podSelector:
13+
matchLabels:
14+
app.kubernetes.io/name: prometheus
15+
policyTypes:
16+
- Egress
17+
- Ingress
Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
---
2+
apiVersion: monitoring.coreos.com/v1
3+
kind: Prometheus
4+
metadata:
5+
name: prometheus
6+
namespace: {{ .Values.namespaces.olmv1.name }}
7+
spec:
8+
logLevel: debug
9+
ruleSelector: {}
10+
scrapeInterval: 1m
11+
scrapeTimeout: 30s
12+
securityContext:
13+
runAsNonRoot: true
14+
runAsUser: 65534
15+
seccompProfile:
16+
type: RuntimeDefault
17+
serviceAccountName: prometheus
18+
serviceDiscoveryRole: EndpointSlice
19+
serviceMonitorSelector: {}
Lines changed: 72 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,72 @@
1+
---
2+
apiVersion: monitoring.coreos.com/v1
3+
kind: PrometheusRule
4+
metadata:
5+
name: controller-alerts
6+
namespace: {{ .Values.namespaces.olmv1.name }}
7+
spec:
8+
groups:
9+
- name: controller-panic
10+
rules:
11+
- alert: reconciler-panic
12+
annotations:
13+
description: controller of pod {{`{{ $labels.pod }}`}} experienced panic(s); count={{`{{ $value }}`}}
14+
expr: controller_runtime_reconcile_panics_total{} > 0
15+
- alert: webhook-panic
16+
annotations:
17+
description: controller webhook of pod {{`{{ $labels.pod }}`}} experienced panic(s); count={{`{{ $value }}`}}
18+
expr: controller_runtime_webhook_panics_total{} > 0
19+
- name: resource-usage
20+
rules:
21+
- alert: oom-events
22+
annotations:
23+
description: container {{`{{ $labels.container }}`}} of pod {{`{{ $labels.pod }}`}} experienced OOM event(s); count={{`{{ $value }}`}}
24+
expr: container_oom_events_total > 0
25+
- alert: operator-controller-memory-growth
26+
annotations:
27+
description: 'operator-controller pod memory usage growing at a high rate for 5 minutes: {{`{{ $value | humanize }}`}}B/sec'
28+
expr: deriv(sum(container_memory_working_set_bytes{pod=~"operator-controller.*",container="manager"})[5m:]) > 100_000
29+
for: 5m
30+
keep_firing_for: 1d
31+
- alert: catalogd-memory-growth
32+
annotations:
33+
description: 'catalogd pod memory usage growing at a high rate for 5 minutes: {{`{{ $value | humanize }}`}}B/sec'
34+
expr: deriv(sum(container_memory_working_set_bytes{pod=~"catalogd.*",container="manager"})[5m:]) > 100_000
35+
for: 5m
36+
keep_firing_for: 1d
37+
- alert: operator-controller-memory-usage
38+
annotations:
39+
description: 'operator-controller pod using high memory resources for the last 5 minutes: {{`{{ $value | humanize }}`}}B'
40+
expr: sum(container_memory_working_set_bytes{pod=~"operator-controller.*",container="manager"}) > 100_000_000
41+
for: 5m
42+
keep_firing_for: 1d
43+
- alert: catalogd-memory-usage
44+
annotations:
45+
description: 'catalogd pod using high memory resources for the last 5 minutes: {{`{{ $value | humanize }}`}}B'
46+
expr: sum(container_memory_working_set_bytes{pod=~"catalogd.*",container="manager"}) > 75_000_000
47+
for: 5m
48+
keep_firing_for: 1d
49+
- alert: operator-controller-cpu-usage
50+
annotations:
51+
description: 'operator-controller using high cpu resource for 5 minutes: {{`{{ $value | printf "%.2f" }}`}}%'
52+
expr: rate(container_cpu_usage_seconds_total{pod=~"operator-controller.*",container="manager"}[5m]) * 100 > 20
53+
for: 5m
54+
keep_firing_for: 1d
55+
- alert: catalogd-cpu-usage
56+
annotations:
57+
description: 'catalogd using high cpu resources for 5 minutes: {{`{{ $value | printf "%.2f" }}`}}%'
58+
expr: rate(container_cpu_usage_seconds_total{pod=~"catalogd.*",container="manager"}[5m]) * 100 > 20
59+
for: 5m
60+
keep_firing_for: 1d
61+
- alert: operator-controller-api-call-rate
62+
annotations:
63+
description: 'operator-controller making excessive API calls for 5 minutes: {{`{{ $value | printf "%.2f" }}`}}/sec'
64+
expr: sum(rate(rest_client_requests_total{job=~"operator-controller-service"}[5m])) > 10
65+
for: 5m
66+
keep_firing_for: 1d
67+
- alert: catalogd-api-call-rate
68+
annotations:
69+
description: 'catalogd making excessive API calls for 5 minutes: {{`{{ $value | printf "%.2f" }}`}}/sec'
70+
expr: sum(rate(rest_client_requests_total{job=~"catalogd-service"}[5m])) > 5
71+
for: 5m
72+
keep_firing_for: 1d
Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
---
2+
apiVersion: v1
3+
kind: Secret
4+
metadata:
5+
annotations:
6+
kubernetes.io/service-account.name: prometheus
7+
name: prometheus-metrics-token
8+
namespace: {{ .Values.namespaces.olmv1.name }}
9+
type: kubernetes.io/service-account-token
Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
---
2+
apiVersion: v1
3+
kind: Service
4+
metadata:
5+
name: prometheus-service
6+
namespace: {{ .Values.namespaces.prometheus.name }}
7+
spec:
8+
ports:
9+
- name: web
10+
nodePort: 30900
11+
port: 9090
12+
protocol: TCP
13+
targetPort: web
14+
selector:
15+
prometheus: prometheus
16+
type: NodePort

0 commit comments

Comments
 (0)