ITISFoundation · YuryHrytsuk · Sep 29, 2025 · Oct 1, 2025 · Oct 1, 2025 · Oct 1, 2025
@@ -4,49 +4,62 @@ REPO_BASE_DIR := $(shell git rev-parse --show-toplevel)
 include ${REPO_BASE_DIR}/scripts/common.Makefile
 include $(REPO_CONFIG_LOCATION)
 
+#
+# Vars
+#
+
 CONFIG_DIR := $(shell dirname $(REPO_CONFIG_LOCATION))
 CHART_DIRS := $(wildcard $(REPO_BASE_DIR)/charts/*/)
 
+HELMFILE_EXTRA_ARGS ?=
+
+HELMFILE := helmfile $(HELMFILE_EXTRA_ARGS)
+
+#
+# Help Targets
+#
+
 .PHONY: .check-helmfile-installed
 .check-helmfile-installed: ## Checks if helmfile is installed
 	@if ! command -v helmfile >/dev/null 2>&1; then \
 			echo "'helmfile' is not installed. Install it to continue ...";\
 	fi
 
+#
+# Artifacts
+#
+
 helmfile.yaml: simcore-charts/helmfile.yaml ## Copies the helmfile.yaml to the charts directory
 	cp $(CONFIG_DIR)/$@ $(REPO_BASE_DIR)/charts/helmfile.yaml
 
 simcore-charts/helmfile.yaml: ## Copies the simcore helmfile to the charts directory
 	cp $(CONFIG_DIR)/helmfile.simcore.yaml $(REPO_BASE_DIR)/charts/$@
 
+#
+# Targets
+#
+
 .PHONY: helmfile-lint
 helmfile-lint: .check-helmfile-installed helmfile.yaml ## Lints the helmfile
 	set -a; source $(REPO_CONFIG_LOCATION); set +a; \
-	helmfile lint
+	$(HELMFILE) lint
 
 .PHONY: helmfile-apply
 helmfile-apply: .check-helmfile-installed helmfile.yaml ## Applies the helmfile configuration
 	set -a; source $(REPO_CONFIG_LOCATION); set +a; \
-	helmfile -f $(REPO_BASE_DIR)/charts/helmfile.yaml apply
+	$(HELMFILE) -f $(REPO_BASE_DIR)/charts/helmfile.yaml apply
 
 .PHONY: helmfile-sync
 helmfile-sync: .check-helmfile-installed helmfile.yaml ## Syncs the helmfile configuration (use `helmfile-apply` to deploy the app)
 	set -a; source $(REPO_CONFIG_LOCATION); set +a; \
-	helmfile -f $(REPO_BASE_DIR)/charts/helmfile.yaml sync
+	$(HELMFILE) -f $(REPO_BASE_DIR)/charts/helmfile.yaml sync
 
 .PHONY: helmfile-diff
 helmfile-diff: .check-helmfile-installed helmfile.yaml ## Shows the differences that would be applied by helmfile
 	@set -a; source $(REPO_CONFIG_LOCATION); set +a; \
-	helmfile -f $(REPO_BASE_DIR)/charts/helmfile.yaml diff
+	$(HELMFILE) -f $(REPO_BASE_DIR)/charts/helmfile.yaml diff
 
 .PHONY: helmfile-delete
 helmfile-delete: .check-helmfile-installed helmfile.yaml ## Deletes the helmfile configuration
 	@set -a; source $(REPO_CONFIG_LOCATION); set +a; \
-	helmfile -f $(REPO_BASE_DIR)/charts/helmfile.yaml delete
-
-.PHONY: up
-up: helmfile-apply ## Start the stack
-
-.PHONY: leave
-leave: ## Leaves kind cluster
-	kind delete clusters kind
+	$(HELMFILE) -f $(REPO_BASE_DIR)/charts/helmfile.yaml delete
@@ -31,7 +31,7 @@ spec:
           - 172.16.0.0/12
           - 192.168.0.0/16
         ports:
-          - 6443
+          - {{ .Values.kubeApiServerPort }}
     # 6. TCP: cert-manager (controller) -> DNS API endpoints (for ACME DNS01)
     - action: Allow
       protocol: TCP

@@ -1,3 +1,5 @@
+kubeApiServerPort: {{ .Values.kubeApiServerPort }}
+
 cert-manager:
   crds:
     enabled: true

@@ -0,0 +1,19 @@
+
+## High Availability
+
+Prometheus Server
+* Issue asking how to configure it in `kube-prometheus-stack` https://github.com/prometheus-community/helm-charts/issues/6184
+* Prometheus Operator Documentation https://github.com/prometheus-operator/prometheus-operator/blob/v0.85.0/Documentation/platform/high-availability.md#prometheus
+
+Promethes Operator
+* Not needed. See https://github.com/prometheus-operator/prometheus-operator/issues/2491
+
+## FAQ
+
+How to expose workload metrics
+* Use ServiceMonitor, PodMonitor or Running exporters. See https://github.com/prometheus-community/helm-charts/blob/kube-prometheus-stack-77.12.0/charts/kube-prometheus-stack/README.md#prometheusioscrape
+* Make sure network policy of prometheus and workload all all necessary ingress and egress
+  - prometheus shall be able to egress for metrics and workload should allow ingress for metrics
+
+Pod Monitor vs Service Monitor:
+* https://github.com/prometheus-operator/prometheus-operator/issues/3119
@@ -0,0 +1,15 @@
+apiVersion: v1
+kind: Namespace
+metadata:
+  name: monitoring
+  labels:
+    pod-security.kubernetes.io/enforce: restricted
+
+---
+
+apiVersion: v1
+kind: Namespace
+metadata:
+  name: monitoring-privileged
+  labels:
+    pod-security.kubernetes.io/enforce: privileged
@@ -0,0 +1,9 @@
+prometheus:
+  prometheusSpec:
+    storageSpec:
+      volumeClaimTemplate:
+        spec:
+          resources:
+            requests:
+              storage: 100Gi
+          storageClassName: "{{ .Values.ebsStorageClassName }}"
@@ -0,0 +1,9 @@
+prometheus:
+  prometheusSpec:
+    storageSpec:
+      volumeClaimTemplate:
+        spec:
+          resources:
+            requests:
+              storage: 10Gi
+          storageClassName: "{{ .Values.kindDefaultStorageClassName }}"
@@ -0,0 +1,9 @@
+prometheus:
+  prometheusSpec:
+    storageSpec:
+      volumeClaimTemplate:
+        spec:
+          resources:
+            requests:
+              storage: 100Gi
+          storageClassName: "{{ .Values.topolvmStorageClassName }}"
@@ -0,0 +1,166 @@
+alertmanager:
+  enabled: false
+
+defaultRules:
+  create: false
+
+grafana:
+  enabled: false
+
+kubeApiServer:
+  enabled: false
+
+# container metrics (cpu / memory)
+kubelet:
+  enabled: true
+
+kubeControllerManager:
+  enabled: false
+
+coreDns:
+  enabled: false
+
+kubeEtcd:
+  enabled: false
+
+kubeScheduler:
+  enabled: false
+
+kubeDns:
+  enabled: false
+
+kubeProxy:
+  enabled: false
+
+kubeStateMetrics:
+  enabled: false
+
+nodeExporter:
+  enabled: true
+
+prometheus-node-exporter:
+  namespaceOverride: "{{ .Release.Namespace }}-privileged"
+
+thanosRuler:
+  enabled: false
+
+prometheusOperator:
+  enabled: true
+
+  networkPolicy:
+    enabled: true
+    flavor: kubernetes
+
+  resources:
+    limits:
+      cpu: 1
+      memory: 1Gi
+    requests:
+      cpu: 0.1
+      memory: 256Mi
+
+  nodeSelector:
+    ops: "true"
+
+  tls:
+    internalPort: &prometheusOperatorInternalPort 10250
+
+prometheus:
+  enabled: true
+
+  service:
+    port: &prometheusServicePort 9090
+
+  networkPolicy:
+    enabled: true
+    flavor: kubernetes
+
+    ingress:
+      - from:
+        - namespaceSelector:
+            matchLabels:
+              kubernetes.io/metadata.name: traefik
+          podSelector:
+            matchLabels:
+              app.kubernetes.io/name: traefik
+        ports:
+          - port: *prometheusServicePort
+            protocol: TCP
+    egress:
+      - ports:
+          # generic port for metrics
+          - port: 9100
+            protocol: TCP
+      - ports:
+          # prometheus operator
+          - port: *prometheusOperatorInternalPort
+            protocol: TCP
+      - ports:
+          # kube api server
+          - port: {{ .Values.kubeApiServerPort }}
+            protocol: TCP
+
+  # enable once object storage needed
+  thanosService:
+    enabled: false
+
+  ingress:
+    enabled: true
+    ingressClassName: ""
+    annotations:
+        namespace: "{{ .Release.Namespace }}"
+        cert-manager.io/cluster-issuer: "cert-issuer"
+        traefik.ingress.kubernetes.io/router.entrypoints: websecure
+        traefik.ingress.kubernetes.io/router.middlewares: traefik-traefik-basic-auth@kubernetescrd # namespace + middleware name
+    tls:
+      - secretName: monitoring-tls
+        hosts:
+          - {{ requiredEnv "K8S_MONITORING_FQDN" }}
+    hosts:
+      - {{ requiredEnv "K8S_MONITORING_FQDN" }}
+    paths:
+      - &pathprefix /prometheus
+    pathType: Prefix
+
+  prometheusSpec:
+    # Use 2+ for HA
+    replicas: 1
+
+    # Done for HA
+    # Needs to differentiate prometheus instances with the same setup
+    # https://github.com/prometheus-operator/prometheus-operator/blob/v0.85.0/Documentation/platform/high-availability.md#prometheus
+    # External Labels do not show up metrics. See https://github.com/prometheus-operator/prometheus-operator/issues/2918#issuecomment-567009499
+    replicaExternalLabelName: "prometheus_replica"
+
+    # Done for HA
+    # Enforce replicas running on different nodes
+    # Otherwise it does not make sense from HA perspective
+    podAntiAffinity: "hard"
+
+    routePrefix: *pathprefix
+
+    retention: 90d
+
+    retentionSize: 100GiB
+
+    scrapeInterval: 30s
+    scrapeTimeout: 10s
+
+    nodeSelector:
+      ops: "true"
+
+    # https://github.com/prometheus-community/helm-charts/blob/kube-prometheus-stack-77.12.0/charts/kube-prometheus-stack/README.md#prometheusioscrape
+    podMonitorSelectorNilUsesHelmValues: false
+    serviceMonitorSelectorNilUsesHelmValues: false
+
+    resources:
+      requests:
+        memory: 2Gi
+        cpu: 1
+      limits:
+        memory: 4Gi
+        cpu: 2
+
+    persistentVolumeClaimRetentionPolicy:
+      whenDeleted: Retain
+      whenScaled: Retain
@@ -13,7 +13,7 @@ spec:
       # connect to the Kubernetes API server
       destination:
         ports:
-          - 6443
+          - {{ .Values.kubeApiServerPort }}
         nets:
           - 10.0.0.0/8
           - 172.16.0.0/12

@@ -1,5 +1,7 @@
 servicePort: &servicePort 9000
 
+kubeApiServerPort: {{ .Values.kubeApiServerPort }}
+
 portainer:
   replicaCount: 1
 

@@ -35,3 +35,10 @@ affinity:  # https://github.com/traefik/traefik-helm-chart/blob/v28.2.0/traefik/
             app.kubernetes.io/name: '{{`{{ template "traefik.name" . }}`}}'
             app.kubernetes.io/instance: '{{ .Release.Name }}'
         topologyKey: kubernetes.io/hostname
+
+metrics:
+  prometheus:
+    service:
+      enabled: true
+    serviceMonitor:
+      enabled: true