Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
37 changes: 25 additions & 12 deletions charts/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -4,49 +4,62 @@ REPO_BASE_DIR := $(shell git rev-parse --show-toplevel)
include ${REPO_BASE_DIR}/scripts/common.Makefile
include $(REPO_CONFIG_LOCATION)

#
# Vars
#

CONFIG_DIR := $(shell dirname $(REPO_CONFIG_LOCATION))
CHART_DIRS := $(wildcard $(REPO_BASE_DIR)/charts/*/)

HELMFILE_EXTRA_ARGS ?=

HELMFILE := helmfile $(HELMFILE_EXTRA_ARGS)

#
# Help Targets
#

.PHONY: .check-helmfile-installed
.check-helmfile-installed: ## Checks if helmfile is installed
@if ! command -v helmfile >/dev/null 2>&1; then \
echo "'helmfile' is not installed. Install it to continue ...";\
fi

#
# Artifacts
#

helmfile.yaml: simcore-charts/helmfile.yaml ## Copies the helmfile.yaml to the charts directory
cp $(CONFIG_DIR)/$@ $(REPO_BASE_DIR)/charts/helmfile.yaml

simcore-charts/helmfile.yaml: ## Copies the simcore helmfile to the charts directory
cp $(CONFIG_DIR)/helmfile.simcore.yaml $(REPO_BASE_DIR)/charts/$@

#
# Targets
#

.PHONY: helmfile-lint
helmfile-lint: .check-helmfile-installed helmfile.yaml ## Lints the helmfile
set -a; source $(REPO_CONFIG_LOCATION); set +a; \
helmfile lint
$(HELMFILE) lint

.PHONY: helmfile-apply
helmfile-apply: .check-helmfile-installed helmfile.yaml ## Applies the helmfile configuration
set -a; source $(REPO_CONFIG_LOCATION); set +a; \
helmfile -f $(REPO_BASE_DIR)/charts/helmfile.yaml apply
$(HELMFILE) -f $(REPO_BASE_DIR)/charts/helmfile.yaml apply

.PHONY: helmfile-sync
helmfile-sync: .check-helmfile-installed helmfile.yaml ## Syncs the helmfile configuration (use `helmfile-apply` to deploy the app)
set -a; source $(REPO_CONFIG_LOCATION); set +a; \
helmfile -f $(REPO_BASE_DIR)/charts/helmfile.yaml sync
$(HELMFILE) -f $(REPO_BASE_DIR)/charts/helmfile.yaml sync

.PHONY: helmfile-diff
helmfile-diff: .check-helmfile-installed helmfile.yaml ## Shows the differences that would be applied by helmfile
@set -a; source $(REPO_CONFIG_LOCATION); set +a; \
helmfile -f $(REPO_BASE_DIR)/charts/helmfile.yaml diff
$(HELMFILE) -f $(REPO_BASE_DIR)/charts/helmfile.yaml diff

.PHONY: helmfile-delete
helmfile-delete: .check-helmfile-installed helmfile.yaml ## Deletes the helmfile configuration
@set -a; source $(REPO_CONFIG_LOCATION); set +a; \
helmfile -f $(REPO_BASE_DIR)/charts/helmfile.yaml delete

.PHONY: up
up: helmfile-apply ## Start the stack

.PHONY: leave
leave: ## Leaves kind cluster
kind delete clusters kind
$(HELMFILE) -f $(REPO_BASE_DIR)/charts/helmfile.yaml delete
2 changes: 1 addition & 1 deletion charts/cert-manager/templates/networkpolicy.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ spec:
- 172.16.0.0/12
- 192.168.0.0/16
ports:
- 6443
- {{ .Values.kubeApiServerPort }}
# 6. TCP: cert-manager (controller) -> DNS API endpoints (for ACME DNS01)
- action: Allow
protocol: TCP
Expand Down
2 changes: 2 additions & 0 deletions charts/cert-manager/values.common.yaml.gotmpl
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
kubeApiServerPort: {{ .Values.kubeApiServerPort }}

cert-manager:
crds:
enabled: true
Expand Down
19 changes: 19 additions & 0 deletions charts/kube-prometheus-stack/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@

## High Availability

Prometheus Server
* Issue asking how to configure it in `kube-prometheus-stack` https://github.com/prometheus-community/helm-charts/issues/6184
* Prometheus Operator Documentation https://github.com/prometheus-operator/prometheus-operator/blob/v0.85.0/Documentation/platform/high-availability.md#prometheus

Promethes Operator
* Not needed. See https://github.com/prometheus-operator/prometheus-operator/issues/2491

## FAQ

How to expose workload metrics
* Use ServiceMonitor, PodMonitor or Running exporters. See https://github.com/prometheus-community/helm-charts/blob/kube-prometheus-stack-77.12.0/charts/kube-prometheus-stack/README.md#prometheusioscrape
* Make sure network policy of prometheus and workload all all necessary ingress and egress
- prometheus shall be able to egress for metrics and workload should allow ingress for metrics

Pod Monitor vs Service Monitor:
* https://github.com/prometheus-operator/prometheus-operator/issues/3119
15 changes: 15 additions & 0 deletions charts/kube-prometheus-stack/namespaces.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
apiVersion: v1
kind: Namespace
metadata:
name: monitoring
labels:
pod-security.kubernetes.io/enforce: restricted

---

apiVersion: v1
kind: Namespace
metadata:
name: monitoring-privileged
labels:
pod-security.kubernetes.io/enforce: privileged
9 changes: 9 additions & 0 deletions charts/kube-prometheus-stack/values.ebs-storage.yaml.gotmpl
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
prometheus:
prometheusSpec:
storageSpec:
volumeClaimTemplate:
spec:
resources:
requests:
storage: 100Gi
storageClassName: "{{ .Values.ebsStorageClassName }}"
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
prometheus:
prometheusSpec:
storageSpec:
volumeClaimTemplate:
spec:
resources:
requests:
storage: 10Gi
storageClassName: "{{ .Values.kindDefaultStorageClassName }}"
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
prometheus:
prometheusSpec:
storageSpec:
volumeClaimTemplate:
spec:
resources:
requests:
storage: 100Gi
storageClassName: "{{ .Values.topolvmStorageClassName }}"
166 changes: 166 additions & 0 deletions charts/kube-prometheus-stack/values.yaml.gotmpl
Original file line number Diff line number Diff line change
@@ -0,0 +1,166 @@
alertmanager:
enabled: false

defaultRules:
create: false

grafana:
enabled: false

kubeApiServer:
enabled: false

# container metrics (cpu / memory)
kubelet:
enabled: true

kubeControllerManager:
enabled: false

coreDns:
enabled: false

kubeEtcd:
enabled: false

kubeScheduler:
enabled: false

kubeDns:
enabled: false

kubeProxy:
enabled: false

kubeStateMetrics:
enabled: false

nodeExporter:
enabled: true

prometheus-node-exporter:
namespaceOverride: "{{ .Release.Namespace }}-privileged"

thanosRuler:
enabled: false

prometheusOperator:
enabled: true

networkPolicy:
enabled: true
flavor: kubernetes

resources:
limits:
cpu: 1
memory: 1Gi
requests:
cpu: 0.1
memory: 256Mi

nodeSelector:
ops: "true"

tls:
internalPort: &prometheusOperatorInternalPort 10250

prometheus:
enabled: true

service:
port: &prometheusServicePort 9090

networkPolicy:
enabled: true
flavor: kubernetes

ingress:
- from:
- namespaceSelector:
matchLabels:
kubernetes.io/metadata.name: traefik
podSelector:
matchLabels:
app.kubernetes.io/name: traefik
ports:
- port: *prometheusServicePort
protocol: TCP
egress:
- ports:
# generic port for metrics
- port: 9100
protocol: TCP
- ports:
# prometheus operator
- port: *prometheusOperatorInternalPort
protocol: TCP
- ports:
# kube api server
- port: {{ .Values.kubeApiServerPort }}
protocol: TCP

# enable once object storage needed
thanosService:
enabled: false

ingress:
enabled: true
ingressClassName: ""
annotations:
namespace: "{{ .Release.Namespace }}"
cert-manager.io/cluster-issuer: "cert-issuer"
traefik.ingress.kubernetes.io/router.entrypoints: websecure
traefik.ingress.kubernetes.io/router.middlewares: traefik-traefik-basic-auth@kubernetescrd # namespace + middleware name
tls:
- secretName: monitoring-tls
hosts:
- {{ requiredEnv "K8S_MONITORING_FQDN" }}
hosts:
- {{ requiredEnv "K8S_MONITORING_FQDN" }}
paths:
- &pathprefix /prometheus
pathType: Prefix

prometheusSpec:
# Use 2+ for HA
replicas: 1

# Done for HA
# Needs to differentiate prometheus instances with the same setup
# https://github.com/prometheus-operator/prometheus-operator/blob/v0.85.0/Documentation/platform/high-availability.md#prometheus
# External Labels do not show up metrics. See https://github.com/prometheus-operator/prometheus-operator/issues/2918#issuecomment-567009499
replicaExternalLabelName: "prometheus_replica"

# Done for HA
# Enforce replicas running on different nodes
# Otherwise it does not make sense from HA perspective
podAntiAffinity: "hard"

routePrefix: *pathprefix

retention: 90d

retentionSize: 100GiB

scrapeInterval: 30s
scrapeTimeout: 10s

nodeSelector:
ops: "true"

# https://github.com/prometheus-community/helm-charts/blob/kube-prometheus-stack-77.12.0/charts/kube-prometheus-stack/README.md#prometheusioscrape
podMonitorSelectorNilUsesHelmValues: false
serviceMonitorSelectorNilUsesHelmValues: false

resources:
requests:
memory: 2Gi
cpu: 1
limits:
memory: 4Gi
cpu: 2

persistentVolumeClaimRetentionPolicy:
whenDeleted: Retain
whenScaled: Retain
2 changes: 1 addition & 1 deletion charts/portainer/templates/networkpolicy.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ spec:
# connect to the Kubernetes API server
destination:
ports:
- 6443
- {{ .Values.kubeApiServerPort }}
nets:
- 10.0.0.0/8
- 172.16.0.0/12
Expand Down
2 changes: 2 additions & 0 deletions charts/portainer/values.yaml.gotmpl
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
servicePort: &servicePort 9000

kubeApiServerPort: {{ .Values.kubeApiServerPort }}

portainer:
replicaCount: 1

Expand Down
7 changes: 7 additions & 0 deletions charts/traefik/values.common.yaml.gotmpl
Original file line number Diff line number Diff line change
Expand Up @@ -35,3 +35,10 @@ affinity: # https://github.com/traefik/traefik-helm-chart/blob/v28.2.0/traefik/
app.kubernetes.io/name: '{{`{{ template "traefik.name" . }}`}}'
app.kubernetes.io/instance: '{{ .Release.Name }}'
topologyKey: kubernetes.io/hostname

metrics:
prometheus:
service:
enabled: true
serviceMonitor:
enabled: true