From ac6b70e50c66eb65bd2c2e51f8d8005bd623dcf6 Mon Sep 17 00:00:00 2001 From: Yaroslav Kasatikov Date: Thu, 6 Apr 2023 10:22:20 +0300 Subject: [PATCH 1/6] Add pgexporter --- .../grafana-dashboard.yaml | 0 postgres/helm/Chart.yaml | 7 + .../helm/templates/cnpg-prometheusrule.yaml | 2 + .../pgexporter-monitoring-rules.yaml | 207 ++++++++++++++++++ postgres/helm/values.yaml | 14 +- 5 files changed, 228 insertions(+), 2 deletions(-) rename postgres/{helm/templates => grafana-dashboard}/grafana-dashboard.yaml (100%) create mode 100644 postgres/helm/templates/pgexporter-monitoring-rules.yaml diff --git a/postgres/helm/templates/grafana-dashboard.yaml b/postgres/grafana-dashboard/grafana-dashboard.yaml similarity index 100% rename from postgres/helm/templates/grafana-dashboard.yaml rename to postgres/grafana-dashboard/grafana-dashboard.yaml diff --git a/postgres/helm/Chart.yaml b/postgres/helm/Chart.yaml index bbd96aa..494bad2 100644 --- a/postgres/helm/Chart.yaml +++ b/postgres/helm/Chart.yaml @@ -22,3 +22,10 @@ version: 0.1.2 # follow Semantic Versioning. They should reflect the version the application is using. # It is recommended to use it with quotes. appVersion: "1.16.0" + +dependencies: +- name: prometheus-postgres-exporter + repository: https://prometheus-community.github.io/helm-charts + version: 4.4.0 + condition: cluster.monitoring.enabled + diff --git a/postgres/helm/templates/cnpg-prometheusrule.yaml b/postgres/helm/templates/cnpg-prometheusrule.yaml index 13f8e08..75c5ddf 100644 --- a/postgres/helm/templates/cnpg-prometheusrule.yaml +++ b/postgres/helm/templates/cnpg-prometheusrule.yaml @@ -3,6 +3,8 @@ apiVersion: monitoring.coreos.com/v1 kind: PrometheusRule metadata: name: {{ include "vault-secret.fullname" . }}cnpg-default-alerts + labels: + openshift.io/prometheus-rule-evaluation-scope: leaf-prometheus apiVersion: monitoring.coreos.com/v1 spec: groups: diff --git a/postgres/helm/templates/pgexporter-monitoring-rules.yaml b/postgres/helm/templates/pgexporter-monitoring-rules.yaml new file mode 100644 index 0000000..31c6fa6 --- /dev/null +++ b/postgres/helm/templates/pgexporter-monitoring-rules.yaml @@ -0,0 +1,207 @@ +{{- if .Values.cluster.monitoring.enabled }} +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + name: {{ include "vault-secret.fullname" . }}pg-exp-alerts + labels: + openshift.io/prometheus-rule-evaluation-scope: leaf-prometheus +apiVersion: monitoring.coreos.com/v1 +spec: + - alert: PostgresqlDown + expr: pg_up == 0 + for: 0m + labels: + severity: critical + annotations: + summary: Postgresql down (instance {{"{{"}} $labels.instance {{"}}"}}) + description: "Postgresql instance is down\n VALUE = {{"{{"}} $value {{"}}"}}\n LABELS = {{"{{"}} $labels {{"}}"}}" + - alert: PostgresqlRestarted + expr: time() - pg_postmaster_start_time_seconds < 60 + for: 0m + labels: + severity: critical + annotations: + summary: Postgresql restarted (instance {{"{{"}} $labels.instance {{"}}"}}) + description: "Postgresql restarted\n VALUE = {{"{{"}} $value {{"}}"}}\n LABELS = {{"{{"}} $labels {{"}}"}}" + + - alert: PostgresqlExporterError + expr: pg_exporter_last_scrape_error > 0 + for: 0m + labels: + severity: critical + annotations: + summary: Postgresql exporter error (instance {{"{{"}} $labels.instance {{"}}"}}) + description: "Postgresql exporter is showing errors. A query may be buggy in query.yaml\n VALUE = {{"{{"}} $value {{"}}"}}\n LABELS = {{"{{"}} $labels {{"}}"}}" + + + - alert: PostgresqlTableNotAutoVacuumed + expr: (pg_stat_user_tables_last_autovacuum > 0) and (time() - pg_stat_user_tables_last_autovacuum) > 60 * 60 * 24 * 10 + for: 0m + labels: + severity: warning + annotations: + summary: Postgresql table not auto vacuumed (instance {{"{{"}} $labels.instance {{"}}"}}) + description: "Table {{"{{"}} $labels.relname {{"}}"}} has not been auto vacuumed for 10 days\n VALUE = {{"{{"}} $value {{"}}"}}\n LABELS = {{"{{"}} $labels {{"}}"}}" + + + - alert: PostgresqlTableNotAutoAnalyzed + expr: (pg_stat_user_tables_last_autoanalyze > 0) and (time() - pg_stat_user_tables_last_autoanalyze) > 24 * 60 * 60 * 10 + for: 0m + labels: + severity: warning + annotations: + summary: Postgresql table not auto analyzed (instance {{"{{"}} $labels.instance {{"}}"}}) + description: "Table {{"{{"}} $labels.relname {{"}}"}} has not been auto analyzed for 10 days\n VALUE = {{"{{"}} $value {{"}}"}}\n LABELS = {{"{{"}} $labels {{"}}"}}" + + - alert: PostgresqlTooManyConnections + expr: sum by (datname) (pg_stat_activity_count{datname!~"template.*|postgres"}) > pg_settings_max_connections * 0.8 + for: 2m + labels: + severity: warning + annotations: + summary: Postgresql too many connections (instance {{"{{"}} $labels.instance {{"}}"}}) + description: "PostgreSQL instance has too many connections (> 80%).\n VALUE = {{"{{"}} $value {{"}}"}}\n LABELS = {{"{{"}} $labels {{"}}"}}" + + + - alert: PostgresqlNotEnoughConnections + expr: sum by (datname) (pg_stat_activity_count{datname!~"template.*|postgres"}) < 5 + for: 2m + labels: + severity: warning + annotations: + summary: Postgresql not enough connections (instance {{"{{"}} $labels.instance {{"}}"}}) + description: "PostgreSQL instance should have more connections (> 5)\n VALUE = {{"{{"}} $value {{"}}"}}\n LABELS = {{"{{"}} $labels {{"}}"}}" + + + - alert: PostgresqlDeadLocks + expr: increase(pg_stat_database_deadlocks{datname!~"template.*|postgres"}[1m]) > 5 + for: 0m + labels: + severity: warning + annotations: + summary: Postgresql dead locks (instance {{"{{"}} $labels.instance {{"}}"}}) + description: "PostgreSQL has dead-locks\n VALUE = {{"{{"}} $value {{"}}"}}\n LABELS = {{"{{"}} $labels {{"}}"}}" + + + - alert: PostgresqlHighRollbackRate + expr: sum by (namespace,datname) ((rate(pg_stat_database_xact_rollback{datname!~"template.*|postgres",datid!="0"}[3m])) / ((rate(pg_stat_database_xact_rollback{datname!~"template.*|postgres",datid!="0"}[3m])) + (rate(pg_stat_database_xact_commit{datname!~"template.*|postgres",datid!="0"}[3m])))) > 0.02 + for: 0m + labels: + severity: warning + annotations: + summary: Postgresql high rollback rate (instance {{"{{"}} $labels.instance {{"}}"}}) + description: "Ratio of transactions being aborted compared to committed is > 2 %\n VALUE = {{"{{"}} $value {{"}}"}}\n LABELS = {{"{{"}} $labels {{"}}"}}" + + + - alert: PostgresqlCommitRateLow + expr: rate(pg_stat_database_xact_commit[1m]) < 10 + for: 2m + labels: + severity: critical + annotations: + summary: Postgresql commit rate low (instance {{"{{"}} $labels.instance {{"}}"}}) + description: "Postgresql seems to be processing very few transactions\n VALUE = {{"{{"}} $value {{"}}"}}\n LABELS = {{"{{"}} $labels {{"}}"}}" + + - alert: PostgresqlLowXidConsumption + expr: rate(pg_txid_current[1m]) < 5 + for: 2m + labels: + severity: warning + annotations: + summary: Postgresql low XID consumption (instance {{"{{"}} $labels.instance {{"}}"}}) + description: "Postgresql seems to be consuming transaction IDs very slowly\n VALUE = {{"{{"}} $value {{"}}"}}\n LABELS = {{"{{"}} $labels {{"}}"}}" + + - alert: PostgresqlHighRateStatementTimeout + expr: rate(postgresql_errors_total{type="statement_timeout"}[1m]) > 3 + for: 0m + labels: + severity: critical + annotations: + summary: Postgresql high rate statement timeout (instance {{"{{"}} $labels.instance {{"}}"}}) + description: "Postgres transactions showing high rate of statement timeouts\n VALUE = {{"{{"}} $value {{"}}"}}\n LABELS = {{"{{"}} $labels {{"}}"}}" + + - alert: PostgresqlHighRateDeadlock + expr: increase(postgresql_errors_total{type="deadlock_detected"}[1m]) > 1 + for: 0m + labels: + severity: critical + annotations: + summary: Postgresql high rate deadlock (instance {{"{{"}} $labels.instance {{"}}"}}) + description: "Postgres detected deadlocks\n VALUE = {{"{{"}} $value {{"}}"}}\n LABELS = {{"{{"}} $labels {{"}}"}}" + + - alert: PostgresqlUnusedReplicationSlot + expr: pg_replication_slots_active == 0 + for: 1m + labels: + severity: warning + annotations: + summary: Postgresql unused replication slot (instance {{"{{"}} $labels.instance {{"}}"}}) + description: "Unused Replication Slots\n VALUE = {{"{{"}} $value {{"}}"}}\n LABELS = {{"{{"}} $labels {{"}}"}}" + + - alert: PostgresqlTooManyDeadTuples + expr: ((pg_stat_user_tables_n_dead_tup > 10000) / (pg_stat_user_tables_n_live_tup + pg_stat_user_tables_n_dead_tup)) >= 0.1 + for: 2m + labels: + severity: warning + annotations: + summary: Postgresql too many dead tuples (instance {{"{{"}} $labels.instance {{"}}"}}) + description: "PostgreSQL dead tuples is too large\n VALUE = {{"{{"}} $value {{"}}"}}\n LABELS = {{"{{"}} $labels {{"}}"}}" + + + - alert: PostgresqlSslCompressionActive + expr: sum(pg_stat_ssl_compression) > 0 + for: 0m + labels: + severity: critical + annotations: + summary: Postgresql SSL compression active (instance {{"{{"}} $labels.instance {{"}}"}}) + description: "Database connections with SSL compression enabled. This may add significant jitter in replication delay. Replicas should turn off SSL compression via `sslcompression=0` in `recovery.conf`.\n VALUE = {{"{{"}} $value {{"}}"}}\n LABELS = {{"{{"}} $labels {{"}}"}}" + + + - alert: PostgresqlTooManyLocksAcquired + expr: ((sum (pg_locks_count)) / (pg_settings_max_locks_per_transaction * pg_settings_max_connections)) > 0.20 + for: 2m + labels: + severity: critical + annotations: + summary: Postgresql too many locks acquired (instance {{"{{"}} $labels.instance {{"}}"}}) + description: "Too many locks acquired on the database. If this alert happens frequently, we may need to increase the postgres setting max_locks_per_transaction.\n VALUE = {{"{{"}} $value {{"}}"}}\n LABELS = {{"{{"}} $labels {{"}}"}}" + + - alert: PostgresqlBloatIndexHigh(>80%) + expr: pg_bloat_btree_bloat_pct > 80 and on (idxname) (pg_bloat_btree_real_size > 100000000) + for: 1h + labels: + severity: warning + annotations: + summary: Postgresql bloat index high (> 80%) (instance {{"{{"}} $labels.instance {{"}}"}}) + description: "The index {{"{{"}} $labels.idxname {{"}}"}} is bloated. You should execute `REINDEX INDEX CONCURRENTLY {{"{{"}} $labels.idxname {{"}}"}};`\n VALUE = {{"{{"}} $value {{"}}"}}\n LABELS = {{"{{"}} $labels {{"}}"}}" + + - alert: PostgresqlBloatTableHigh(>80%) + expr: pg_bloat_table_bloat_pct > 80 and on (relname) (pg_bloat_table_real_size > 200000000) + for: 1h + labels: + severity: warning + annotations: + summary: Postgresql bloat table high (> 80%) (instance {{"{{"}} $labels.instance {{"}}"}}) + description: "The table {{"{{"}} $labels.relname {{"}}"}} is bloated. You should execute `VACUUM {{"{{"}} $labels.relname {{"}}"}};`\n VALUE = {{"{{"}} $value {{"}}"}}\n LABELS = {{"{{"}} $labels {{"}}"}}" + + - alert: PostgresqlHighDbSize + expr: pg_database_size_bytes / (1024 * 1024 * 1024) > 100 # this value depends on available disk size + for: 0m + labels: + severity: critical + annotations: + summary: Postgresql DB size is more than 100 GB (instance {{"{{"}} $labels.instance {{"}}"}}) + description: "Postgresql DB size is more than 100 GB\n VALUE = {{"{{"}} $value {{"}}"}}\n LABELS = {{"{{"}} $labels {{"}}"}}" + + - alert: PostgresqlReplicationLag + expr: pg_replication_lag > 30 and ON(instance) pg_replication_is_replica == 1 + for: 0m + labels: + severity: critical + annotations: + summary: Postgresql replication lag (instance {{"{{"}} $labels.instance {{"}}"}}) + description: "PostgreSQL replication lag is going up (> 30s)\n VALUE = {{"{{"}} $value {{"}}"}}\n LABELS = {{"{{"}} $labels {{"}}"}}" + + +{{- end }} diff --git a/postgres/helm/values.yaml b/postgres/helm/values.yaml index 9870c1a..fc7c80b 100644 --- a/postgres/helm/values.yaml +++ b/postgres/helm/values.yaml @@ -1,6 +1,16 @@ # Default values for vault-secret. # This is a YAML-formatted file. # Declare variables to be passed into your templates. +prometheus-postgres-exporter: + config: + datasource: + host: grafana-psql-pgcluster-r.grafana.svc + passwordSecret: + name: grafana-psql-pgcluster-superuser + key: password + + serviceMonitor: + enabled: true kubeClusterName: jwt cluster: @@ -8,8 +18,8 @@ cluster: enabled: true initDB: enabled: true - dbName: app - username: app + dbName: grafana + username: grafana password: superpassword backup: enabled: false From 7ea4dd09166152b54511d26f50c2f8881b8ae923 Mon Sep 17 00:00:00 2001 From: Yaroslav Kasatikov Date: Tue, 11 Apr 2023 10:28:04 +0300 Subject: [PATCH 2/6] PodMonitor + memcached --- postgres/helm/templates/pgexporter-monitoring-rules.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/postgres/helm/templates/pgexporter-monitoring-rules.yaml b/postgres/helm/templates/pgexporter-monitoring-rules.yaml index 31c6fa6..b549cc8 100644 --- a/postgres/helm/templates/pgexporter-monitoring-rules.yaml +++ b/postgres/helm/templates/pgexporter-monitoring-rules.yaml @@ -7,6 +7,7 @@ metadata: openshift.io/prometheus-rule-evaluation-scope: leaf-prometheus apiVersion: monitoring.coreos.com/v1 spec: + groups: - alert: PostgresqlDown expr: pg_up == 0 for: 0m From b20dc1e9c49b85794d1d999ecd67df1a82f0e55a Mon Sep 17 00:00:00 2001 From: Yaroslav Kasatikov Date: Tue, 11 Apr 2023 10:29:00 +0300 Subject: [PATCH 3/6] PodMonitor + memcached --- .../pgexporter-monitoring-rules.yaml | 396 +++++++++--------- 1 file changed, 199 insertions(+), 197 deletions(-) diff --git a/postgres/helm/templates/pgexporter-monitoring-rules.yaml b/postgres/helm/templates/pgexporter-monitoring-rules.yaml index b549cc8..56041a4 100644 --- a/postgres/helm/templates/pgexporter-monitoring-rules.yaml +++ b/postgres/helm/templates/pgexporter-monitoring-rules.yaml @@ -8,201 +8,203 @@ metadata: apiVersion: monitoring.coreos.com/v1 spec: groups: - - alert: PostgresqlDown - expr: pg_up == 0 - for: 0m - labels: - severity: critical - annotations: - summary: Postgresql down (instance {{"{{"}} $labels.instance {{"}}"}}) - description: "Postgresql instance is down\n VALUE = {{"{{"}} $value {{"}}"}}\n LABELS = {{"{{"}} $labels {{"}}"}}" - - alert: PostgresqlRestarted - expr: time() - pg_postmaster_start_time_seconds < 60 - for: 0m - labels: - severity: critical - annotations: - summary: Postgresql restarted (instance {{"{{"}} $labels.instance {{"}}"}}) - description: "Postgresql restarted\n VALUE = {{"{{"}} $value {{"}}"}}\n LABELS = {{"{{"}} $labels {{"}}"}}" - - - alert: PostgresqlExporterError - expr: pg_exporter_last_scrape_error > 0 - for: 0m - labels: - severity: critical - annotations: - summary: Postgresql exporter error (instance {{"{{"}} $labels.instance {{"}}"}}) - description: "Postgresql exporter is showing errors. A query may be buggy in query.yaml\n VALUE = {{"{{"}} $value {{"}}"}}\n LABELS = {{"{{"}} $labels {{"}}"}}" - - - - alert: PostgresqlTableNotAutoVacuumed - expr: (pg_stat_user_tables_last_autovacuum > 0) and (time() - pg_stat_user_tables_last_autovacuum) > 60 * 60 * 24 * 10 - for: 0m - labels: - severity: warning - annotations: - summary: Postgresql table not auto vacuumed (instance {{"{{"}} $labels.instance {{"}}"}}) - description: "Table {{"{{"}} $labels.relname {{"}}"}} has not been auto vacuumed for 10 days\n VALUE = {{"{{"}} $value {{"}}"}}\n LABELS = {{"{{"}} $labels {{"}}"}}" - - - - alert: PostgresqlTableNotAutoAnalyzed - expr: (pg_stat_user_tables_last_autoanalyze > 0) and (time() - pg_stat_user_tables_last_autoanalyze) > 24 * 60 * 60 * 10 - for: 0m - labels: - severity: warning - annotations: - summary: Postgresql table not auto analyzed (instance {{"{{"}} $labels.instance {{"}}"}}) - description: "Table {{"{{"}} $labels.relname {{"}}"}} has not been auto analyzed for 10 days\n VALUE = {{"{{"}} $value {{"}}"}}\n LABELS = {{"{{"}} $labels {{"}}"}}" - - - alert: PostgresqlTooManyConnections - expr: sum by (datname) (pg_stat_activity_count{datname!~"template.*|postgres"}) > pg_settings_max_connections * 0.8 - for: 2m - labels: - severity: warning - annotations: - summary: Postgresql too many connections (instance {{"{{"}} $labels.instance {{"}}"}}) - description: "PostgreSQL instance has too many connections (> 80%).\n VALUE = {{"{{"}} $value {{"}}"}}\n LABELS = {{"{{"}} $labels {{"}}"}}" - - - - alert: PostgresqlNotEnoughConnections - expr: sum by (datname) (pg_stat_activity_count{datname!~"template.*|postgres"}) < 5 - for: 2m - labels: - severity: warning - annotations: - summary: Postgresql not enough connections (instance {{"{{"}} $labels.instance {{"}}"}}) - description: "PostgreSQL instance should have more connections (> 5)\n VALUE = {{"{{"}} $value {{"}}"}}\n LABELS = {{"{{"}} $labels {{"}}"}}" - - - - alert: PostgresqlDeadLocks - expr: increase(pg_stat_database_deadlocks{datname!~"template.*|postgres"}[1m]) > 5 - for: 0m - labels: - severity: warning - annotations: - summary: Postgresql dead locks (instance {{"{{"}} $labels.instance {{"}}"}}) - description: "PostgreSQL has dead-locks\n VALUE = {{"{{"}} $value {{"}}"}}\n LABELS = {{"{{"}} $labels {{"}}"}}" - - - - alert: PostgresqlHighRollbackRate - expr: sum by (namespace,datname) ((rate(pg_stat_database_xact_rollback{datname!~"template.*|postgres",datid!="0"}[3m])) / ((rate(pg_stat_database_xact_rollback{datname!~"template.*|postgres",datid!="0"}[3m])) + (rate(pg_stat_database_xact_commit{datname!~"template.*|postgres",datid!="0"}[3m])))) > 0.02 - for: 0m - labels: - severity: warning - annotations: - summary: Postgresql high rollback rate (instance {{"{{"}} $labels.instance {{"}}"}}) - description: "Ratio of transactions being aborted compared to committed is > 2 %\n VALUE = {{"{{"}} $value {{"}}"}}\n LABELS = {{"{{"}} $labels {{"}}"}}" - - - - alert: PostgresqlCommitRateLow - expr: rate(pg_stat_database_xact_commit[1m]) < 10 - for: 2m - labels: - severity: critical - annotations: - summary: Postgresql commit rate low (instance {{"{{"}} $labels.instance {{"}}"}}) - description: "Postgresql seems to be processing very few transactions\n VALUE = {{"{{"}} $value {{"}}"}}\n LABELS = {{"{{"}} $labels {{"}}"}}" - - - alert: PostgresqlLowXidConsumption - expr: rate(pg_txid_current[1m]) < 5 - for: 2m - labels: - severity: warning - annotations: - summary: Postgresql low XID consumption (instance {{"{{"}} $labels.instance {{"}}"}}) - description: "Postgresql seems to be consuming transaction IDs very slowly\n VALUE = {{"{{"}} $value {{"}}"}}\n LABELS = {{"{{"}} $labels {{"}}"}}" - - - alert: PostgresqlHighRateStatementTimeout - expr: rate(postgresql_errors_total{type="statement_timeout"}[1m]) > 3 - for: 0m - labels: - severity: critical - annotations: - summary: Postgresql high rate statement timeout (instance {{"{{"}} $labels.instance {{"}}"}}) - description: "Postgres transactions showing high rate of statement timeouts\n VALUE = {{"{{"}} $value {{"}}"}}\n LABELS = {{"{{"}} $labels {{"}}"}}" - - - alert: PostgresqlHighRateDeadlock - expr: increase(postgresql_errors_total{type="deadlock_detected"}[1m]) > 1 - for: 0m - labels: - severity: critical - annotations: - summary: Postgresql high rate deadlock (instance {{"{{"}} $labels.instance {{"}}"}}) - description: "Postgres detected deadlocks\n VALUE = {{"{{"}} $value {{"}}"}}\n LABELS = {{"{{"}} $labels {{"}}"}}" - - - alert: PostgresqlUnusedReplicationSlot - expr: pg_replication_slots_active == 0 - for: 1m - labels: - severity: warning - annotations: - summary: Postgresql unused replication slot (instance {{"{{"}} $labels.instance {{"}}"}}) - description: "Unused Replication Slots\n VALUE = {{"{{"}} $value {{"}}"}}\n LABELS = {{"{{"}} $labels {{"}}"}}" - - - alert: PostgresqlTooManyDeadTuples - expr: ((pg_stat_user_tables_n_dead_tup > 10000) / (pg_stat_user_tables_n_live_tup + pg_stat_user_tables_n_dead_tup)) >= 0.1 - for: 2m - labels: - severity: warning - annotations: - summary: Postgresql too many dead tuples (instance {{"{{"}} $labels.instance {{"}}"}}) - description: "PostgreSQL dead tuples is too large\n VALUE = {{"{{"}} $value {{"}}"}}\n LABELS = {{"{{"}} $labels {{"}}"}}" - - - - alert: PostgresqlSslCompressionActive - expr: sum(pg_stat_ssl_compression) > 0 - for: 0m - labels: - severity: critical - annotations: - summary: Postgresql SSL compression active (instance {{"{{"}} $labels.instance {{"}}"}}) - description: "Database connections with SSL compression enabled. This may add significant jitter in replication delay. Replicas should turn off SSL compression via `sslcompression=0` in `recovery.conf`.\n VALUE = {{"{{"}} $value {{"}}"}}\n LABELS = {{"{{"}} $labels {{"}}"}}" - - - - alert: PostgresqlTooManyLocksAcquired - expr: ((sum (pg_locks_count)) / (pg_settings_max_locks_per_transaction * pg_settings_max_connections)) > 0.20 - for: 2m - labels: - severity: critical - annotations: - summary: Postgresql too many locks acquired (instance {{"{{"}} $labels.instance {{"}}"}}) - description: "Too many locks acquired on the database. If this alert happens frequently, we may need to increase the postgres setting max_locks_per_transaction.\n VALUE = {{"{{"}} $value {{"}}"}}\n LABELS = {{"{{"}} $labels {{"}}"}}" - - - alert: PostgresqlBloatIndexHigh(>80%) - expr: pg_bloat_btree_bloat_pct > 80 and on (idxname) (pg_bloat_btree_real_size > 100000000) - for: 1h - labels: - severity: warning - annotations: - summary: Postgresql bloat index high (> 80%) (instance {{"{{"}} $labels.instance {{"}}"}}) - description: "The index {{"{{"}} $labels.idxname {{"}}"}} is bloated. You should execute `REINDEX INDEX CONCURRENTLY {{"{{"}} $labels.idxname {{"}}"}};`\n VALUE = {{"{{"}} $value {{"}}"}}\n LABELS = {{"{{"}} $labels {{"}}"}}" - - - alert: PostgresqlBloatTableHigh(>80%) - expr: pg_bloat_table_bloat_pct > 80 and on (relname) (pg_bloat_table_real_size > 200000000) - for: 1h - labels: - severity: warning - annotations: - summary: Postgresql bloat table high (> 80%) (instance {{"{{"}} $labels.instance {{"}}"}}) - description: "The table {{"{{"}} $labels.relname {{"}}"}} is bloated. You should execute `VACUUM {{"{{"}} $labels.relname {{"}}"}};`\n VALUE = {{"{{"}} $value {{"}}"}}\n LABELS = {{"{{"}} $labels {{"}}"}}" - - - alert: PostgresqlHighDbSize - expr: pg_database_size_bytes / (1024 * 1024 * 1024) > 100 # this value depends on available disk size - for: 0m - labels: - severity: critical - annotations: - summary: Postgresql DB size is more than 100 GB (instance {{"{{"}} $labels.instance {{"}}"}}) - description: "Postgresql DB size is more than 100 GB\n VALUE = {{"{{"}} $value {{"}}"}}\n LABELS = {{"{{"}} $labels {{"}}"}}" - - - alert: PostgresqlReplicationLag - expr: pg_replication_lag > 30 and ON(instance) pg_replication_is_replica == 1 - for: 0m - labels: - severity: critical - annotations: - summary: Postgresql replication lag (instance {{"{{"}} $labels.instance {{"}}"}}) - description: "PostgreSQL replication lag is going up (> 30s)\n VALUE = {{"{{"}} $value {{"}}"}}\n LABELS = {{"{{"}} $labels {{"}}"}}" - - + - name: cnp-default.rules + rules: + - alert: PostgresqlDown + expr: pg_up == 0 + for: 0m + labels: + severity: critical + annotations: + summary: Postgresql down (instance {{"{{"}} $labels.instance {{"}}"}}) + description: "Postgresql instance is down\n VALUE = {{"{{"}} $value {{"}}"}}\n LABELS = {{"{{"}} $labels {{"}}"}}" + - alert: PostgresqlRestarted + expr: time() - pg_postmaster_start_time_seconds < 60 + for: 0m + labels: + severity: critical + annotations: + summary: Postgresql restarted (instance {{"{{"}} $labels.instance {{"}}"}}) + description: "Postgresql restarted\n VALUE = {{"{{"}} $value {{"}}"}}\n LABELS = {{"{{"}} $labels {{"}}"}}" + + - alert: PostgresqlExporterError + expr: pg_exporter_last_scrape_error > 0 + for: 0m + labels: + severity: critical + annotations: + summary: Postgresql exporter error (instance {{"{{"}} $labels.instance {{"}}"}}) + description: "Postgresql exporter is showing errors. A query may be buggy in query.yaml\n VALUE = {{"{{"}} $value {{"}}"}}\n LABELS = {{"{{"}} $labels {{"}}"}}" + + + - alert: PostgresqlTableNotAutoVacuumed + expr: (pg_stat_user_tables_last_autovacuum > 0) and (time() - pg_stat_user_tables_last_autovacuum) > 60 * 60 * 24 * 10 + for: 0m + labels: + severity: warning + annotations: + summary: Postgresql table not auto vacuumed (instance {{"{{"}} $labels.instance {{"}}"}}) + description: "Table {{"{{"}} $labels.relname {{"}}"}} has not been auto vacuumed for 10 days\n VALUE = {{"{{"}} $value {{"}}"}}\n LABELS = {{"{{"}} $labels {{"}}"}}" + + + - alert: PostgresqlTableNotAutoAnalyzed + expr: (pg_stat_user_tables_last_autoanalyze > 0) and (time() - pg_stat_user_tables_last_autoanalyze) > 24 * 60 * 60 * 10 + for: 0m + labels: + severity: warning + annotations: + summary: Postgresql table not auto analyzed (instance {{"{{"}} $labels.instance {{"}}"}}) + description: "Table {{"{{"}} $labels.relname {{"}}"}} has not been auto analyzed for 10 days\n VALUE = {{"{{"}} $value {{"}}"}}\n LABELS = {{"{{"}} $labels {{"}}"}}" + + - alert: PostgresqlTooManyConnections + expr: sum by (datname) (pg_stat_activity_count{datname!~"template.*|postgres"}) > pg_settings_max_connections * 0.8 + for: 2m + labels: + severity: warning + annotations: + summary: Postgresql too many connections (instance {{"{{"}} $labels.instance {{"}}"}}) + description: "PostgreSQL instance has too many connections (> 80%).\n VALUE = {{"{{"}} $value {{"}}"}}\n LABELS = {{"{{"}} $labels {{"}}"}}" + + + - alert: PostgresqlNotEnoughConnections + expr: sum by (datname) (pg_stat_activity_count{datname!~"template.*|postgres"}) < 5 + for: 2m + labels: + severity: warning + annotations: + summary: Postgresql not enough connections (instance {{"{{"}} $labels.instance {{"}}"}}) + description: "PostgreSQL instance should have more connections (> 5)\n VALUE = {{"{{"}} $value {{"}}"}}\n LABELS = {{"{{"}} $labels {{"}}"}}" + + + - alert: PostgresqlDeadLocks + expr: increase(pg_stat_database_deadlocks{datname!~"template.*|postgres"}[1m]) > 5 + for: 0m + labels: + severity: warning + annotations: + summary: Postgresql dead locks (instance {{"{{"}} $labels.instance {{"}}"}}) + description: "PostgreSQL has dead-locks\n VALUE = {{"{{"}} $value {{"}}"}}\n LABELS = {{"{{"}} $labels {{"}}"}}" + + + - alert: PostgresqlHighRollbackRate + expr: sum by (namespace,datname) ((rate(pg_stat_database_xact_rollback{datname!~"template.*|postgres",datid!="0"}[3m])) / ((rate(pg_stat_database_xact_rollback{datname!~"template.*|postgres",datid!="0"}[3m])) + (rate(pg_stat_database_xact_commit{datname!~"template.*|postgres",datid!="0"}[3m])))) > 0.02 + for: 0m + labels: + severity: warning + annotations: + summary: Postgresql high rollback rate (instance {{"{{"}} $labels.instance {{"}}"}}) + description: "Ratio of transactions being aborted compared to committed is > 2 %\n VALUE = {{"{{"}} $value {{"}}"}}\n LABELS = {{"{{"}} $labels {{"}}"}}" + + + - alert: PostgresqlCommitRateLow + expr: rate(pg_stat_database_xact_commit[1m]) < 10 + for: 2m + labels: + severity: critical + annotations: + summary: Postgresql commit rate low (instance {{"{{"}} $labels.instance {{"}}"}}) + description: "Postgresql seems to be processing very few transactions\n VALUE = {{"{{"}} $value {{"}}"}}\n LABELS = {{"{{"}} $labels {{"}}"}}" + + - alert: PostgresqlLowXidConsumption + expr: rate(pg_txid_current[1m]) < 5 + for: 2m + labels: + severity: warning + annotations: + summary: Postgresql low XID consumption (instance {{"{{"}} $labels.instance {{"}}"}}) + description: "Postgresql seems to be consuming transaction IDs very slowly\n VALUE = {{"{{"}} $value {{"}}"}}\n LABELS = {{"{{"}} $labels {{"}}"}}" + + - alert: PostgresqlHighRateStatementTimeout + expr: rate(postgresql_errors_total{type="statement_timeout"}[1m]) > 3 + for: 0m + labels: + severity: critical + annotations: + summary: Postgresql high rate statement timeout (instance {{"{{"}} $labels.instance {{"}}"}}) + description: "Postgres transactions showing high rate of statement timeouts\n VALUE = {{"{{"}} $value {{"}}"}}\n LABELS = {{"{{"}} $labels {{"}}"}}" + + - alert: PostgresqlHighRateDeadlock + expr: increase(postgresql_errors_total{type="deadlock_detected"}[1m]) > 1 + for: 0m + labels: + severity: critical + annotations: + summary: Postgresql high rate deadlock (instance {{"{{"}} $labels.instance {{"}}"}}) + description: "Postgres detected deadlocks\n VALUE = {{"{{"}} $value {{"}}"}}\n LABELS = {{"{{"}} $labels {{"}}"}}" + + - alert: PostgresqlUnusedReplicationSlot + expr: pg_replication_slots_active == 0 + for: 1m + labels: + severity: warning + annotations: + summary: Postgresql unused replication slot (instance {{"{{"}} $labels.instance {{"}}"}}) + description: "Unused Replication Slots\n VALUE = {{"{{"}} $value {{"}}"}}\n LABELS = {{"{{"}} $labels {{"}}"}}" + + - alert: PostgresqlTooManyDeadTuples + expr: ((pg_stat_user_tables_n_dead_tup > 10000) / (pg_stat_user_tables_n_live_tup + pg_stat_user_tables_n_dead_tup)) >= 0.1 + for: 2m + labels: + severity: warning + annotations: + summary: Postgresql too many dead tuples (instance {{"{{"}} $labels.instance {{"}}"}}) + description: "PostgreSQL dead tuples is too large\n VALUE = {{"{{"}} $value {{"}}"}}\n LABELS = {{"{{"}} $labels {{"}}"}}" + + + - alert: PostgresqlSslCompressionActive + expr: sum(pg_stat_ssl_compression) > 0 + for: 0m + labels: + severity: critical + annotations: + summary: Postgresql SSL compression active (instance {{"{{"}} $labels.instance {{"}}"}}) + description: "Database connections with SSL compression enabled. This may add significant jitter in replication delay. Replicas should turn off SSL compression via `sslcompression=0` in `recovery.conf`.\n VALUE = {{"{{"}} $value {{"}}"}}\n LABELS = {{"{{"}} $labels {{"}}"}}" + + + - alert: PostgresqlTooManyLocksAcquired + expr: ((sum (pg_locks_count)) / (pg_settings_max_locks_per_transaction * pg_settings_max_connections)) > 0.20 + for: 2m + labels: + severity: critical + annotations: + summary: Postgresql too many locks acquired (instance {{"{{"}} $labels.instance {{"}}"}}) + description: "Too many locks acquired on the database. If this alert happens frequently, we may need to increase the postgres setting max_locks_per_transaction.\n VALUE = {{"{{"}} $value {{"}}"}}\n LABELS = {{"{{"}} $labels {{"}}"}}" + + - alert: PostgresqlBloatIndexHigh(>80%) + expr: pg_bloat_btree_bloat_pct > 80 and on (idxname) (pg_bloat_btree_real_size > 100000000) + for: 1h + labels: + severity: warning + annotations: + summary: Postgresql bloat index high (> 80%) (instance {{"{{"}} $labels.instance {{"}}"}}) + description: "The index {{"{{"}} $labels.idxname {{"}}"}} is bloated. You should execute `REINDEX INDEX CONCURRENTLY {{"{{"}} $labels.idxname {{"}}"}};`\n VALUE = {{"{{"}} $value {{"}}"}}\n LABELS = {{"{{"}} $labels {{"}}"}}" + + - alert: PostgresqlBloatTableHigh(>80%) + expr: pg_bloat_table_bloat_pct > 80 and on (relname) (pg_bloat_table_real_size > 200000000) + for: 1h + labels: + severity: warning + annotations: + summary: Postgresql bloat table high (> 80%) (instance {{"{{"}} $labels.instance {{"}}"}}) + description: "The table {{"{{"}} $labels.relname {{"}}"}} is bloated. You should execute `VACUUM {{"{{"}} $labels.relname {{"}}"}};`\n VALUE = {{"{{"}} $value {{"}}"}}\n LABELS = {{"{{"}} $labels {{"}}"}}" + + - alert: PostgresqlHighDbSize + expr: pg_database_size_bytes / (1024 * 1024 * 1024) > 100 # this value depends on available disk size + for: 0m + labels: + severity: critical + annotations: + summary: Postgresql DB size is more than 100 GB (instance {{"{{"}} $labels.instance {{"}}"}}) + description: "Postgresql DB size is more than 100 GB\n VALUE = {{"{{"}} $value {{"}}"}}\n LABELS = {{"{{"}} $labels {{"}}"}}" + + - alert: PostgresqlReplicationLag + expr: pg_replication_lag > 30 and ON(instance) pg_replication_is_replica == 1 + for: 0m + labels: + severity: critical + annotations: + summary: Postgresql replication lag (instance {{"{{"}} $labels.instance {{"}}"}}) + description: "PostgreSQL replication lag is going up (> 30s)\n VALUE = {{"{{"}} $value {{"}}"}}\n LABELS = {{"{{"}} $labels {{"}}"}}" + + {{- end }} From 6ff7a97c413189d42213894788ac2ab11c39109b Mon Sep 17 00:00:00 2001 From: Yaroslav Kasatikov Date: Tue, 11 Apr 2023 10:34:06 +0300 Subject: [PATCH 4/6] PodMonitor + memcached --- .../pgexporter-monitoring-rules.yaml | 63 ++++++++++++------- 1 file changed, 42 insertions(+), 21 deletions(-) diff --git a/postgres/helm/templates/pgexporter-monitoring-rules.yaml b/postgres/helm/templates/pgexporter-monitoring-rules.yaml index 56041a4..0412943 100644 --- a/postgres/helm/templates/pgexporter-monitoring-rules.yaml +++ b/postgres/helm/templates/pgexporter-monitoring-rules.yaml @@ -11,7 +11,8 @@ spec: - name: cnp-default.rules rules: - alert: PostgresqlDown - expr: pg_up == 0 + expr: |- + pg_up == 0 for: 0m labels: severity: critical @@ -19,7 +20,8 @@ spec: summary: Postgresql down (instance {{"{{"}} $labels.instance {{"}}"}}) description: "Postgresql instance is down\n VALUE = {{"{{"}} $value {{"}}"}}\n LABELS = {{"{{"}} $labels {{"}}"}}" - alert: PostgresqlRestarted - expr: time() - pg_postmaster_start_time_seconds < 60 + expr: |- + time() - pg_postmaster_start_time_seconds < 60 for: 0m labels: severity: critical @@ -28,7 +30,8 @@ spec: description: "Postgresql restarted\n VALUE = {{"{{"}} $value {{"}}"}}\n LABELS = {{"{{"}} $labels {{"}}"}}" - alert: PostgresqlExporterError - expr: pg_exporter_last_scrape_error > 0 + expr: |- + pg_exporter_last_scrape_error > 0 for: 0m labels: severity: critical @@ -38,7 +41,8 @@ spec: - alert: PostgresqlTableNotAutoVacuumed - expr: (pg_stat_user_tables_last_autovacuum > 0) and (time() - pg_stat_user_tables_last_autovacuum) > 60 * 60 * 24 * 10 + expr: |- + (pg_stat_user_tables_last_autovacuum > 0) and (time() - pg_stat_user_tables_last_autovacuum) > 60 * 60 * 24 * 10 for: 0m labels: severity: warning @@ -48,7 +52,8 @@ spec: - alert: PostgresqlTableNotAutoAnalyzed - expr: (pg_stat_user_tables_last_autoanalyze > 0) and (time() - pg_stat_user_tables_last_autoanalyze) > 24 * 60 * 60 * 10 + expr: |- + (pg_stat_user_tables_last_autoanalyze > 0) and (time() - pg_stat_user_tables_last_autoanalyze) > 24 * 60 * 60 * 10 for: 0m labels: severity: warning @@ -57,7 +62,8 @@ spec: description: "Table {{"{{"}} $labels.relname {{"}}"}} has not been auto analyzed for 10 days\n VALUE = {{"{{"}} $value {{"}}"}}\n LABELS = {{"{{"}} $labels {{"}}"}}" - alert: PostgresqlTooManyConnections - expr: sum by (datname) (pg_stat_activity_count{datname!~"template.*|postgres"}) > pg_settings_max_connections * 0.8 + expr: |- + sum by (datname) (pg_stat_activity_count{datname!~"template.*|postgres"}) > pg_settings_max_connections * 0.8 for: 2m labels: severity: warning @@ -67,7 +73,8 @@ spec: - alert: PostgresqlNotEnoughConnections - expr: sum by (datname) (pg_stat_activity_count{datname!~"template.*|postgres"}) < 5 + expr: |- + sum by (datname) (pg_stat_activity_count{datname!~"template.*|postgres"}) < 5 for: 2m labels: severity: warning @@ -77,7 +84,8 @@ spec: - alert: PostgresqlDeadLocks - expr: increase(pg_stat_database_deadlocks{datname!~"template.*|postgres"}[1m]) > 5 + expr: |- + increase(pg_stat_database_deadlocks{datname!~"template.*|postgres"}[1m]) > 5 for: 0m labels: severity: warning @@ -87,7 +95,8 @@ spec: - alert: PostgresqlHighRollbackRate - expr: sum by (namespace,datname) ((rate(pg_stat_database_xact_rollback{datname!~"template.*|postgres",datid!="0"}[3m])) / ((rate(pg_stat_database_xact_rollback{datname!~"template.*|postgres",datid!="0"}[3m])) + (rate(pg_stat_database_xact_commit{datname!~"template.*|postgres",datid!="0"}[3m])))) > 0.02 + expr: |- + sum by (namespace,datname) ((rate(pg_stat_database_xact_rollback{datname!~"template.*|postgres",datid!="0"}[3m])) / ((rate(pg_stat_database_xact_rollback{datname!~"template.*|postgres",datid!="0"}[3m])) + (rate(pg_stat_database_xact_commit{datname!~"template.*|postgres",datid!="0"}[3m])))) > 0.02 for: 0m labels: severity: warning @@ -97,7 +106,8 @@ spec: - alert: PostgresqlCommitRateLow - expr: rate(pg_stat_database_xact_commit[1m]) < 10 + expr: |- + rate(pg_stat_database_xact_commit[1m]) < 10 for: 2m labels: severity: critical @@ -106,7 +116,8 @@ spec: description: "Postgresql seems to be processing very few transactions\n VALUE = {{"{{"}} $value {{"}}"}}\n LABELS = {{"{{"}} $labels {{"}}"}}" - alert: PostgresqlLowXidConsumption - expr: rate(pg_txid_current[1m]) < 5 + expr: |- + rate(pg_txid_current[1m]) < 5 for: 2m labels: severity: warning @@ -115,7 +126,8 @@ spec: description: "Postgresql seems to be consuming transaction IDs very slowly\n VALUE = {{"{{"}} $value {{"}}"}}\n LABELS = {{"{{"}} $labels {{"}}"}}" - alert: PostgresqlHighRateStatementTimeout - expr: rate(postgresql_errors_total{type="statement_timeout"}[1m]) > 3 + expr: |- + rate(postgresql_errors_total{type="statement_timeout"}[1m]) > 3 for: 0m labels: severity: critical @@ -124,7 +136,8 @@ spec: description: "Postgres transactions showing high rate of statement timeouts\n VALUE = {{"{{"}} $value {{"}}"}}\n LABELS = {{"{{"}} $labels {{"}}"}}" - alert: PostgresqlHighRateDeadlock - expr: increase(postgresql_errors_total{type="deadlock_detected"}[1m]) > 1 + expr: |- + increase(postgresql_errors_total{type="deadlock_detected"}[1m]) > 1 for: 0m labels: severity: critical @@ -133,7 +146,8 @@ spec: description: "Postgres detected deadlocks\n VALUE = {{"{{"}} $value {{"}}"}}\n LABELS = {{"{{"}} $labels {{"}}"}}" - alert: PostgresqlUnusedReplicationSlot - expr: pg_replication_slots_active == 0 + expr: |- + pg_replication_slots_active == 0 for: 1m labels: severity: warning @@ -142,7 +156,8 @@ spec: description: "Unused Replication Slots\n VALUE = {{"{{"}} $value {{"}}"}}\n LABELS = {{"{{"}} $labels {{"}}"}}" - alert: PostgresqlTooManyDeadTuples - expr: ((pg_stat_user_tables_n_dead_tup > 10000) / (pg_stat_user_tables_n_live_tup + pg_stat_user_tables_n_dead_tup)) >= 0.1 + expr: |- + ((pg_stat_user_tables_n_dead_tup > 10000) / (pg_stat_user_tables_n_live_tup + pg_stat_user_tables_n_dead_tup)) >= 0.1 for: 2m labels: severity: warning @@ -152,7 +167,8 @@ spec: - alert: PostgresqlSslCompressionActive - expr: sum(pg_stat_ssl_compression) > 0 + expr: |- + sum(pg_stat_ssl_compression) > 0 for: 0m labels: severity: critical @@ -162,7 +178,8 @@ spec: - alert: PostgresqlTooManyLocksAcquired - expr: ((sum (pg_locks_count)) / (pg_settings_max_locks_per_transaction * pg_settings_max_connections)) > 0.20 + expr: |- + ((sum (pg_locks_count)) / (pg_settings_max_locks_per_transaction * pg_settings_max_connections)) > 0.20 for: 2m labels: severity: critical @@ -171,7 +188,8 @@ spec: description: "Too many locks acquired on the database. If this alert happens frequently, we may need to increase the postgres setting max_locks_per_transaction.\n VALUE = {{"{{"}} $value {{"}}"}}\n LABELS = {{"{{"}} $labels {{"}}"}}" - alert: PostgresqlBloatIndexHigh(>80%) - expr: pg_bloat_btree_bloat_pct > 80 and on (idxname) (pg_bloat_btree_real_size > 100000000) + expr: |- + pg_bloat_btree_bloat_pct > 80 and on (idxname) (pg_bloat_btree_real_size > 100000000) for: 1h labels: severity: warning @@ -180,7 +198,8 @@ spec: description: "The index {{"{{"}} $labels.idxname {{"}}"}} is bloated. You should execute `REINDEX INDEX CONCURRENTLY {{"{{"}} $labels.idxname {{"}}"}};`\n VALUE = {{"{{"}} $value {{"}}"}}\n LABELS = {{"{{"}} $labels {{"}}"}}" - alert: PostgresqlBloatTableHigh(>80%) - expr: pg_bloat_table_bloat_pct > 80 and on (relname) (pg_bloat_table_real_size > 200000000) + expr: |- + pg_bloat_table_bloat_pct > 80 and on (relname) (pg_bloat_table_real_size > 200000000) for: 1h labels: severity: warning @@ -189,7 +208,8 @@ spec: description: "The table {{"{{"}} $labels.relname {{"}}"}} is bloated. You should execute `VACUUM {{"{{"}} $labels.relname {{"}}"}};`\n VALUE = {{"{{"}} $value {{"}}"}}\n LABELS = {{"{{"}} $labels {{"}}"}}" - alert: PostgresqlHighDbSize - expr: pg_database_size_bytes / (1024 * 1024 * 1024) > 100 # this value depends on available disk size + expr: |- + pg_database_size_bytes / (1024 * 1024 * 1024) > 100 # this value depends on available disk size for: 0m labels: severity: critical @@ -198,7 +218,8 @@ spec: description: "Postgresql DB size is more than 100 GB\n VALUE = {{"{{"}} $value {{"}}"}}\n LABELS = {{"{{"}} $labels {{"}}"}}" - alert: PostgresqlReplicationLag - expr: pg_replication_lag > 30 and ON(instance) pg_replication_is_replica == 1 + expr: |- + pg_replication_lag > 30 and ON(instance) pg_replication_is_replica == 1 for: 0m labels: severity: critical From 7ed5a09fef870b16651afa9704df46c672523df3 Mon Sep 17 00:00:00 2001 From: Yaroslav Kasatikov Date: Tue, 20 Jun 2023 14:18:50 +0300 Subject: [PATCH 5/6] podLogs --- postgres/helm/values.yaml | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/postgres/helm/values.yaml b/postgres/helm/values.yaml index fc7c80b..ecd5a36 100644 --- a/postgres/helm/values.yaml +++ b/postgres/helm/values.yaml @@ -4,18 +4,24 @@ prometheus-postgres-exporter: config: datasource: - host: grafana-psql-pgcluster-r.grafana.svc + host: grafana-psql-pgcluster passwordSecret: name: grafana-psql-pgcluster-superuser key: password serviceMonitor: - enabled: true + enabled: false + +podLogs: + enabled: true + intstanceSelector: + app.kubernetes.io/instance: loki + app.kubernetes.io/name: loki kubeClusterName: jwt cluster: monitoring: - enabled: true + enabled: false initDB: enabled: true dbName: grafana From aaebedea1290f5b59eedb048bbb170ddd8112785 Mon Sep 17 00:00:00 2001 From: Yaroslav Kasatikov Date: Thu, 22 Jun 2023 10:58:27 +0300 Subject: [PATCH 6/6] Postgres Logs --- postgres/helm/templates/podlogs.yaml | 47 ++++++++++++++++++++++++++++ 1 file changed, 47 insertions(+) create mode 100644 postgres/helm/templates/podlogs.yaml diff --git a/postgres/helm/templates/podlogs.yaml b/postgres/helm/templates/podlogs.yaml new file mode 100644 index 0000000..6919283 --- /dev/null +++ b/postgres/helm/templates/podlogs.yaml @@ -0,0 +1,47 @@ +{{ if .Values.podLogs.enabled }} +apiVersion: monitoring.grafana.com/v1alpha1 +kind: PodLogs +metadata: + labels: + {{- with .Values.podLogs.intstanceSelector }} + {{- toYaml . | nindent 4 }} + {{- end }} + name: {{ include "vault-secret.fullname" . }}-cnpg-podlogs +spec: + namespaceSelector: + matchNames: + - {{ .Release.Namespace }} + pipelineStages: + - cri: {} + relabelings: + - action: replace + sourceLabels: + - __meta_kubernetes_pod_node_name + targetLabel: __host__ + - action: labelmap + regex: __meta_kubernetes_pod_label_(.+) + - action: replace + replacement: $1 + separator: '-' + sourceLabels: + - __meta_kubernetes_pod_label_app_kubernetes_io_name + - __meta_kubernetes_pod_label_app_kubernetes_io_component + targetLabel: __service__ + - action: replace + replacement: $1 + separator: / + sourceLabels: + - __meta_kubernetes_namespace + - __service__ + targetLabel: job + - action: replace + sourceLabels: + - __meta_kubernetes_pod_container_name + targetLabel: container + - action: replace + replacement: loki + targetLabel: cluster + selector: + matchLabels: + cnpg.io/cluster: {{ include "vault-secret.fullname" . }} +{{- end }}