From b741ca9cbf97bf9d2fd5ca89a83ef9bd39ef34c9 Mon Sep 17 00:00:00 2001 From: Chris Stockton Date: Fri, 28 Nov 2025 11:12:34 -0700 Subject: [PATCH 1/2] fix: set restart limits to 0 to prevent being marked as failed The systemd default is 10s / 5 for these values with a DefaultRestartUSec of 100ms. Most services set a RestartSec limit of 3, under most circumstances it takes 15s to restart 5 times so the limit of 10s is not exceeded. However if other system processes (salt, cloud init) restart it explicitly, or recovering system services within the --before chain trigger a restart the limit can be exceeded causing it to be marked as failed. Since no services mark gotrue.service as required it will remain offline until the next explicit restart is issued. Setting these values to 0 with Restart=always and RestartSec=3 will prevent gotrue from being marked as failed. --- ansible/files/gotrue.service.j2 | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/ansible/files/gotrue.service.j2 b/ansible/files/gotrue.service.j2 index 144448cc6..dbcbd03fe 100644 --- a/ansible/files/gotrue.service.j2 +++ b/ansible/files/gotrue.service.j2 @@ -40,9 +40,19 @@ After=network-online.target systemd-resolved.service Wants=postgresql.service After=postgresql.service -# Lower start limit ival and burst to prevent the noisy flapping -StartLimitIntervalSec=10 -StartLimitBurst=5 +# The systemd default is 10s / 5 for these values with a DefaultRestartUSec of +# 100ms. Most services set a RestartSec limit of 3, under most circumstances it +# takes 15s to restart 5 times so the limit of 10s is not exceeded. However if +# other system processes (salt, cloud init) restart it explicitly, or recovering +# system services within the --before chain trigger a restart the limit can be +# exceeded causing it to be marked as failed. Since no services mark +# gotrue.service as required it will remain offline until the next explicit +# restart is issued. +# +# Setting these values to 0 with Restart=always and RestartSec=3 will prevent +# gotrue from being marked as failed. +StartLimitIntervalSec=0 +StartLimitBurst=0 [Service] Type=exec From 3ef31ba81249cf1dddcbff6a85380eccbed5fa1e Mon Sep 17 00:00:00 2001 From: Chris Stockton Date: Mon, 1 Dec 2025 15:59:23 -0700 Subject: [PATCH 2/2] chore: set StartLimits for persistent services. I've noticed all !oneshot services set a `RestartSec` of `3s` and we use the systemd defaults of `StartLimitBurst=5` and `StartLimitInterval=10s`. Together this forms a property that under typical conditions a service will be restarted indefinitely until it comes back up due to `(3s * 5) > 10s`, but it is still possible for a service to enter a failed state under some scenarios. This change defensively sets them to 0/0 to keep them in restart loops. --- ansible/files/adminapi.service.j2 | 5 ++--- ansible/files/nginx.service.j2 | 3 +++ ansible/files/pg_egress_collect.service.j2 | 3 +++ ansible/files/postgres_exporter.service.j2 | 3 +++ ansible/files/postgrest.service.j2 | 3 +++ ansible/files/vector.service.j2 | 3 +++ 6 files changed, 17 insertions(+), 3 deletions(-) diff --git a/ansible/files/adminapi.service.j2 b/ansible/files/adminapi.service.j2 index cc1e9dc2a..305d1ac62 100644 --- a/ansible/files/adminapi.service.j2 +++ b/ansible/files/adminapi.service.j2 @@ -3,9 +3,8 @@ Description=AdminAPI Requires=network-online.target After=network-online.target -# Move this to the Service section if on systemd >=250 -StartLimitIntervalSec=60 -StartLimitBurst=10 +StartLimitIntervalSec=0 +StartLimitBurst=0 [Service] Type=simple diff --git a/ansible/files/nginx.service.j2 b/ansible/files/nginx.service.j2 index 872e3346a..a43c3df60 100644 --- a/ansible/files/nginx.service.j2 +++ b/ansible/files/nginx.service.j2 @@ -3,6 +3,9 @@ Description=nginx server After=postgrest.service gotrue.service adminapi.service Wants=postgrest.service gotrue.service adminapi.service +StartLimitIntervalSec=0 +StartLimitBurst=0 + [Service] Type=forking ExecStart=/usr/local/nginx/sbin/nginx -c /etc/nginx/nginx.conf diff --git a/ansible/files/pg_egress_collect.service.j2 b/ansible/files/pg_egress_collect.service.j2 index 7ac04f47d..36e1b2074 100644 --- a/ansible/files/pg_egress_collect.service.j2 +++ b/ansible/files/pg_egress_collect.service.j2 @@ -1,6 +1,9 @@ [Unit] Description=Postgres Egress Collector +StartLimitIntervalSec=0 +StartLimitBurst=0 + [Service] Type=simple ExecStart=/bin/bash -c "tcpdump -s 128 -Q out -nn -tt -vv -p -l 'tcp and (port 5432 or port 6543)' | perl /root/pg_egress_collect.pl" diff --git a/ansible/files/postgres_exporter.service.j2 b/ansible/files/postgres_exporter.service.j2 index 6baa18c0d..dcb107cb7 100644 --- a/ansible/files/postgres_exporter.service.j2 +++ b/ansible/files/postgres_exporter.service.j2 @@ -1,6 +1,9 @@ [Unit] Description=Postgres Exporter +StartLimitIntervalSec=0 +StartLimitBurst=0 + [Service] Type=simple ExecStart=/opt/postgres_exporter/postgres_exporter --disable-settings-metrics --extend.query-path="/opt/postgres_exporter/queries.yml" --disable-default-metrics --no-collector.locks --no-collector.replication --no-collector.replication_slot --no-collector.stat_bgwriter --no-collector.stat_database --no-collector.stat_user_tables --no-collector.statio_user_tables --no-collector.wal {% if qemu_mode is defined and qemu_mode %}--no-collector.database {% endif %} diff --git a/ansible/files/postgrest.service.j2 b/ansible/files/postgrest.service.j2 index 290f07720..61102cb42 100644 --- a/ansible/files/postgrest.service.j2 +++ b/ansible/files/postgrest.service.j2 @@ -3,6 +3,9 @@ Description=PostgREST Requires=postgrest-optimizations.service After=postgrest-optimizations.service +StartLimitIntervalSec=0 +StartLimitBurst=0 + [Service] Type=simple # We allow the base config (sent from the worker) to override the generated config diff --git a/ansible/files/vector.service.j2 b/ansible/files/vector.service.j2 index 1c88baa20..05c11e453 100644 --- a/ansible/files/vector.service.j2 +++ b/ansible/files/vector.service.j2 @@ -4,6 +4,9 @@ Documentation=https://vector.dev After=network-online.target Requires=network-online.target +StartLimitIntervalSec=0 +StartLimitBurst=0 + [Service] User=vector Group=vector