From 045b3893f90c8ace7a46888c177953e692441e8d Mon Sep 17 00:00:00 2001 From: Kevin Pouget Date: Wed, 1 Oct 2025 15:49:48 +0200 Subject: [PATCH 01/44] createdisk: remove trailing spaces --- createdisk.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/createdisk.sh b/createdisk.sh index 84cfd0fa9..c09cb70c1 100755 --- a/createdisk.sh +++ b/createdisk.sh @@ -52,7 +52,7 @@ wait_for_ssh ${VM_NAME} ${VM_IP} if [ ${BUNDLE_TYPE} != "microshift" ]; then # Disable kubelet service ${SSH} core@${VM_IP} -- sudo systemctl disable kubelet - + # Stop the kubelet service so it will not reprovision the pods ${SSH} core@${VM_IP} -- sudo systemctl stop kubelet fi From d8819d476833be8f7754eac43a55eef6455b4ab2 Mon Sep 17 00:00:00 2001 From: Kevin Pouget Date: Wed, 1 Oct 2025 16:00:32 +0200 Subject: [PATCH 02/44] systemd/*.service: Ensure that crc-env exists before starting This commit enforces that the CRC services that require the CRC configuration file (`/etc/sysconfig/crc-env`) don't start before the file has been populated. Most of the services actually have this like: ``` ExecCondition=/usr/local/bin/crc-self-sufficient-env.sh ``` which requires `/etc/sysconfig/crc-env`, so the synchronization point ``` After=crc-env-file-exists.service ``` is present in most of the services. This will go away when the self-sufficient bundle becomes the default path. --- systemd/crc-cluster-status.service | 1 + systemd/crc-custom.target | 1 + systemd/crc-dnsmasq.service | 1 + systemd/crc-env-file-exists.service | 20 ++++++++++++++++++++ systemd/crc-no-tap.service | 1 + systemd/crc-pullsecret.service | 1 + systemd/crc-routes-controller.service | 1 + systemd/crc-wait-apiserver-up.service | 1 + systemd/ocp-clusterid.service | 1 + systemd/ocp-custom-domain.service | 1 + systemd/ocp-growfs.service | 1 + systemd/ocp-mco-sshkey.service | 1 + systemd/ocp-userpasswords.service | 1 + 13 files changed, 32 insertions(+) create mode 100644 systemd/crc-env-file-exists.service diff --git a/systemd/crc-cluster-status.service b/systemd/crc-cluster-status.service index 92d73dffe..a30379fb3 100644 --- a/systemd/crc-cluster-status.service +++ b/systemd/crc-cluster-status.service @@ -1,5 +1,6 @@ [Unit] Description=CRC Unit checking if cluster is ready +After=crc-env-file-exists.service After=crc-wait-apiserver-up.service crc-pullsecret.service After=ocp-mco-sshkey.service ocp-cluster-ca.service After=ocp-custom-domain.service ocp-userpasswords.service diff --git a/systemd/crc-custom.target b/systemd/crc-custom.target index 206d482fa..cffb4d5b9 100644 --- a/systemd/crc-custom.target +++ b/systemd/crc-custom.target @@ -1,4 +1,5 @@ [Unit] Description=crc custom target Requires=kubelet-dependencies.target +Requires=crc-env-file-exists.service After=kubelet-dependencies.target diff --git a/systemd/crc-dnsmasq.service b/systemd/crc-dnsmasq.service index 42d45a93d..17346235b 100644 --- a/systemd/crc-dnsmasq.service +++ b/systemd/crc-dnsmasq.service @@ -1,6 +1,7 @@ [Unit] Description=CRC Unit for configuring dnsmasq Wants=ovs-configuration.service +After=crc-env-file-exists.service After=ovs-configuration.service Before=kubelet-dependencies.target StartLimitIntervalSec=30 diff --git a/systemd/crc-env-file-exists.service b/systemd/crc-env-file-exists.service new file mode 100644 index 000000000..35a9fb344 --- /dev/null +++ b/systemd/crc-env-file-exists.service @@ -0,0 +1,20 @@ +[Unit] +Description=Wait for /etc/sysconfig/crc-env file to be populated + +[Service] +# This service runs a command once and then exits. +Type=oneshot + +# This is the magic part. It keeps the service in an 'active' state +# after the command exits, so other services can see it succeeded. +RemainAfterExit=yes + +# This is the command that waits for the file. +# It checks every second if the file does not exist ('! -f'). +# Once the file is found, the loop exits, the command succeeds, and the service is 'active'. +ExecStart=/bin/sh -c 'while [ ! -f /etc/sysconfig/crc-env ]; do sleep 1; done' +TimeoutStartSec=300 + +[Install] +# Ensure this service is started during the normal boot process. +WantedBy=crc-custom.target diff --git a/systemd/crc-no-tap.service b/systemd/crc-no-tap.service index fe215e2c4..318467d54 100644 --- a/systemd/crc-no-tap.service +++ b/systemd/crc-no-tap.service @@ -2,6 +2,7 @@ Description=Ensure that tap0 network configuration is absent on Apple Virtualization Before=NetworkManager.service After=local-fs.target +After=crc-env-file-exists.service RequiresMountsFor=/etc/NetworkManager/system-connections [Service] diff --git a/systemd/crc-pullsecret.service b/systemd/crc-pullsecret.service index 4c88531b7..79c86d53f 100644 --- a/systemd/crc-pullsecret.service +++ b/systemd/crc-pullsecret.service @@ -1,5 +1,6 @@ [Unit] Description=CRC Unit for adding pull secret to cluster +After=crc-env-file-exists.service After=crc-wait-apiserver-up.service StartLimitIntervalSec=450 StartLimitBurst=10 diff --git a/systemd/crc-routes-controller.service b/systemd/crc-routes-controller.service index fe56fde2a..4ac3408a0 100644 --- a/systemd/crc-routes-controller.service +++ b/systemd/crc-routes-controller.service @@ -1,5 +1,6 @@ [Unit] Description=CRC Unit starting routes controller +After=crc-env-file-exists.service After=crc-wait-apiserver-up.service StartLimitIntervalSec=450 StartLimitBurst=10 diff --git a/systemd/crc-wait-apiserver-up.service b/systemd/crc-wait-apiserver-up.service index 7cf21e000..477692345 100644 --- a/systemd/crc-wait-apiserver-up.service +++ b/systemd/crc-wait-apiserver-up.service @@ -2,6 +2,7 @@ Description=CRC Unit waiting till k8s API server is up Requires=kubelet.service After=kubelet.service +After=crc-env-file-exists.service Before=ocp-delete-mco-leases.service [Service] diff --git a/systemd/ocp-clusterid.service b/systemd/ocp-clusterid.service index 19479bb8c..9c4ec761e 100644 --- a/systemd/ocp-clusterid.service +++ b/systemd/ocp-clusterid.service @@ -1,5 +1,6 @@ [Unit] Description=CRC Unit setting random cluster ID +After=crc-env-file-exists.service After=crc-wait-apiserver-up.service StartLimitIntervalSec=450 StartLimitBurst=10 diff --git a/systemd/ocp-custom-domain.service b/systemd/ocp-custom-domain.service index 6ec401c64..273ec7950 100644 --- a/systemd/ocp-custom-domain.service +++ b/systemd/ocp-custom-domain.service @@ -1,5 +1,6 @@ [Unit] Description=CRC Unit setting nip.io domain for cluster +After=crc-env-file-exists.service After=crc-wait-apiserver-up.service StartLimitIntervalSec=450 StartLimitBurst=10 diff --git a/systemd/ocp-growfs.service b/systemd/ocp-growfs.service index ff92d99cd..307bdbece 100644 --- a/systemd/ocp-growfs.service +++ b/systemd/ocp-growfs.service @@ -1,6 +1,7 @@ [Unit] Description=CRC Unit to grow the root filesystem Requires=crc-custom.target +After=crc-env-file-exists.service [Service] Type=oneshot diff --git a/systemd/ocp-mco-sshkey.service b/systemd/ocp-mco-sshkey.service index 85aaa170e..81e0fc1c2 100644 --- a/systemd/ocp-mco-sshkey.service +++ b/systemd/ocp-mco-sshkey.service @@ -1,5 +1,6 @@ [Unit] Description=CRC Unit patching the MachineConfig to add new ssh key +After=crc-env-file-exists.service After=crc-wait-apiserver-up.service StartLimitIntervalSec=450 StartLimitBurst=10 diff --git a/systemd/ocp-userpasswords.service b/systemd/ocp-userpasswords.service index 57937762f..e5e30af1a 100644 --- a/systemd/ocp-userpasswords.service +++ b/systemd/ocp-userpasswords.service @@ -1,6 +1,7 @@ [Unit] Description=CRC Unit setting the developer and kubeadmin user password Before=ocp-cluster-ca.service +After=crc-env-file-exists.service After=crc-wait-apiserver-up.service StartLimitIntervalSec=450 StartLimitBurst=10 From d27e42540110a13088b760894038289fb1a63073 Mon Sep 17 00:00:00 2001 From: Kevin Pouget Date: Wed, 1 Oct 2025 16:00:32 +0200 Subject: [PATCH 03/44] systemd/*.service: don't use EnvironmentFile when not needed The `EnvironmentFile` makes sense when the `unit.service` file relies on these environment variables, which isn't the case for CRC services. Instead, the relevant scripts should call `source /etc/sysconfig/crc-env` to be self-sufficient. This will be done in a follow up commit. --- systemd/crc-cluster-status.service | 1 - systemd/crc-dnsmasq.service | 1 - systemd/crc-no-tap.service | 1 - systemd/crc-pullsecret.service | 1 - systemd/crc-routes-controller.service | 1 - systemd/crc-wait-apiserver-up.service | 1 - systemd/ocp-cluster-ca.service | 1 - systemd/ocp-clusterid.service | 1 - systemd/ocp-custom-domain.service | 1 - systemd/ocp-growfs.service | 1 - systemd/ocp-mco-sshkey.service | 1 - systemd/ocp-userpasswords.service | 1 - 12 files changed, 12 deletions(-) diff --git a/systemd/crc-cluster-status.service b/systemd/crc-cluster-status.service index a30379fb3..74c8af9a0 100644 --- a/systemd/crc-cluster-status.service +++ b/systemd/crc-cluster-status.service @@ -12,7 +12,6 @@ StartLimitBurst=10 Type=oneshot Restart=on-failure RestartSec=40 -EnvironmentFile=-/etc/sysconfig/crc-env ExecCondition=/usr/local/bin/crc-self-sufficient-env.sh ExecStart=/usr/local/bin/crc-cluster-status.sh RemainAfterExit=true diff --git a/systemd/crc-dnsmasq.service b/systemd/crc-dnsmasq.service index 17346235b..a01107210 100644 --- a/systemd/crc-dnsmasq.service +++ b/systemd/crc-dnsmasq.service @@ -9,7 +9,6 @@ StartLimitIntervalSec=30 [Service] Type=oneshot Restart=on-failure -EnvironmentFile=-/etc/sysconfig/crc-env ExecStartPre=/bin/systemctl start ovs-configuration.service ExecCondition=/usr/local/bin/crc-self-sufficient-env.sh ExecStart=/usr/local/bin/crc-dnsmasq.sh diff --git a/systemd/crc-no-tap.service b/systemd/crc-no-tap.service index 318467d54..3f7c64e59 100644 --- a/systemd/crc-no-tap.service +++ b/systemd/crc-no-tap.service @@ -7,7 +7,6 @@ RequiresMountsFor=/etc/NetworkManager/system-connections [Service] Type=oneshot -EnvironmentFile=-/etc/sysconfig/crc-env ExecStart=/usr/local/bin/crc-no-tap.sh [Install] diff --git a/systemd/crc-pullsecret.service b/systemd/crc-pullsecret.service index 79c86d53f..a76b36584 100644 --- a/systemd/crc-pullsecret.service +++ b/systemd/crc-pullsecret.service @@ -10,7 +10,6 @@ ConditionPathExists=!/opt/crc/%n.done Type=oneshot Restart=on-failure RestartSec=40 -EnvironmentFile=-/etc/sysconfig/crc-env ExecCondition=/usr/local/bin/crc-self-sufficient-env.sh ExecStart=/usr/local/bin/crc-pullsecret.sh ExecStartPost=-touch /opt/crc/%n.done diff --git a/systemd/crc-routes-controller.service b/systemd/crc-routes-controller.service index 4ac3408a0..a1f3c53c4 100644 --- a/systemd/crc-routes-controller.service +++ b/systemd/crc-routes-controller.service @@ -9,7 +9,6 @@ StartLimitBurst=10 Type=oneshot Restart=on-failure RestartSec=40 -EnvironmentFile=-/etc/sysconfig/crc-env ExecCondition=/usr/local/bin/crc-self-sufficient-env.sh ExecStart=/usr/local/bin/crc-routes-controller.sh diff --git a/systemd/crc-wait-apiserver-up.service b/systemd/crc-wait-apiserver-up.service index 477692345..2a6061917 100644 --- a/systemd/crc-wait-apiserver-up.service +++ b/systemd/crc-wait-apiserver-up.service @@ -8,7 +8,6 @@ Before=ocp-delete-mco-leases.service [Service] Type=oneshot Restart=on-failure -EnvironmentFile=-/etc/sysconfig/crc-env ExecCondition=/usr/local/bin/crc-self-sufficient-env.sh ExecStart=/usr/local/bin/crc-wait-apiserver-up.sh diff --git a/systemd/ocp-cluster-ca.service b/systemd/ocp-cluster-ca.service index 374383fca..8f5a8d2ef 100644 --- a/systemd/ocp-cluster-ca.service +++ b/systemd/ocp-cluster-ca.service @@ -9,7 +9,6 @@ ConditionPathExists=!/opt/crc/%n.done Type=oneshot Restart=on-failure RestartSec=40 -EnvironmentFile=-/etc/sysconfig/crc-env ExecCondition=/usr/local/bin/crc-self-sufficient-env.sh ExecStart=/usr/local/bin/ocp-cluster-ca.sh ExecStartPost=-touch /opt/crc/%n.done diff --git a/systemd/ocp-clusterid.service b/systemd/ocp-clusterid.service index 9c4ec761e..f901f8160 100644 --- a/systemd/ocp-clusterid.service +++ b/systemd/ocp-clusterid.service @@ -9,7 +9,6 @@ StartLimitBurst=10 Type=oneshot Restart=on-failure RestartSec=40 -EnvironmentFile=-/etc/sysconfig/crc-env ExecCondition=/usr/local/bin/crc-self-sufficient-env.sh ExecStart=/usr/local/bin/ocp-clusterid.sh diff --git a/systemd/ocp-custom-domain.service b/systemd/ocp-custom-domain.service index 273ec7950..93d644ce0 100644 --- a/systemd/ocp-custom-domain.service +++ b/systemd/ocp-custom-domain.service @@ -10,7 +10,6 @@ ConditionPathExists=!/opt/crc/%n.done Type=oneshot Restart=on-failure RestartSec=40 -EnvironmentFile=-/etc/sysconfig/crc-env ExecCondition=/usr/local/bin/crc-self-sufficient-env.sh ExecStart=/usr/local/bin/ocp-custom-domain.sh ExecStartPost=-touch /opt/crc/%n.done diff --git a/systemd/ocp-growfs.service b/systemd/ocp-growfs.service index 307bdbece..0790e4dd7 100644 --- a/systemd/ocp-growfs.service +++ b/systemd/ocp-growfs.service @@ -5,7 +5,6 @@ After=crc-env-file-exists.service [Service] Type=oneshot -EnvironmentFile=-/etc/sysconfig/crc-env ExecCondition=/usr/local/bin/crc-self-sufficient-env.sh ExecStart=/usr/local/bin/ocp-growfs.sh diff --git a/systemd/ocp-mco-sshkey.service b/systemd/ocp-mco-sshkey.service index 81e0fc1c2..42b4b5587 100644 --- a/systemd/ocp-mco-sshkey.service +++ b/systemd/ocp-mco-sshkey.service @@ -9,7 +9,6 @@ StartLimitBurst=10 Type=oneshot Restart=on-failure RestartSec=40 -EnvironmentFile=-/etc/sysconfig/crc-env ExecCondition=/usr/local/bin/crc-self-sufficient-env.sh ExecStart=/usr/local/bin/ocp-mco-sshkey.sh RemainAfterExit=true diff --git a/systemd/ocp-userpasswords.service b/systemd/ocp-userpasswords.service index e5e30af1a..b47b9fcf1 100644 --- a/systemd/ocp-userpasswords.service +++ b/systemd/ocp-userpasswords.service @@ -11,7 +11,6 @@ ConditionPathExists=!/opt/crc/%n.done Type=oneshot Restart=on-failure RestartSec=40 -EnvironmentFile=-/etc/sysconfig/crc-env ExecCondition=/usr/local/bin/crc-self-sufficient-env.sh ExecStartPre=/usr/bin/sleep 5 ExecStart=/usr/local/bin/ocp-userpasswords.sh From 9a036844c65e0076d3ce311460da5c882f75dcfd Mon Sep 17 00:00:00 2001 From: Kevin Pouget Date: Wed, 1 Oct 2025 16:20:47 +0200 Subject: [PATCH 04/44] systemd: Improve the handling of the tap networking This commit clarifies the enablement/disablement of the TAP networking. * `systemd/crc-needs-tap.sh` this script tells if the TAP networking should be enabled or not. The choice is currently done by checking the CRC configuration file. * `systemd/crc-conditionally-disable-tap.sh` this script checks if the TAP networking should be disabled or not. I had to use a script to make the choice, as using the SystemD directives would have failed the service. * `systemd/crc-disable-tap.sh` this script disables the TAP networking, by disactivating the GV proxy and the `tap0` network configuration. * `crc-self-sufficient-env.sh` this script tells if VM is running a self-sufficient bundle * `crc-user-mode-networking.sh` this script checks if the user-mode networking should be enabled --- createdisk.sh | 4 ++ ...c-no-tap.service => crc-check-tap.service} | 6 +- systemd/crc-conditionally-disable-tap.sh | 17 +++++ systemd/crc-disable-tap.sh | 14 ++++ systemd/crc-needs-tap.sh | 44 +++++++++++++ systemd/crc-no-tap.sh | 10 --- systemd/crc-self-sufficient-env.sh | 18 +++-- systemd/crc-user-mode-networking.sh | 65 +++++++++++++++++++ 8 files changed, 161 insertions(+), 17 deletions(-) rename systemd/{crc-no-tap.service => crc-check-tap.service} (50%) create mode 100644 systemd/crc-conditionally-disable-tap.sh create mode 100644 systemd/crc-disable-tap.sh create mode 100644 systemd/crc-needs-tap.sh delete mode 100644 systemd/crc-no-tap.sh create mode 100644 systemd/crc-user-mode-networking.sh diff --git a/createdisk.sh b/createdisk.sh index c09cb70c1..9ce573438 100755 --- a/createdisk.sh +++ b/createdisk.sh @@ -109,11 +109,15 @@ ${SSH} core@${VM_IP} 'sudo bash -x -s' < vfkit doesn't need tap0 +# --> other platforms do need it + +virt="$(systemd-detect-virt || true)" + +if [[ -z "$virt" ]]; then + echo "ERROR: systemd couldn't detect the virtualization :/" >&2 + exit "$EXIT_ERROR" +fi + +if [[ "${virt}" == apple ]] ; then + echo "Running with vfkit ($virt) virtualization. Don't need tap0." + exit "$EXIT_DONT_NEED_TAP" +fi + +echo "Running with '$virt' virtualization. Need tap0." + +exit "$EXIT_NEED_TAP" diff --git a/systemd/crc-no-tap.sh b/systemd/crc-no-tap.sh deleted file mode 100644 index 1f0410221..000000000 --- a/systemd/crc-no-tap.sh +++ /dev/null @@ -1,10 +0,0 @@ -#!/bin/bash - -# Return true if running under Apple Virtualization or CRC_SELF_SUFFICIENT is set, otherwise false - -if systemd-detect-virt | grep -q '^apple$' || [ -n "$CRC_SELF_SUFFICIENT" ]; then - rm -f /etc/NetworkManager/system-connections/tap0.nmconnection - systemctl disable --now gv-user-network@tap0.service -fi - -exit 0 diff --git a/systemd/crc-self-sufficient-env.sh b/systemd/crc-self-sufficient-env.sh index 32dde4294..a19f7c7d4 100644 --- a/systemd/crc-self-sufficient-env.sh +++ b/systemd/crc-self-sufficient-env.sh @@ -1,8 +1,16 @@ #!/bin/bash -set -euo pipefail -# Optional: load env if unit forgot EnvironmentFile -[ -r /etc/sysconfig/crc-env ] && . /etc/sysconfig/crc-env -if [ "${CRC_SELF_SUFFICIENT:-}" = "1" ] || [ "${CRC_CLOUD:-}" = "1" ]; then + +set -o pipefail +set -o errexit +set -o nounset +set -o errtrace + +source /etc/sysconfig/crc-env || echo "WARNING: crc-env not found" + +if (( ${CRC_SELF_SUFFICIENT:-0} == 1 )); then + echo "Running with a self-sufficient bundle" exit 0 +else + echo "Not running in a self-sufficient bundle" + exit 1 fi -exit 1 \ No newline at end of file diff --git a/systemd/crc-user-mode-networking.sh b/systemd/crc-user-mode-networking.sh new file mode 100644 index 000000000..109603e5e --- /dev/null +++ b/systemd/crc-user-mode-networking.sh @@ -0,0 +1,65 @@ +#!/bin/bash + +set -o pipefail +set -o errexit +set -o nounset +set -o errtrace + +source /etc/sysconfig/crc-env || echo "WARNING: crc-env not found" + +EXIT_ERROR=77 + +target="${1:-}" +if [[ "$target" == user || -z "$target" ]]; then + # searching for user mode, return 0 if user + EXIT_USER_MODE=0 + EXIT_NOT_USER_MODE=1 +elif [[ "$target" == system ]]; then + # searching for system mode, return 0 if system + EXIT_NOT_USER_MODE=0 + EXIT_USER_MODE=1 +else + echo "ERROR: invalid target '$target'. Should be 'user' (default) or 'system'. Got '$target'." >&2 + exit "$EXIT_ERROR" +fi + + +if /usr/local/bin/crc-self-sufficient-env.sh; then + echo "Running a self-sufficient bundle. Not user-mode networking." + if [[ "${CRC_NETWORK_MODE_USER:-}" ]]; then + echo "WARNING: Ignoring CRC_NETWORK_MODE_USER='$CRC_NETWORK_MODE_USER' in the self-sufficient bundle." + fi + + exit "$EXIT_NOT_USER_MODE" +fi + +# no value --> error +if [[ -z "${CRC_NETWORK_MODE_USER:-}" ]]; then + echo "ERROR: CRC_NETWORK_MODE_USER not set. Assuming user networking." >&2 + exit "$EXIT_ERROR" +fi + +# value not in [0, 1] --> error +if [[ ! "${CRC_NETWORK_MODE_USER}" =~ ^[01]$ ]]; then + echo "ERROR: unknown network mode: CRC_NETWORK_MODE_USER=${CRC_NETWORK_MODE_USER} (expected 0 or 1)" >&2 + exit "$EXIT_ERROR" +fi + +# value == 0 --> not user-node +if (( CRC_NETWORK_MODE_USER == 0 )); then + echo "network-mode 'system' detected" + exit "$EXIT_NOT_USER_MODE" +fi + +# value == 1 --> user-mode +if (( CRC_NETWORK_MODE_USER == 1 )); then + echo "network-mode 'user' detected" + exit "$EXIT_USER_MODE" +fi + +# anything else --> error (can't be reached) +echo "ERROR: unknown network mode: CRC_NETWORK_MODE_USER=$CRC_NETWORK_MODE_USER." >&2 +echo "Assuming user networking." >&2 +echo "SHOULD NOT BE REACHED." >&2 + +exit "$EXIT_ERROR" From 49b490f0ef0de6774ecc13618bac0521b09df50b Mon Sep 17 00:00:00 2001 From: Kevin Pouget Date: Wed, 1 Oct 2025 16:23:18 +0200 Subject: [PATCH 05/44] tools.sh: improve the bash syntax of the `generate_htpasswd_file` function Minor improvements of the `generate_htpasswd_file` function: - don't use `local var=$(command)` as this avoids the `set -e` safety net (if `command` fails, the failure is ignored by Bash) - consistent use of `$auth_file_dir` instead of `$1` - better comments to explain why the two `htpasswd` calls don't have the same arguments --- tools.sh | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/tools.sh b/tools.sh index 3b3d98a56..478d9ad0e 100755 --- a/tools.sh +++ b/tools.sh @@ -221,8 +221,10 @@ function generate_htpasswd_file { local pass_file=$2 ( set +x # use a subshell to avoid leaking the password - local random_password=$(cat $1/auth/kubeadmin-password) - ${HTPASSWD} -c -B -i "${pass_file}" developer <<<"developer" - ${HTPASSWD} -B -i "${pass_file}" kubeadmin <<<"${random_password}" + + local random_password + random_password=$(cat "$auth_file_dir/auth/kubeadmin-password") + "${HTPASSWD}" -c -B -i "${pass_file}" developer <<< "developer" # use -c to create the file + "${HTPASSWD}" -B -i "${pass_file}" kubeadmin <<< "${random_password}" # append to the existing password file ) } From c92595125fafb26dde65e56e500958be2b51f2dd Mon Sep 17 00:00:00 2001 From: Kevin Pouget Date: Wed, 1 Oct 2025 16:27:02 +0200 Subject: [PATCH 06/44] crc-systemd-common.sh: improve the bash syntax, rename into wait_for_resource_or_die * Rename the function into `wait_for_resource_or_die` to make it clear that the function exits if the wait fails * Disable `set -x` during the wait, to reduce the journal verbosity * Check that the `$resource` argument isn't missing * Use of Bash arithmetic syntax to make the code more readable * Explicit of `for (())` and `(( retry == max_retry ))` checks to easily read the execution flow * Clear error messages --- systemd/crc-systemd-common.sh | 50 +++++++++++++++++++++++++++++------ 1 file changed, 42 insertions(+), 8 deletions(-) diff --git a/systemd/crc-systemd-common.sh b/systemd/crc-systemd-common.sh index bd68169ed..c384eb39d 100644 --- a/systemd/crc-systemd-common.sh +++ b/systemd/crc-systemd-common.sh @@ -1,15 +1,49 @@ # $1 is the resource to check # $2 is an optional maximum retry count; default 20 -function wait_for_resource() { - local retry=0 +function wait_for_resource_or_die() { + local resource=${1:-} local max_retry=${2:-20} local wait_sec=${3:-5} - until oc get "$1" > /dev/null 2>&1 - do - [[ "$retry" -ge "$max_retry" ]] && exit 1 - sleep $wait_sec - ((retry++)) + + local xtrace_was_disabled=0 + # Check if xtrace is currently DISABLED. If so, set a flag. + [[ $- == *x* ]] || xtrace_was_disabled=1 + set +x # disable xtrace to reduce the verbosity of this function + + if [[ -z "$resource" ]]; then + echo "ERROR: expected a K8s resource as first parameter ..." + echo "ERROR: wait_for_resource_or_die RESOURCE [max_retry=20] [wait_sec=5]" + exit 1 # this is wait_for_resource_or_die, so die ... + fi + + # Loop from 1 up to max_retry + for (( retry=1; retry<=max_retry; retry++ )); do + # Try the command. If it succeeds, exit the loop. + if oc get $resource > /dev/null 2>&1; then + local end_time + end_time=$(date +%s) + + local duration=$((end_time - start_time)) + echo "Resource '$resource' found after $retry tries ($duration seconds)." + + if (( ! xtrace_was_disabled )); then + set -x # reenable xtrace + fi + + return 0 + fi + + # If it's the last attempt, log a failure message before exiting + if (( retry == max_retry )); then + echo "Error: Timed out waiting for resource '$resource' after ${max_retry} attempts x ${wait_sec} seconds." >&2 + exit 1 # this is wait_for_resource_or_die, so die ... + fi + + # Wait before the next attempt + echo "Attempt ${retry}/${max_retry} didn't succeed." + echo "Waiting $wait_sec seconds for '$resource'." + sleep "$wait_sec" done - return 0 + # unreachable } From 46d736a55a68a8332e3f026976b3dc809f388d35 Mon Sep 17 00:00:00 2001 From: Kevin Pouget Date: Wed, 1 Oct 2025 16:26:16 +0200 Subject: [PATCH 07/44] crc-cluster-status: improve the bash syntax * Clarification of the login test/retry logic. * Simple time tracking for a quick glance assessment of the wait duration * Better logging and constant definition --- systemd/crc-cluster-status.sh | 46 +++++++++++++++++++++++++++-------- 1 file changed, 36 insertions(+), 10 deletions(-) diff --git a/systemd/crc-cluster-status.sh b/systemd/crc-cluster-status.sh index a62586438..2529779ad 100644 --- a/systemd/crc-cluster-status.sh +++ b/systemd/crc-cluster-status.sh @@ -7,6 +7,8 @@ set -o errtrace set -x export KUBECONFIG=/opt/kubeconfig +MAXIMUM_LOGIN_RETRY=10 +RETRY_DELAY=5 if [ ! -f /opt/crc/pass_kubeadmin ]; then echo "kubeadmin password file not found" @@ -15,25 +17,49 @@ fi rm -rf /tmp/.crc-cluster-ready +SECONDS=0 if ! oc adm wait-for-stable-cluster --minimum-stable-period=1m --timeout=10m; then exit 1 fi +echo "Cluster took $SECONDS seconds to stabilize." -echo "Logging into OpenShift with kubeadmin user to update $KUBECONFIG" -COUNTER=1 -MAXIMUM_LOGIN_RETRY=10 +echo "Logging into OpenShift with kubeadmin user to update the KUBECONFIG" + +try_login() { + ( # use a `(set +x)` subshell to avoid leaking the password + set +x + set +e # don't abort on error in this subshell + oc login --insecure-skip-tls-verify=true \ + -u kubeadmin \ + -p "$(cat /opt/crc/pass_kubeadmin)" \ + https://api.crc.testing:6443 > /dev/null 2>&1 + ) + local success="$?" + if [[ "$success" == 0 ]]; then + echo "Login succeeded" + else + echo "Login did not complete ..." + fi -# use a `(set +x)` subshell to avoid leaking the password -until (set +x ; oc login --insecure-skip-tls-verify=true -u kubeadmin -p "$(cat /opt/crc/pass_kubeadmin)" https://api.crc.testing:6443 > /dev/null 2>&1); do - if [ "$COUNTER" -ge "$MAXIMUM_LOGIN_RETRY" ]; then - echo "Unable to login to the cluster..., authentication failed." + return "$success" +} + +for ((counter=1; counter<=MAXIMUM_LOGIN_RETRY; counter++)); do + echo "Login attempt $counter/$MAXIMUM_LOGIN_RETRY…" + if try_login; then + break + fi + if (( counter == MAXIMUM_LOGIN_RETRY )); then + echo "Unable to login to the cluster after $counter attempts; authentication failed." exit 1 fi - echo "Logging into OpenShift with updated credentials try $COUNTER, hang on...." - sleep 5 - ((COUNTER++)) + sleep "$RETRY_DELAY" done # need to set a marker to let `crc` know the cluster is ready touch /tmp/.crc-cluster-ready + +echo "All done after $SECONDS seconds " + +exit 0 From e422f01b7e1c36d9b999f1aa76a16338227e0ac6 Mon Sep 17 00:00:00 2001 From: Kevin Pouget Date: Wed, 1 Oct 2025 16:28:28 +0200 Subject: [PATCH 08/44] crc-pullsecret.sh: syntax and reliability improvements * More resilient checks of the pull secrets file * More secure handling of the pull secrets: * don't pass them via the command-line, but via stdin * use `jq` to enforce that the secrets are properly inserted in the patch JSON object --- systemd/crc-pullsecret.sh | 44 +++++++++++++++++++++++++++++---------- 1 file changed, 33 insertions(+), 11 deletions(-) diff --git a/systemd/crc-pullsecret.sh b/systemd/crc-pullsecret.sh index 895a5480a..d1665357b 100644 --- a/systemd/crc-pullsecret.sh +++ b/systemd/crc-pullsecret.sh @@ -9,24 +9,46 @@ set -x source /usr/local/bin/crc-systemd-common.sh export KUBECONFIG="/opt/kubeconfig" -wait_for_resource secret +PULL_SECRETS_FILE="/opt/crc/pull-secret" -set +x # disable the logging to avoid leaking the pull secrets +wait_for_resource_or_die secret -# check if existing pull-secret is valid if not add the one from /opt/crc/pull-secret -existingPsB64=$(oc get secret pull-secret -n openshift-config -o jsonpath="{['data']['\.dockerconfigjson']}") -existingPs=$(echo "${existingPsB64}" | base64 -d) +# The pull secret data is piped through stdin and not exposed in command arguments, +# so `set -x` is safe to keep # check if the .auths field is there -if echo "${existingPs}" | jq -e 'has("auths")' >/dev/null 2>&1; then - echo "Cluster already has the pull secrets, nothing to do" +if oc get secret pull-secret \ + -n openshift-config \ + -o jsonpath="{['data']['\.dockerconfigjson']}" \ + | base64 -d \ + | jq -e 'has("auths")' >/dev/null 2>&1 +then + echo "Cluster already has some pull secrets, nothing to do." exit 0 fi -echo "Cluster doesn't have the pull secrets. Setting them from /opt/crc/pull-secret ..." -pullSecretB64=$(base64 -w0 < /opt/crc/pull-secret) +echo "Cluster doesn't have the pull secrets. Setting them from $PULL_SECRETS_FILE ..." + +if [[ ! -r "$PULL_SECRETS_FILE" ]]; +then + echo "ERROR: $PULL_SECRETS_FILE is missing or unreadable" 1>&2 + exit 1 +fi + +if ! jq -e 'has("auths")' < "$PULL_SECRETS_FILE" >/dev/null; +then + echo "ERROR: pull-secrets file doesn't have the required '.auths' field" + exit 1 +fi + # Create the JSON patch in memory and pipe it to the oc command -printf '{"data":{".dockerconfigjson": "%s"}}' "${pullSecretB64}" | \ - oc patch secret pull-secret -n openshift-config --type merge --patch-file=/dev/stdin +base64 -w0 < "$PULL_SECRETS_FILE" | \ + jq -R '{"data": {".dockerconfigjson": .}}' | \ + oc patch secret pull-secret \ + -n openshift-config \ + --type merge \ + --patch-file=/dev/stdin + +echo "All done" exit 0 From d1a85efcde913a45296d12883a34dc70406f896c Mon Sep 17 00:00:00 2001 From: Kevin Pouget Date: Wed, 1 Oct 2025 16:30:03 +0200 Subject: [PATCH 09/44] crc-routes-controller.service: add a condition on user-mode networking Add a SystemD primitive to enforce that the `crc-routes-controller` is only deployed when user-mode networking has been enabled --- systemd/crc-routes-controller.service | 1 + systemd/crc-routes-controller.sh | 4 ---- 2 files changed, 1 insertion(+), 4 deletions(-) diff --git a/systemd/crc-routes-controller.service b/systemd/crc-routes-controller.service index a1f3c53c4..65f6e1973 100644 --- a/systemd/crc-routes-controller.service +++ b/systemd/crc-routes-controller.service @@ -9,6 +9,7 @@ StartLimitBurst=10 Type=oneshot Restart=on-failure RestartSec=40 +ExecCondition=/usr/local/bin/crc-user-mode-networking.sh ExecCondition=/usr/local/bin/crc-self-sufficient-env.sh ExecStart=/usr/local/bin/crc-routes-controller.sh diff --git a/systemd/crc-routes-controller.sh b/systemd/crc-routes-controller.sh index 7aa2c3316..3fc3fb919 100644 --- a/systemd/crc-routes-controller.sh +++ b/systemd/crc-routes-controller.sh @@ -2,10 +2,6 @@ set -x -if [[ ${CRC_NETWORK_MODE_USER} -eq 0 ]]; then - echo -n "network-mode 'system' detected: skipping routes-controller pod deployment" - exit 0 -fi source /usr/local/bin/crc-systemd-common.sh export KUBECONFIG=/opt/kubeconfig From d798723cfaaa3e028f0661688534c54b97f1cdcf Mon Sep 17 00:00:00 2001 From: Kevin Pouget Date: Wed, 1 Oct 2025 16:31:21 +0200 Subject: [PATCH 10/44] crc-routes-controller.sh: minor syntax improvements * Make the script more resilient by failing on any error. * Better use of script constants * Switch to `wait_for_resource_or_die` --- systemd/crc-routes-controller.sh | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/systemd/crc-routes-controller.sh b/systemd/crc-routes-controller.sh index 3fc3fb919..64d3f6f42 100644 --- a/systemd/crc-routes-controller.sh +++ b/systemd/crc-routes-controller.sh @@ -1,12 +1,22 @@ #!/bin/bash +set -o pipefail +set -o errexit +set -o nounset +set -o errtrace set -x +ROUTE_CONTROLLER=/opt/crc/routes-controller.yaml source /usr/local/bin/crc-systemd-common.sh + export KUBECONFIG=/opt/kubeconfig -wait_for_resource pods +wait_for_resource_or_die pods +wait_for_resource_or_die deployments + +oc apply -f "$ROUTE_CONTROLLER" -oc apply -f /opt/crc/routes-controller.yaml +echo "All done." +exit 0 From e2f9291c8ee90610fc33e8d9fbf5a6f5e915e2f6 Mon Sep 17 00:00:00 2001 From: Kevin Pouget Date: Wed, 1 Oct 2025 16:32:25 +0200 Subject: [PATCH 11/44] crc-wait-apiserver-up.sh: minor syntax improvements * Make the script more resilient by failing on any error * Make more verbose * Use `wait_for_resource_or_die` * Switch the retry-delay from 4 tries, 60s delay to 60 tries, 4 seconds delay * this makes the script detect earlier when the APIServer becomes available --- systemd/crc-wait-apiserver-up.sh | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/systemd/crc-wait-apiserver-up.sh b/systemd/crc-wait-apiserver-up.sh index 28299a5d4..e18494a01 100644 --- a/systemd/crc-wait-apiserver-up.sh +++ b/systemd/crc-wait-apiserver-up.sh @@ -1,9 +1,19 @@ #!/bin/bash -set -x +set -o pipefail +set -o errexit +set -o nounset +set -o errtrace source /usr/local/bin/crc-systemd-common.sh export KUBECONFIG=/opt/kubeconfig +echo "Waiting for the node resource to be available ..." # $1 resource, $2 retry count, $3 wait time -wait_for_resource node 4 60 +wait_for_resource_or_die node 4 60 + +echo "node resource available, APIServer is ready." + +echo "All done" + +exit 0 From 7016cf4f35a5ffbd9e8c407ebc6401ca46e62e99 Mon Sep 17 00:00:00 2001 From: Kevin Pouget Date: Wed, 1 Oct 2025 16:32:50 +0200 Subject: [PATCH 12/44] dnsmasq.sh.template: minor syntax improvements * Make more resilient by failing with any error * Use bash arithmetic syntax, more readable * Describe that `APPS_DOMAIN` is a template variable and not an environment variable --- systemd/dnsmasq.sh.template | 23 ++++++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) diff --git a/systemd/dnsmasq.sh.template b/systemd/dnsmasq.sh.template index f0168fd94..7942fc961 100644 --- a/systemd/dnsmasq.sh.template +++ b/systemd/dnsmasq.sh.template @@ -1,12 +1,33 @@ #!/bin/bash +set -o pipefail +set -o errexit +set -o nounset +set -o errtrace set -x -if [[ ${CRC_NETWORK_MODE_USER} -eq 1 ]]; then +source /etc/sysconfig/crc-env || echo "WARNING: crc-env not found" + + +if (( ${CRC_NETWORK_MODE_USER:-0} == 1 )); then echo -n "network-mode 'user' detected: skipping dnsmasq configuration" exit 0 fi +# The value of APPS_DOMAIN is set by the +# createdisk-library.sh::copy_systemd_units script during the template +# instantiation. So in the end system, the test below should be a +# tautologie (ie, always true if correctly set up) + +# disable this to properly reach the error block (cannot use ${var:-} +# here because of the envsubst instantiating the template) +set +o nounset +if [[ -z "${APPS_DOMAIN}" ]]; then + echo "ERROR: APPS_DOMAIN must be defined to use this script" + exit 1 +fi +set -o nounset + hostName=$(hostname) hostIp=$(hostname --all-ip-addresses | awk '{print $1}') From 67ee4c22ba55bee462064d5a45e016e6cfd192dd Mon Sep 17 00:00:00 2001 From: Kevin Pouget Date: Wed, 1 Oct 2025 16:33:48 +0200 Subject: [PATCH 13/44] ocp-cluster-ca.sh: syntax and reliability improvements * Make more resilient by failing on any error * Better use of script constants * Introduce a cleanup mechanism to remove the temporary cert files * Make more resilient by properly isolating variables (`"$VARIABLE"`) * More readable syntax by removing unnecessary `${VARIABLE}` brackets * Make more resilient by using `oc create ... --dry-run | oc apply-f-` * Make more readable by using `jq` to generate to patch JSON * Make more readable by splitting the long commands over multiple lines * Reuse the existing `wait_for_resource_or_die` --- systemd/ocp-cluster-ca.sh | 106 +++++++++++++++++++++++++------------- 1 file changed, 70 insertions(+), 36 deletions(-) diff --git a/systemd/ocp-cluster-ca.sh b/systemd/ocp-cluster-ca.sh index fc82e5ced..19a1ba57d 100644 --- a/systemd/ocp-cluster-ca.sh +++ b/systemd/ocp-cluster-ca.sh @@ -4,20 +4,28 @@ # https://access.redhat.com/solutions/5286371 # https://access.redhat.com/solutions/6054981 +set -o pipefail +set -o errexit +set -o nounset +set -o errtrace set -x +source /etc/sysconfig/crc-env || echo "WARNING: crc-env not found" + source /usr/local/bin/crc-systemd-common.sh export KUBECONFIG="/opt/kubeconfig" -wait_for_resource configmap +wait_for_resource_or_die configmap -external_ip_path=/opt/crc/eip +CRC_EXTERNAL_IP_FILE_PATH=/opt/crc/eip # may or may not be there. See below ... -if oc get configmap client-ca-custom -n openshift-config; then +if oc get configmap client-ca-custom -n openshift-config 2>/dev/null; then echo "API Server Client CA already rotated..." exit 0 fi +echo "API Server Client CA not rotated. Doing it now ..." + # generate CA CA_FILE_PATH="/tmp/custom-ca.crt" CA_KEY_FILE_PATH="/tmp/custom-ca.key" @@ -28,52 +36,78 @@ CA_SUBJ="/OU=openshift/CN=admin-kubeconfig-signer-custom" CLIENT_SUBJ="/O=system:masters/CN=system:admin" VALIDITY=365 +cleanup() { + rm -f "$CA_FILE_PATH" "$CA_KEY_FILE_PATH" \ + "$CLIENT_CA_FILE_PATH" "$CLIENT_CA_KEY_FILE_PATH" "$CLIENT_CSR_FILE_PATH" + echo "Temp files cleanup complete." +} + +# keep cleanup bound to EXIT; no need to clear ERR early +trap cleanup ERR EXIT + # generate the CA private key -openssl genrsa -out ${CA_KEY_FILE_PATH} 4096 +openssl genrsa -out "$CA_KEY_FILE_PATH" 4096 # Create the CA certificate -openssl req -x509 -new -nodes -key ${CA_KEY_FILE_PATH} -sha256 -days $VALIDITY -out ${CA_FILE_PATH} -subj "${CA_SUBJ}" +openssl req -x509 -new -nodes -key "$CA_KEY_FILE_PATH" -sha256 -days "$VALIDITY" -out "$CA_FILE_PATH" -subj "$CA_SUBJ" # create CSR -openssl req -new -newkey rsa:4096 -nodes -keyout ${CLIENT_CA_KEY_FILE_PATH} -out ${CLIENT_CSR_FILE_PATH} -subj "${CLIENT_SUBJ}" +openssl req -new -newkey rsa:4096 -nodes -keyout "$CLIENT_CA_KEY_FILE_PATH" -out "$CLIENT_CSR_FILE_PATH" -subj "$CLIENT_SUBJ" # sign the CSR with above CA -openssl x509 -extfile <(printf "extendedKeyUsage = clientAuth") -req -in ${CLIENT_CSR_FILE_PATH} -CA ${CA_FILE_PATH} \ - -CAkey ${CA_KEY_FILE_PATH} -CAcreateserial -out ${CLIENT_CA_FILE_PATH} -days $VALIDITY -sha256 - -oc create configmap client-ca-custom -n openshift-config --from-file=ca-bundle.crt=${CA_FILE_PATH} -oc patch apiserver cluster --type=merge -p '{"spec": {"clientCA": {"name": "client-ca-custom"}}}' +openssl x509 -extfile <(printf "extendedKeyUsage = clientAuth") -req -in "$CLIENT_CSR_FILE_PATH" -CA "$CA_FILE_PATH" \ + -CAkey "$CA_KEY_FILE_PATH" -CAcreateserial -out "$CLIENT_CA_FILE_PATH" -days "$VALIDITY" -sha256 + +oc create configmap client-ca-custom \ + -n openshift-config \ + --from-file=ca-bundle.crt="$CA_FILE_PATH" \ + --dry-run=client -o yaml \ + | oc apply -f - + +jq -n ' +{ + "spec": { + "clientCA": { + "name": "client-ca-custom" + } + } +}' | oc patch apiserver cluster --type=merge --patch-file=/dev/stdin cluster_name=$(oc config view -o jsonpath='{.clusters[0].name}') -apiserver_url=$(oc config view -o jsonpath='{.clusters[0].cluster.server}') -if [ -f "${external_ip_path}" ]; then - apiserver_url=https://api.$(cat "${external_ip_path}").nip.io:6443 +if [[ -r "$CRC_EXTERNAL_IP_FILE_PATH" ]]; then + external_ip=$(tr -d '\r\n' < "$CRC_EXTERNAL_IP_FILE_PATH") + apiserver_url=https://api.${external_ip}.nip.io:6443 + echo "INFO: CRC external IP file found. Using apiserver_url='$apiserver_url'." +else + apiserver_url=$(oc config view -o jsonpath='{.clusters[0].cluster.server}') + echo "INFO: CRC external IP file does not exist ($CRC_EXTERNAL_IP_FILE_PATH). Using apiserver_url='$apiserver_url'." fi -updated_kubeconfig_path=/opt/crc/kubeconfig -rm -rf "${updated_kubeconfig_path}" +export KUBECONFIG=/opt/crc/kubeconfig +rm -rf "$KUBECONFIG" -oc config set-credentials system:admin --client-certificate=${CLIENT_CA_FILE_PATH} --client-key=${CLIENT_CA_KEY_FILE_PATH} \ - --embed-certs --kubeconfig="${updated_kubeconfig_path}" -oc config set-context system:admin --cluster="${cluster_name}" --namespace=default --user=system:admin --kubeconfig="${updated_kubeconfig_path}" -oc config set-cluster "${cluster_name}" --server="${apiserver_url}" --insecure-skip-tls-verify=true --kubeconfig="${updated_kubeconfig_path}" -oc config use-context system:admin --kubeconfig="${updated_kubeconfig_path}" +oc config set-credentials system:admin \ + --client-certificate="$CLIENT_CA_FILE_PATH" \ + --client-key="$CLIENT_CA_KEY_FILE_PATH" \ + --embed-certs -COUNTER=0 -until oc get co --kubeconfig="${updated_kubeconfig_path}"; -do - if [ $COUNTER == 90 ]; then - echo "Unable to access API server using new client certitificate..." - exit 1 - fi - echo "Acess API server with new client cert, try $COUNTER, hang on...." - sleep 2 - ((COUNTER++)) -done +oc config set-context system:admin --cluster="$cluster_name" --namespace=default --user=system:admin +oc config set-cluster "$cluster_name" --server="$apiserver_url" --insecure-skip-tls-verify=true +oc config use-context system:admin +wait_for_resource_or_die clusteroperators 90 2 -oc create configmap admin-kubeconfig-client-ca -n openshift-config --from-file=ca-bundle.crt=${CA_FILE_PATH} \ - --dry-run=client -o yaml | oc replace -f - +oc create configmap admin-kubeconfig-client-ca \ + -n openshift-config \ + --from-file=ca-bundle.crt="$CA_FILE_PATH" \ + --dry-run=client -oyaml \ + | oc apply -f- # copy the new kubeconfig to /opt/kubeconfig -rm -rf /opt/kubeconfig +rm -f /opt/kubeconfig cp /opt/crc/kubeconfig /opt/kubeconfig -chmod 0666 /opt/kubeconfig +chmod 0666 /opt/kubeconfig # keep the file readable by everyone in the system, this is safe + +# cleanup will apply here + +echo "All done" + +exit 0 From 350d4b49f9aac2ecb36655ea9e0494db50c777dd Mon Sep 17 00:00:00 2001 From: Kevin Pouget Date: Wed, 1 Oct 2025 16:39:08 +0200 Subject: [PATCH 14/44] ocp-clusterid.sh: minor syntax improvements * Make more resilient by failing on any error * Generate the patch file with JQ (more readable, less error-prone) --- systemd/ocp-clusterid.sh | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/systemd/ocp-clusterid.sh b/systemd/ocp-clusterid.sh index 686deaa56..e144f0983 100644 --- a/systemd/ocp-clusterid.sh +++ b/systemd/ocp-clusterid.sh @@ -1,11 +1,20 @@ #!/bin/bash -set -x +set -o pipefail +set -o errexit +set -o nounset +set -o errtrace source /usr/local/bin/crc-systemd-common.sh export KUBECONFIG="/opt/kubeconfig" + +wait_for_resource_or_die clusterversion + uuid=$(uuidgen) -wait_for_resource clusterversion +jq -n --arg id "${uuid}" '{spec: {clusterID: $id}}' \ + | oc patch clusterversion version --type merge --patch-file=/dev/stdin + +echo "All done" -oc patch clusterversion version -p "{\"spec\":{\"clusterID\":\"${uuid}\"}}" --type merge +exit 0 From 3b7a1bc346cc3e27407758933ed0053216fa5de9 Mon Sep 17 00:00:00 2001 From: Kevin Pouget Date: Wed, 1 Oct 2025 16:44:32 +0200 Subject: [PATCH 15/44] ocp-custom-domain.sh: syntax and reliability improvements * Make more resilient by failing on any error * Stronger verifications on the external-ip file * Make more readable by splitting long lines * Better isolation and cleanup of the temporary cert files * Use of `jq` to set the JSON arguments --- systemd/ocp-custom-domain.sh | 123 +++++++++++++++++++++++++++-------- 1 file changed, 95 insertions(+), 28 deletions(-) diff --git a/systemd/ocp-custom-domain.sh b/systemd/ocp-custom-domain.sh index 47c563ffe..68d8795ee 100644 --- a/systemd/ocp-custom-domain.sh +++ b/systemd/ocp-custom-domain.sh @@ -1,49 +1,116 @@ #!/bin/bash +set -o pipefail +set -o errexit +set -o nounset +set -o errtrace set -x source /usr/local/bin/crc-systemd-common.sh export KUBECONFIG="/opt/kubeconfig" -if [ ! -f /opt/crc/eip ]; then - echo "external ip not found" +CRC_EXTERNAL_IP_FILE_PATH=/opt/crc/eip + +if [[ ! -r "$CRC_EXTERNAL_IP_FILE_PATH" ]]; then + echo "ERROR: CRC external ip file not found ($CRC_EXTERNAL_IP_FILE_PATH)" >&2 exit 1 fi -EIP=$(cat /opt/crc/eip) +EIP=$(tr -d '\r\n' < "$CRC_EXTERNAL_IP_FILE_PATH") + +if [[ -z "$EIP" ]]; then + echo "ERROR: External IP file is empty: $CRC_EXTERNAL_IP_FILE_PATH" >&2 + exit 1 +fi -STEPS_SLEEP_TIME=30 +# Basic IPv4 sanity check; adjust if IPv6 is expected +if [[ ! "$EIP" =~ ^([0-9]{1,3}\.){3}[0-9]{1,3}$ ]]; then + echo "ERROR: Invalid IPv4 address read from $CRC_EXTERNAL_IP_FILE_PATH: '$EIP'" >&2 + exit 1 +fi -wait_for_resource secret +wait_for_resource_or_die secret + +TMP_KEY_FILE=$(mktemp /tmp/nip.key.XXXXX) +TMP_CRT_FILE=$(mktemp /tmp/nip.crt.XXXXX) + +cleanup() { + rm -f "$TMP_KEY_FILE" "$TMP_CRT_FILE" + echo "Temp files cleanup complete." +} + +# Cleanup happens automatically via trap on error or at script end +trap cleanup ERR EXIT # create cert and add as secret -openssl req -newkey rsa:2048 -new -nodes -x509 -days 3650 -keyout /tmp/nip.key -out /tmp/nip.crt -subj "/CN=$EIP.nip.io" -addext "subjectAltName=DNS:apps.$EIP.nip.io,DNS:*.apps.$EIP.nip.io,DNS:api.$EIP.nip.io" -oc delete secret nip-secret -n openshift-config || true -oc create secret tls nip-secret --cert=/tmp/nip.crt --key=/tmp/nip.key -n openshift-config -sleep $STEPS_SLEEP_TIME +openssl req -newkey rsa:2048 -new \ + -nodes -x509 -days 3650 \ + -keyout "$TMP_KEY_FILE" -out "$TMP_CRT_FILE" \ + -subj "/CN=$EIP.nip.io" \ + -addext "subjectAltName=DNS:apps.$EIP.nip.io,DNS:*.apps.$EIP.nip.io,DNS:api.$EIP.nip.io" + +oc delete secret nip-secret -n openshift-config --ignore-not-found +oc create secret tls nip-secret \ + --cert="$TMP_CRT_FILE" \ + --key="$TMP_KEY_FILE" \ + -n openshift-config # patch ingress - cat < /tmp/ingress-patch.yaml -spec: - appsDomain: apps.$EIP.nip.io - componentRoutes: - - hostname: console-openshift-console.apps.$EIP.nip.io - name: console - namespace: openshift-console - servingCertKeyPairSecret: - name: nip-secret - - hostname: oauth-openshift.apps.$EIP.nip.io - name: oauth-openshift - namespace: openshift-authentication - servingCertKeyPairSecret: - name: nip-secret -EOF -oc patch ingresses.config.openshift.io cluster --type=merge --patch-file=/tmp/ingress-patch.yaml +wait_for_resource_or_die ingresses.config.openshift.io +jq -n --arg eip "$EIP" ' +{ + "spec": { + "appsDomain": "apps.\($eip).nip.io", + "componentRoutes": [ + { + "hostname": "console-openshift-console.apps.\($eip).nip.io", + "name": "console", + "namespace": "openshift-console", + "servingCertKeyPairSecret": { + "name": "nip-secret" + } + }, + { + "hostname": "oauth-openshift.apps.\($eip).nip.io", + "name": "oauth-openshift", + "namespace": "openshift-authentication", + "servingCertKeyPairSecret": { + "name": "nip-secret" + } + } + ] + } +}' | oc patch ingresses.config.openshift.io cluster --type=merge --patch-file=/dev/stdin # patch API server to use new CA secret -oc patch apiserver cluster --type=merge -p '{"spec":{"servingCerts": {"namedCertificates":[{"names":["api.'$EIP'.nip.io"],"servingCertificate": {"name": "nip-secret"}}]}}}' +wait_for_resource_or_die apiserver.config.openshift.io +jq -n --arg eip "$EIP" ' +{ + "spec": { + "servingCerts": { + "namedCertificates": [ + { + "names": [ + "api.\($eip).nip.io" + ], + "servingCertificate": { + "name": "nip-secret" + } + } + ] + } + } +}' | oc patch apiserver cluster --type=merge --patch-file=/dev/stdin # patch image registry route -oc patch -p '{"spec": {"host": "default-route-openshift-image-registry.'$EIP'.nip.io"}}' route default-route -n openshift-image-registry --type=merge +wait_for_resource_or_die route.route.openshift.io +jq -n --arg eip "$EIP" ' +{ + "spec": { + "host": "default-route-openshift-image-registry.\($eip).nip.io" + } +}' | oc patch route default-route -n openshift-image-registry --type=merge --patch-file=/dev/stdin + +echo "All done" -#wait_cluster_become_healthy "authentication|console|etcd|ingress|openshift-apiserver" +exit 0 From 5584205e3249b1c9aae4d682bc0ba8eff451ab0d Mon Sep 17 00:00:00 2001 From: Kevin Pouget Date: Wed, 1 Oct 2025 16:44:48 +0200 Subject: [PATCH 16/44] ocp-growfs.sh: syntax improvements (will be removed in a follow up commit) --- systemd/ocp-growfs.sh | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/systemd/ocp-growfs.sh b/systemd/ocp-growfs.sh index c637a7c08..b74ae8457 100644 --- a/systemd/ocp-growfs.sh +++ b/systemd/ocp-growfs.sh @@ -1,5 +1,9 @@ #!/bin/bash +set -o pipefail +set -o errexit +set -o nounset +set -o errtrace set -x root_partition=$(/usr/sbin/blkid -t TYPE=xfs -o device) @@ -8,4 +12,9 @@ root_partition=$(/usr/sbin/blkid -t TYPE=xfs -o device) rootFS="/sysroot" mount -o remount,rw "${rootFS}" xfs_growfs "${rootFS}" + #mount -o remount,ro "${rootFS}" + +echo "All done" + +exit 0 From 8c036ee0bfa130cba517c39be3700a147fdfb744 Mon Sep 17 00:00:00 2001 From: Kevin Pouget Date: Wed, 1 Oct 2025 16:47:20 +0200 Subject: [PATCH 17/44] ocp-mco-sshkey.sh: syntax improvements * Make more resilient by failing on any error * Improve the logging and argument validation * Use `jq` to to guarantee that the patch file is valid JSON (will be updated in a follow up commit to avoid passing the pub key in the CLI) --- systemd/ocp-mco-sshkey.sh | 40 +++++++++++++++++++++++++++++---------- 1 file changed, 30 insertions(+), 10 deletions(-) diff --git a/systemd/ocp-mco-sshkey.sh b/systemd/ocp-mco-sshkey.sh index 0f1d441bd..b412769e7 100644 --- a/systemd/ocp-mco-sshkey.sh +++ b/systemd/ocp-mco-sshkey.sh @@ -1,22 +1,42 @@ #!/bin/bash +set -o pipefail +set -o errexit +set -o nounset +set -o errtrace set -x source /usr/local/bin/crc-systemd-common.sh export KUBECONFIG="/opt/kubeconfig" -pub_key_path="/opt/crc/id_rsa.pub" +CRC_PUB_KEY_PATH="/opt/crc/id_rsa.pub" -if [ ! -f "${pub_key_path}" ]; then - echo "No pubkey file found" +if [[ ! -r "$CRC_PUB_KEY_PATH" ]]; then + echo "ERROR: CRC pubkey file does not exist ($CRC_PUB_KEY_PATH)" exit 1 fi +wait_for_resource_or_die machineconfig/99-master-ssh + echo "Updating the public key resource for machine config operator" -pub_key=$(tr -d '\n\r' < ${pub_key_path}) -wait_for_resource machineconfig -if ! oc patch machineconfig 99-master-ssh -p "{\"spec\": {\"config\": {\"passwd\": {\"users\": [{\"name\": \"core\", \"sshAuthorizedKeys\": [\"${pub_key}\"]}]}}}}" --type merge; -then - echo "failed to update public key to machine config operator" - exit 1 -fi +pub_key=$(cat "$CRC_PUB_KEY_PATH" | tr -d '\n\r') + +jq -n --arg key "${pub_key}" ' +{ + "spec": { + "config": { + "passwd": { + "users": [ + { + "name": "core", + "sshAuthorizedKeys": [ $key ] + } + ] + } + } + } +}' | oc patch machineconfig 99-master-ssh --type merge --patch-file=/dev/stdin + +echo "All done" + +exit 0 From 6dc223396a57dc35b742c54dfbc8afda85ab4a0a Mon Sep 17 00:00:00 2001 From: Kevin Pouget Date: Wed, 1 Oct 2025 16:48:47 +0200 Subject: [PATCH 18/44] ocp-userpasswords.sh: syntax improvements * Better use of script constants * Better validation of the arguments * Better logging File will be further updated to prevent leaking passwords in the journal logs. --- systemd/ocp-userpasswords.sh | 37 +++++++++++++++++++++--------------- 1 file changed, 22 insertions(+), 15 deletions(-) diff --git a/systemd/ocp-userpasswords.sh b/systemd/ocp-userpasswords.sh index f2a6d2a02..88446df84 100644 --- a/systemd/ocp-userpasswords.sh +++ b/systemd/ocp-userpasswords.sh @@ -9,41 +9,48 @@ set -x source /usr/local/bin/crc-systemd-common.sh export KUBECONFIG="/opt/kubeconfig" +CRC_PASS_DEVELOPER_PATH=/opt/crc/pass_developer +CRC_PASS_KUBEADMIN_PATH=/opt/crc/pass_kubeadmin +CRC_HTPASSWD_IMAGE=registry.access.redhat.com/ubi10/httpd-24 + function gen_htpasswd() { if [ -z "${1:-}" ] || [ -z "${2:-}" ]; then - echo "gen_htpasswd needs two arguments: username password" 1>&2 + echo "gen_htpasswd needs two arguments: username password" >&2 return 1 fi - podman run --rm docker.io/xmartlabs/htpasswd "$1" "$2" + podman run --rm "$CRC_HTPASSWD_IMAGE" htpasswd -nb "$1" "$2" } -wait_for_resource secret - -if [ ! -f /opt/crc/pass_developer ]; then - echo "developer password does not exist" +if [[ ! -r "$CRC_PASS_DEVELOPER_PATH" ]]; then + echo "ERROR: CRC developer password does not exist ($CRC_PASS_DEVELOPER_PATH)" exit 1 fi -if [ ! -f /opt/crc/pass_kubeadmin ]; then - echo "kubeadmin password does not exist" +if [[ ! -r "$CRC_PASS_KUBEADMIN_PATH" ]]; then + echo "ERROR: CRC kubeadmin password does not exist ($CRC_PASS_KUBEADMIN_PATH)" exit 1 fi -echo "generating the kubeadmin and developer passwords ..." +echo "Pulling $CRC_HTPASSWD_IMAGE ..." +podman pull --quiet "$CRC_HTPASSWD_IMAGE" -set +x # /!\ disable the logging to avoid leaking the passwords +wait_for_resource_or_die secret -dev_pass=$(gen_htpasswd developer "$(cat /opt/crc/pass_developer)") -adm_pass=$(gen_htpasswd kubeadmin "$(cat /opt/crc/pass_kubeadmin)") +echo "Generating the kubeadmin and developer passwords ..." +set +x # disable the logging to avoid leaking the passwords + +dev_pass=$(gen_htpasswd developer "$(cat "$CRC_PASS_DEVELOPER_PATH")") +adm_pass=$(gen_htpasswd kubeadmin "$(cat "$CRC_PASS_KUBEADMIN_PATH")") echo "creating the password secret ..." -# use bash <() to use a temporary fd file -# use sed to remove the empty lines +# use bash "<()" to use a temporary fd file (safer to handle secrets) oc create secret generic htpass-secret \ --from-file=htpasswd=<(printf '%s\n%s\n' "$dev_pass" "$adm_pass") \ -n openshift-config \ --dry-run=client -oyaml \ | oc apply -f- -echo "all done" +echo "All done" + +exit 0 From 987d10f5d0a9ce873a35cb00f437f9ec9f87f723 Mon Sep 17 00:00:00 2001 From: Kevin Pouget Date: Thu, 2 Oct 2025 09:41:21 +0200 Subject: [PATCH 19/44] ocp-userpasswords.service: remove unnecessary sleep No need to sleep `5s` here, the SystemD dependencies should enforce the correct ordering. --- systemd/ocp-userpasswords.service | 1 - 1 file changed, 1 deletion(-) diff --git a/systemd/ocp-userpasswords.service b/systemd/ocp-userpasswords.service index b47b9fcf1..04be04957 100644 --- a/systemd/ocp-userpasswords.service +++ b/systemd/ocp-userpasswords.service @@ -12,7 +12,6 @@ Type=oneshot Restart=on-failure RestartSec=40 ExecCondition=/usr/local/bin/crc-self-sufficient-env.sh -ExecStartPre=/usr/bin/sleep 5 ExecStart=/usr/local/bin/ocp-userpasswords.sh ExecStartPost=-touch /opt/crc/%n.done From ddc79982b546b042e07bc08d06181deb15d4e81e Mon Sep 17 00:00:00 2001 From: Kevin Pouget Date: Thu, 2 Oct 2025 14:26:46 +0200 Subject: [PATCH 20/44] Define the KUBECONFIG in the systemd service Define the `KUBECONFIG` in the Systemd unit file, so that the CRC scripts don't have to care about it. ``` Environment=KUBECONFIG=/opt/kubeconfig ``` Gives a better separation of concerns. --- systemd/crc-cluster-status.service | 1 + systemd/crc-cluster-status.sh | 1 - systemd/crc-pullsecret.service | 1 + systemd/crc-pullsecret.sh | 1 - systemd/crc-routes-controller.service | 1 + systemd/crc-routes-controller.sh | 2 -- systemd/crc-wait-apiserver-up.service | 1 + systemd/crc-wait-apiserver-up.sh | 1 - systemd/ocp-cluster-ca.service | 1 + systemd/ocp-cluster-ca.sh | 1 - systemd/ocp-clusterid.service | 1 + systemd/ocp-clusterid.sh | 1 - systemd/ocp-custom-domain.service | 1 + systemd/ocp-custom-domain.sh | 1 - systemd/ocp-mco-sshkey.service | 1 + systemd/ocp-mco-sshkey.sh | 1 - systemd/ocp-userpasswords.service | 1 + systemd/ocp-userpasswords.sh | 1 - 18 files changed, 9 insertions(+), 10 deletions(-) diff --git a/systemd/crc-cluster-status.service b/systemd/crc-cluster-status.service index 74c8af9a0..950acdb90 100644 --- a/systemd/crc-cluster-status.service +++ b/systemd/crc-cluster-status.service @@ -12,6 +12,7 @@ StartLimitBurst=10 Type=oneshot Restart=on-failure RestartSec=40 +Environment=KUBECONFIG=/opt/kubeconfig ExecCondition=/usr/local/bin/crc-self-sufficient-env.sh ExecStart=/usr/local/bin/crc-cluster-status.sh RemainAfterExit=true diff --git a/systemd/crc-cluster-status.sh b/systemd/crc-cluster-status.sh index 2529779ad..9b25dece3 100644 --- a/systemd/crc-cluster-status.sh +++ b/systemd/crc-cluster-status.sh @@ -6,7 +6,6 @@ set -o nounset set -o errtrace set -x -export KUBECONFIG=/opt/kubeconfig MAXIMUM_LOGIN_RETRY=10 RETRY_DELAY=5 diff --git a/systemd/crc-pullsecret.service b/systemd/crc-pullsecret.service index a76b36584..0baece570 100644 --- a/systemd/crc-pullsecret.service +++ b/systemd/crc-pullsecret.service @@ -10,6 +10,7 @@ ConditionPathExists=!/opt/crc/%n.done Type=oneshot Restart=on-failure RestartSec=40 +Environment=KUBECONFIG=/opt/kubeconfig ExecCondition=/usr/local/bin/crc-self-sufficient-env.sh ExecStart=/usr/local/bin/crc-pullsecret.sh ExecStartPost=-touch /opt/crc/%n.done diff --git a/systemd/crc-pullsecret.sh b/systemd/crc-pullsecret.sh index d1665357b..74f62ac3f 100644 --- a/systemd/crc-pullsecret.sh +++ b/systemd/crc-pullsecret.sh @@ -7,7 +7,6 @@ set -o errtrace set -x source /usr/local/bin/crc-systemd-common.sh -export KUBECONFIG="/opt/kubeconfig" PULL_SECRETS_FILE="/opt/crc/pull-secret" diff --git a/systemd/crc-routes-controller.service b/systemd/crc-routes-controller.service index 65f6e1973..869b4ab95 100644 --- a/systemd/crc-routes-controller.service +++ b/systemd/crc-routes-controller.service @@ -9,6 +9,7 @@ StartLimitBurst=10 Type=oneshot Restart=on-failure RestartSec=40 +Environment=KUBECONFIG=/opt/kubeconfig ExecCondition=/usr/local/bin/crc-user-mode-networking.sh ExecCondition=/usr/local/bin/crc-self-sufficient-env.sh ExecStart=/usr/local/bin/crc-routes-controller.sh diff --git a/systemd/crc-routes-controller.sh b/systemd/crc-routes-controller.sh index 64d3f6f42..ee15b968d 100644 --- a/systemd/crc-routes-controller.sh +++ b/systemd/crc-routes-controller.sh @@ -10,8 +10,6 @@ ROUTE_CONTROLLER=/opt/crc/routes-controller.yaml source /usr/local/bin/crc-systemd-common.sh -export KUBECONFIG=/opt/kubeconfig - wait_for_resource_or_die pods wait_for_resource_or_die deployments diff --git a/systemd/crc-wait-apiserver-up.service b/systemd/crc-wait-apiserver-up.service index 2a6061917..9ab3e168c 100644 --- a/systemd/crc-wait-apiserver-up.service +++ b/systemd/crc-wait-apiserver-up.service @@ -8,6 +8,7 @@ Before=ocp-delete-mco-leases.service [Service] Type=oneshot Restart=on-failure +Environment=KUBECONFIG=/opt/kubeconfig ExecCondition=/usr/local/bin/crc-self-sufficient-env.sh ExecStart=/usr/local/bin/crc-wait-apiserver-up.sh diff --git a/systemd/crc-wait-apiserver-up.sh b/systemd/crc-wait-apiserver-up.sh index e18494a01..614c13dbb 100644 --- a/systemd/crc-wait-apiserver-up.sh +++ b/systemd/crc-wait-apiserver-up.sh @@ -6,7 +6,6 @@ set -o nounset set -o errtrace source /usr/local/bin/crc-systemd-common.sh -export KUBECONFIG=/opt/kubeconfig echo "Waiting for the node resource to be available ..." # $1 resource, $2 retry count, $3 wait time diff --git a/systemd/ocp-cluster-ca.service b/systemd/ocp-cluster-ca.service index 8f5a8d2ef..832f64093 100644 --- a/systemd/ocp-cluster-ca.service +++ b/systemd/ocp-cluster-ca.service @@ -9,6 +9,7 @@ ConditionPathExists=!/opt/crc/%n.done Type=oneshot Restart=on-failure RestartSec=40 +Environment=KUBECONFIG=/opt/kubeconfig ExecCondition=/usr/local/bin/crc-self-sufficient-env.sh ExecStart=/usr/local/bin/ocp-cluster-ca.sh ExecStartPost=-touch /opt/crc/%n.done diff --git a/systemd/ocp-cluster-ca.sh b/systemd/ocp-cluster-ca.sh index 19a1ba57d..01e6f2e12 100644 --- a/systemd/ocp-cluster-ca.sh +++ b/systemd/ocp-cluster-ca.sh @@ -13,7 +13,6 @@ set -x source /etc/sysconfig/crc-env || echo "WARNING: crc-env not found" source /usr/local/bin/crc-systemd-common.sh -export KUBECONFIG="/opt/kubeconfig" wait_for_resource_or_die configmap diff --git a/systemd/ocp-clusterid.service b/systemd/ocp-clusterid.service index f901f8160..2e6ad70f3 100644 --- a/systemd/ocp-clusterid.service +++ b/systemd/ocp-clusterid.service @@ -9,6 +9,7 @@ StartLimitBurst=10 Type=oneshot Restart=on-failure RestartSec=40 +Environment=KUBECONFIG=/opt/kubeconfig ExecCondition=/usr/local/bin/crc-self-sufficient-env.sh ExecStart=/usr/local/bin/ocp-clusterid.sh diff --git a/systemd/ocp-clusterid.sh b/systemd/ocp-clusterid.sh index e144f0983..3beee5eba 100644 --- a/systemd/ocp-clusterid.sh +++ b/systemd/ocp-clusterid.sh @@ -6,7 +6,6 @@ set -o nounset set -o errtrace source /usr/local/bin/crc-systemd-common.sh -export KUBECONFIG="/opt/kubeconfig" wait_for_resource_or_die clusterversion diff --git a/systemd/ocp-custom-domain.service b/systemd/ocp-custom-domain.service index 93d644ce0..d48b8f4c1 100644 --- a/systemd/ocp-custom-domain.service +++ b/systemd/ocp-custom-domain.service @@ -10,6 +10,7 @@ ConditionPathExists=!/opt/crc/%n.done Type=oneshot Restart=on-failure RestartSec=40 +Environment=KUBECONFIG=/opt/kubeconfig ExecCondition=/usr/local/bin/crc-self-sufficient-env.sh ExecStart=/usr/local/bin/ocp-custom-domain.sh ExecStartPost=-touch /opt/crc/%n.done diff --git a/systemd/ocp-custom-domain.sh b/systemd/ocp-custom-domain.sh index 68d8795ee..6b706a3dc 100644 --- a/systemd/ocp-custom-domain.sh +++ b/systemd/ocp-custom-domain.sh @@ -7,7 +7,6 @@ set -o errtrace set -x source /usr/local/bin/crc-systemd-common.sh -export KUBECONFIG="/opt/kubeconfig" CRC_EXTERNAL_IP_FILE_PATH=/opt/crc/eip diff --git a/systemd/ocp-mco-sshkey.service b/systemd/ocp-mco-sshkey.service index 42b4b5587..f2d66c21b 100644 --- a/systemd/ocp-mco-sshkey.service +++ b/systemd/ocp-mco-sshkey.service @@ -9,6 +9,7 @@ StartLimitBurst=10 Type=oneshot Restart=on-failure RestartSec=40 +Environment=KUBECONFIG=/opt/kubeconfig ExecCondition=/usr/local/bin/crc-self-sufficient-env.sh ExecStart=/usr/local/bin/ocp-mco-sshkey.sh RemainAfterExit=true diff --git a/systemd/ocp-mco-sshkey.sh b/systemd/ocp-mco-sshkey.sh index b412769e7..00a90ed64 100644 --- a/systemd/ocp-mco-sshkey.sh +++ b/systemd/ocp-mco-sshkey.sh @@ -7,7 +7,6 @@ set -o errtrace set -x source /usr/local/bin/crc-systemd-common.sh -export KUBECONFIG="/opt/kubeconfig" CRC_PUB_KEY_PATH="/opt/crc/id_rsa.pub" diff --git a/systemd/ocp-userpasswords.service b/systemd/ocp-userpasswords.service index 04be04957..5d3d61ad6 100644 --- a/systemd/ocp-userpasswords.service +++ b/systemd/ocp-userpasswords.service @@ -11,6 +11,7 @@ ConditionPathExists=!/opt/crc/%n.done Type=oneshot Restart=on-failure RestartSec=40 +Environment=KUBECONFIG=/opt/kubeconfig ExecCondition=/usr/local/bin/crc-self-sufficient-env.sh ExecStart=/usr/local/bin/ocp-userpasswords.sh ExecStartPost=-touch /opt/crc/%n.done diff --git a/systemd/ocp-userpasswords.sh b/systemd/ocp-userpasswords.sh index 88446df84..2adeed8be 100644 --- a/systemd/ocp-userpasswords.sh +++ b/systemd/ocp-userpasswords.sh @@ -7,7 +7,6 @@ set -o errtrace set -x source /usr/local/bin/crc-systemd-common.sh -export KUBECONFIG="/opt/kubeconfig" CRC_PASS_DEVELOPER_PATH=/opt/crc/pass_developer CRC_PASS_KUBEADMIN_PATH=/opt/crc/pass_kubeadmin From 3fc87dd61e0cd8637b6cb5d261f13e869ce2558c Mon Sep 17 00:00:00 2001 From: Kevin Pouget Date: Thu, 2 Oct 2025 14:39:39 +0200 Subject: [PATCH 21/44] systemd: add a synchronization on ocp-wait-apiservices-available Introduce the `ocp-apiservices-available.sh` script, which waits for the `apiservices` to be all available. The APIServices are made up of two groups: - the K8s APIs, which are always available (pods, secrets, configmaps,...) - the OCP APIs, which need OCP Operators and Pods to be ready (routes, projects, ...) This script waits for the second group to finish its initialization. --- systemd/crc-check-tap.service | 1 + systemd/crc-cluster-status.service | 1 + systemd/ocp-cluster-ca.service | 1 + systemd/ocp-custom-domain.service | 1 + .../ocp-wait-apiservices-available.service | 21 ++++++ systemd/ocp-wait-apiservices-available.sh | 69 +++++++++++++++++++ 6 files changed, 94 insertions(+) create mode 100644 systemd/ocp-wait-apiservices-available.service create mode 100644 systemd/ocp-wait-apiservices-available.sh diff --git a/systemd/crc-check-tap.service b/systemd/crc-check-tap.service index 38ce68057..46c5e3a24 100644 --- a/systemd/crc-check-tap.service +++ b/systemd/crc-check-tap.service @@ -5,6 +5,7 @@ Before=gv-user-network@tap0.service After=local-fs.target After=crc-env-file-exists.service RequiresMountsFor=/etc/NetworkManager/system-connections +Requires=crc-env-file-exists.service [Service] Type=oneshot diff --git a/systemd/crc-cluster-status.service b/systemd/crc-cluster-status.service index 950acdb90..565b04e85 100644 --- a/systemd/crc-cluster-status.service +++ b/systemd/crc-cluster-status.service @@ -5,6 +5,7 @@ After=crc-wait-apiserver-up.service crc-pullsecret.service After=ocp-mco-sshkey.service ocp-cluster-ca.service After=ocp-custom-domain.service ocp-userpasswords.service After=ocp-clusterid.service +After=ocp-wait-apiservices-available.service StartLimitIntervalSec=450 StartLimitBurst=10 diff --git a/systemd/ocp-cluster-ca.service b/systemd/ocp-cluster-ca.service index 832f64093..c36cafbcd 100644 --- a/systemd/ocp-cluster-ca.service +++ b/systemd/ocp-cluster-ca.service @@ -1,6 +1,7 @@ [Unit] Description=CRC Unit setting custom cluster ca After=crc-wait-apiserver-up.service +After=ocp-wait-apiservices-available.service StartLimitIntervalSec=450 StartLimitBurst=10 ConditionPathExists=!/opt/crc/%n.done diff --git a/systemd/ocp-custom-domain.service b/systemd/ocp-custom-domain.service index d48b8f4c1..ab2cc2f72 100644 --- a/systemd/ocp-custom-domain.service +++ b/systemd/ocp-custom-domain.service @@ -2,6 +2,7 @@ Description=CRC Unit setting nip.io domain for cluster After=crc-env-file-exists.service After=crc-wait-apiserver-up.service +After=ocp-wait-apiservices-available.service StartLimitIntervalSec=450 StartLimitBurst=10 ConditionPathExists=!/opt/crc/%n.done diff --git a/systemd/ocp-wait-apiservices-available.service b/systemd/ocp-wait-apiservices-available.service new file mode 100644 index 000000000..38627eed1 --- /dev/null +++ b/systemd/ocp-wait-apiservices-available.service @@ -0,0 +1,21 @@ +[Unit] +Description=Wait for all Kubernetes APIServices to be Available + +# This service needs network to talk to the k8s API server +Wants=network-online.target +After=network-online.target +After=crc-wait-apiserver-up.service +StartLimitIntervalSec=450 +StartLimitBurst=10 + +[Service] +Type=oneshot +Restart=on-failure +RestartSec=20 +ExecCondition=/usr/local/bin/crc-self-sufficient-env.sh +ExecStart=/usr/local/bin/ocp-wait-apiservices-available.sh + +Environment=KUBECONFIG=/opt/kubeconfig + +[Install] +WantedBy=crc-custom.target diff --git a/systemd/ocp-wait-apiservices-available.sh b/systemd/ocp-wait-apiservices-available.sh new file mode 100644 index 000000000..1bb89e0a1 --- /dev/null +++ b/systemd/ocp-wait-apiservices-available.sh @@ -0,0 +1,69 @@ +#!/bin/bash + +set -o pipefail +set -o errexit +set -o nounset +set -o errtrace + +echo "➡️ Waiting for all APIServices to become available..." + +SECONDS=0 +MAX_RETRY=60 +WAIT_SEC=5 + +for retry in $(seq 1 "$MAX_RETRY"); do + # This command gets the 'status' of the 'Available' condition for every apiservice. + # It produces a list of "True" and/or "False" strings. We then count how many are "False". + APISERVICE_DATA=$(oc get apiservices -o json 2>/dev/null || true) + if [[ -z "$APISERVICE_DATA" ]]; then + UNAVAILABLE_COUNT=999 + echo "⚠️ Couldn't get the list of apiservices ..." + else + UNAVAILABLE_COUNT=$(jq -r ' + [ .items[] + | select(((.status.conditions // []) + | any(.type=="Available" and .status=="True")) | not) + ] | length + ' <<<"$APISERVICE_DATA") + UNAVAILABLE_COUNT=${UNAVAILABLE_COUNT:-0} + fi + + if [ "$UNAVAILABLE_COUNT" -eq 0 ]; then + echo "✅ All APIServices are now available after $SECONDS seconds." + break + fi + + echo + echo "⏳ Still waiting for $UNAVAILABLE_COUNT APIService(s) to become available. Retrying in $WAIT_SEC seconds." + echo "--------------------------------------------------------------------------------" + echo "Unavailable services and their messages:" + + # Get all apiservices as JSON and pipe to jq for filtering and formatting. + # The '-r' flag outputs raw strings instead of JSON-quoted strings. + if ! oc get apiservices -o json | jq -r ' + .items[] | + . as $item | + ( + $item.status.conditions[]? | + select(.type == "Available" and .status == "False") + ) | + " - \($item.metadata.name): \(.reason) - \(.message)" + ' + then + echo "⚠️ Unable to list unavailable APIServices details (will retry)" >&2 + fi + + echo "--------------------------------------------------------------------------------" + + # If it's the last attempt, log a failure message before exiting + if (( retry == MAX_RETRY )); then + echo "ERROR: Timed out waiting for the api-services to get ready, after $MAX_RETRY attempts x $WAIT_SEC seconds = $SECONDS seconds." >&2 + exit 1 + fi + + sleep "$WAIT_SEC" +done + +echo "🎉 Done." + +exit 0 From f8132ea8a02a9cc4e9a8306236ed405b350fca2e Mon Sep 17 00:00:00 2001 From: Kevin Pouget Date: Thu, 2 Oct 2025 16:17:58 +0200 Subject: [PATCH 22/44] crc-wait-apiserver-up.sh: try more often The `crc-wait-apiserver-up` wait for the K8s APIServer to be up and running. This commit makes the patch try 60 times with 5s delay, instead of 5 times with 60s delay. The script becomes more reactive to the APIServer activation. --- systemd/crc-wait-apiserver-up.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/systemd/crc-wait-apiserver-up.sh b/systemd/crc-wait-apiserver-up.sh index 614c13dbb..209592f48 100644 --- a/systemd/crc-wait-apiserver-up.sh +++ b/systemd/crc-wait-apiserver-up.sh @@ -9,7 +9,7 @@ source /usr/local/bin/crc-systemd-common.sh echo "Waiting for the node resource to be available ..." # $1 resource, $2 retry count, $3 wait time -wait_for_resource_or_die node 4 60 +wait_for_resource_or_die node 60 5 echo "node resource available, APIServer is ready." From 7b0c2a689830365755e18b3af497abbb14f3f9d1 Mon Sep 17 00:00:00 2001 From: Kevin Pouget Date: Fri, 3 Oct 2025 10:25:26 +0200 Subject: [PATCH 23/44] systemd: add synchronization on crc-wait-node-ready This script add a synchronization point on the `ready` status of the CRC node. Before the node is ready, services can interact with the K8s APIServer, but user (and OCP) services won't start their deployment before the CRC node is ready. This synchronization point avoid that other services (like the `ocp-wait-apiservices`) wait in vain while their target didn't start their own deployment. --- systemd/crc-cluster-status.service | 1 + systemd/crc-wait-node-ready.service | 19 +++++++++ systemd/crc-wait-node-ready.sh | 39 +++++++++++++++++++ .../ocp-wait-apiservices-available.service | 2 + 4 files changed, 61 insertions(+) create mode 100644 systemd/crc-wait-node-ready.service create mode 100644 systemd/crc-wait-node-ready.sh diff --git a/systemd/crc-cluster-status.service b/systemd/crc-cluster-status.service index 565b04e85..4bd12abd2 100644 --- a/systemd/crc-cluster-status.service +++ b/systemd/crc-cluster-status.service @@ -6,6 +6,7 @@ After=ocp-mco-sshkey.service ocp-cluster-ca.service After=ocp-custom-domain.service ocp-userpasswords.service After=ocp-clusterid.service After=ocp-wait-apiservices-available.service +After=crc-wait-node-ready.service StartLimitIntervalSec=450 StartLimitBurst=10 diff --git a/systemd/crc-wait-node-ready.service b/systemd/crc-wait-node-ready.service new file mode 100644 index 000000000..6daf0472d --- /dev/null +++ b/systemd/crc-wait-node-ready.service @@ -0,0 +1,19 @@ +[Unit] +Description=CRC Unit waiting till k8s node is ready +Requires=kubelet.service +After=kubelet.service +After=crc-env-file-exists.service +After=crc-wait-apiserver-up.service +StartLimitIntervalSec=450 +StartLimitBurst=10 + +[Service] +Type=oneshot +Restart=on-failure +RestartSec=10 +Environment=KUBECONFIG=/opt/kubeconfig +ExecCondition=/usr/local/bin/crc-self-sufficient-env.sh +ExecStart=/usr/local/bin/crc-wait-node-ready.sh + +[Install] +WantedBy=crc-custom.target diff --git a/systemd/crc-wait-node-ready.sh b/systemd/crc-wait-node-ready.sh new file mode 100644 index 000000000..0e3d43380 --- /dev/null +++ b/systemd/crc-wait-node-ready.sh @@ -0,0 +1,39 @@ +#!/bin/bash + +set -o pipefail +set -o errexit +set -o nounset +set -o errtrace + +source /usr/local/bin/crc-systemd-common.sh + +MAX_RETRY=150 +WAIT_SEC=2 +NODE_NAME=node/crc +# Loop from 1 up to max_retry +for retry in $(seq 1 "$MAX_RETRY"); do + node_status=$(oc get "$NODE_NAME" --no-headers | awk '{print $2}' || true) + node_status=${node_status:-""} + + # Check if the node status is "Ready" + if [[ $node_status == "Ready" ]]; then + echo "CRC node is ready." + exit 0 + fi + + echo "CRC node is not ready. Status: $node_status" + + # If it's the last attempt, log a failure message before exiting + if (( retry == MAX_RETRY )); then + echo "Error: Timed out waiting for the CRC node to be ready after $MAX_RETRY attempts x $WAIT_SEC seconds." >&2 + exit 1 + fi + + # Wait before the next attempt + echo "Waiting for crc node to be ready ... (Attempt ${retry}/${MAX_RETRY})" + sleep "$WAIT_SEC" +done + +# cannot be reached + +exit 1 diff --git a/systemd/ocp-wait-apiservices-available.service b/systemd/ocp-wait-apiservices-available.service index 38627eed1..a82cde3a7 100644 --- a/systemd/ocp-wait-apiservices-available.service +++ b/systemd/ocp-wait-apiservices-available.service @@ -5,6 +5,8 @@ Description=Wait for all Kubernetes APIServices to be Available Wants=network-online.target After=network-online.target After=crc-wait-apiserver-up.service +After=crc-wait-node-ready.service +Requires=crc-wait-node-ready.service StartLimitIntervalSec=450 StartLimitBurst=10 From e371b9b7f235e1c2401c6ad3a573c90d7f34974a Mon Sep 17 00:00:00 2001 From: Kevin Pouget Date: Mon, 6 Oct 2025 10:58:23 +0200 Subject: [PATCH 24/44] ocp-growfs: remove This script has been broken for a while, will be handled by cloud-init. --- docs/self-sufficient-bundle.md | 1 - systemd/ocp-growfs.service | 12 ------------ systemd/ocp-growfs.sh | 20 -------------------- 3 files changed, 33 deletions(-) delete mode 100644 systemd/ocp-growfs.service delete mode 100644 systemd/ocp-growfs.sh diff --git a/docs/self-sufficient-bundle.md b/docs/self-sufficient-bundle.md index e7016b157..3cb8a221d 100644 --- a/docs/self-sufficient-bundle.md +++ b/docs/self-sufficient-bundle.md @@ -15,7 +15,6 @@ services to do their work. | `ocp-cluster-ca.service` | ocp | /opt/crc/custom-ca.crt | none | | `ocp-clusterid.service` | ocp | none | none | | `ocp-custom-domain.service` | ocp | none | none | -| `ocp-growfs.service` | ocp | none | none | | `ocp-userpasswords.service` | ocp | /opt/crc/pass_{kubeadmin, developer} | none | In addition to the above services we have `ocp-cluster-ca.path`, `crc-pullsecret.path` and `ocp-userpasswords.path` that monitors the filesystem paths diff --git a/systemd/ocp-growfs.service b/systemd/ocp-growfs.service deleted file mode 100644 index 0790e4dd7..000000000 --- a/systemd/ocp-growfs.service +++ /dev/null @@ -1,12 +0,0 @@ -[Unit] -Description=CRC Unit to grow the root filesystem -Requires=crc-custom.target -After=crc-env-file-exists.service - -[Service] -Type=oneshot -ExecCondition=/usr/local/bin/crc-self-sufficient-env.sh -ExecStart=/usr/local/bin/ocp-growfs.sh - -[Install] -WantedBy=multi-user.target diff --git a/systemd/ocp-growfs.sh b/systemd/ocp-growfs.sh deleted file mode 100644 index b74ae8457..000000000 --- a/systemd/ocp-growfs.sh +++ /dev/null @@ -1,20 +0,0 @@ -#!/bin/bash - -set -o pipefail -set -o errexit -set -o nounset -set -o errtrace -set -x - -root_partition=$(/usr/sbin/blkid -t TYPE=xfs -o device) -/usr/bin/growpart "${root_partition%?}" "${root_partition#/dev/???}" - -rootFS="/sysroot" -mount -o remount,rw "${rootFS}" -xfs_growfs "${rootFS}" - -#mount -o remount,ro "${rootFS}" - -echo "All done" - -exit 0 From 9d77c51b97c287970090ceb0567e7883b4be766c Mon Sep 17 00:00:00 2001 From: Kevin Pouget Date: Mon, 6 Oct 2025 16:20:48 +0200 Subject: [PATCH 25/44] crc-aws-fetch-secrets.sh: new script for mapt to fetch the secrets from AWS-CLI This script offloads `mapt` and other AWS deployers from the task of fetching the secrets from AWS IMDS service. This script should be include in the `cloud-init` user-data configuration file, with this kind of invocation: ``` /usr/local/bin/crc-aws-fetch-secrets.sh \ "{{ .SSMPullSecretName }}" \ "{{ .SSMKubeAdminPasswordName }}" \ "{{ .SSMDeveloperPasswordName }}" ``` where the parameters specify the location of the three secrets in the IMDS store. --- systemd/crc-aws-fetch-secrets.sh | 126 +++++++++++++++++++++++++++++++ 1 file changed, 126 insertions(+) create mode 100644 systemd/crc-aws-fetch-secrets.sh diff --git a/systemd/crc-aws-fetch-secrets.sh b/systemd/crc-aws-fetch-secrets.sh new file mode 100644 index 000000000..fdc3dd947 --- /dev/null +++ b/systemd/crc-aws-fetch-secrets.sh @@ -0,0 +1,126 @@ +#!/bin/bash + +set -o pipefail +set -o errexit +set -o nounset +set -o errtrace +set -x + +# set -x is safe, the secrets are passed via stdin + +AWS_CLI_IMG=docker.io/amazon/aws-cli +MIN_CHAR_COUNT=8 # minimum number of chars for the secret to be + # assumed valid + +umask 0077 # 0600 file permission for secrets +install -d -m 0700 /opt/crc # ensure that the target directory exists + +PULL_SECRETS_KEY=${1:-} +KUBEADM_PASS_KEY=${2:-} +DEVELOPER_PASS_KEY=${3:-} + +if [[ -z "$PULL_SECRETS_KEY" || -z "$KUBEADM_PASS_KEY" || -z "$DEVELOPER_PASS_KEY" ]]; then + echo "ERROR: expected to receive 3 parameters: PULL_SECRETS_KEY KUBEADM_PASS_KEY DEVELOPER_PASS_KEY" + exit 1 +fi + +SECONDS=0 +podman pull --quiet "$AWS_CLI_IMG" +echo "Took $SECONDS seconds to pull the $AWS_CLI_IMG" + +wait_imds_available_and_get_region() { + total_timeout_minutes=5 + retry_interval_seconds=5 + + IMDS_TOKEN_COMMAND=( + curl + --connect-timeout 1 + -X PUT + "http://169.254.169.254/latest/api/token" + -H "X-aws-ec2-metadata-token-ttl-seconds: 21600" + -Ssf + ) + success=false + deadline=$(( $(date +%s) + (total_timeout_minutes * 60) )) + while [[ $(date +%s) -lt $deadline ]]; do + # By placing the command in an 'if' condition, we can test its exit code + # without triggering 'set -e'. The output is still captured. + if TOKEN=$("${IMDS_TOKEN_COMMAND[@]}"); then + # This block only runs if the curl command succeeds (exit code 0) + success=true + echo "Successfully fetched token." >&2 + break # Exit the loop on success + fi + + # This block runs if the curl command fails + echo "Failed to connect. Retrying in $retry_interval_seconds seconds..." >&2 + sleep "$retry_interval_seconds" + done + + if [[ "$success" != "true" ]]; then + echo "ERROR: Could not fetch token after $total_timeout_minutes minutes." >&2 + return 1 + fi + + # Then, use the token to get the region + echo "Fetching the AWS region ..." + curl -Ssf -H "X-aws-ec2-metadata-token: $TOKEN" http://169.254.169.254/latest/meta-data/placement/region > /tmp/aws-region + echo >> /tmp/aws-region # add EOL at EOF, for consistency + echo "AWS region: $(< /tmp/aws-region)" +} + +( + set +x # disable the xtrace as the token would be leaked + echo "Waiting for the AWS IMDS service to be available ..." + SECONDS=0 + wait_imds_available_and_get_region + echo "Took $SECONDS for the IMDS service to become available." +) + +missing_secrets=0 + +save_secret() { + name=$1 + key=$2 + dest=$3 + + # --log-driver=none avoids that the journal captures the stdout + # logs of podman and leaks the passwords in the journal ... + if ! podman run \ + --name "cloud-init-fetch-$name" \ + --env AWS_REGION="$(< /tmp/aws-region)" \ + --rm \ + "$AWS_CLI_IMG" \ + ssm get-parameter \ + --name "$key" \ + --with-decryption \ + --query "Parameter.Value" \ + --output text \ + > "${dest}.tmp" + then + rm -f "${dest}.tmp" + echo "ERROR: failed to get the '$name' secret ... (fetched from $key)" + ((missing_secrets += 1)) + return + fi + char_count=$(wc -c < "${dest}.tmp") + if (( char_count < MIN_CHAR_COUNT )); then + echo "ERROR: the content of the '$name' secret is too short ... (fetched from $key)" + rm -f "${dest}.tmp" + ((missing_secrets += 1)) + return + fi + + mv "${dest}.tmp" "${dest}" # atomic creation of the file +} + +save_secret "pull-secrets" "$PULL_SECRETS_KEY" /opt/crc/pull-secret +save_secret "kubeadmin-pass" "$KUBEADM_PASS_KEY" /opt/crc/pass_kubeadmin +save_secret "developer-pass" "$DEVELOPER_PASS_KEY" /opt/crc/pass_developer + +if (( missing_secrets != 0 )); then + echo "ERROR: failed to fetch $missing_secrets secrets ..." + exit 1 +fi + +exit 0 From b4e1a1b2b511848c1336fc40f95fc227c9d24f71 Mon Sep 17 00:00:00 2001 From: Kevin Pouget Date: Mon, 6 Oct 2025 17:06:34 +0200 Subject: [PATCH 26/44] systemd: let systemd enforce that the mandatory secrets files exist This commit moves the definition of the secret file locations from the scripts to the SystemD unit. This way, SystemD can enforce that the files exist before launching the relevant services. --- systemd/crc-pullsecret.service | 3 ++- systemd/crc-pullsecret.sh | 3 ++- systemd/ocp-custom-domain.service | 3 ++- systemd/ocp-custom-domain.sh | 8 +++++++- systemd/ocp-userpasswords.service | 4 +++- systemd/ocp-userpasswords.sh | 12 ++++++++++-- 6 files changed, 26 insertions(+), 7 deletions(-) diff --git a/systemd/crc-pullsecret.service b/systemd/crc-pullsecret.service index 0baece570..8781a0bf1 100644 --- a/systemd/crc-pullsecret.service +++ b/systemd/crc-pullsecret.service @@ -12,7 +12,8 @@ Restart=on-failure RestartSec=40 Environment=KUBECONFIG=/opt/kubeconfig ExecCondition=/usr/local/bin/crc-self-sufficient-env.sh -ExecStart=/usr/local/bin/crc-pullsecret.sh +ExecStartPre=/usr/bin/test -f /opt/crc/pull-secret +ExecStart=/usr/local/bin/crc-pullsecret.sh /opt/crc/pull-secret ExecStartPost=-touch /opt/crc/%n.done [Install] diff --git a/systemd/crc-pullsecret.sh b/systemd/crc-pullsecret.sh index 74f62ac3f..0b636a67b 100644 --- a/systemd/crc-pullsecret.sh +++ b/systemd/crc-pullsecret.sh @@ -8,7 +8,7 @@ set -x source /usr/local/bin/crc-systemd-common.sh -PULL_SECRETS_FILE="/opt/crc/pull-secret" +PULL_SECRETS_FILE="${1:-}" wait_for_resource_or_die secret @@ -28,6 +28,7 @@ fi echo "Cluster doesn't have the pull secrets. Setting them from $PULL_SECRETS_FILE ..." +# enforced by systemd if [[ ! -r "$PULL_SECRETS_FILE" ]]; then echo "ERROR: $PULL_SECRETS_FILE is missing or unreadable" 1>&2 diff --git a/systemd/ocp-custom-domain.service b/systemd/ocp-custom-domain.service index ab2cc2f72..5fb41e38a 100644 --- a/systemd/ocp-custom-domain.service +++ b/systemd/ocp-custom-domain.service @@ -6,6 +6,7 @@ After=ocp-wait-apiservices-available.service StartLimitIntervalSec=450 StartLimitBurst=10 ConditionPathExists=!/opt/crc/%n.done +AssertPathExists=/opt/crc/eip [Service] Type=oneshot @@ -13,7 +14,7 @@ Restart=on-failure RestartSec=40 Environment=KUBECONFIG=/opt/kubeconfig ExecCondition=/usr/local/bin/crc-self-sufficient-env.sh -ExecStart=/usr/local/bin/ocp-custom-domain.sh +ExecStart=/usr/local/bin/ocp-custom-domain.sh /opt/crc/eip ExecStartPost=-touch /opt/crc/%n.done [Install] diff --git a/systemd/ocp-custom-domain.sh b/systemd/ocp-custom-domain.sh index 6b706a3dc..023df73b5 100644 --- a/systemd/ocp-custom-domain.sh +++ b/systemd/ocp-custom-domain.sh @@ -8,8 +8,14 @@ set -x source /usr/local/bin/crc-systemd-common.sh -CRC_EXTERNAL_IP_FILE_PATH=/opt/crc/eip +CRC_EXTERNAL_IP_FILE_PATH="${1:-}" +if [[ -z "$CRC_EXTERNAL_IP_FILE_PATH" ]]; then + echo "ERROR: expected to receive the external IP file as first argument ..." >&2 + exit 1 +fi + +# enforced by systemd if [[ ! -r "$CRC_EXTERNAL_IP_FILE_PATH" ]]; then echo "ERROR: CRC external ip file not found ($CRC_EXTERNAL_IP_FILE_PATH)" >&2 exit 1 diff --git a/systemd/ocp-userpasswords.service b/systemd/ocp-userpasswords.service index 5d3d61ad6..e16bf311b 100644 --- a/systemd/ocp-userpasswords.service +++ b/systemd/ocp-userpasswords.service @@ -13,7 +13,9 @@ Restart=on-failure RestartSec=40 Environment=KUBECONFIG=/opt/kubeconfig ExecCondition=/usr/local/bin/crc-self-sufficient-env.sh -ExecStart=/usr/local/bin/ocp-userpasswords.sh +ExecStartPre=/usr/bin/test -f /opt/crc/pass_developer +ExecStartPre=/usr/bin/test -f /opt/crc/pass_kubeadmin +ExecStart=/usr/local/bin/ocp-userpasswords.sh /opt/crc/pass_kubeadmin /opt/crc/pass_developer ExecStartPost=-touch /opt/crc/%n.done [Install] diff --git a/systemd/ocp-userpasswords.sh b/systemd/ocp-userpasswords.sh index 2adeed8be..3a80cd853 100644 --- a/systemd/ocp-userpasswords.sh +++ b/systemd/ocp-userpasswords.sh @@ -8,8 +8,14 @@ set -x source /usr/local/bin/crc-systemd-common.sh -CRC_PASS_DEVELOPER_PATH=/opt/crc/pass_developer -CRC_PASS_KUBEADMIN_PATH=/opt/crc/pass_kubeadmin +CRC_PASS_KUBEADMIN_PATH=${1:-} +CRC_PASS_DEVELOPER_PATH=${2:-} + +if [[ -z "$CRC_PASS_KUBEADMIN_PATH" || -z "$CRC_PASS_DEVELOPER_PATH" ]]; then + echo "ERROR: expected to receive the kubeadmin password file as 1st arg and the dev password file as 2nd arg. Got '$CRC_PASS_KUBEADMIN_PATH' and '$CRC_PASS_DEVELOPER_PATH'" + exit 1 +fi + CRC_HTPASSWD_IMAGE=registry.access.redhat.com/ubi10/httpd-24 function gen_htpasswd() { @@ -21,11 +27,13 @@ function gen_htpasswd() { podman run --rm "$CRC_HTPASSWD_IMAGE" htpasswd -nb "$1" "$2" } +# enforced by systemd if [[ ! -r "$CRC_PASS_DEVELOPER_PATH" ]]; then echo "ERROR: CRC developer password does not exist ($CRC_PASS_DEVELOPER_PATH)" exit 1 fi +# enforced by systemd if [[ ! -r "$CRC_PASS_KUBEADMIN_PATH" ]]; then echo "ERROR: CRC kubeadmin password does not exist ($CRC_PASS_KUBEADMIN_PATH)" exit 1 From e793cbf626d8f737c03177b0c88edbd5028069c5 Mon Sep 17 00:00:00 2001 From: Kevin Pouget Date: Mon, 6 Oct 2025 21:41:21 +0200 Subject: [PATCH 27/44] systemd: prevent podman from leaking passwords in the journal A review of the systemd journal logs of the different services highlighted that the SystemD journal captures information about the Podman containers via a Podman-internal logging mechanism. This commit disables the logging mechanism to the containers handling secrets. --- systemd/crc-aws-fetch-secrets.sh | 1 + systemd/ocp-userpasswords.sh | 4 +++- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/systemd/crc-aws-fetch-secrets.sh b/systemd/crc-aws-fetch-secrets.sh index fdc3dd947..8d0174d43 100644 --- a/systemd/crc-aws-fetch-secrets.sh +++ b/systemd/crc-aws-fetch-secrets.sh @@ -89,6 +89,7 @@ save_secret() { if ! podman run \ --name "cloud-init-fetch-$name" \ --env AWS_REGION="$(< /tmp/aws-region)" \ + --log-driver=none \ --rm \ "$AWS_CLI_IMG" \ ssm get-parameter \ diff --git a/systemd/ocp-userpasswords.sh b/systemd/ocp-userpasswords.sh index 3a80cd853..f3e508430 100644 --- a/systemd/ocp-userpasswords.sh +++ b/systemd/ocp-userpasswords.sh @@ -24,7 +24,9 @@ function gen_htpasswd() { return 1 fi - podman run --rm "$CRC_HTPASSWD_IMAGE" htpasswd -nb "$1" "$2" + # --log-driver=none avoids that the journal captures the stdout + # logs of podman and leaks the passwords in the journal ... + podman run --log-driver=none --rm "$CRC_HTPASSWD_IMAGE" htpasswd -nb "$1" "$2" } # enforced by systemd From f902337097de36b8a7eaae64e7e97f07fe3d127c Mon Sep 17 00:00:00 2001 From: Kevin Pouget Date: Mon, 6 Oct 2025 21:41:51 +0200 Subject: [PATCH 28/44] systemd: log the wait durations To ease the quick glance review of the CRC boot timing, this scripts adds a simple timing measurement, based on Bash's `SECONDS` special variable (automatically tracking time past after its `SECONDS=0` reset). For a stronger time tracking, refer to the journal timestamps of the services. --- systemd/crc-systemd-common.sh | 3 +++ systemd/crc-wait-apiserver-up.sh | 4 +++- systemd/crc-wait-node-ready.sh | 7 ++++--- 3 files changed, 10 insertions(+), 4 deletions(-) diff --git a/systemd/crc-systemd-common.sh b/systemd/crc-systemd-common.sh index c384eb39d..583ad22ad 100644 --- a/systemd/crc-systemd-common.sh +++ b/systemd/crc-systemd-common.sh @@ -16,6 +16,9 @@ function wait_for_resource_or_die() { exit 1 # this is wait_for_resource_or_die, so die ... fi + local start_time + start_time=$(date +%s) + # Loop from 1 up to max_retry for (( retry=1; retry<=max_retry; retry++ )); do # Try the command. If it succeeds, exit the loop. diff --git a/systemd/crc-wait-apiserver-up.sh b/systemd/crc-wait-apiserver-up.sh index 209592f48..25bfe8b2e 100644 --- a/systemd/crc-wait-apiserver-up.sh +++ b/systemd/crc-wait-apiserver-up.sh @@ -7,11 +7,13 @@ set -o errtrace source /usr/local/bin/crc-systemd-common.sh +SECONDS=0 + echo "Waiting for the node resource to be available ..." # $1 resource, $2 retry count, $3 wait time wait_for_resource_or_die node 60 5 -echo "node resource available, APIServer is ready." +echo "node resource available, APIServer is ready after $SECONDS seconds." echo "All done" diff --git a/systemd/crc-wait-node-ready.sh b/systemd/crc-wait-node-ready.sh index 0e3d43380..dd3d59d65 100644 --- a/systemd/crc-wait-node-ready.sh +++ b/systemd/crc-wait-node-ready.sh @@ -7,6 +7,7 @@ set -o errtrace source /usr/local/bin/crc-systemd-common.sh +SECONDS=0 MAX_RETRY=150 WAIT_SEC=2 NODE_NAME=node/crc @@ -17,7 +18,7 @@ for retry in $(seq 1 "$MAX_RETRY"); do # Check if the node status is "Ready" if [[ $node_status == "Ready" ]]; then - echo "CRC node is ready." + echo "CRC node is ready after $SECONDS seconds." exit 0 fi @@ -25,12 +26,12 @@ for retry in $(seq 1 "$MAX_RETRY"); do # If it's the last attempt, log a failure message before exiting if (( retry == MAX_RETRY )); then - echo "Error: Timed out waiting for the CRC node to be ready after $MAX_RETRY attempts x $WAIT_SEC seconds." >&2 + echo "ERROR: Timed out waiting for the CRC node to be ready after $MAX_RETRY attempts x $WAIT_SEC seconds." >&2 exit 1 fi # Wait before the next attempt - echo "Waiting for crc node to be ready ... (Attempt ${retry}/${MAX_RETRY})" + echo "Waiting $WAIT_SEC seconds for crc node to be ready ... (Attempt ${retry}/${MAX_RETRY})" sleep "$WAIT_SEC" done From 48a1a2c9c385053a841f295829af61248f84071b Mon Sep 17 00:00:00 2001 From: Kevin Pouget Date: Tue, 7 Oct 2025 12:03:42 +0200 Subject: [PATCH 29/44] createdisk-library: add the ability to upload `unit-name.service.d` config-ovewrite directories SystemD allows overwriting the definition of services by writing new properties in the `unit-name.service.d/override.conf` files. This commit allows the CRC image creation script to properly upload these files and directories to the VM image. --- createdisk-library.sh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/createdisk-library.sh b/createdisk-library.sh index 05b08405b..b55e3a419 100755 --- a/createdisk-library.sh +++ b/createdisk-library.sh @@ -410,6 +410,7 @@ function copy_systemd_units() { ${SSH} core@${VM_IP} -- 'mkdir -p /home/core/systemd-units && mkdir -p /home/core/systemd-scripts' ${SCP} systemd/crc-*.service core@${VM_IP}:/home/core/systemd-units/ ${SCP} systemd/crc-*.target core@${VM_IP}:/home/core/systemd-units/ + ${SCP} -r systemd/*.d core@${VM_IP}:/home/core/systemd-units/ ${SCP} systemd/crc-*.sh core@${VM_IP}:/home/core/systemd-scripts/ case "${BUNDLE_TYPE}" in @@ -419,7 +420,7 @@ function copy_systemd_units() { ;; esac - ${SSH} core@${VM_IP} -- 'sudo cp /home/core/systemd-units/* /etc/systemd/system/ && sudo cp /home/core/systemd-scripts/* /usr/local/bin/' + ${SSH} core@${VM_IP} -- 'sudo cp -r /home/core/systemd-units/* /etc/systemd/system/ && sudo cp /home/core/systemd-scripts/* /usr/local/bin/' ${SSH} core@${VM_IP} -- 'ls /home/core/systemd-scripts/ | xargs -t -I % sudo chmod +x /usr/local/bin/%' ${SSH} core@${VM_IP} -- 'sudo restorecon -rv /usr/local/bin' From 408328b2450c24240e8220ace5123186dd4e12f9 Mon Sep 17 00:00:00 2001 From: Kevin Pouget Date: Tue, 7 Oct 2025 12:04:02 +0200 Subject: [PATCH 30/44] ovs-configuration.service.d/mute-console.conf: mute the journal logs This override prevents the `ovs-configuration.service` from logging its xtrace execution into the journal and the console. This service is very verbose, and makes the console impossible to follow in real-time. Instead, its output is logged in a `/var/log` file. --- systemd/ovs-configuration.service.d/mute-console.conf | 3 +++ 1 file changed, 3 insertions(+) create mode 100644 systemd/ovs-configuration.service.d/mute-console.conf diff --git a/systemd/ovs-configuration.service.d/mute-console.conf b/systemd/ovs-configuration.service.d/mute-console.conf new file mode 100644 index 000000000..749ed6d5d --- /dev/null +++ b/systemd/ovs-configuration.service.d/mute-console.conf @@ -0,0 +1,3 @@ +[Service] +StandardOutput=append:/var/log/ovs-configure.log +StandardError=append:/var/log/ovs-configure.log From cef8ef706b5675a8c2d2e78649bded3688349391 Mon Sep 17 00:00:00 2001 From: Kevin Pouget Date: Tue, 7 Oct 2025 14:55:19 +0200 Subject: [PATCH 31/44] crc-pullsecret.service: retry more often This commit reduces the Restart duration of the service. The SystemD dependencies should already avoid any failure of the script. --- systemd/crc-pullsecret.service | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/systemd/crc-pullsecret.service b/systemd/crc-pullsecret.service index 8781a0bf1..bc2203a58 100644 --- a/systemd/crc-pullsecret.service +++ b/systemd/crc-pullsecret.service @@ -3,13 +3,13 @@ Description=CRC Unit for adding pull secret to cluster After=crc-env-file-exists.service After=crc-wait-apiserver-up.service StartLimitIntervalSec=450 -StartLimitBurst=10 +StartLimitBurst=40 ConditionPathExists=!/opt/crc/%n.done [Service] Type=oneshot Restart=on-failure -RestartSec=40 +RestartSec=10 Environment=KUBECONFIG=/opt/kubeconfig ExecCondition=/usr/local/bin/crc-self-sufficient-env.sh ExecStartPre=/usr/bin/test -f /opt/crc/pull-secret From cd2b395abd3a9b969a699c6abee39b4cc24375d7 Mon Sep 17 00:00:00 2001 From: Kevin Pouget Date: Tue, 7 Oct 2025 15:28:52 +0200 Subject: [PATCH 32/44] crc-pullsecret.service: only run after cloud-final.service Add a dependency on the `cloud-final.service`, to be sure that the pull-secrets have been pulled when the service starts. --- systemd/crc-pullsecret.service | 1 + 1 file changed, 1 insertion(+) diff --git a/systemd/crc-pullsecret.service b/systemd/crc-pullsecret.service index bc2203a58..da313824e 100644 --- a/systemd/crc-pullsecret.service +++ b/systemd/crc-pullsecret.service @@ -2,6 +2,7 @@ Description=CRC Unit for adding pull secret to cluster After=crc-env-file-exists.service After=crc-wait-apiserver-up.service +After=cloud-final.service StartLimitIntervalSec=450 StartLimitBurst=40 ConditionPathExists=!/opt/crc/%n.done From b888671b270b2cbee2c552d633cd05298d3ce350 Mon Sep 17 00:00:00 2001 From: Kevin Pouget Date: Tue, 7 Oct 2025 15:29:02 +0200 Subject: [PATCH 33/44] ocp-userpasswords.service: only run after cloud-final.service Add a dependency on the `cloud-final.service` to be sure that the CRC passwords have been fetched before starting. --- systemd/ocp-userpasswords.service | 1 + 1 file changed, 1 insertion(+) diff --git a/systemd/ocp-userpasswords.service b/systemd/ocp-userpasswords.service index e16bf311b..9eda0cc50 100644 --- a/systemd/ocp-userpasswords.service +++ b/systemd/ocp-userpasswords.service @@ -3,6 +3,7 @@ Description=CRC Unit setting the developer and kubeadmin user password Before=ocp-cluster-ca.service After=crc-env-file-exists.service After=crc-wait-apiserver-up.service +After=cloud-final.service StartLimitIntervalSec=450 StartLimitBurst=10 ConditionPathExists=!/opt/crc/%n.done From dd23709dba4e55c073988dc89e0b208ff8b7f689 Mon Sep 17 00:00:00 2001 From: Kevin Pouget Date: Tue, 7 Oct 2025 16:18:52 +0200 Subject: [PATCH 34/44] crc-custom.target: reformulate the dependencies Reformulate the dependencies of the `crc-custom.target` to avoid startup deadlocks. Load `crc-custom.target` as a dependency of the `kubelet.service`. --- systemd/crc-custom.target | 8 ++++---- systemd/kubelet.service.d/wants-crc-custom.conf | 3 +++ 2 files changed, 7 insertions(+), 4 deletions(-) create mode 100644 systemd/kubelet.service.d/wants-crc-custom.conf diff --git a/systemd/crc-custom.target b/systemd/crc-custom.target index cffb4d5b9..8fa380f48 100644 --- a/systemd/crc-custom.target +++ b/systemd/crc-custom.target @@ -1,5 +1,5 @@ [Unit] -Description=crc custom target -Requires=kubelet-dependencies.target -Requires=crc-env-file-exists.service -After=kubelet-dependencies.target +Description=CRC custom target +Requires=crc-wait-apiserver-up.service +Requires=crc-cluster-status.service +After=crc-wait-apiserver-up.service crc-cluster-status.service diff --git a/systemd/kubelet.service.d/wants-crc-custom.conf b/systemd/kubelet.service.d/wants-crc-custom.conf new file mode 100644 index 000000000..be4b777c2 --- /dev/null +++ b/systemd/kubelet.service.d/wants-crc-custom.conf @@ -0,0 +1,3 @@ +[Unit] +Wants=crc-custom.target +Before=crc-custom.target From 701d5904512b356555108f9e1cd7282c452c2c93 Mon Sep 17 00:00:00 2001 From: Kevin Pouget Date: Tue, 7 Oct 2025 16:43:53 +0200 Subject: [PATCH 35/44] ocp-mco-sshkey.service: ensure that the pubkey is there before starting Ensure that the pub key has been fetched before starting the service. --- systemd/ocp-mco-sshkey.service | 4 +++- systemd/ocp-mco-sshkey.sh | 8 +++++++- 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/systemd/ocp-mco-sshkey.service b/systemd/ocp-mco-sshkey.service index f2d66c21b..5ac8f4c6e 100644 --- a/systemd/ocp-mco-sshkey.service +++ b/systemd/ocp-mco-sshkey.service @@ -2,6 +2,7 @@ Description=CRC Unit patching the MachineConfig to add new ssh key After=crc-env-file-exists.service After=crc-wait-apiserver-up.service +After=cloud-final.service StartLimitIntervalSec=450 StartLimitBurst=10 @@ -11,7 +12,8 @@ Restart=on-failure RestartSec=40 Environment=KUBECONFIG=/opt/kubeconfig ExecCondition=/usr/local/bin/crc-self-sufficient-env.sh -ExecStart=/usr/local/bin/ocp-mco-sshkey.sh +ExecStartPre=/usr/bin/test -f /opt/crc/id_rsa.pub +ExecStart=/usr/local/bin/ocp-mco-sshkey.sh /opt/crc/id_rsa.pub RemainAfterExit=true [Install] diff --git a/systemd/ocp-mco-sshkey.sh b/systemd/ocp-mco-sshkey.sh index 00a90ed64..49b9c5c6b 100644 --- a/systemd/ocp-mco-sshkey.sh +++ b/systemd/ocp-mco-sshkey.sh @@ -8,8 +8,14 @@ set -x source /usr/local/bin/crc-systemd-common.sh -CRC_PUB_KEY_PATH="/opt/crc/id_rsa.pub" +CRC_PUB_KEY_PATH="${1:-}" +if [[ -z "$CRC_PUB_KEY_PATH" ]]; then + echo "ERROR: expected to receive the path to the pub key file as first argument." + exit 1 +fi + +# enforced by systemd if [[ ! -r "$CRC_PUB_KEY_PATH" ]]; then echo "ERROR: CRC pubkey file does not exist ($CRC_PUB_KEY_PATH)" exit 1 From dd8cfe37323731db90b6843cca7f1103cb45dfea Mon Sep 17 00:00:00 2001 From: Kevin Pouget Date: Tue, 7 Oct 2025 16:43:53 +0200 Subject: [PATCH 36/44] ocp-mco-sshkey.sh: don't expose the pub key to the journal Better use of `jq` to ensure that the public key isn't exposed in the journal logs. Exposing a public key isn't a security leak, but better avoid disclosing it as a good practice. --- systemd/ocp-mco-sshkey.sh | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/systemd/ocp-mco-sshkey.sh b/systemd/ocp-mco-sshkey.sh index 49b9c5c6b..31bb5bae8 100644 --- a/systemd/ocp-mco-sshkey.sh +++ b/systemd/ocp-mco-sshkey.sh @@ -24,9 +24,12 @@ fi wait_for_resource_or_die machineconfig/99-master-ssh echo "Updating the public key resource for machine config operator" -pub_key=$(cat "$CRC_PUB_KEY_PATH" | tr -d '\n\r') -jq -n --arg key "${pub_key}" ' +# Use --rawfile to read the key file directly into a jq variable named 'pub_key'. +# The key's content is never exposed as a command-line argument. +# We use jq's rtrimstr function to remove any trailing newlines from the file. + +jq -n --rawfile pub_key "$CRC_PUB_KEY_PATH" ' { "spec": { "config": { @@ -34,7 +37,10 @@ jq -n --arg key "${pub_key}" ' "users": [ { "name": "core", - "sshAuthorizedKeys": [ $key ] + "sshAuthorizedKeys": [ + # Trim trailing newlines and carriage returns from the slurped file content + $pub_key | rtrimstr("\n") | rtrimstr("\r") + ] } ] } From ed4fc5fec3df1066e25fb1c1e0b415107559c515 Mon Sep 17 00:00:00 2001 From: Kevin Pouget Date: Tue, 14 Oct 2025 21:48:26 +0200 Subject: [PATCH 37/44] ocp-custom-domain.service: don't use AssertPathExists `AssertPathExists` is checked before the condition is tested. Use a `ExecStartPre` directive instead. ``` ExecCondition=/usr/local/bin/crc-self-sufficient-env.sh ``` --- systemd/ocp-custom-domain.service | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/systemd/ocp-custom-domain.service b/systemd/ocp-custom-domain.service index 5fb41e38a..b879410c6 100644 --- a/systemd/ocp-custom-domain.service +++ b/systemd/ocp-custom-domain.service @@ -6,7 +6,6 @@ After=ocp-wait-apiservices-available.service StartLimitIntervalSec=450 StartLimitBurst=10 ConditionPathExists=!/opt/crc/%n.done -AssertPathExists=/opt/crc/eip [Service] Type=oneshot @@ -14,6 +13,7 @@ Restart=on-failure RestartSec=40 Environment=KUBECONFIG=/opt/kubeconfig ExecCondition=/usr/local/bin/crc-self-sufficient-env.sh +ExecStartPre=/usr/bin/test -f /opt/crc/eip ExecStart=/usr/local/bin/ocp-custom-domain.sh /opt/crc/eip ExecStartPost=-touch /opt/crc/%n.done From 44a96b542f642d927e7c249c91a6f7c172420502 Mon Sep 17 00:00:00 2001 From: Kevin Pouget Date: Wed, 15 Oct 2025 09:37:23 +0200 Subject: [PATCH 38/44] systemd: remove the dependency on crc-env-file-exists.service --- systemd/crc-check-tap.service | 2 -- systemd/crc-cluster-status.service | 1 - systemd/crc-dnsmasq.service | 1 - systemd/crc-env-file-exists.service | 20 -------------------- systemd/crc-pullsecret.service | 1 - systemd/crc-routes-controller.service | 1 - systemd/crc-wait-apiserver-up.service | 1 - systemd/crc-wait-node-ready.service | 1 - systemd/ocp-clusterid.service | 1 - systemd/ocp-custom-domain.service | 1 - systemd/ocp-mco-sshkey.service | 1 - systemd/ocp-userpasswords.service | 1 - 12 files changed, 32 deletions(-) delete mode 100644 systemd/crc-env-file-exists.service diff --git a/systemd/crc-check-tap.service b/systemd/crc-check-tap.service index 46c5e3a24..473e77a29 100644 --- a/systemd/crc-check-tap.service +++ b/systemd/crc-check-tap.service @@ -3,9 +3,7 @@ Description=Ensure that tap0 network configuration is disabled when not necessar Before=NetworkManager.service Before=gv-user-network@tap0.service After=local-fs.target -After=crc-env-file-exists.service RequiresMountsFor=/etc/NetworkManager/system-connections -Requires=crc-env-file-exists.service [Service] Type=oneshot diff --git a/systemd/crc-cluster-status.service b/systemd/crc-cluster-status.service index 4bd12abd2..fd7f70b19 100644 --- a/systemd/crc-cluster-status.service +++ b/systemd/crc-cluster-status.service @@ -1,6 +1,5 @@ [Unit] Description=CRC Unit checking if cluster is ready -After=crc-env-file-exists.service After=crc-wait-apiserver-up.service crc-pullsecret.service After=ocp-mco-sshkey.service ocp-cluster-ca.service After=ocp-custom-domain.service ocp-userpasswords.service diff --git a/systemd/crc-dnsmasq.service b/systemd/crc-dnsmasq.service index a01107210..a53f2b320 100644 --- a/systemd/crc-dnsmasq.service +++ b/systemd/crc-dnsmasq.service @@ -1,7 +1,6 @@ [Unit] Description=CRC Unit for configuring dnsmasq Wants=ovs-configuration.service -After=crc-env-file-exists.service After=ovs-configuration.service Before=kubelet-dependencies.target StartLimitIntervalSec=30 diff --git a/systemd/crc-env-file-exists.service b/systemd/crc-env-file-exists.service deleted file mode 100644 index 35a9fb344..000000000 --- a/systemd/crc-env-file-exists.service +++ /dev/null @@ -1,20 +0,0 @@ -[Unit] -Description=Wait for /etc/sysconfig/crc-env file to be populated - -[Service] -# This service runs a command once and then exits. -Type=oneshot - -# This is the magic part. It keeps the service in an 'active' state -# after the command exits, so other services can see it succeeded. -RemainAfterExit=yes - -# This is the command that waits for the file. -# It checks every second if the file does not exist ('! -f'). -# Once the file is found, the loop exits, the command succeeds, and the service is 'active'. -ExecStart=/bin/sh -c 'while [ ! -f /etc/sysconfig/crc-env ]; do sleep 1; done' -TimeoutStartSec=300 - -[Install] -# Ensure this service is started during the normal boot process. -WantedBy=crc-custom.target diff --git a/systemd/crc-pullsecret.service b/systemd/crc-pullsecret.service index da313824e..4523549f0 100644 --- a/systemd/crc-pullsecret.service +++ b/systemd/crc-pullsecret.service @@ -1,6 +1,5 @@ [Unit] Description=CRC Unit for adding pull secret to cluster -After=crc-env-file-exists.service After=crc-wait-apiserver-up.service After=cloud-final.service StartLimitIntervalSec=450 diff --git a/systemd/crc-routes-controller.service b/systemd/crc-routes-controller.service index 869b4ab95..e73f71100 100644 --- a/systemd/crc-routes-controller.service +++ b/systemd/crc-routes-controller.service @@ -1,6 +1,5 @@ [Unit] Description=CRC Unit starting routes controller -After=crc-env-file-exists.service After=crc-wait-apiserver-up.service StartLimitIntervalSec=450 StartLimitBurst=10 diff --git a/systemd/crc-wait-apiserver-up.service b/systemd/crc-wait-apiserver-up.service index 9ab3e168c..78ee273c9 100644 --- a/systemd/crc-wait-apiserver-up.service +++ b/systemd/crc-wait-apiserver-up.service @@ -2,7 +2,6 @@ Description=CRC Unit waiting till k8s API server is up Requires=kubelet.service After=kubelet.service -After=crc-env-file-exists.service Before=ocp-delete-mco-leases.service [Service] diff --git a/systemd/crc-wait-node-ready.service b/systemd/crc-wait-node-ready.service index 6daf0472d..facefe55c 100644 --- a/systemd/crc-wait-node-ready.service +++ b/systemd/crc-wait-node-ready.service @@ -2,7 +2,6 @@ Description=CRC Unit waiting till k8s node is ready Requires=kubelet.service After=kubelet.service -After=crc-env-file-exists.service After=crc-wait-apiserver-up.service StartLimitIntervalSec=450 StartLimitBurst=10 diff --git a/systemd/ocp-clusterid.service b/systemd/ocp-clusterid.service index 2e6ad70f3..d9909f29c 100644 --- a/systemd/ocp-clusterid.service +++ b/systemd/ocp-clusterid.service @@ -1,6 +1,5 @@ [Unit] Description=CRC Unit setting random cluster ID -After=crc-env-file-exists.service After=crc-wait-apiserver-up.service StartLimitIntervalSec=450 StartLimitBurst=10 diff --git a/systemd/ocp-custom-domain.service b/systemd/ocp-custom-domain.service index b879410c6..db19d0fa2 100644 --- a/systemd/ocp-custom-domain.service +++ b/systemd/ocp-custom-domain.service @@ -1,6 +1,5 @@ [Unit] Description=CRC Unit setting nip.io domain for cluster -After=crc-env-file-exists.service After=crc-wait-apiserver-up.service After=ocp-wait-apiservices-available.service StartLimitIntervalSec=450 diff --git a/systemd/ocp-mco-sshkey.service b/systemd/ocp-mco-sshkey.service index 5ac8f4c6e..94ea9c203 100644 --- a/systemd/ocp-mco-sshkey.service +++ b/systemd/ocp-mco-sshkey.service @@ -1,6 +1,5 @@ [Unit] Description=CRC Unit patching the MachineConfig to add new ssh key -After=crc-env-file-exists.service After=crc-wait-apiserver-up.service After=cloud-final.service StartLimitIntervalSec=450 diff --git a/systemd/ocp-userpasswords.service b/systemd/ocp-userpasswords.service index 9eda0cc50..30919f51c 100644 --- a/systemd/ocp-userpasswords.service +++ b/systemd/ocp-userpasswords.service @@ -1,7 +1,6 @@ [Unit] Description=CRC Unit setting the developer and kubeadmin user password Before=ocp-cluster-ca.service -After=crc-env-file-exists.service After=crc-wait-apiserver-up.service After=cloud-final.service StartLimitIntervalSec=450 From f0ca4de541dc9c2ddb5b88c54b28f112aaf1519b Mon Sep 17 00:00:00 2001 From: Kevin Pouget Date: Wed, 15 Oct 2025 09:59:56 +0200 Subject: [PATCH 39/44] systemd: update the self-sufficient/user-mode tests to avoid relying on the env file --- createdisk-library.sh | 1 + systemd/crc-self-sufficient-env.sh | 34 +++++++++++----- systemd/crc-test-vsock.py | 63 +++++++++++++++++++++++++++++ systemd/crc-user-mode-networking.sh | 2 +- 4 files changed, 90 insertions(+), 10 deletions(-) create mode 100644 systemd/crc-test-vsock.py diff --git a/createdisk-library.sh b/createdisk-library.sh index b55e3a419..50ff1d3f9 100755 --- a/createdisk-library.sh +++ b/createdisk-library.sh @@ -412,6 +412,7 @@ function copy_systemd_units() { ${SCP} systemd/crc-*.target core@${VM_IP}:/home/core/systemd-units/ ${SCP} -r systemd/*.d core@${VM_IP}:/home/core/systemd-units/ ${SCP} systemd/crc-*.sh core@${VM_IP}:/home/core/systemd-scripts/ + ${SCP} systemd/crc-*.py core@${VM_IP}:/home/core/systemd-scripts/ case "${BUNDLE_TYPE}" in "snc"|"okd") diff --git a/systemd/crc-self-sufficient-env.sh b/systemd/crc-self-sufficient-env.sh index a19f7c7d4..4aa61ad4b 100644 --- a/systemd/crc-self-sufficient-env.sh +++ b/systemd/crc-self-sufficient-env.sh @@ -1,16 +1,32 @@ #!/bin/bash +# set -o errexit disabled to capture the test return code set -o pipefail -set -o errexit set -o nounset set -o errtrace -source /etc/sysconfig/crc-env || echo "WARNING: crc-env not found" +TEST_TIMEOUT=120 +VSOCK_COMM_PORT=1024 -if (( ${CRC_SELF_SUFFICIENT:-0} == 1 )); then - echo "Running with a self-sufficient bundle" - exit 0 -else - echo "Not running in a self-sufficient bundle" - exit 1 -fi +timeout "$TEST_TIMEOUT" python3 /usr/local/bin/crc-test-vsock.py "$VSOCK_COMM_PORT" +returncode=$? + +case "$returncode" in + 124) + echo "ERROR: vsock/${VSOCK_COMM_PORT} test timed out after $TEST_TIMEOUT seconds :/" >&2 + exit 124 + ;; + 1) + echo "vsock/${VSOCK_COMM_PORT} not working, running with a self-sufficient bundle" >&2 + exit 0 + ;; + 0) + echo "vsock/${VSOCK_COMM_PORT} works, not running with a self-sufficient bundle" >&2 + exit 1 + ;; + *) + echo "ERROR: unexpected return code from the vsock test: $returncode" >&2 + exit "$returncode" +esac + +# cannot be reached diff --git a/systemd/crc-test-vsock.py b/systemd/crc-test-vsock.py new file mode 100644 index 000000000..fb93a07ee --- /dev/null +++ b/systemd/crc-test-vsock.py @@ -0,0 +1,63 @@ +#!/usr/bin/env python3 + +import socket +import sys +import time +import fcntl, struct +import os +import errno +import pathlib + +VSOCK_DEV = pathlib.Path("/dev/vsock") +HOST_CID = 2 # VMADDR_CID_HOST + +def main(): + if len(sys.argv) != 2: + print("ERROR: expected a vsock port number as first argument.") + raise SystemExit(errno.EINVAL) + + port = int(sys.argv[1]) + tries = 60 + while not VSOCK_DEV.exists(): + tries -= 1 + + if not tries: + print(f"ERROR: {VSOCK_DEV} didn't appear ...") + return errno.ENODEV + print(f"Waiting for {VSOCK_DEV} to appear ... ({tries} tries left)") + time.sleep(1) + + print(f"Looking up the CID in {VSOCK_DEV}...") + with open(VSOCK_DEV, 'rb') as f: + r = fcntl.ioctl(f, socket.IOCTL_VM_SOCKETS_GET_LOCAL_CID, ' ') + cid = struct.unpack('I', r)[0] + print(f'Our vsock CID is {cid}.') + + s = socket.socket(socket.AF_VSOCK, socket.SOCK_STREAM) + + try: + s.connect((HOST_CID, port)) + except OSError as e: + + if e.errno in (errno.ENODEV, errno.ECONNREFUSED, errno.EHOSTUNREACH, errno.ETIMEDOUT, errno.ECONNRESET): + print(f"No remote host on vsock://{HOST_CID}:{port} ({e.strerror})") + s.close() + return 1 + + print(f"Unexpected error connecting vsock://{HOST_CID}:{port}: {e}") + s.close() + return 1 + + msg = b"hello" + s.sendall(msg) + + s.sendall(b"\n") + + s.close() + print(f"A remote host is listening on vsock://{HOST_CID}:{port}") + + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/systemd/crc-user-mode-networking.sh b/systemd/crc-user-mode-networking.sh index 109603e5e..c60b548f5 100644 --- a/systemd/crc-user-mode-networking.sh +++ b/systemd/crc-user-mode-networking.sh @@ -36,7 +36,7 @@ fi # no value --> error if [[ -z "${CRC_NETWORK_MODE_USER:-}" ]]; then echo "ERROR: CRC_NETWORK_MODE_USER not set. Assuming user networking." >&2 - exit "$EXIT_ERROR" + exit "$EXIT_USER_MODE" fi # value not in [0, 1] --> error From b4146c7e94a8d4d29fc9285ec375ac27dcebf80e Mon Sep 17 00:00:00 2001 From: Kevin Pouget Date: Wed, 15 Oct 2025 12:31:13 +0200 Subject: [PATCH 40/44] crc-aws-fetch-secrets: try multiple times to get the secrets from the IMDS --- systemd/crc-aws-fetch-secrets.sh | 75 ++++++++++++++++---------------- 1 file changed, 38 insertions(+), 37 deletions(-) diff --git a/systemd/crc-aws-fetch-secrets.sh b/systemd/crc-aws-fetch-secrets.sh index 8d0174d43..e174398fe 100644 --- a/systemd/crc-aws-fetch-secrets.sh +++ b/systemd/crc-aws-fetch-secrets.sh @@ -24,14 +24,37 @@ if [[ -z "$PULL_SECRETS_KEY" || -z "$KUBEADM_PASS_KEY" || -z "$DEVELOPER_PASS_KE exit 1 fi +DELAY=5 +TOTAL_PERIOD=$(( 3*60 )) +ATTEMPTS=$(( TOTAL_PERIOD / DELAY)) +function retry_compact() { + for i in $(seq 1 $ATTEMPTS); do + # If the command succeeds (returns 0), exit the function with success. + if "$@"; then + echo "'$*' succeeded after $i attempts " + return 0 + fi + echo "'$*' still failing after $i/$ATTEMPTS attempts ..." + sleep "$DELAY" + done + echo "'$*' didn't succeed after $i attempt ..." + # If the loop finishes, the command never succeeded. + return 1 +} + +cleanup() { + rm -f /tmp/aws-region /opt/crc/pull-secret.tmp /opt/crc/pass_kubeadmin.tmp /opt/crc/pass_developer.tmp + echo "Temp files cleanup complete." +} + +# Cleanup happens automatically via trap on error or at script end +trap cleanup ERR EXIT + SECONDS=0 podman pull --quiet "$AWS_CLI_IMG" echo "Took $SECONDS seconds to pull the $AWS_CLI_IMG" -wait_imds_available_and_get_region() { - total_timeout_minutes=5 - retry_interval_seconds=5 - +check_imds_available_and_get_region() { IMDS_TOKEN_COMMAND=( curl --connect-timeout 1 @@ -40,25 +63,9 @@ wait_imds_available_and_get_region() { -H "X-aws-ec2-metadata-token-ttl-seconds: 21600" -Ssf ) - success=false - deadline=$(( $(date +%s) + (total_timeout_minutes * 60) )) - while [[ $(date +%s) -lt $deadline ]]; do - # By placing the command in an 'if' condition, we can test its exit code - # without triggering 'set -e'. The output is still captured. - if TOKEN=$("${IMDS_TOKEN_COMMAND[@]}"); then - # This block only runs if the curl command succeeds (exit code 0) - success=true - echo "Successfully fetched token." >&2 - break # Exit the loop on success - fi - # This block runs if the curl command fails - echo "Failed to connect. Retrying in $retry_interval_seconds seconds..." >&2 - sleep "$retry_interval_seconds" - done - - if [[ "$success" != "true" ]]; then - echo "ERROR: Could not fetch token after $total_timeout_minutes minutes." >&2 + if ! TOKEN=$("${IMDS_TOKEN_COMMAND[@]}"); then + echo "Couldn't fetch the token..." >&2 return 1 fi @@ -73,12 +80,10 @@ wait_imds_available_and_get_region() { set +x # disable the xtrace as the token would be leaked echo "Waiting for the AWS IMDS service to be available ..." SECONDS=0 - wait_imds_available_and_get_region + retry_compact check_imds_available_and_get_region echo "Took $SECONDS for the IMDS service to become available." ) -missing_secrets=0 - save_secret() { name=$1 key=$2 @@ -101,27 +106,23 @@ save_secret() { then rm -f "${dest}.tmp" echo "ERROR: failed to get the '$name' secret ... (fetched from $key)" - ((missing_secrets += 1)) - return + return 1 fi char_count=$(wc -c < "${dest}.tmp") if (( char_count < MIN_CHAR_COUNT )); then echo "ERROR: the content of the '$name' secret is too short ... (fetched from $key)" rm -f "${dest}.tmp" - ((missing_secrets += 1)) - return + return 1 fi mv "${dest}.tmp" "${dest}" # atomic creation of the file -} -save_secret "pull-secrets" "$PULL_SECRETS_KEY" /opt/crc/pull-secret -save_secret "kubeadmin-pass" "$KUBEADM_PASS_KEY" /opt/crc/pass_kubeadmin -save_secret "developer-pass" "$DEVELOPER_PASS_KEY" /opt/crc/pass_developer + return 0 +} -if (( missing_secrets != 0 )); then - echo "ERROR: failed to fetch $missing_secrets secrets ..." - exit 1 -fi +# execution will abort if 'retry_compact' fails. +retry_compact save_secret "pull-secrets" "$PULL_SECRETS_KEY" /opt/crc/pull-secret +retry_compact save_secret "kubeadmin-pass" "$KUBEADM_PASS_KEY" /opt/crc/pass_kubeadmin +retry_compact save_secret "developer-pass" "$DEVELOPER_PASS_KEY" /opt/crc/pass_developer exit 0 From 51412e2f3d5aafe19ed3c75d961e28a3888365ce Mon Sep 17 00:00:00 2001 From: Kevin Pouget Date: Thu, 16 Oct 2025 10:45:09 +0200 Subject: [PATCH 41/44] systemd: crc-needs-tap.sh: skip self-sufficient/user-mode networking test Not working before the network is established --- systemd/crc-needs-tap.sh | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/systemd/crc-needs-tap.sh b/systemd/crc-needs-tap.sh index f12796bde..d05db66b0 100644 --- a/systemd/crc-needs-tap.sh +++ b/systemd/crc-needs-tap.sh @@ -29,16 +29,17 @@ fi virt="$(systemd-detect-virt || true)" -if [[ -z "$virt" ]]; then - echo "ERROR: systemd couldn't detect the virtualization :/" >&2 - exit "$EXIT_ERROR" -fi - -if [[ "${virt}" == apple ]] ; then +case "${virt}" in + apple) echo "Running with vfkit ($virt) virtualization. Don't need tap0." exit "$EXIT_DONT_NEED_TAP" -fi - -echo "Running with '$virt' virtualization. Need tap0." - -exit "$EXIT_NEED_TAP" + ;; + none) + echo "Bare metal detected. Don't need tap0." + exit "$EXIT_DONT_NEED_TAP" + ;; + *) + echo "Running with '$virt' virtualization. Need tap0." + exit "$EXIT_NEED_TAP" + ;; +esac From 25fca0bd99858691f6bfe43658425f37c21339a9 Mon Sep 17 00:00:00 2001 From: Kevin Pouget Date: Fri, 17 Oct 2025 11:40:40 +0200 Subject: [PATCH 42/44] crc-self-sufficient-env: better handling of missing /dev/vsock --- systemd/crc-self-sufficient-env.sh | 4 ++++ systemd/crc-test-vsock.py | 2 +- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/systemd/crc-self-sufficient-env.sh b/systemd/crc-self-sufficient-env.sh index 4aa61ad4b..d783a56ca 100644 --- a/systemd/crc-self-sufficient-env.sh +++ b/systemd/crc-self-sufficient-env.sh @@ -12,6 +12,10 @@ timeout "$TEST_TIMEOUT" python3 /usr/local/bin/crc-test-vsock.py "$VSOCK_COMM_PO returncode=$? case "$returncode" in + 19) # ENODEV + echo "vsock device doesn't exist, not running self-sufficient bundle" >&2 + exit 1 + ;; 124) echo "ERROR: vsock/${VSOCK_COMM_PORT} test timed out after $TEST_TIMEOUT seconds :/" >&2 exit 124 diff --git a/systemd/crc-test-vsock.py b/systemd/crc-test-vsock.py index fb93a07ee..f8ae0a6b3 100644 --- a/systemd/crc-test-vsock.py +++ b/systemd/crc-test-vsock.py @@ -17,7 +17,7 @@ def main(): raise SystemExit(errno.EINVAL) port = int(sys.argv[1]) - tries = 60 + tries = 5 while not VSOCK_DEV.exists(): tries -= 1 From 6b3866a3fec465f554d23ebcd9c2235b982c9f80 Mon Sep 17 00:00:00 2001 From: Kevin Pouget Date: Fri, 17 Oct 2025 12:14:44 +0200 Subject: [PATCH 43/44] systemd: crc-needs-tap: only use the virt type to detect if TAP is needed --- systemd/crc-needs-tap.sh | 15 --------------- 1 file changed, 15 deletions(-) diff --git a/systemd/crc-needs-tap.sh b/systemd/crc-needs-tap.sh index d05db66b0..ebe1b2c11 100644 --- a/systemd/crc-needs-tap.sh +++ b/systemd/crc-needs-tap.sh @@ -12,21 +12,6 @@ EXIT_NEED_TAP=0 EXIT_DONT_NEED_TAP=77 EXIT_ERROR=1 -if /usr/local/bin/crc-self-sufficient-env.sh; then - echo "Running a self-sufficient bundle. Don't need tap0" - exit "$EXIT_DONT_NEED_TAP" -fi - -if /usr/local/bin/crc-user-mode-networking.sh system; then - echo "Running with CRC and system-mode networking. Don't need tap0. (Fairly rare case.)" - exit "$EXIT_DONT_NEED_TAP" -fi - -# running with CRC (not a self-sufficient bundle) -# running with user-mode networking -# --> vfkit doesn't need tap0 -# --> other platforms do need it - virt="$(systemd-detect-virt || true)" case "${virt}" in From cafd465153e4458553e80def9d708574a925f6c6 Mon Sep 17 00:00:00 2001 From: Kevin Pouget Date: Fri, 17 Oct 2025 12:15:15 +0200 Subject: [PATCH 44/44] systemd: crc-self-sufficient-env: prefer CRC_SELF_SUFFICIENT if available --- systemd/crc-self-sufficient-env.sh | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) diff --git a/systemd/crc-self-sufficient-env.sh b/systemd/crc-self-sufficient-env.sh index d783a56ca..4bb4cc1d8 100644 --- a/systemd/crc-self-sufficient-env.sh +++ b/systemd/crc-self-sufficient-env.sh @@ -1,15 +1,34 @@ #!/bin/bash -# set -o errexit disabled to capture the test return code set -o pipefail set -o nounset set -o errtrace +source /etc/sysconfig/crc-env || echo "WARNING: crc-env not found" + +if [[ "${CRC_SELF_SUFFICIENT:-}" ]]; then + echo "Found CRC_SELF_SUFFICIENT=$CRC_SELF_SUFFICIENT" + + if [[ ! "${CRC_SELF_SUFFICIENT}" =~ ^[01]$ ]]; then + echo "ERROR: CRC_SELF_SUFFICIENT should be 0 or 1 ..." >&2 + exit 1 + fi + + if [[ "$CRC_SELF_SUFFICIENT" == 1 ]]; then + exit 0 + else + exit 1 + fi +fi + TEST_TIMEOUT=120 VSOCK_COMM_PORT=1024 +set +o errexit +# set -o errexit disabled to capture the test return code timeout "$TEST_TIMEOUT" python3 /usr/local/bin/crc-test-vsock.py "$VSOCK_COMM_PORT" returncode=$? +set -o errexit case "$returncode" in 19) # ENODEV