From f7efa6c4e335725bd289c4ed49ae163b198654ce Mon Sep 17 00:00:00 2001 From: bertiethorpe Date: Tue, 4 Nov 2025 11:20:39 +0000 Subject: [PATCH 01/16] Add protected environment checks hook --- environments/site/hooks/pre.yml | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) create mode 100644 environments/site/hooks/pre.yml diff --git a/environments/site/hooks/pre.yml b/environments/site/hooks/pre.yml new file mode 100644 index 000000000..cd25e9217 --- /dev/null +++ b/environments/site/hooks/pre.yml @@ -0,0 +1,20 @@ +--- + +- hosts: localhost + gather_facts: no + become: no + tasks: + - name: Confirm continuing if using production environment + ansible.builtin.pause: + prompt: | + ************************************* + * WARNING: PROTECTED ENVIRONMENT! * + ************************************* + + Current environment: {{ appliances_environment_name }} + Do you really want to continue (yes/no)? + register: env_confirm_safe + when: + - appliances_environment_name in protected_environments + - not (prd_continue | default(false) | bool) + failed_when: not (env_confirm_safe.user_input | bool) \ No newline at end of file From ca475780c660cdd19e6e840d5165e9859c21ca3f Mon Sep 17 00:00:00 2001 From: bertiethorpe Date: Tue, 4 Nov 2025 11:23:16 +0000 Subject: [PATCH 02/16] populate protected_environments list --- environments/site/hooks/pre.yml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/environments/site/hooks/pre.yml b/environments/site/hooks/pre.yml index cd25e9217..cc0a6c6a9 100644 --- a/environments/site/hooks/pre.yml +++ b/environments/site/hooks/pre.yml @@ -3,6 +3,9 @@ - hosts: localhost gather_facts: no become: no + vars: + protected_environments: + - prd tasks: - name: Confirm continuing if using production environment ansible.builtin.pause: From ceaba175873f02c595e235c1693a788be5663f7b Mon Sep 17 00:00:00 2001 From: bertiethorpe Date: Tue, 4 Nov 2025 16:24:48 +0000 Subject: [PATCH 03/16] unlock instances before rebuild-via-slurm --- ansible/adhoc/lock_unlock_instances.yml | 10 ++++++++++ ansible/adhoc/rebuild-via-slurm.yml | 10 ++++++++++ ansible/site.yml | 6 ++++++ 3 files changed, 26 insertions(+) create mode 100644 ansible/adhoc/lock_unlock_instances.yml diff --git a/ansible/adhoc/lock_unlock_instances.yml b/ansible/adhoc/lock_unlock_instances.yml new file mode 100644 index 000000000..81ec547ab --- /dev/null +++ b/ansible/adhoc/lock_unlock_instances.yml @@ -0,0 +1,10 @@ +--- + +- hosts: "{{ target_hosts | default('all') }}" + gather_facts: no + become: no + tasks: + - name: Lock/Unlock instances + openstack.cloud.server_action: + action: "{{ server_action | default('lock') }}" + server: "{{ inventory_hostname }}" \ No newline at end of file diff --git a/ansible/adhoc/rebuild-via-slurm.yml b/ansible/adhoc/rebuild-via-slurm.yml index 33cbe5cc7..bf26e0323 100644 --- a/ansible/adhoc/rebuild-via-slurm.yml +++ b/ansible/adhoc/rebuild-via-slurm.yml @@ -8,6 +8,16 @@ # See docs/slurm-controlled-rebuild.md. +- hosts: localhost + gather_facts: false + vars: + server_action: unlock + target_hosts: compute + tasks: + - name: Unlock compute instances for rebuild + ansible.builtin.include_playbook: + file: adhoc/lock_unlock_instances.yml + - hosts: login run_once: true gather_facts: false diff --git a/ansible/site.yml b/ansible/site.yml index 79b71e10a..fa229800a 100644 --- a/ansible/site.yml +++ b/ansible/site.yml @@ -1,4 +1,10 @@ --- +- name: Lock all instances + vars: + server_action: lock + target_hosts: all + ansible.builtin.import_playbook: adhoc/lock_unlock_instances.yml + - name: Run pre.yml hook vars: # hostvars not available here, so have to recalculate environment root: From 36a10e776faec49ec9974120dbb1b3de97994a11 Mon Sep 17 00:00:00 2001 From: bertiethorpe Date: Fri, 7 Nov 2025 09:40:08 +0000 Subject: [PATCH 04/16] fix rebuild unlocking --- ansible/adhoc/lock_unlock_instances.yml | 3 ++- ansible/adhoc/rebuild-via-slurm.yml | 8 ++------ ansible/safe-env.yml | 22 ++++++++++++++++++++++ ansible/site.yml | 3 +++ environments/site/hooks/pre.yml | 22 ---------------------- 5 files changed, 29 insertions(+), 29 deletions(-) create mode 100644 ansible/safe-env.yml diff --git a/ansible/adhoc/lock_unlock_instances.yml b/ansible/adhoc/lock_unlock_instances.yml index 81ec547ab..db9464cae 100644 --- a/ansible/adhoc/lock_unlock_instances.yml +++ b/ansible/adhoc/lock_unlock_instances.yml @@ -7,4 +7,5 @@ - name: Lock/Unlock instances openstack.cloud.server_action: action: "{{ server_action | default('lock') }}" - server: "{{ inventory_hostname }}" \ No newline at end of file + server: "{{ inventory_hostname }}" + delegate_to: localhost \ No newline at end of file diff --git a/ansible/adhoc/rebuild-via-slurm.yml b/ansible/adhoc/rebuild-via-slurm.yml index bf26e0323..fca4258a8 100644 --- a/ansible/adhoc/rebuild-via-slurm.yml +++ b/ansible/adhoc/rebuild-via-slurm.yml @@ -8,15 +8,11 @@ # See docs/slurm-controlled-rebuild.md. -- hosts: localhost - gather_facts: false +- name: Unlock compute instances for rebuild vars: server_action: unlock target_hosts: compute - tasks: - - name: Unlock compute instances for rebuild - ansible.builtin.include_playbook: - file: adhoc/lock_unlock_instances.yml + ansible.builtin.import_playbook: lock_unlock_instances.yml - hosts: login run_once: true diff --git a/ansible/safe-env.yml b/ansible/safe-env.yml new file mode 100644 index 000000000..8479a298b --- /dev/null +++ b/ansible/safe-env.yml @@ -0,0 +1,22 @@ +--- +- hosts: localhost + gather_facts: no + become: no + vars: + protected_environments: + - prd + tasks: + - name: Confirm continuing if using production environment + ansible.builtin.pause: + prompt: | + ************************************* + * WARNING: PROTECTED ENVIRONMENT! * + ************************************* + + Current environment: {{ appliances_environment_name }} + Do you really want to continue (yes/no)? + register: env_confirm_safe + when: + - appliances_environment_name in protected_environments + - not (prd_continue | default(false) | bool) + failed_when: not (env_confirm_safe.user_input | bool) \ No newline at end of file diff --git a/ansible/site.yml b/ansible/site.yml index fa229800a..4cafa71c2 100644 --- a/ansible/site.yml +++ b/ansible/site.yml @@ -1,4 +1,7 @@ --- + +- ansible.builtin.import_playbook: safe-env.yml + - name: Lock all instances vars: server_action: lock diff --git a/environments/site/hooks/pre.yml b/environments/site/hooks/pre.yml index cc0a6c6a9..ed97d539c 100644 --- a/environments/site/hooks/pre.yml +++ b/environments/site/hooks/pre.yml @@ -1,23 +1 @@ --- - -- hosts: localhost - gather_facts: no - become: no - vars: - protected_environments: - - prd - tasks: - - name: Confirm continuing if using production environment - ansible.builtin.pause: - prompt: | - ************************************* - * WARNING: PROTECTED ENVIRONMENT! * - ************************************* - - Current environment: {{ appliances_environment_name }} - Do you really want to continue (yes/no)? - register: env_confirm_safe - when: - - appliances_environment_name in protected_environments - - not (prd_continue | default(false) | bool) - failed_when: not (env_confirm_safe.user_input | bool) \ No newline at end of file From 675d3ba294eafcdeda0783d163783ab341778684 Mon Sep 17 00:00:00 2001 From: bertiethorpe Date: Fri, 7 Nov 2025 09:42:13 +0000 Subject: [PATCH 05/16] remove site hook --- environments/site/hooks/pre.yml | 1 - 1 file changed, 1 deletion(-) delete mode 100644 environments/site/hooks/pre.yml diff --git a/environments/site/hooks/pre.yml b/environments/site/hooks/pre.yml deleted file mode 100644 index ed97d539c..000000000 --- a/environments/site/hooks/pre.yml +++ /dev/null @@ -1 +0,0 @@ ---- From 0a4988d3e08537e868464237ffdcc869e3256fae Mon Sep 17 00:00:00 2001 From: bertiethorpe Date: Tue, 11 Nov 2025 19:30:30 +0000 Subject: [PATCH 06/16] define protected envs in common vars, improve lock_unlock_instances --- ansible/adhoc/lock_unlock_instances.yml | 4 ++-- ansible/adhoc/rebuild-via-slurm.yml | 5 ++--- ansible/safe-env.yml | 7 ++----- ansible/site.yml | 7 ++----- environments/common/inventory/group_vars/all/defaults.yml | 2 ++ 5 files changed, 10 insertions(+), 15 deletions(-) diff --git a/ansible/adhoc/lock_unlock_instances.yml b/ansible/adhoc/lock_unlock_instances.yml index db9464cae..80e3404a4 100644 --- a/ansible/adhoc/lock_unlock_instances.yml +++ b/ansible/adhoc/lock_unlock_instances.yml @@ -1,11 +1,11 @@ --- -- hosts: "{{ target_hosts | default('all') }}" +- hosts: cluster gather_facts: no become: no tasks: - name: Lock/Unlock instances openstack.cloud.server_action: - action: "{{ server_action | default('lock') }}" + action: "{{ appliances_server_action | default('lock') }}" server: "{{ inventory_hostname }}" delegate_to: localhost \ No newline at end of file diff --git a/ansible/adhoc/rebuild-via-slurm.yml b/ansible/adhoc/rebuild-via-slurm.yml index fca4258a8..fbe96c700 100644 --- a/ansible/adhoc/rebuild-via-slurm.yml +++ b/ansible/adhoc/rebuild-via-slurm.yml @@ -10,9 +10,8 @@ - name: Unlock compute instances for rebuild vars: - server_action: unlock - target_hosts: compute - ansible.builtin.import_playbook: lock_unlock_instances.yml + appliances_server_action: unlock + ansible.builtin.command: ansible-playbook --limit compute adhoc/lock_unlock_instances.yml - hosts: login run_once: true diff --git a/ansible/safe-env.yml b/ansible/safe-env.yml index 8479a298b..7aab7c8da 100644 --- a/ansible/safe-env.yml +++ b/ansible/safe-env.yml @@ -2,9 +2,6 @@ - hosts: localhost gather_facts: no become: no - vars: - protected_environments: - - prd tasks: - name: Confirm continuing if using production environment ansible.builtin.pause: @@ -17,6 +14,6 @@ Do you really want to continue (yes/no)? register: env_confirm_safe when: - - appliances_environment_name in protected_environments - - not (prd_continue | default(false) | bool) + - appliances_environment_name in appliances_protected_environments + - not (appliances_protected_environment_autoapprove | default(false) | bool) failed_when: not (env_confirm_safe.user_input | bool) \ No newline at end of file diff --git a/ansible/site.yml b/ansible/site.yml index 4cafa71c2..191aa3cd6 100644 --- a/ansible/site.yml +++ b/ansible/site.yml @@ -2,11 +2,8 @@ - ansible.builtin.import_playbook: safe-env.yml -- name: Lock all instances - vars: - server_action: lock - target_hosts: all - ansible.builtin.import_playbook: adhoc/lock_unlock_instances.yml +- name: Lock cluster instances + ansible.builtin.command: ansible-playbook adhoc/lock_unlock_instances.yml - name: Run pre.yml hook vars: diff --git a/environments/common/inventory/group_vars/all/defaults.yml b/environments/common/inventory/group_vars/all/defaults.yml index e9852afd6..6cc02ff59 100644 --- a/environments/common/inventory/group_vars/all/defaults.yml +++ b/environments/common/inventory/group_vars/all/defaults.yml @@ -4,6 +4,8 @@ ansible_user: rocky appliances_repository_root: "{{ lookup('env', 'APPLIANCES_REPO_ROOT') }}" appliances_environment_root: "{{ lookup('env', 'APPLIANCES_ENVIRONMENT_ROOT') }}" appliances_environment_name: "{{ appliances_environment_root | basename | regex_replace('\\W+', '') }}" # [a-zA-Z0-9_] only +appliances_protected_environments: + - prd appliances_cockpit_state: absent # RHEL cockpit installed but not enabled in genericcloud images; appliance defaults to removing it # appliances_state_dir: # define an absolute path here to use for persistent state: NB: This is defined as /var/lib/state in inventory by the default Terraform appliances_mode: configure From 2576d250b23a39f469a19a8c61848141a752f552 Mon Sep 17 00:00:00 2001 From: bertiethorpe Date: Wed, 12 Nov 2025 11:36:05 +0000 Subject: [PATCH 07/16] fix locking instances play --- ansible/adhoc/lock_unlock_instances.yml | 4 ++-- ansible/adhoc/rebuild-via-slurm.yml | 10 ++++++---- ansible/site.yml | 8 ++++++-- 3 files changed, 14 insertions(+), 8 deletions(-) diff --git a/ansible/adhoc/lock_unlock_instances.yml b/ansible/adhoc/lock_unlock_instances.yml index 80e3404a4..6aafce1e3 100644 --- a/ansible/adhoc/lock_unlock_instances.yml +++ b/ansible/adhoc/lock_unlock_instances.yml @@ -1,8 +1,8 @@ --- - hosts: cluster - gather_facts: no - become: no + gather_facts: false + become: false tasks: - name: Lock/Unlock instances openstack.cloud.server_action: diff --git a/ansible/adhoc/rebuild-via-slurm.yml b/ansible/adhoc/rebuild-via-slurm.yml index fbe96c700..4f1fc5fe9 100644 --- a/ansible/adhoc/rebuild-via-slurm.yml +++ b/ansible/adhoc/rebuild-via-slurm.yml @@ -8,10 +8,12 @@ # See docs/slurm-controlled-rebuild.md. -- name: Unlock compute instances for rebuild - vars: - appliances_server_action: unlock - ansible.builtin.command: ansible-playbook --limit compute adhoc/lock_unlock_instances.yml +- hosts: localhost + gather_facts: false + tasks: + - name: Unlock compute nodes to ready rebuild + ansible.builtin.command: + cmd: ansible-playbook --limit compute adhoc/lock_unlock_instances.yml -e "appliances_server_action=unlock" - hosts: login run_once: true diff --git a/ansible/site.yml b/ansible/site.yml index 191aa3cd6..748933a50 100644 --- a/ansible/site.yml +++ b/ansible/site.yml @@ -2,8 +2,12 @@ - ansible.builtin.import_playbook: safe-env.yml -- name: Lock cluster instances - ansible.builtin.command: ansible-playbook adhoc/lock_unlock_instances.yml +- hosts: localhost + gather_facts: false + tasks: + - name: Lock all cluster instances + ansible.builtin.command: + cmd: ansible-playbook adhoc/lock_unlock_instances.yml - name: Run pre.yml hook vars: From e2d2a933137e635538f028d65cba0669e4b25d8a Mon Sep 17 00:00:00 2001 From: bertiethorpe Date: Wed, 12 Nov 2025 11:37:45 +0000 Subject: [PATCH 08/16] linting fix --- ansible/safe-env.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ansible/safe-env.yml b/ansible/safe-env.yml index 7aab7c8da..81a688709 100644 --- a/ansible/safe-env.yml +++ b/ansible/safe-env.yml @@ -1,7 +1,7 @@ --- - hosts: localhost - gather_facts: no - become: no + gather_facts: false + become: false tasks: - name: Confirm continuing if using production environment ansible.builtin.pause: From f8424099f2c66d703165c712de3751ae48e61334 Mon Sep 17 00:00:00 2001 From: bertiethorpe Date: Wed, 12 Nov 2025 14:28:18 +0000 Subject: [PATCH 09/16] linter error --- ansible/adhoc/lock_unlock_instances.yml | 4 ++-- ansible/adhoc/rebuild-via-slurm.yml | 11 +++++------ ansible/safe-env.yml | 2 +- ansible/site.yml | 8 ++------ 4 files changed, 10 insertions(+), 15 deletions(-) diff --git a/ansible/adhoc/lock_unlock_instances.yml b/ansible/adhoc/lock_unlock_instances.yml index 6aafce1e3..64886cd71 100644 --- a/ansible/adhoc/lock_unlock_instances.yml +++ b/ansible/adhoc/lock_unlock_instances.yml @@ -1,6 +1,6 @@ --- -- hosts: cluster +- hosts: "{{ target_hosts | default('cluster') }}" gather_facts: false become: false tasks: @@ -8,4 +8,4 @@ openstack.cloud.server_action: action: "{{ appliances_server_action | default('lock') }}" server: "{{ inventory_hostname }}" - delegate_to: localhost \ No newline at end of file + delegate_to: localhost diff --git a/ansible/adhoc/rebuild-via-slurm.yml b/ansible/adhoc/rebuild-via-slurm.yml index 4f1fc5fe9..5f22d7764 100644 --- a/ansible/adhoc/rebuild-via-slurm.yml +++ b/ansible/adhoc/rebuild-via-slurm.yml @@ -8,12 +8,11 @@ # See docs/slurm-controlled-rebuild.md. -- hosts: localhost - gather_facts: false - tasks: - - name: Unlock compute nodes to ready rebuild - ansible.builtin.command: - cmd: ansible-playbook --limit compute adhoc/lock_unlock_instances.yml -e "appliances_server_action=unlock" +- name: Unlock compute instances for rebuild + vars: + appliances_server_action: unlock + target_hosts: compute + ansible.builtin.import_playbook: adhoc/lock_unlock_instances.yml - hosts: login run_once: true diff --git a/ansible/safe-env.yml b/ansible/safe-env.yml index 81a688709..b32b5d86e 100644 --- a/ansible/safe-env.yml +++ b/ansible/safe-env.yml @@ -16,4 +16,4 @@ when: - appliances_environment_name in appliances_protected_environments - not (appliances_protected_environment_autoapprove | default(false) | bool) - failed_when: not (env_confirm_safe.user_input | bool) \ No newline at end of file + failed_when: not (env_confirm_safe.user_input | bool) diff --git a/ansible/site.yml b/ansible/site.yml index 748933a50..8adc8cf13 100644 --- a/ansible/site.yml +++ b/ansible/site.yml @@ -2,12 +2,8 @@ - ansible.builtin.import_playbook: safe-env.yml -- hosts: localhost - gather_facts: false - tasks: - - name: Lock all cluster instances - ansible.builtin.command: - cmd: ansible-playbook adhoc/lock_unlock_instances.yml +- name: Lock cluster instances + ansible.builtin.import_playbook: adhoc/lock_unlock_instances.yml - name: Run pre.yml hook vars: From 0fa67e5a21b6544aff955d65dbd800002be7f1fb Mon Sep 17 00:00:00 2001 From: bertiethorpe Date: Wed, 12 Nov 2025 14:48:09 +0000 Subject: [PATCH 10/16] lock playbook path fix --- ansible/adhoc/rebuild-via-slurm.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ansible/adhoc/rebuild-via-slurm.yml b/ansible/adhoc/rebuild-via-slurm.yml index 5f22d7764..f00062423 100644 --- a/ansible/adhoc/rebuild-via-slurm.yml +++ b/ansible/adhoc/rebuild-via-slurm.yml @@ -12,7 +12,7 @@ vars: appliances_server_action: unlock target_hosts: compute - ansible.builtin.import_playbook: adhoc/lock_unlock_instances.yml + ansible.builtin.import_playbook: lock_unlock_instances.yml - hosts: login run_once: true From af27191a355685b08c571b80544578b6514ac2ae Mon Sep 17 00:00:00 2001 From: bertiethorpe Date: Wed, 12 Nov 2025 18:33:34 +0000 Subject: [PATCH 11/16] document locking/unlocking instances --- .github/workflows/stackhpc.yml | 1 + docs/experimental/compute-init.md | 2 +- docs/experimental/slurm-controlled-rebuild.md | 10 ++++++---- docs/sequence.md | 1 + 4 files changed, 9 insertions(+), 5 deletions(-) diff --git a/.github/workflows/stackhpc.yml b/.github/workflows/stackhpc.yml index 60c05389e..cf987d5bf 100644 --- a/.github/workflows/stackhpc.yml +++ b/.github/workflows/stackhpc.yml @@ -154,6 +154,7 @@ jobs: run: | . venv/bin/activate . environments/.stackhpc/activate + ansible-playbook --limit login,control ansible/adhoc/lock_unlock_instances.yml -e "appliances_server_action=unlock" cd "$STACKHPC_TF_DIR" tofu init tofu apply -auto-approve -var-file="${{ env.CI_CLOUD }}.tfvars" diff --git a/docs/experimental/compute-init.md b/docs/experimental/compute-init.md index dfad27bcf..e0f548aff 100644 --- a/docs/experimental/compute-init.md +++ b/docs/experimental/compute-init.md @@ -22,7 +22,7 @@ login and control nodes. The process follows 1. Compute nodes are reimaged: ```shell -ansible-playbook -v --limit compute ansible/adhoc/rebuild.yml +ansible-playbook -v ansible/adhoc/rebuild-via-slurm.yml ``` 2. Ansible-init runs against newly reimaged compute nodes diff --git a/docs/experimental/slurm-controlled-rebuild.md b/docs/experimental/slurm-controlled-rebuild.md index fc654d354..4c8caefbb 100644 --- a/docs/experimental/slurm-controlled-rebuild.md +++ b/docs/experimental/slurm-controlled-rebuild.md @@ -12,14 +12,16 @@ In summary, the way this functionality works is as follows: 1. The image references(s) are manually updated in the OpenTofu configuration in the normal way. +2. `lock_unlock_instances.yml --limit control,login -e "appliances_server_action=unlock"` + is run to unlock the control and login nodes for reimaging. 2. `tofu apply` is run which rebuilds the login and control nodes to the new image(s). The new image reference for compute nodes is ignored, but is written into the hosts inventory file (and is therefore available as an Ansible hostvar). -3. The `site.yml` playbook is run which reconfigures the cluster as normal. At - this point the cluster is functional, but using a new image for the login - and control nodes and the old image for the compute nodes. This playbook - also: +3. The `site.yml` playbook is run which locks the instances again and reconfigures + the cluster as normal. At this point the cluster is functional, but using a new + image for the login and control nodes and the old image for the compute nodes. + This playbook also: - Writes cluster configuration to the control node, using the [compute_init](../../ansible/roles/compute_init/README.md) role. - Configures an application credential and helper programs on the control diff --git a/docs/sequence.md b/docs/sequence.md index 6f3b77922..96a2333f2 100644 --- a/docs/sequence.md +++ b/docs/sequence.md @@ -100,6 +100,7 @@ sequenceDiagram participant cloud as Cloud participant nodes as Cluster Instances note over ansible: Update OpenTofu cluster_image variable [1] + ansible->>cloud: Unlock control and and login nodes rect rgb(204, 232, 250) note over ansible: $ tofu apply .... ansible<<->>cloud: Check login/compute current vs desired images From 5192f60dd7081ac98bf2935599146ac1ac05114e Mon Sep 17 00:00:00 2001 From: bertiethorpe Date: Wed, 12 Nov 2025 18:45:27 +0000 Subject: [PATCH 12/16] linting docs --- docs/experimental/slurm-controlled-rebuild.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/docs/experimental/slurm-controlled-rebuild.md b/docs/experimental/slurm-controlled-rebuild.md index 4c8caefbb..3d16fabf5 100644 --- a/docs/experimental/slurm-controlled-rebuild.md +++ b/docs/experimental/slurm-controlled-rebuild.md @@ -12,7 +12,9 @@ In summary, the way this functionality works is as follows: 1. The image references(s) are manually updated in the OpenTofu configuration in the normal way. -2. `lock_unlock_instances.yml --limit control,login -e "appliances_server_action=unlock"` +2. ``` ansible-playbook lock_unlock_instances.yml + --limit control,login -e "appliances_server_action=unlock" + ``` is run to unlock the control and login nodes for reimaging. 2. `tofu apply` is run which rebuilds the login and control nodes to the new image(s). The new image reference for compute nodes is ignored, but is From 6119c227aa7fb0ea123b34d1fec5b8ae581ba78d Mon Sep 17 00:00:00 2001 From: bertiethorpe Date: Wed, 12 Nov 2025 18:46:37 +0000 Subject: [PATCH 13/16] fix --- docs/experimental/slurm-controlled-rebuild.md | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/docs/experimental/slurm-controlled-rebuild.md b/docs/experimental/slurm-controlled-rebuild.md index 3d16fabf5..1f924edf0 100644 --- a/docs/experimental/slurm-controlled-rebuild.md +++ b/docs/experimental/slurm-controlled-rebuild.md @@ -12,15 +12,13 @@ In summary, the way this functionality works is as follows: 1. The image references(s) are manually updated in the OpenTofu configuration in the normal way. -2. ``` ansible-playbook lock_unlock_instances.yml - --limit control,login -e "appliances_server_action=unlock" - ``` +2. `ansible-playbook lock_unlock_instances.yml --limit control,login -e "appliances_server_action=unlock"` is run to unlock the control and login nodes for reimaging. -2. `tofu apply` is run which rebuilds the login and control nodes to the new +3. `tofu apply` is run which rebuilds the login and control nodes to the new image(s). The new image reference for compute nodes is ignored, but is written into the hosts inventory file (and is therefore available as an Ansible hostvar). -3. The `site.yml` playbook is run which locks the instances again and reconfigures +4. The `site.yml` playbook is run which locks the instances again and reconfigures the cluster as normal. At this point the cluster is functional, but using a new image for the login and control nodes and the old image for the compute nodes. This playbook also: @@ -28,22 +26,22 @@ In summary, the way this functionality works is as follows: [compute_init](../../ansible/roles/compute_init/README.md) role. - Configures an application credential and helper programs on the control node, using the [rebuild](../../ansible/roles/rebuild/README.md) role. -4. An admin submits Slurm jobs, one for each node, to a special "rebuild" +5. An admin submits Slurm jobs, one for each node, to a special "rebuild" partition using an Ansible playbook. Because this partition has higher priority than the partitions normal users can use, these rebuild jobs become the next job in the queue for every node (although any jobs currently running will complete as normal). -5. Because these rebuild jobs have the `--reboot` flag set, before launching them +6. Because these rebuild jobs have the `--reboot` flag set, before launching them the Slurm control node runs a [RebootProgram](https://slurm.schedmd.com/slurm.conf.html#OPT_RebootProgram) which compares the current image for the node to the one in the cluster configuration, and if it does not match, uses OpenStack to rebuild the node to the desired (updated) image. TODO: Describe the logic if they DO match -6. After a rebuild, the compute node runs various Ansible tasks during boot, +7. After a rebuild, the compute node runs various Ansible tasks during boot, controlled by the [compute_init](../../ansible/roles/compute_init/README.md) role, to fully configure the node again. It retrieves the required cluster configuration information from the control node via an NFS mount. -7. Once the `slurmd` daemon starts on a compute node, the slurm controller +8. Once the `slurmd` daemon starts on a compute node, the slurm controller registers the node as having finished rebooting. It then launches the actual job, which does not do anything. From 7b832c3a90741dfcd0822d46bc09592e31a2f1c2 Mon Sep 17 00:00:00 2001 From: bertiethorpe Date: Wed, 12 Nov 2025 18:52:01 +0000 Subject: [PATCH 14/16] improve docs style --- docs/experimental/slurm-controlled-rebuild.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/experimental/slurm-controlled-rebuild.md b/docs/experimental/slurm-controlled-rebuild.md index 1f924edf0..d2ee4df95 100644 --- a/docs/experimental/slurm-controlled-rebuild.md +++ b/docs/experimental/slurm-controlled-rebuild.md @@ -12,8 +12,8 @@ In summary, the way this functionality works is as follows: 1. The image references(s) are manually updated in the OpenTofu configuration in the normal way. -2. `ansible-playbook lock_unlock_instances.yml --limit control,login -e "appliances_server_action=unlock"` - is run to unlock the control and login nodes for reimaging. +2. The `lock_unlock_instances.yml` playbook is run against control and login with + `unlock` to allow for reimaging the nodes. 3. `tofu apply` is run which rebuilds the login and control nodes to the new image(s). The new image reference for compute nodes is ignored, but is written into the hosts inventory file (and is therefore available as an From 5bb6d98f3c9f9c3626236cf280b0de1500b4eb84 Mon Sep 17 00:00:00 2001 From: bertiethorpe Date: Thu, 13 Nov 2025 14:02:05 +0000 Subject: [PATCH 15/16] Review changes / unlock instances before CI cleanup --- .github/workflows/stackhpc.yml | 3 ++- README.md | 4 ++- ansible/adhoc/lock-unlock-instances.yml | 27 +++++++++++++++++++ ansible/adhoc/lock_unlock_instances.yml | 11 -------- ansible/adhoc/rebuild-via-slurm.yml | 6 ++--- ansible/adhoc/rebuild.yml | 4 +++ ansible/site.yml | 2 +- docs/experimental/slurm-controlled-rebuild.md | 12 ++++----- docs/operations.md | 2 ++ .../inventory/group_vars/all/defaults.yml | 2 +- 10 files changed, 49 insertions(+), 24 deletions(-) create mode 100644 ansible/adhoc/lock-unlock-instances.yml delete mode 100644 ansible/adhoc/lock_unlock_instances.yml diff --git a/.github/workflows/stackhpc.yml b/.github/workflows/stackhpc.yml index cf987d5bf..dc981d892 100644 --- a/.github/workflows/stackhpc.yml +++ b/.github/workflows/stackhpc.yml @@ -154,7 +154,7 @@ jobs: run: | . venv/bin/activate . environments/.stackhpc/activate - ansible-playbook --limit login,control ansible/adhoc/lock_unlock_instances.yml -e "appliances_server_action=unlock" + ansible-playbook --limit login,control ansible/adhoc/lock-unlock-instances.yml -e "lock_unlock_action=unlock" cd "$STACKHPC_TF_DIR" tofu init tofu apply -auto-approve -var-file="${{ env.CI_CLOUD }}.tfvars" @@ -238,6 +238,7 @@ jobs: run: | . venv/bin/activate . environments/.stackhpc/activate + ansible-playbook ansible/adhoc/lock-unlock-instances.yml -e "lock_unlock_action=unlock" cd "$STACKHPC_TF_DIR" tofu destroy -auto-approve -var-file="${{ env.CI_CLOUD }}.tfvars" || echo "tofu failed in $STACKHPC_TF_DIR" if: ${{ success() || cancelled() }} diff --git a/README.md b/README.md index 8acd424fa..093d70a08 100644 --- a/README.md +++ b/README.md @@ -141,8 +141,10 @@ To configure the appliance, ensure the venv and the environment are [activated]( ```shell ansible-playbook ansible/site.yml ``` +To prevent the cluster instances from being changed or `tofu destroy`ed, this playbook begins by locking the OpenStack instances. Any subsequent desired changes to the OpenTofu state require +running an unlocking playbook as detailed in the adhoc command section of [docs/operations.md](docs/operations.md) -Once it completes you can log in to the cluster using: +Once `site.yml` completes you can log in to the cluster using: ```shell ssh rocky@$login_ip diff --git a/ansible/adhoc/lock-unlock-instances.yml b/ansible/adhoc/lock-unlock-instances.yml new file mode 100644 index 000000000..72194b4d5 --- /dev/null +++ b/ansible/adhoc/lock-unlock-instances.yml @@ -0,0 +1,27 @@ +--- +# Lock or unlock cluster instances + +# Used for site.yml / rebuild-via-slurm.yml +# Run required for rebuild.yml / tofu destroy / changes to tofu state etc. + +# Examples: + +# ansible-playbook --limit login,control ansible/adhoc/lock-unlock-instances.yml -e "lock_unlock_action=unlock" + +# ansansible-playbook ansible/adhoc/lock-unlock-instances.yml -e "lock_unlock_action=unlock" -e "lock_unlock_hosts=compute" + +# - name: Unlock compute instances +# vars: +# lock_unlock_action: unlock +# lock_unlock_hosts: compute +# ansible.builtin.import_playbook: lock-unlock-instances.yml + +- hosts: "{{ lock_unlock_hosts | default('cluster') }}" + gather_facts: false + become: false + tasks: + - name: Lock/Unlock instances + openstack.cloud.server_action: + action: "{{ lock_unlock_action | default('lock') }}" + server: "{{ inventory_hostname }}" + delegate_to: localhost diff --git a/ansible/adhoc/lock_unlock_instances.yml b/ansible/adhoc/lock_unlock_instances.yml deleted file mode 100644 index 64886cd71..000000000 --- a/ansible/adhoc/lock_unlock_instances.yml +++ /dev/null @@ -1,11 +0,0 @@ ---- - -- hosts: "{{ target_hosts | default('cluster') }}" - gather_facts: false - become: false - tasks: - - name: Lock/Unlock instances - openstack.cloud.server_action: - action: "{{ appliances_server_action | default('lock') }}" - server: "{{ inventory_hostname }}" - delegate_to: localhost diff --git a/ansible/adhoc/rebuild-via-slurm.yml b/ansible/adhoc/rebuild-via-slurm.yml index f00062423..8597521fe 100644 --- a/ansible/adhoc/rebuild-via-slurm.yml +++ b/ansible/adhoc/rebuild-via-slurm.yml @@ -10,9 +10,9 @@ - name: Unlock compute instances for rebuild vars: - appliances_server_action: unlock - target_hosts: compute - ansible.builtin.import_playbook: lock_unlock_instances.yml + lock_unlock_action: unlock + lock_unlock_hosts: compute + ansible.builtin.import_playbook: lock-unlock-instances.yml - hosts: login run_once: true diff --git a/ansible/adhoc/rebuild.yml b/ansible/adhoc/rebuild.yml index b6033e43c..1db17d26b 100644 --- a/ansible/adhoc/rebuild.yml +++ b/ansible/adhoc/rebuild.yml @@ -5,6 +5,10 @@ # Use --limit to control which hosts to rebuild (either specific hosts or the _ groups defining partitions). # Optionally, supply `-e rebuild_image=` to define a specific image, otherwise the current image is reused. # +# After running site.yml, all instances are locked, so to run the rebuild.yml, the unlock playbook must be run: +# ansible-playbook ansible/adhoc/lock-unlock-instances.yml -e "lock_unlock_action=unlock" +# Similarly to rebuild, --limit can be used to control which hosts to unlock. +# # NOTE: If a hostvar `instance_id` is defined this is used to select hosts. # Otherwise the hostname is used and this must be unique, which may not be the case e.g. if using identically-named staging and production hosts. # diff --git a/ansible/site.yml b/ansible/site.yml index 8adc8cf13..5d61be819 100644 --- a/ansible/site.yml +++ b/ansible/site.yml @@ -3,7 +3,7 @@ - ansible.builtin.import_playbook: safe-env.yml - name: Lock cluster instances - ansible.builtin.import_playbook: adhoc/lock_unlock_instances.yml + ansible.builtin.import_playbook: adhoc/lock-unlock-instances.yml - name: Run pre.yml hook vars: diff --git a/docs/experimental/slurm-controlled-rebuild.md b/docs/experimental/slurm-controlled-rebuild.md index d2ee4df95..6aab761ce 100644 --- a/docs/experimental/slurm-controlled-rebuild.md +++ b/docs/experimental/slurm-controlled-rebuild.md @@ -12,8 +12,8 @@ In summary, the way this functionality works is as follows: 1. The image references(s) are manually updated in the OpenTofu configuration in the normal way. -2. The `lock_unlock_instances.yml` playbook is run against control and login with - `unlock` to allow for reimaging the nodes. +2. The adhoc playbook `lock-unlock-instances.yml` is run limited to control and login + nodes, with `lock_unlock_action=unlock` to allow the nodes to be rebuilt. 3. `tofu apply` is run which rebuilds the login and control nodes to the new image(s). The new image reference for compute nodes is ignored, but is written into the hosts inventory file (and is therefore available as an @@ -27,10 +27,10 @@ In summary, the way this functionality works is as follows: - Configures an application credential and helper programs on the control node, using the [rebuild](../../ansible/roles/rebuild/README.md) role. 5. An admin submits Slurm jobs, one for each node, to a special "rebuild" - partition using an Ansible playbook. Because this partition has higher - priority than the partitions normal users can use, these rebuild jobs become - the next job in the queue for every node (although any jobs currently - running will complete as normal). + partition using the adhoc playbook `rebuild-via-slurm.yml`. Because this partition + has higher priority than the partitions normal users can use, these rebuild jobs + become the next job in the queue for every node (although any jobs currently running + will complete as normal). 6. Because these rebuild jobs have the `--reboot` flag set, before launching them the Slurm control node runs a [RebootProgram](https://slurm.schedmd.com/slurm.conf.html#OPT_RebootProgram) which compares the current image for the node to the one in the cluster diff --git a/docs/operations.md b/docs/operations.md index 525a3e01c..5b70fc46d 100644 --- a/docs/operations.md +++ b/docs/operations.md @@ -212,7 +212,9 @@ ansible-playbook ansible/adhoc/$PLAYBOOK Currently they include the following (see each playbook for links to documentation): - `hpctests.yml`: MPI-based cluster tests for latency, bandwidth and floating point performance. +- `lock-unlock-instances.yml`: Lock cluster instances for preventing tofu changes, or unlock to allow changes. - `rebuild.yml`: Rebuild nodes with existing or new images (NB: this is intended for development not for re-imaging nodes on an in-production cluster). +Requires `lock-unlock-instances.yml` be run first. - `restart-slurm.yml`: Restart all Slurm daemons in the correct order. - `update-packages.yml`: Update specified packages on cluster nodes (NB: not recommended for routine use). diff --git a/environments/common/inventory/group_vars/all/defaults.yml b/environments/common/inventory/group_vars/all/defaults.yml index 6cc02ff59..622390bc1 100644 --- a/environments/common/inventory/group_vars/all/defaults.yml +++ b/environments/common/inventory/group_vars/all/defaults.yml @@ -5,7 +5,7 @@ appliances_repository_root: "{{ lookup('env', 'APPLIANCES_REPO_ROOT') }}" appliances_environment_root: "{{ lookup('env', 'APPLIANCES_ENVIRONMENT_ROOT') }}" appliances_environment_name: "{{ appliances_environment_root | basename | regex_replace('\\W+', '') }}" # [a-zA-Z0-9_] only appliances_protected_environments: - - prd + - production appliances_cockpit_state: absent # RHEL cockpit installed but not enabled in genericcloud images; appliance defaults to removing it # appliances_state_dir: # define an absolute path here to use for persistent state: NB: This is defined as /var/lib/state in inventory by the default Terraform appliances_mode: configure From f003db80fecd549b585485acb4eafe4862068e06 Mon Sep 17 00:00:00 2001 From: bertiethorpe Date: Thu, 13 Nov 2025 14:18:32 +0000 Subject: [PATCH 16/16] prettier linting --- README.md | 4 ++-- docs/operations.md | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 093d70a08..27d9ef0a9 100644 --- a/README.md +++ b/README.md @@ -141,8 +141,8 @@ To configure the appliance, ensure the venv and the environment are [activated]( ```shell ansible-playbook ansible/site.yml ``` -To prevent the cluster instances from being changed or `tofu destroy`ed, this playbook begins by locking the OpenStack instances. Any subsequent desired changes to the OpenTofu state require -running an unlocking playbook as detailed in the adhoc command section of [docs/operations.md](docs/operations.md) + +To prevent the cluster instances from being changed or `tofu destroy` running, this playbook begins by locking the OpenStack instances. Any subsequent desired changes to the OpenTofu state require running an unlocking playbook as detailed in the adhoc command section of [docs/operations.md](docs/operations.md). Once `site.yml` completes you can log in to the cluster using: diff --git a/docs/operations.md b/docs/operations.md index 5b70fc46d..0ae41d9a7 100644 --- a/docs/operations.md +++ b/docs/operations.md @@ -214,7 +214,7 @@ Currently they include the following (see each playbook for links to documentati - `hpctests.yml`: MPI-based cluster tests for latency, bandwidth and floating point performance. - `lock-unlock-instances.yml`: Lock cluster instances for preventing tofu changes, or unlock to allow changes. - `rebuild.yml`: Rebuild nodes with existing or new images (NB: this is intended for development not for re-imaging nodes on an in-production cluster). -Requires `lock-unlock-instances.yml` be run first. + Requires `lock-unlock-instances.yml` be run first. - `restart-slurm.yml`: Restart all Slurm daemons in the correct order. - `update-packages.yml`: Update specified packages on cluster nodes (NB: not recommended for routine use).