stackhpc · bertiethorpe · Nov 4, 2025 · Nov 4, 2025 · Nov 4, 2025 · Nov 7, 2025
@@ -154,6 +154,7 @@ jobs:
         run: |
           . venv/bin/activate
           . environments/.stackhpc/activate
+          ansible-playbook --limit login,control ansible/adhoc/lock_unlock_instances.yml -e "appliances_server_action=unlock"
           cd "$STACKHPC_TF_DIR"
           tofu init
           tofu apply -auto-approve -var-file="${{ env.CI_CLOUD }}.tfvars"

@@ -0,0 +1,11 @@
+---
+
+- hosts: "{{ target_hosts | default('cluster') }}"
+  gather_facts: false
+  become: false
+  tasks:
+    - name: Lock/Unlock instances
+      openstack.cloud.server_action:
+        action: "{{ appliances_server_action | default('lock') }}"
+        server: "{{ inventory_hostname }}"
+      delegate_to: localhost
@@ -8,6 +8,12 @@
 
 # See docs/slurm-controlled-rebuild.md.
 
+- name: Unlock compute instances for rebuild
+  vars:
+    appliances_server_action: unlock
+    target_hosts: compute
+  ansible.builtin.import_playbook: lock_unlock_instances.yml
+
 - hosts: login
   run_once: true
   gather_facts: false

@@ -0,0 +1,19 @@
+---
+- hosts: localhost
+  gather_facts: false
+  become: false
+  tasks:
+    - name: Confirm continuing if using production environment
+      ansible.builtin.pause:
+        prompt: |
+          *************************************
+          *  WARNING: PROTECTED ENVIRONMENT!  *
+          *************************************
+
+          Current environment: {{ appliances_environment_name }}
+          Do you really want to continue (yes/no)?
+      register: env_confirm_safe
+      when:
+        - appliances_environment_name in appliances_protected_environments
+        - not (appliances_protected_environment_autoapprove | default(false) | bool)
+      failed_when: not (env_confirm_safe.user_input | bool)
@@ -1,4 +1,10 @@
 ---
+
+- ansible.builtin.import_playbook: safe-env.yml
+
+- name: Lock cluster instances
+  ansible.builtin.import_playbook: adhoc/lock_unlock_instances.yml
+
 - name: Run pre.yml hook
   vars:
     # hostvars not available here, so have to recalculate environment root:

@@ -22,7 +22,7 @@ login and control nodes. The process follows
 1. Compute nodes are reimaged:
 
 ```shell
-ansible-playbook -v --limit compute ansible/adhoc/rebuild.yml
+ansible-playbook -v ansible/adhoc/rebuild-via-slurm.yml
 ```
 
 2. Ansible-init runs against newly reimaged compute nodes

@@ -12,34 +12,36 @@ In summary, the way this functionality works is as follows:
 
 1. The image references(s) are manually updated in the OpenTofu configuration
    in the normal way.
-2. `tofu apply` is run which rebuilds the login and control nodes to the new
+2. The `lock_unlock_instances.yml` playbook is run against control and login with
+   `unlock` to allow for reimaging the nodes.
+3. `tofu apply` is run which rebuilds the login and control nodes to the new
    image(s). The new image reference for compute nodes is ignored, but is
    written into the hosts inventory file (and is therefore available as an
    Ansible hostvar).
-3. The `site.yml` playbook is run which reconfigures the cluster as normal. At
-   this point the cluster is functional, but using a new image for the login
-   and control nodes and the old image for the compute nodes. This playbook
-   also:
+4. The `site.yml` playbook is run which locks the instances again and reconfigures
+   the cluster as normal. At this point the cluster is functional, but using a new
+   image for the login and control nodes and the old image for the compute nodes.
+   This playbook also:
    - Writes cluster configuration to the control node, using the
      [compute_init](../../ansible/roles/compute_init/README.md) role.
    - Configures an application credential and helper programs on the control
      node, using the [rebuild](../../ansible/roles/rebuild/README.md) role.
-4. An admin submits Slurm jobs, one for each node, to a special "rebuild"
+5. An admin submits Slurm jobs, one for each node, to a special "rebuild"
    partition using an Ansible playbook. Because this partition has higher
    priority than the partitions normal users can use, these rebuild jobs become
    the next job in the queue for every node (although any jobs currently
    running will complete as normal).
-5. Because these rebuild jobs have the `--reboot` flag set, before launching them
+6. Because these rebuild jobs have the `--reboot` flag set, before launching them
    the Slurm control node runs a [RebootProgram](https://slurm.schedmd.com/slurm.conf.html#OPT_RebootProgram)
    which compares the current image for the node to the one in the cluster
    configuration, and if it does not match, uses OpenStack to rebuild the
    node to the desired (updated) image.
    TODO: Describe the logic if they DO match
-6. After a rebuild, the compute node runs various Ansible tasks during boot,
+7. After a rebuild, the compute node runs various Ansible tasks during boot,
    controlled by the [compute_init](../../ansible/roles/compute_init/README.md)
    role, to fully configure the node again. It retrieves the required cluster
    configuration information from the control node via an NFS mount.
-7. Once the `slurmd` daemon starts on a compute node, the slurm controller
+8. Once the `slurmd` daemon starts on a compute node, the slurm controller
    registers the node as having finished rebooting. It then launches the actual
    job, which does not do anything.
 

@@ -100,6 +100,7 @@ sequenceDiagram
     participant cloud as Cloud
     participant nodes as Cluster Instances
     note over ansible: Update OpenTofu cluster_image variable [1]
+    ansible->>cloud: Unlock control and and login nodes
     rect rgb(204, 232, 250)
     note over ansible: $ tofu apply ....
     ansible<<->>cloud: Check login/compute current vs desired images

@@ -4,6 +4,8 @@ ansible_user: rocky
 appliances_repository_root: "{{ lookup('env', 'APPLIANCES_REPO_ROOT') }}"
 appliances_environment_root: "{{ lookup('env', 'APPLIANCES_ENVIRONMENT_ROOT') }}"
 appliances_environment_name: "{{ appliances_environment_root | basename | regex_replace('\\W+', '') }}" # [a-zA-Z0-9_] only
+appliances_protected_environments:
+  - prd
 appliances_cockpit_state: absent # RHEL cockpit installed but not enabled in genericcloud images; appliance defaults to removing it
 # appliances_state_dir: # define an absolute path here to use for persistent state: NB: This is defined as /var/lib/state in inventory by the default Terraform
 appliances_mode: configure