diff --git a/.github/workflows/stackhpc.yml b/.github/workflows/stackhpc.yml index 60c05389e..dc981d892 100644 --- a/.github/workflows/stackhpc.yml +++ b/.github/workflows/stackhpc.yml @@ -154,6 +154,7 @@ jobs: run: | . venv/bin/activate . environments/.stackhpc/activate + ansible-playbook --limit login,control ansible/adhoc/lock-unlock-instances.yml -e "lock_unlock_action=unlock" cd "$STACKHPC_TF_DIR" tofu init tofu apply -auto-approve -var-file="${{ env.CI_CLOUD }}.tfvars" @@ -237,6 +238,7 @@ jobs: run: | . venv/bin/activate . environments/.stackhpc/activate + ansible-playbook ansible/adhoc/lock-unlock-instances.yml -e "lock_unlock_action=unlock" cd "$STACKHPC_TF_DIR" tofu destroy -auto-approve -var-file="${{ env.CI_CLOUD }}.tfvars" || echo "tofu failed in $STACKHPC_TF_DIR" if: ${{ success() || cancelled() }} diff --git a/README.md b/README.md index 8acd424fa..27d9ef0a9 100644 --- a/README.md +++ b/README.md @@ -142,7 +142,9 @@ To configure the appliance, ensure the venv and the environment are [activated]( ansible-playbook ansible/site.yml ``` -Once it completes you can log in to the cluster using: +To prevent the cluster instances from being changed or `tofu destroy` running, this playbook begins by locking the OpenStack instances. Any subsequent desired changes to the OpenTofu state require running an unlocking playbook as detailed in the adhoc command section of [docs/operations.md](docs/operations.md). + +Once `site.yml` completes you can log in to the cluster using: ```shell ssh rocky@$login_ip diff --git a/ansible/adhoc/lock-unlock-instances.yml b/ansible/adhoc/lock-unlock-instances.yml new file mode 100644 index 000000000..72194b4d5 --- /dev/null +++ b/ansible/adhoc/lock-unlock-instances.yml @@ -0,0 +1,27 @@ +--- +# Lock or unlock cluster instances + +# Used for site.yml / rebuild-via-slurm.yml +# Run required for rebuild.yml / tofu destroy / changes to tofu state etc. + +# Examples: + +# ansible-playbook --limit login,control ansible/adhoc/lock-unlock-instances.yml -e "lock_unlock_action=unlock" + +# ansansible-playbook ansible/adhoc/lock-unlock-instances.yml -e "lock_unlock_action=unlock" -e "lock_unlock_hosts=compute" + +# - name: Unlock compute instances +# vars: +# lock_unlock_action: unlock +# lock_unlock_hosts: compute +# ansible.builtin.import_playbook: lock-unlock-instances.yml + +- hosts: "{{ lock_unlock_hosts | default('cluster') }}" + gather_facts: false + become: false + tasks: + - name: Lock/Unlock instances + openstack.cloud.server_action: + action: "{{ lock_unlock_action | default('lock') }}" + server: "{{ inventory_hostname }}" + delegate_to: localhost diff --git a/ansible/adhoc/rebuild-via-slurm.yml b/ansible/adhoc/rebuild-via-slurm.yml index 33cbe5cc7..8597521fe 100644 --- a/ansible/adhoc/rebuild-via-slurm.yml +++ b/ansible/adhoc/rebuild-via-slurm.yml @@ -8,6 +8,12 @@ # See docs/slurm-controlled-rebuild.md. +- name: Unlock compute instances for rebuild + vars: + lock_unlock_action: unlock + lock_unlock_hosts: compute + ansible.builtin.import_playbook: lock-unlock-instances.yml + - hosts: login run_once: true gather_facts: false diff --git a/ansible/adhoc/rebuild.yml b/ansible/adhoc/rebuild.yml index b6033e43c..1db17d26b 100644 --- a/ansible/adhoc/rebuild.yml +++ b/ansible/adhoc/rebuild.yml @@ -5,6 +5,10 @@ # Use --limit to control which hosts to rebuild (either specific hosts or the _ groups defining partitions). # Optionally, supply `-e rebuild_image=` to define a specific image, otherwise the current image is reused. # +# After running site.yml, all instances are locked, so to run the rebuild.yml, the unlock playbook must be run: +# ansible-playbook ansible/adhoc/lock-unlock-instances.yml -e "lock_unlock_action=unlock" +# Similarly to rebuild, --limit can be used to control which hosts to unlock. +# # NOTE: If a hostvar `instance_id` is defined this is used to select hosts. # Otherwise the hostname is used and this must be unique, which may not be the case e.g. if using identically-named staging and production hosts. # diff --git a/ansible/safe-env.yml b/ansible/safe-env.yml new file mode 100644 index 000000000..b32b5d86e --- /dev/null +++ b/ansible/safe-env.yml @@ -0,0 +1,19 @@ +--- +- hosts: localhost + gather_facts: false + become: false + tasks: + - name: Confirm continuing if using production environment + ansible.builtin.pause: + prompt: | + ************************************* + * WARNING: PROTECTED ENVIRONMENT! * + ************************************* + + Current environment: {{ appliances_environment_name }} + Do you really want to continue (yes/no)? + register: env_confirm_safe + when: + - appliances_environment_name in appliances_protected_environments + - not (appliances_protected_environment_autoapprove | default(false) | bool) + failed_when: not (env_confirm_safe.user_input | bool) diff --git a/ansible/site.yml b/ansible/site.yml index 79b71e10a..5d61be819 100644 --- a/ansible/site.yml +++ b/ansible/site.yml @@ -1,4 +1,10 @@ --- + +- ansible.builtin.import_playbook: safe-env.yml + +- name: Lock cluster instances + ansible.builtin.import_playbook: adhoc/lock-unlock-instances.yml + - name: Run pre.yml hook vars: # hostvars not available here, so have to recalculate environment root: diff --git a/docs/experimental/compute-init.md b/docs/experimental/compute-init.md index dfad27bcf..e0f548aff 100644 --- a/docs/experimental/compute-init.md +++ b/docs/experimental/compute-init.md @@ -22,7 +22,7 @@ login and control nodes. The process follows 1. Compute nodes are reimaged: ```shell -ansible-playbook -v --limit compute ansible/adhoc/rebuild.yml +ansible-playbook -v ansible/adhoc/rebuild-via-slurm.yml ``` 2. Ansible-init runs against newly reimaged compute nodes diff --git a/docs/experimental/slurm-controlled-rebuild.md b/docs/experimental/slurm-controlled-rebuild.md index fc654d354..6aab761ce 100644 --- a/docs/experimental/slurm-controlled-rebuild.md +++ b/docs/experimental/slurm-controlled-rebuild.md @@ -12,34 +12,36 @@ In summary, the way this functionality works is as follows: 1. The image references(s) are manually updated in the OpenTofu configuration in the normal way. -2. `tofu apply` is run which rebuilds the login and control nodes to the new +2. The adhoc playbook `lock-unlock-instances.yml` is run limited to control and login + nodes, with `lock_unlock_action=unlock` to allow the nodes to be rebuilt. +3. `tofu apply` is run which rebuilds the login and control nodes to the new image(s). The new image reference for compute nodes is ignored, but is written into the hosts inventory file (and is therefore available as an Ansible hostvar). -3. The `site.yml` playbook is run which reconfigures the cluster as normal. At - this point the cluster is functional, but using a new image for the login - and control nodes and the old image for the compute nodes. This playbook - also: +4. The `site.yml` playbook is run which locks the instances again and reconfigures + the cluster as normal. At this point the cluster is functional, but using a new + image for the login and control nodes and the old image for the compute nodes. + This playbook also: - Writes cluster configuration to the control node, using the [compute_init](../../ansible/roles/compute_init/README.md) role. - Configures an application credential and helper programs on the control node, using the [rebuild](../../ansible/roles/rebuild/README.md) role. -4. An admin submits Slurm jobs, one for each node, to a special "rebuild" - partition using an Ansible playbook. Because this partition has higher - priority than the partitions normal users can use, these rebuild jobs become - the next job in the queue for every node (although any jobs currently - running will complete as normal). -5. Because these rebuild jobs have the `--reboot` flag set, before launching them +5. An admin submits Slurm jobs, one for each node, to a special "rebuild" + partition using the adhoc playbook `rebuild-via-slurm.yml`. Because this partition + has higher priority than the partitions normal users can use, these rebuild jobs + become the next job in the queue for every node (although any jobs currently running + will complete as normal). +6. Because these rebuild jobs have the `--reboot` flag set, before launching them the Slurm control node runs a [RebootProgram](https://slurm.schedmd.com/slurm.conf.html#OPT_RebootProgram) which compares the current image for the node to the one in the cluster configuration, and if it does not match, uses OpenStack to rebuild the node to the desired (updated) image. TODO: Describe the logic if they DO match -6. After a rebuild, the compute node runs various Ansible tasks during boot, +7. After a rebuild, the compute node runs various Ansible tasks during boot, controlled by the [compute_init](../../ansible/roles/compute_init/README.md) role, to fully configure the node again. It retrieves the required cluster configuration information from the control node via an NFS mount. -7. Once the `slurmd` daemon starts on a compute node, the slurm controller +8. Once the `slurmd` daemon starts on a compute node, the slurm controller registers the node as having finished rebooting. It then launches the actual job, which does not do anything. diff --git a/docs/operations.md b/docs/operations.md index 525a3e01c..0ae41d9a7 100644 --- a/docs/operations.md +++ b/docs/operations.md @@ -212,7 +212,9 @@ ansible-playbook ansible/adhoc/$PLAYBOOK Currently they include the following (see each playbook for links to documentation): - `hpctests.yml`: MPI-based cluster tests for latency, bandwidth and floating point performance. +- `lock-unlock-instances.yml`: Lock cluster instances for preventing tofu changes, or unlock to allow changes. - `rebuild.yml`: Rebuild nodes with existing or new images (NB: this is intended for development not for re-imaging nodes on an in-production cluster). + Requires `lock-unlock-instances.yml` be run first. - `restart-slurm.yml`: Restart all Slurm daemons in the correct order. - `update-packages.yml`: Update specified packages on cluster nodes (NB: not recommended for routine use). diff --git a/docs/sequence.md b/docs/sequence.md index 6f3b77922..96a2333f2 100644 --- a/docs/sequence.md +++ b/docs/sequence.md @@ -100,6 +100,7 @@ sequenceDiagram participant cloud as Cloud participant nodes as Cluster Instances note over ansible: Update OpenTofu cluster_image variable [1] + ansible->>cloud: Unlock control and and login nodes rect rgb(204, 232, 250) note over ansible: $ tofu apply .... ansible<<->>cloud: Check login/compute current vs desired images diff --git a/environments/common/inventory/group_vars/all/defaults.yml b/environments/common/inventory/group_vars/all/defaults.yml index e9852afd6..622390bc1 100644 --- a/environments/common/inventory/group_vars/all/defaults.yml +++ b/environments/common/inventory/group_vars/all/defaults.yml @@ -4,6 +4,8 @@ ansible_user: rocky appliances_repository_root: "{{ lookup('env', 'APPLIANCES_REPO_ROOT') }}" appliances_environment_root: "{{ lookup('env', 'APPLIANCES_ENVIRONMENT_ROOT') }}" appliances_environment_name: "{{ appliances_environment_root | basename | regex_replace('\\W+', '') }}" # [a-zA-Z0-9_] only +appliances_protected_environments: + - production appliances_cockpit_state: absent # RHEL cockpit installed but not enabled in genericcloud images; appliance defaults to removing it # appliances_state_dir: # define an absolute path here to use for persistent state: NB: This is defined as /var/lib/state in inventory by the default Terraform appliances_mode: configure