From ba9699267449fba58cd9c04c451759a914fd7144 Mon Sep 17 00:00:00 2001 From: bertiethorpe <84867280+bertiethorpe@users.noreply.github.com> Date: Thu, 28 Aug 2025 13:55:03 +0100 Subject: [PATCH 01/50] Delete environments/.caas/ansible.cfg (#766) --- environments/.caas/ansible.cfg | 19 ------------------- 1 file changed, 19 deletions(-) delete mode 100644 environments/.caas/ansible.cfg diff --git a/environments/.caas/ansible.cfg b/environments/.caas/ansible.cfg deleted file mode 100644 index 922f086..0000000 --- a/environments/.caas/ansible.cfg +++ /dev/null @@ -1,19 +0,0 @@ -[defaults] -any_errors_fatal = True -stdout_callback = debug -stderr_callback = debug -gathering = smart -forks = 30 -host_key_checking = False -inventory = ../common/inventory,inventory -collections_path = ../../ansible/collections -roles_path = ../../ansible/roles -filter_plugins = ../../ansible/filter_plugins - -[ssh_connection] -ssh_args = -o ControlMaster=auto ControlPath=~/.ssh/%r@%h-%p -o ControlPersist=240s -o PreferredAuthentications=publickey -o UserKnownHostsFile=/dev/null -pipelining = True - -[inventory] -# Fail when any inventory source cannot be parsed. -any_unparsed_is_failed = True From 73f614abb51e5b277a9611759491aceae61afae3 Mon Sep 17 00:00:00 2001 From: Matt Crees Date: Fri, 29 Aug 2025 15:06:57 +0100 Subject: [PATCH 02/50] Add filesystems docs (#710) * Add filesystems docs * Apply suggestions from code review Co-authored-by: Steve Brasier <33413598+sjpb@users.noreply.github.com> * Update Ceph instructions for Manila integrations * Update overview * Update docs/filesystems.md Co-authored-by: Steve Brasier <33413598+sjpb@users.noreply.github.com> * Update image build instructions for Manila --------- Co-authored-by: Steve Brasier <33413598+sjpb@users.noreply.github.com> --- docs/filesystems.md | 88 +++++++++++++++++++ .../inventory/group_vars/all/manila.yml | 6 +- .../group_vars/all/os-manila-mount.yml | 3 - 3 files changed, 93 insertions(+), 4 deletions(-) create mode 100644 docs/filesystems.md delete mode 100644 environments/common/inventory/group_vars/all/os-manila-mount.yml diff --git a/docs/filesystems.md b/docs/filesystems.md new file mode 100644 index 0000000..5509aef --- /dev/null +++ b/docs/filesystems.md @@ -0,0 +1,88 @@ +# Overview + +The Slurm appliance supports multiple ways of configuring shared filesystems, including: + +- Configuring the control node as an NFS server. (Default) + +- CephFS via Manila + +- Lustre + +# Manila + +The Slurm appliance supports mounting shared filesystems using [CephFS](https://docs.ceph.com/en/latest/cephfs/) via [OpenStack Manila](https://docs.openstack.org/manila/latest/). This section explains: + +- How to create the shares in OpenStack Manila. + +- How to configure the Slurm Appliance to mount these Manila shares. + +- How to switch to a Manila share for a shared home directory. + +## Creating shares in OpenStack + +The Slurm appliance requires that the Manila shares already exist on the system. Follow the instructions below to do this. + +If this is the first time Manila is being used on the system, a CephFS share type will need to be created. You will need admin credentials to do this. + + ```bash + openstack share type create cephfs-type false --extra-specs storage_protocol=CEPHFS vendor_name=Ceph + ``` + +Once this exists, create a share using credentials for the Slurm project. An access rule also needs to be created, where the `access_to` argument (`openstack share access create `) is a user that will be created in Ceph. This needs to be globally unique in Ceph, so needs to be different for each OpenStack project. Ideally, this share should include your environment name. In this example, the name is "production". + + ```bash + openstack share create CephFS 300 --description 'Scratch dir for Slurm prod' --name slurm-production-scratch --share-type cephfs-type --wait + openstack share access create slurm-production-scratch cephx slurm-production + ``` + +## Configuring the Slurm Appliance for Manila + +To mount shares onto hosts in a group, add them to the `manila` group. + + ```ini + # environments/site/inventory/groups: + [manila:children]: + login + compute + ``` + +If you are running a different version of Ceph from the defaults in the [os-manila-mount role](https://github.com/stackhpc/ansible-role-os-manila-mount/blob/master/defaults/main.yml), you will need to update the package version by setting: + + ```yaml + # environments/site/inventory/group_vars/manila.yml: + os_manila_mount_ceph_version: "18.2.4" + ``` + +A [site-specific image](image-build.md) should be built which includes this package; add ``manila`` to the Packer ``inventory_groups`` variable. + +Define the list of shares to be mounted, and the paths to mount them to. The example below parameterises the share name using the environment name. See the [stackhpc.os-manila-mount role](https://github.com/stackhpc/ansible-role-os-manila-mount) for further configuration options. + + ```yaml + # environments/site/inventory/group_vars/manila.yml: + os_manila_mount_shares: + - share_name: "slurm-{{ appliances_environment_name }}-scratch" + mount_path: /scratch + ``` + +### Shared home directory + +By default, the Slurm appliance configures the control node as an NFS server and exports a directory which is mounted on the other cluster nodes as `/home`. When using Manila + CephFS for the home directory instead, this will need to be disabled. To do this, set the tf var `home_volume_provisioning` to `None`. + +Some `basic_users_homedir_*` parameters need overriding as the provided defaults are only satisfactory for the default root-squashed NFS share: + + ```yaml + # environments/site/inventory/group_vars/all/basic_users.yml: + basic_users_homedir_server: "{{ groups['login'] | first }}" # if not mounting /home on control node + basic_users_homedir_server_path: /home + ``` + +Finally, add the home directory to the list of shares (the share should be already created in OpenStack). + + ```yaml + # environments/site/inventory/group_vars/all/manila.yml: + os_manila_mount_shares: + - share_name: "slurm-{{ appliances_environment_name }}-scratch" + mount_path: /scratch + - share_name: "slurm-{{ appliances_environment_name }}-home" + mount_path: /home + ``` diff --git a/environments/common/inventory/group_vars/all/manila.yml b/environments/common/inventory/group_vars/all/manila.yml index baccd44..cb015f9 100644 --- a/environments/common/inventory/group_vars/all/manila.yml +++ b/environments/common/inventory/group_vars/all/manila.yml @@ -10,4 +10,8 @@ os_manila_mount_shares: [] # mount_group: # mount_mode: -# os_manila_mount_ceph_version: nautilus # role default for RockyLinux 8 +# os_manila_mount_ceph_version: + +# Empty repo lists from stackhpc.ansible-role-os-manila-mount role defaults, as these repofiles are +# now generated by dnf_repos to allow injecting Ark creds: +os_manila_mount_ceph_rpm_repos: [] diff --git a/environments/common/inventory/group_vars/all/os-manila-mount.yml b/environments/common/inventory/group_vars/all/os-manila-mount.yml deleted file mode 100644 index 6b25d62..0000000 --- a/environments/common/inventory/group_vars/all/os-manila-mount.yml +++ /dev/null @@ -1,3 +0,0 @@ -# Empty repo lists from stackhpc.ansible-role-os-manila-mount role defaults, as these repofiles are -# now generated by dnf_repos to allow injecting Ark creds: -os_manila_mount_ceph_rpm_repos: [] From cb4ca3c69d6627115ede3b0c9ae52c2a9eb3c00d Mon Sep 17 00:00:00 2001 From: bertiethorpe <84867280+bertiethorpe@users.noreply.github.com> Date: Fri, 29 Aug 2025 15:28:07 +0100 Subject: [PATCH 03/50] CaaS pre-hook fix for galaxy requirements validation (#767) * pre-hook to copy requirements.yml.last * remove mention of CI in comments --- environments/.caas/hooks/pre.yml | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/environments/.caas/hooks/pre.yml b/environments/.caas/hooks/pre.yml index 8c99e59..8924dca 100644 --- a/environments/.caas/hooks/pre.yml +++ b/environments/.caas/hooks/pre.yml @@ -63,3 +63,14 @@ - dnf_repos loop: "{{ groups['cluster'] }}" when: dnf_repos_enabled | default(false) | bool + +# Workaround for setup-env.sh not running in CaaS environment, so: +# https://github.com/stackhpc/ansible-slurm-appliance/blob/ba9699267449fba58cd9c04c451759a914fd7144/ansible/validate.yml#L16 +# doesn't break CaaS platforms +- hosts: localhost + gather_facts: no + tasks: + - name: Prepare requirements.yml.last for galaxy validation + copy: + src: "{{ appliances_repository_root }}/requirements.yml" + dest: "{{ appliances_repository_root }}/requirements.yml.last" From 21ef880bf9116fd88754cc7a5e380a4dcc41b2d4 Mon Sep 17 00:00:00 2001 From: Matt Crees Date: Fri, 29 Aug 2025 17:03:03 +0100 Subject: [PATCH 04/50] Production end to end deployment docs (#678) * First draft of production end-to-end docs * Ubuntu Jammy is also supported * Add TODOs * Accomplish TODOs * Mention networks docs * NFS * Clarify image * Formatting changes * Apply suggestions from code review Co-authored-by: Steve Brasier <33413598+sjpb@users.noreply.github.com> * Suggestions from code review * Update docs/production.md Co-authored-by: Steve Brasier <33413598+sjpb@users.noreply.github.com> * Add git remote instructions * Update cookiecutter info * Link filesystems docs * Move tofu into define and deploy infra section * Reorganise configuration * Move tofu note --------- Co-authored-by: Steve Brasier <33413598+sjpb@users.noreply.github.com> --- docs/production.md | 499 +++++++++++++++++++++++++++++++++------------ 1 file changed, 370 insertions(+), 129 deletions(-) diff --git a/docs/production.md b/docs/production.md index bcf4925..8808a56 100644 --- a/docs/production.md +++ b/docs/production.md @@ -1,158 +1,399 @@ # Production Deployments -This page contains some brief notes about differences between the default/demo -configuration (as described in the main [README.md](../README.md)) and -production-ready deployments. - -- Get it agreed up front what the cluster names will be. Changing this later - requires instance deletion/recreation. - -- At least two environments should be created using cookiecutter, which will derive from the `site` base environment: - - `production`: production environment - - `staging`: staging environment - - A `dev` environment should also be created if considered required, or this - can be left until later. - - In general only the `inventory/groups` file in the `site` environment is needed - - it can be modified as required to - enable features for all environments at the site. - -- To avoid divergence of configuration all possible overrides for group/role -vars should be placed in `environments/site/inventory/group_vars/all/*.yml` -unless the value really is environment-specific (e.g. DNS names for -`openondemand_servername`). - -- Where possible hooks should also be placed in `environments/site/hooks/` -and referenced from the `site` and `production` environments, e.g.: - - ```yaml - # environments/production/hooks/pre.yml: - - name: Import parent hook - import_playbook: "{{ lookup('env', 'APPLIANCES_ENVIRONMENT_ROOT') }}/../site/hooks/pre.yml" - ``` - -- When setting OpenTofu configurations: - - - Environment-specific variables (`cluster_name`) should be hardcoded - as arguments into the cluster module block at `environments/$ENV/tofu/main.tf`. - - Environment-independent variables (e.g. maybe `cluster_net` if the - same is used for staging and production) should be set as *defaults* - in `environments/site/tofu/variables.tf`, and then don't need to - be passed in to the module. +This page will guide you on how to create production-ready deployments. While +you can start right away with this guide, you may find it useful to try with a +demo deployment first, as described in the [main README](../README.md). + +## Prerequisites + +Before starting ensure that: + + - You have root access on the deploy host. + + - You can create instances from the [latest Slurm appliance + image](https://github.com/stackhpc/ansible-slurm-appliance/releases), + which already contains the required packages. This is built and tested in + StackHPC's CI. + + - You have an SSH keypair defined in OpenStack, with the private key + available on the deploy host. + + - Created instances have access to internet (note proxies can be setup + through the appliance if necessary). + + - Created instances have accurate/synchronised time (for VM instances this is + usually provided by the hypervisor; if not or for bare metal instances it + may be necessary to configure a time service via the appliance). + + - Three security groups are present: ``default`` allowing intra-cluster + communication, ``SSH`` allowing external access via SSH and ``HTTPS`` + allowing access for Open OnDemand. + + - Usually, you'll want to deploy the Slurm Appliance into its own dedicated + project. It's recommended that your OpenStack credentials are defined in a + [clouds.yaml](https://docs.openstack.org/python-openstackclient/latest/configuration/index.html#clouds-yaml) + file in a default location with the default cloud name of `openstack`. + +### Setup deploy host + +The following operating systems are supported for the deploy host: + + - Rocky Linux 9 + + - Rocky Linux 8 + +These instructions assume the deployment host is running Rocky Linux 8: + +```bash +sudo yum install -y git python38 +git clone https://github.com/stackhpc/ansible-slurm-appliance +cd ansible-slurm-appliance +git checkout ${latest-release-tag} +./dev/setup-env.sh +``` + +You will also need to install +[OpenTofu](https://opentofu.org/docs/intro/install/rpm/). + +## Version control + +A production deployment should be set up under version control, so you should +create a fork of this repo. + +First make an empty Git repository using your service of choice (e.g. GitHub or +GitLab), then execute the following commands to turn the new empty repository +into a copy of the ansible-slurm-appliance repository. + + ```bash + git clone https://github.com/stackhpc/ansible-slurm-appliance.git + cd ansible-slurm-appliance + ``` + +Maintain the existing origin remote as upstream, and create a new origin remote +for the repository location. + + ```bash + git remote rename origin upstream + git remote add origin git@/ansible-slurm-appliance.git + ``` + +You should use the [latest tagged +release](https://github.com/stackhpc/ansible-slurm-appliance/releases). v1.161 +has been used as an example here, make sure to change this. Do not use the +default main branch, as this may have features that are still works in +progress. + + ```bash + git checkout v1.161 + git checkout -b site/main + git push -u origin site/main + ``` + +## Environment setup + +Get it agreed up front what the cluster names will be. Changing this later +requires instance deletion/recreation. + +### Environments structure + +At least two environments should be created using cookiecutter, which will +derive from the `site` base environment: + - `production`: production environment + - `staging`: staging environment + +A `dev` environment should also be created if considered required, or this can +be left until later. + +In general only the `inventory/groups` file in the `site` environment is +needed; it can be modified as required to enable features for all environments +at the site. + +To ensure the `staging` environment provides a good test of the `production` +environment, wherever possible group/role vars should be placed in +`environments/site/inventory/group_vars/all/*.yml` unless the value really is +environment-specific (e.g. DNS names for `openondemand_servername`). + +Where possible hooks should also be placed in `environments/site/hooks/` +and referenced from the `production` and `staging` environments, e.g.: + + ```yaml + # environments/production/hooks/pre.yml: + - name: Import parent hook + import_playbook: "{{ lookup('env', 'APPLIANCES_ENVIRONMENT_ROOT') }}/../site/hooks/pre.yml" + ``` + +OpenTofu configurations are defined in the `site` environment and referenced +as a module by the site-specific cookie-cutter generated configurations. This +will have been generated for you already under +``environments/$ENV/tofu/main.tf``. + +### Cookiecutter instructions + +- Run the following from the repository root to activate the venv: + + ```bash + . venv/bin/activate + ``` + +- Use the `cookiecutter` template to create a new environment to hold your + configuration: + + ```bash + cd environments + cookiecutter ../cookiecutter + ``` + + and follow the prompts to complete the environment name and description. + + **NB:** In subsequent sections this new environment is referred to as `$ENV`. + +- Go back to the root folder and activate the new environment: + + ```bash + cd .. + . environments/$ENV/activate + ``` + + And generate secrets for it: + + ```bash + ansible-playbook ansible/adhoc/generate-passwords.yml + ``` + +## Define and deploy infrastructure + +Create an OpenTofu variables file to define the required infrastructure, e.g.: + + ``` + # environments/$ENV/tofu/terraform.tfvars + cluster_name = "mycluster" + cluster_networks = [ + { + network = "some_network" # * + subnet = "some_subnet" # * + } + ] + key_pair = "my_key" # * + control_node_flavor = "some_flavor_name" + login = { + # Arbitrary group name for these login nodes + interactive = { + nodes: ["login-0"] + flavor: "login_flavor_name" # * + } + } + cluster_image_id = "rocky_linux_9_image_uuid" + compute = { + # Group name used for compute node partition definition + general = { + nodes: ["compute-0", "compute-1"] + flavor: "compute_flavor_name" # * + } + } + ``` + +Variables marked `*` refer to OpenStack resources which must already exist. + +The above is a minimal configuration - for all variables and descriptions see +`environments/site/tofu/variables.tf`. + +Note that: + + - Environment-specific variables (`cluster_name`) should be hardcoded into + the cluster module block. + + - Environment-independent variables (e.g. maybe `cluster_net` if the same + is used for staging and production) should be set as *defaults* in + `environments/site/tofu/variables.tf`, and then don't need to be passed + in to the module. + +The cluster image used should match the release which you are deploying with. +Published images are described in the release notes +[here](https://github.com/stackhpc/ansible-slurm-appliance/releases). + +By default, the site OpenTofu configuration provisions two volumes and attaches +them to the control node: + - "$cluster_name-home" for NFS-shared home directories + - "$cluster_name-state" for monitoring and Slurm data +The volumes mean this data is persisted when the control node is rebuilt. +However if the cluster is destroyed with `tofu destroy` then the volumes will +also be deleted. This is undesirable for production environments and usually +also for staging environments. Therefore the volumes should be manually +created, e.g. via the CLI: + + ``` + openstack volume create --size 200 mycluster-home # size in GB + openstack volume create --size 100 mycluster-state + ``` + +and OpenTofu configured to use those volumes instead of managing them itself by +setting: + + ``` + home_volume_provisioning = "attach" + state_volume_provisioning = "attach" + ``` + +either for a specific environment within the cluster module block in +`environments/$ENV/tofu/main.tf`, or as the site default by changing the +default in `environments/site/tofu/variables.tf`. + +For a development environment allowing OpenTofu to manage the volumes using the +default value of `"manage"` for those varibles is usually appropriate, as it +allows for multiple clusters to be created with this environment. + +If no home volume at all is required because the home directories are provided +by a parallel filesystem (e.g. Manila) set + + ``` + home_volume_provisioning = "none" + ``` + +In this case the NFS share for home directories is automatically disabled. + +**NB:** To apply "attach" options to existing clusters, first remove the +volume(s) from the tofu state, e.g.: + + ``` + tofu state list # find the volume(s) + tofu state rm 'module.cluster.openstack_blockstorage_volume_v3.state[0]' + ``` + +This leaves the volume itself intact, but means OpenTofu "forgets" it. Then set +the "attach" options and run `tofu apply` again - this should show there are no +changes planned. + +A production deployment may have a more complex networking requirements than +just a simple network. See the [networks docs](networks.md) for details. + +If floating IPs are required for login nodes, create these in OpenStack and add +the IPs into the OpenTofu `login` definition. + +Consider enabling topology aware scheduling. This is currently only supported +if your cluster does not include any baremetal nodes. This can be enabled by: + 1. Creating Availability Zones in your OpenStack project for each physical + rack + 2. Setting the `availability_zone` fields of compute groups in your OpenTofu + configuration + 3. Adding the `compute` group as a child of `topology` in + `environments/$ENV/inventory/groups` + 4. (Optional) If you are aware of the physical topology of switches above the + rack-level, override `topology_above_rack_topology` in your groups vars + (see [topology docs](../ansible/roles/topology/README.md) for more detail) + +Consider whether mapping of baremetal nodes to ironic nodes is required. See +[PR 485](https://github.com/stackhpc/ansible-slurm-appliance/pull/485). + +To deploy this infrastructure, ensure the venv and the environment are +[activated](#cookiecutter-instructions) and run: + + ```bash + export OS_CLOUD=openstack + cd environments/$ENV/tofu/ + tofu init + tofu apply + ``` + +and follow the prompts. Note the OS_CLOUD environment variable assumes that +OpenStack credentials are defined using a +[clouds.yaml](https://docs.openstack.org/python-openstackclient/latest/configuration/index.html#clouds-yaml) +file in a default location with the default cloud name of `openstack`. + +By default, OpenTofu (and Terraform) +[limits](https://opentofu.org/docs/cli/commands/apply/#apply-options) the +number of concurrent operations to 10. This means that for example only 10 +ports or 10 instances can be deployed at once. This should be raised by +modifying `environments/$ENV/activate` to add a line like: + + ```bash + export TF_CLI_ARGS_apply="-parallelism=25" + ``` + +The value chosen should be the highest value demonstrated during testing. Note +that any time spent blocked due to this parallelism limit does not count +against the (un-overridable) internal OpenTofu timeout of 30 minutes + +## Configure appliance + +### Production configuration to consider - Vault-encrypt secrets. Running the `generate-passwords.yml` playbook creates a secrets file at `environments/$ENV/inventory/group_vars/all/secrets.yml`. - To ensure staging environments are a good model for production this should - generally be moved into the `site` environment. It should be encrypted - using [Ansible vault](https://docs.ansible.com/ansible/latest/user_guide/vault.html) - and then committed to the repository. + These should be created for each environment, and then be encrypted using + [Ansible vault](https://docs.ansible.com/ansible/latest/user_guide/vault.html) + and committed to the repository. - Ensure created instances have accurate/synchronised time. For VM instances this is usually provided by the hypervisor, but if not (or for bare metal - instances) it may be necessary to configure or proxy `chronyd` via an - environment hook. - -- By default, the site OpenTofu configuration provisions two - volumes and attaches them to the control node: - - "$cluster_name-home" for NFS-shared home directories - - "$cluster_name-state" for monitoring and Slurm data - The volumes mean this data is persisted when the control node is rebuilt. - However if the cluster is destroyed with `tofu destroy` then the volumes will - also be deleted. This is undesirable for production environments and usually - also for staging environments. Therefore the volumes should be manually - created, e.g. via the CLI: - - openstack volume create --size 200 mycluster-home # size in GB - openstack volume create --size 100 mycluster-state - - and OpenTofu configured to use those volumes instead of managing them itself - by setting: - - home_volume_provisioning = "attach" - state_volume_provisioning = "attach" - - either for a specific environment within the cluster module block in - `environments/$ENV/tofu/main.tf`, or as the site default by changing the - default in `environments/site/tofu/variables.tf`. - - For a development environment allowing OpenTofu to manage the volumes using - the default value of `"manage"` for those varibles is usually appropriate, as - it allows for multiple clusters to be created with this environment. - - If no home volume at all is required because the home directories are provided - by a parallel filesystem (e.g. manila) set - - home_volume_provisioning = "none" - - In this case the NFS share for home directories is automatically disabled. - - **NB:** To apply "attach" options to existing clusters, first remove the - volume(s) from the tofu state, e.g.: - - tofu state list # find the volume(s) - tofu state rm 'module.cluster.openstack_blockstorage_volume_v3.state[0]' - - This leaves the volume itself intact, but means OpenTofu "forgets" it. Then - set the "attach" options and run `tofu apply` again - this should show there - are no changes planned. + instances) it may be necessary to [configure chrony](./chrony.md). - Consider whether Prometheus storage configuration is required. By default: - A 200GB state volume is provisioned (but see above) - - The common environment [sets](../environments/common/inventory/group_vars/all/prometheus.yml) - a maximum retention of 100 GB and 31 days + - The common environment + [sets](../environments/common/inventory/group_vars/all/prometheus.yml) a + maximum retention of 100 GB and 31 days. These may or may not be appropriate depending on the number of nodes, the scrape interval, and other uses of the state volume (primarily the `slurmctld` - state and the `slurmdbd` database). See [docs/monitoring-and-logging](./monitoring-and-logging.md) - for more options. + state and the `slurmdbd` database). See + [docs/monitoring-and-logging](./monitoring-and-logging.md) for more options. - Configure Open OnDemand - see [specific documentation](openondemand.md) which notes specific variables required. -- Remove the `demo_user` user from `environments/$ENV/inventory/group_vars/all/basic_users.yml`. - Replace the `hpctests_user` in `environments/$ENV/inventory/group_vars/all/hpctests.yml` with - an appropriately configured user. +- Remove the `demo_user` user from + `environments/$ENV/inventory/group_vars/all/basic_users.yml`. Replace the + `hpctests_user` in `environments/$ENV/inventory/group_vars/all/hpctests.yml` + with an appropriately configured user. -- Consider whether having (read-only) access to Grafana without login is OK. If not, remove `grafana_auth_anonymous` in `environments/$ENV/inventory/group_vars/all/grafana.yml` +- Consider whether having (read-only) access to Grafana without login is OK. If + not, remove `grafana_auth_anonymous` in + `environments/$ENV/inventory/group_vars/all/grafana.yml` -- If floating IPs are required for login nodes, create these in OpenStack and add the IPs into - the OpenTofu `login` definition. +- See the [hpctests docs](../ansible/roles/hpctests/README.md) for advice on + raising `hpctests_hpl_mem_frac` during tests. -- Consider enabling topology aware scheduling. This is currently only supported if your cluster does not include any baremetal nodes. This can be enabled by: - 1. Creating Availability Zones in your OpenStack project for each physical rack - 2. Setting the `availability_zone` fields of compute groups in your OpenTofu configuration - 3. Adding the `compute` group as a child of `topology` in `environments/$ENV/inventory/groups` - 4. (Optional) If you are aware of the physical topology of switches above the rack-level, override `topology_above_rack_topology` in your groups vars - (see [topology docs](../ansible/roles/topology/README.md) for more detail) +- By default, OpenStack Nova + [limits](https://docs.openstack.org/nova/latest/configuration/config.html#DEFAULT.max_concurrent_builds) + the number of concurrent instance builds to 10. This is per Nova controller, + so 10x virtual machines per hypervisor. For baremetal nodes it is 10 per + cloud if the OpenStack version is earlier than Caracel, else this limit can + be raised using + [shards](https://specs.openstack.org/openstack/nova-specs/specs/2024.1/implemented/ironic-shards.html). + In general it should be possible to raise this value to 50-100 if the cloud + is properly tuned, again, demonstrated through testing. -- Consider whether mapping of baremetal nodes to ironic nodes is required. See - [PR 485](https://github.com/stackhpc/ansible-slurm-appliance/pull/485). +- Enable alertmanager if Slack is available - see + [docs/alerting.md](./alerting.md). -- Note [PR 473](https://github.com/stackhpc/ansible-slurm-appliance/pull/473) - may help identify any site-specific configuration. +- Enable node health checks - see + [ansible/roles/nhc/README.md](../ansible/roles/nhc/README.md). -- See the [hpctests docs](../ansible/roles/hpctests/README.md) for advice on - raising `hpctests_hpl_mem_frac` during tests. +- By default, the appliance uses a built-in NFS share backed by an OpenStack + volume for the cluster home directories. You may find that you want to change + this. The following alternatives are supported: -- By default, OpenTofu (and Terraform) [limits](https://opentofu.org/docs/cli/commands/apply/#apply-options) - the number of concurrent operations to 10. This means that for example only - 10 ports or 10 instances can be deployed at once. This should be raised by - modifying `environments/$ENV/activate` to add a line like: + - [CephFS via OpenStack Manila](./filesystems.md) + - [Lustre](../roles/lustre/README.md) - export TF_CLI_ARGS_apply="-parallelism=25" +- For some features, such as installing [DOCA-OFED](../roles/doca/README.md) or + [CUDA](../roles/cuda/README.md), you will need to build a custom image. It is + recommended that you build this on top of the latest existing openhpc image. + See the [image-build docs](image-build.md) for details. - The value chosen should be the highest value demonstrated during testing. - Note that any time spent blocked due to this parallelism limit does not count - against the (un-overridable) internal OpenTofu timeout of 30 minutes +### Applying configuration -- By default, OpenStack Nova also [limits](https://docs.openstack.org/nova/latest/configuration/config.html#DEFAULT.max_concurrent_builds) - the number of concurrent instance builds to 10. This is per Nova controller, - so 10x virtual machines per hypervisor. For baremetal nodes it is 10 per cloud - if the OpenStack version is earlier than Caracel, else this limit can be - raised using [shards](https://specs.openstack.org/openstack/nova-specs/specs/2024.1/implemented/ironic-shards.html). - In general it should be possible to raise this value to 50-100 if the cloud - is properly tuned, again, demonstrated through testing. +To configure the appliance, ensure the venv and the environment are +[activated](#create-a-new-environment) and run: + + ```bash + ansible-playbook ansible/site.yml + ``` + +Once it completes you can log in to the cluster using: -- Enable alertmanager if Slack is available - see [docs/alerting.md](./alerting.md). + ```bash + ./dev/ansible-ssh login + ``` -- Enable node health checks - see [ansible/roles/nhc/README.md](../ansible/roles/nhc/README.md). +For further information, including additional configuration guides and +operations instructions, see the [docs](README.md) directory. From cbf990a8118c882af9429b3edfa290387ac45d28 Mon Sep 17 00:00:00 2001 From: Matt Crees Date: Thu, 4 Sep 2025 10:07:42 +0100 Subject: [PATCH 05/50] Fix inventory parsing of cookiecutter env (#768) Without any top-level inventory file, Ansible will fail with: ``` ERROR! Completely failed to parse inventory source /home/ubuntu/ansible-slurm-appliance/environments/$ENV/inventory ``` --- cookiecutter/{{cookiecutter.environment}}/inventory/hosts | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 cookiecutter/{{cookiecutter.environment}}/inventory/hosts diff --git a/cookiecutter/{{cookiecutter.environment}}/inventory/hosts b/cookiecutter/{{cookiecutter.environment}}/inventory/hosts new file mode 100644 index 0000000..e69de29 From 275da838fa3727cfd745d733573d04527df70198 Mon Sep 17 00:00:00 2001 From: wtripp180901 <78219569+wtripp180901@users.noreply.github.com> Date: Thu, 4 Sep 2025 13:47:32 +0100 Subject: [PATCH 06/50] Refactor Pulp repo definitions and add more Pulp documentation (#760) * WIP: refactor repos definitions * add more repos and cope with CRB/PowerTools oddness * add epel * use pulp_server as a group * add epel default * wip: get pulp sync working * fixed sync * autodetect latest in adhoc script, refactored timestamps to allow gated ohpc repos, fixed pulp site * fixed distributions + ohpc repos * updated timestamps script + bumped rocky 9 timestamps * removed pulp_repo_name fields * updated docs, added gpg checks, simplified filters * Added pulp systemd file + removed unused vars * added READMEs + updated variable names * disabled gpg checks for dnf_repos * typo * fixed disable repos task * bump images * remove dnf_repos extra index/key and make epel/openhpc special-cases simpler * clarify pulp distro selection * fixup sync vars * fixup grafana vars * revert latest timestamp changes for extra key level * review suggestions Co-authored-by: Steve Brasier <33413598+sjpb@users.noreply.github.com> * updated README * docs tweaks * regularised group names * updated operations guide for functionality requiring additional installs * review changes from docs Co-authored-by: Steve Brasier <33413598+sjpb@users.noreply.github.com> * renamed timestamps.yml to dnf_repos_timestamps.yml --------- Co-authored-by: Steve Brasier Co-authored-by: Steve Brasier <33413598+sjpb@users.noreply.github.com> --- ansible/adhoc/deploy-pulp.yml | 17 +-- ansible/adhoc/sync-pulp.yml | 4 +- ansible/ci/update_timestamps.yml | 6 +- ansible/fatimage.yml | 2 +- ansible/filter_plugins/utils.py | 6 - ansible/library/latest_timestamps.py | 5 +- ansible/roles/dnf_repos/README.md | 40 ++++++ ansible/roles/dnf_repos/defaults/main.yml | 52 +------- .../roles/dnf_repos/tasks/disable_repos.yml | 25 ++-- ansible/roles/dnf_repos/tasks/set_repos.yml | 43 +++++-- ansible/roles/pulp_site/README.md | 36 ++++++ ansible/roles/pulp_site/defaults/main.yml | 47 +++---- ansible/roles/pulp_site/files/pulp.service | 12 ++ .../filter_plugins/pulp-list-filters.py | 68 +++++++--- ansible/roles/pulp_site/tasks/install.yml | 22 +++- ansible/roles/pulp_site/tasks/sync.yml | 22 ++-- ansible/roles/pulp_site/templates/cli.toml.j2 | 1 - docs/environments.md | 5 +- docs/experimental/pulp.md | 42 ++++++- docs/operations.md | 20 ++- .../tofu/cluster_image.auto.tfvars.json | 4 +- .../common/files/grafana/grafana.repo.j2 | 2 +- .../group_vars/all/dnf_repo_timestamps.yml | 116 ++++++++++++++++++ .../inventory/group_vars/all/dnf_repos.yml | 8 ++ .../inventory/group_vars/all/timestamps.yml | 88 ------------- environments/common/inventory/groups | 11 +- environments/site/inventory/groups | 9 ++ 27 files changed, 445 insertions(+), 268 deletions(-) create mode 100644 ansible/roles/dnf_repos/README.md create mode 100644 ansible/roles/pulp_site/README.md create mode 100644 ansible/roles/pulp_site/files/pulp.service create mode 100644 environments/common/inventory/group_vars/all/dnf_repo_timestamps.yml create mode 100644 environments/common/inventory/group_vars/all/dnf_repos.yml delete mode 100644 environments/common/inventory/group_vars/all/timestamps.yml diff --git a/ansible/adhoc/deploy-pulp.yml b/ansible/adhoc/deploy-pulp.yml index 2858d03..f7bafc3 100644 --- a/ansible/adhoc/deploy-pulp.yml +++ b/ansible/adhoc/deploy-pulp.yml @@ -1,15 +1,6 @@ -# Usage: ansible-playbook ansible/adhoc/deploy-pulp.yml -e "pulp_server=" - -- name: Add temporary pulp server host - hosts: localhost - tasks: - - ansible.builtin.add_host: - name: "{{ pulp_server }}" - group: "_pulp_host" - -- name: Install pulp on server and add to config +- name: Install pulp on server become: yes - hosts: _pulp_host + hosts: pulp_server tasks: - name: Install pulp ansible.builtin.include_role: @@ -22,5 +13,5 @@ debug: msg: | Server configured, override 'appliances_pulp_url' with - appliances_pulp_url: "http://{{ pulp_server }}:{{ pulp_site_port }}" - in your environments + appliances_pulp_url: "http://{{ hostvars[groups['pulp_server'] | first].ansible_host }}:{{ pulp_site_port }}" + (or the correct IP if multi-homed) in your environments diff --git a/ansible/adhoc/sync-pulp.yml b/ansible/adhoc/sync-pulp.yml index b2cd9a8..373f3ab 100644 --- a/ansible/adhoc/sync-pulp.yml +++ b/ansible/adhoc/sync-pulp.yml @@ -6,5 +6,5 @@ vars: pulp_site_target_arch: "x86_64" pulp_site_target_distribution: "rocky" - pulp_site_target_distribution_version: "9.5" - pulp_site_target_distribution_version_major: "9" + # default distribution to *latest* specified for baseos repo: + pulp_site_target_distribution_version: "{{ dnf_repos_repos['baseos'].keys() | map('float') | sort | last }}" diff --git a/ansible/ci/update_timestamps.yml b/ansible/ci/update_timestamps.yml index e9a455a..8db4757 100644 --- a/ansible/ci/update_timestamps.yml +++ b/ansible/ci/update_timestamps.yml @@ -2,15 +2,15 @@ tasks: - name: Get latest timestamps from sources latest_timestamps: - repos_dict: "{{ appliances_pulp_repos }}" + repos_dict: "{{ dnf_repos_default }}" content_url: "https://ark.stackhpc.com/pulp/content" register: _result - name: Overwrite repo timestamps with latest ansible.builtin.copy: - dest: "{{ appliances_repository_root }}/environments/common/inventory/group_vars/all/timestamps.yml" + dest: "{{ appliances_repository_root }}/environments/common/inventory/group_vars/all/dnf_repo_timestamps.yml" content: "{{ repo_template | to_nice_yaml(indent=2) }}" backup: true vars: repo_template: - appliances_pulp_repos: "{{ _result.timestamps }}" + dnf_repos_default: "{{ _result.timestamps }}" diff --git a/ansible/fatimage.yml b/ansible/fatimage.yml index 839c8dc..46a99bc 100644 --- a/ansible/fatimage.yml +++ b/ansible/fatimage.yml @@ -18,7 +18,7 @@ when: hook_path | exists - name: Sync pulp repos with upstream - hosts: pulp + hosts: pulp_site tasks: - ansible.builtin.include_role: name: pulp_site diff --git a/ansible/filter_plugins/utils.py b/ansible/filter_plugins/utils.py index b5b92ed..42b7107 100644 --- a/ansible/filter_plugins/utils.py +++ b/ansible/filter_plugins/utils.py @@ -61,11 +61,6 @@ def to_ood_regex(items): r = ['(%s)' % v for v in r] return '|'.join(r) -def appliances_repo_to_subpath(repo_entry): - """ Take an element from appliances_pulp_repos and convert it to a pulp path. This assumes that the remote and local pulp structures are the same - """ - return repo_entry['path'] + '/' + repo_entry['timestamp'] - class FilterModule(object): ''' Ansible core jinja2 filters ''' @@ -81,5 +76,4 @@ def filters(self): 'exists': exists, 'warn': self.warn, 'to_ood_regex': to_ood_regex, - 'appliances_repo_to_subpath': appliances_repo_to_subpath } diff --git a/ansible/library/latest_timestamps.py b/ansible/library/latest_timestamps.py index 6407ef0..0de3883 100644 --- a/ansible/library/latest_timestamps.py +++ b/ansible/library/latest_timestamps.py @@ -56,13 +56,12 @@ def run_module(): for version in timestamps[repo]: html_txt = requests.get( - url= module.params['content_url'] + '/' + timestamps[repo][version]['path'] + url= module.params['content_url'] + '/' + timestamps[repo][version]['pulp_path'] ).text timestamp_link_list = BeautifulSoup(html_txt,features="html.parser").body.find('pre').find_all() # getting raw list of timestamps from html timestamp_link_list = map(lambda x: x.string,timestamp_link_list) # stripping xml tags latest_timestamp = list(timestamp_link_list)[-1][:-1] # last timestamp in list with trailing / removed - timestamps[repo][version]['timestamp'] = latest_timestamp - + timestamps[repo][version]['pulp_timestamp'] = latest_timestamp result['timestamps'] = dict(sorted(timestamps.items())) module.exit_json(**result) diff --git a/ansible/roles/dnf_repos/README.md b/ansible/roles/dnf_repos/README.md new file mode 100644 index 0000000..ff22c79 --- /dev/null +++ b/ansible/roles/dnf_repos/README.md @@ -0,0 +1,40 @@ +dnf_repos +========= + +Modifies repo definitions for repofiles in `/etc/yum.repos.d` to point to snapshots in StackHPC's Ark Pulp server or mirrors of them +on a local Pulp server. + +Requirements +------------ + +Requires Ark credentials if using StackHPC's upstream Ark server. + +Role Variables +-------------- + +Variables in this role are also required by `pulp_site` so set in +`environments/common/inventory/groups_vars/all/dnf_repos.yml`. See that file for detailed default values. + +- `dnf_repos_repos`: Dict of dicts containing information to construct URLs for Ark snapshots from the target Pulp server for each Rocky version. For example: + ``` + dnf_repos_repos: + appstream: # ansible.builtin.yum_repository:name + '8.10': # ansible_distribution_version or ansible_distribution_major_version + repo_file: Rocky-AppStream # yum_repository: file + # repo_name: # optional, override yum_repository:name + pulp_path: rocky/8.10/AppStream/x86_64/os # The subpath of the the upstream Ark server's content endpoint URL for the repo's snapshots, see https://ark.stackhpc.com/pulp/content/ + pulp_timestamp: 20250614T013846 + # pulp_content_url: # optional, dnf_repos_pulp_content_url + '9.6': + ... + ``` +- `dnf_repos_default`: Appliance default repos to use Ark snapshots for. Following same format as `dnf_repos_repos`. + See for appliance default repo list `environments/common/inventory/group_vars/all/dnf_repo_timestamps.yml`. +- `dnf_repos_extra`: Additional repos to use Ark snapshots for. Follows same format as + `dnf_repos_repos`. Defaults to `{}` +- `dnf_repos_pulp_content_url`: Optional str. Content URL of Pulp server to use Ark snapshots from. + Defaults to `{{ appliances_pulp_url }}/pulp/content` +- `dnf_repos_username`: Optional str. Username for Ark. Should be set if using upstream StackHPC Ark + Pulp server, but omitted if using local Pulp server (see `ansible/roles/pulp_site`) +- `dnf_repos_password`: Optional str. Password for Ark. Should be set if using upstream StackHPC Ark + Pulp server, but omitted if using local Pulp server (see `ansible/roles/pulp_site`) diff --git a/ansible/roles/dnf_repos/defaults/main.yml b/ansible/roles/dnf_repos/defaults/main.yml index 9302eff..fe3c44e 100644 --- a/ansible/roles/dnf_repos/defaults/main.yml +++ b/ansible/roles/dnf_repos/defaults/main.yml @@ -1,54 +1,4 @@ +dnf_repos_repos: {} # see environments/common/inventory/group_vars/all/{dnf_repos,timestamps}.yml dnf_repos_pulp_content_url: "{{ appliances_pulp_url }}/pulp/content" dnf_repos_username: "{{ omit }}" dnf_repos_password: "{{ omit }}" - -dnf_repos_filenames: - '8': - baseos: 'Rocky-BaseOS' - appstream: 'Rocky-AppStream' - crb: 'Rocky-PowerTools' - extras: 'Rocky-Extras' - grafana: 'grafana' - '9': - baseos: 'rocky' - appstream: 'rocky' - crb: 'rocky' - extras: 'rocky-extras' - grafana: 'grafana' - -dnf_repos_version_filenames: "{{ dnf_repos_filenames[ansible_distribution_major_version] }}" - -# epel installed separately -dnf_repos_default_repolist: -- file: "{{ dnf_repos_version_filenames.baseos }}" - name: baseos - base_url: "{{ dnf_repos_pulp_content_url }}/{{ appliances_pulp_repos.baseos[ansible_distribution_version] | appliances_repo_to_subpath }}" -- file: "{{ dnf_repos_version_filenames.appstream }}" - name: appstream - base_url: "{{ dnf_repos_pulp_content_url }}/{{ appliances_pulp_repos.appstream[ansible_distribution_version] | appliances_repo_to_subpath }}" -- file: "{{ dnf_repos_version_filenames.crb }}" - name: "{{ 'powertools' if ansible_distribution_major_version == '8' else 'crb' }}" - base_url: "{{ dnf_repos_pulp_content_url }}/{{ appliances_pulp_repos.crb[ansible_distribution_version] | appliances_repo_to_subpath }}" -- file: "{{ dnf_repos_version_filenames.extras }}" - name: extras - base_url: "{{ dnf_repos_pulp_content_url }}/{{ appliances_pulp_repos.extras[ansible_distribution_version] | appliances_repo_to_subpath }}" -- file: ceph - name: Ceph - base_url: "{{ dnf_repos_pulp_content_url }}/{{ appliances_pulp_repos.ceph[ansible_distribution_major_version] | appliances_repo_to_subpath }}" -- file: "{{ dnf_repos_version_filenames.grafana }}" - name: grafana - base_url: "{{ dnf_repos_pulp_content_url }}/{{ appliances_pulp_repos.grafana[ansible_distribution_major_version] | appliances_repo_to_subpath }}" - -dnf_repos_openhpc_repolist: -- name: OpenHPC - file: OpenHPC - base_url: "{{ dnf_repos_pulp_content_url }}/{{ appliances_pulp_repos.openhpc_base[ansible_distribution_major_version] | appliances_repo_to_subpath }}" -- name: OpenHPC-updates - file: OpenHPC - base_url: "{{ dnf_repos_pulp_content_url }}/{{ appliances_pulp_repos.openhpc_updates[ansible_distribution_major_version] | appliances_repo_to_subpath }}" - -dnf_repos_extra_repolist: [] -dnf_repos_repolist: "{{ dnf_repos_default_repolist + (dnf_repos_openhpc_repolist if (openhpc_install_type | default('ohpc')) == 'ohpc' else []) + dnf_repos_extra_repolist }}" - -dnf_repos_epel_baseurl: "{{ dnf_repos_pulp_content_url }}/{{ appliances_pulp_repos.epel[ansible_distribution_major_version] | appliances_repo_to_subpath }}" -dnf_repos_epel_description: "epel" diff --git a/ansible/roles/dnf_repos/tasks/disable_repos.yml b/ansible/roles/dnf_repos/tasks/disable_repos.yml index 9f8abe6..4db073b 100644 --- a/ansible/roles/dnf_repos/tasks/disable_repos.yml +++ b/ansible/roles/dnf_repos/tasks/disable_repos.yml @@ -1,21 +1,20 @@ --- - name: Remove password and disable Pulp repos ansible.builtin.yum_repository: - file: "{{ item.file }}" - name: "{{ item.name }}" - baseurl: "{{ item.base_url }}" - description: "{{ item.name }}" + file: "{{ repo_values.repo_file }}" + name: "{{ repo_name }}" + baseurl: "{{ repo_content_url }}/{{ repo_values.pulp_path }}/{{ repo_values.pulp_timestamp }}" + description: "{{ repo_name }}" enabled: false - loop: "{{ dnf_repos_repolist }}" - -- name: Remove password and disable EPEL repo - ansible.builtin.yum_repository: - name: epel - file: epel - description: "{{ dnf_repos_epel_description }}" - baseurl: "{{ dnf_repos_epel_baseurl }}" gpgcheck: false - enabled: false + loop: "{{ dnf_repos_repos | dict2items }}" + loop_control: + label: "{{ repo_name }}[{{ repo_os }}]: {{ repo_values }}" + vars: + repo_os: "{{ ansible_distribution_version if ansible_distribution_version in item.value else ansible_distribution_major_version }}" + repo_values: "{{ item.value[repo_os] }}" + repo_name: "{{ repo_values.repo_name | default(item.key) }}" + repo_content_url: "{{ repo_values.pulp_content_url | default(dnf_repos_pulp_content_url) }}" - name: Get all repo files ansible.builtin.find: diff --git a/ansible/roles/dnf_repos/tasks/set_repos.yml b/ansible/roles/dnf_repos/tasks/set_repos.yml index c9fcb0c..2db4de9 100644 --- a/ansible/roles/dnf_repos/tasks/set_repos.yml +++ b/ansible/roles/dnf_repos/tasks/set_repos.yml @@ -1,27 +1,44 @@ --- -- name: Replace system repos with Pulp repos +- name: Replace non-epel repos with Pulp repos ansible.builtin.yum_repository: - file: "{{ item.file }}" - name: "{{ item.name }}" - baseurl: "{{ item.base_url }}" - description: "{{ item.name }}" + file: "{{ repo_values.repo_file }}" + name: "{{ repo_name }}" + baseurl: "{{ repo_content_url }}/{{ repo_values.pulp_path }}/{{ repo_values.pulp_timestamp }}" + description: "{{ repo_name }}" username: "{{ dnf_repos_username }}" password: "{{ dnf_repos_password }}" gpgcheck: false - loop: "{{ dnf_repos_repolist }}" + loop: "{{ dnf_repos_repos | dict2items }}" + loop_control: + label: "{{ repo_name }}[{{ repo_os }}]: {{ repo_values }}" + when: repo_name != 'epel' + vars: + repo_os: "{{ ansible_distribution_version if ansible_distribution_version in item.value else ansible_distribution_major_version }}" + repo_values: "{{ item.value[repo_os] }}" + repo_name: "{{ repo_values.repo_name | default(item.key) }}" + repo_content_url: "{{ repo_values.pulp_content_url | default(dnf_repos_pulp_content_url) }}" - name: Install epel-release - # done so that roles installing epel via epel-release don't over-write our changes to the epel repo + # So roles installing epel via epel-release don't overwrite changes to the epel repo below ansible.builtin.dnf: name: epel-release -- name: Use Pulp EPEL repo +- name: Replace epel repo with Pulp repo ansible.builtin.yum_repository: - name: epel - file: epel - description: "{{ dnf_repos_epel_description }}" - gpgcheck: false - baseurl: "{{ dnf_repos_epel_baseurl }}" + file: "{{ repo_values.repo_file }}" + name: "{{ repo_name }}" + baseurl: "{{ repo_content_url }}/{{ repo_values.pulp_path }}/{{ repo_values.pulp_timestamp }}" + description: "{{ repo_name }}" username: "{{ dnf_repos_username }}" password: "{{ dnf_repos_password }}" + gpgcheck: false + loop: "{{ dnf_repos_repos | dict2items }}" + loop_control: + label: "{{ repo_name }}[{{ repo_os }}]: {{ repo_values }}" + when: repo_name == 'epel' + vars: + repo_os: "{{ ansible_distribution_version if ansible_distribution_version in item.value else ansible_distribution_major_version }}" + repo_values: "{{ item.value[repo_os] }}" + repo_name: "{{ repo_values.repo_name | default(item.key) }}" + repo_content_url: "{{ repo_values.pulp_content_url | default(dnf_repos_pulp_content_url) }}" diff --git a/ansible/roles/pulp_site/README.md b/ansible/roles/pulp_site/README.md new file mode 100644 index 0000000..3af801c --- /dev/null +++ b/ansible/roles/pulp_site/README.md @@ -0,0 +1,36 @@ +pulp_site +========= + +Contains playbooks to deploy a Pulp server and sync its content with repo snapshots in +StackHPC's Ark Pulp server + +Requirements +------------ + +Requires Ark credentials. The VM you are deploying Pulp on must allow ingress on `pulp_site_port` +and not be externally accessible (as the Pulp server's content is unauthenticated). Rocky Linux 9 has been +tested as the target VM for deploying Pulp. + +Role Variables +-------------- + +- `pulp_site_url`: Required str. The base url from which Pulp content will be hosted. Defaults to `{{ appliances_pulp_url }}`. + Value to set for ``appliances_pulp_url` will be generated and output by the deploy.yml playbook. +- `pulp_site_port`: Optional str. Port to serve Pulp server on. Defaults to `8080`. +- `pulp_site_username`: Optional str. Admin username for the Pulp server. Defaults to `admin`. +- `pulp_site_password`: Required str. Admin password for the Pulp server. Defaults to `{{ vault_pulp_admin_password }}`. +- `pulp_site_upstream_username`: Required str. Username for accessing content from the upstream Ark Pulp server. +- `pulp_site_upstream_password`: Required str. Password for upstream Ark Pulp server. +- `pulp_site_upstream_content_url`: Optional str. Content URL of upstream Ark Pulp. Defaults to `https://ark.stackhpc.com/pulp/content`. +- `pulp_site_install_dir`: Optional str. Directory on Pulp host to install config and persistent state to be mounted into Pulp container. Defaults to `/home/rocky/pulp`. +- `pulp_site_target_facts`: Optional str. The `ansible_facts` of a host which will be pulling from your Pulp server, allowing the role to auto-discover the necessary repos to pull. + defaults to `{{ hostvars[groups['pulp'][0]]['ansible_facts'] }}`. +- `pulp_site_target_distribution_version`: Optional str. The Rocky Linux minor release to sync repos from Ark for. Defaults to `{{ pulp_site_target_facts['distribution_version'] }}`. +- `pulp_site_rpm_repo_defaults`: Optional dict. Contains key value pairs for fields which are common to all repo definition in `pulp_site_rpm_repos`. Includes values for `remote_username`, + `remote_password` and `policy` by default. +- `pulp_site_rpm_repos`: Optional list of dicts. List of repo definitions in format required by the `stackhpc.pulp.pulp_repository`. Defaults to modified versions of repos defined in + `dnf_repos_all`. +- `pulp_site_rpm_publications`: Optional list of dicts. List of repo definitions in format required by the `stackhpc.pulp.pulp_publication`. Defaults to list of publications for repos defined in + `dnf_repos_all`. +- `pulp_site_rpm_distributions`: Optional list of dicts. List of repo definitions in format required by the `stackhpc.pulp.pulp_distribution`. Defaults to list of distributions for repos defined in + `dnf_repos_all`. diff --git a/ansible/roles/pulp_site/defaults/main.yml b/ansible/roles/pulp_site/defaults/main.yml index d30d1bd..3d2bce7 100644 --- a/ansible/roles/pulp_site/defaults/main.yml +++ b/ansible/roles/pulp_site/defaults/main.yml @@ -2,43 +2,30 @@ pulp_site_url: "{{ appliances_pulp_url }}" pulp_site_port: 8080 pulp_site_username: admin # shouldn't be changed pulp_site_password: "{{ vault_pulp_admin_password }}" +# See environments/common/inventory/groups_vars/all/pulp.yml +# pulp_site_upstream_username: +# pulp_site_upstream_password: pulp_site_upstream_content_url: https://ark.stackhpc.com/pulp/content -pulp_site_default_upstream_suffix: "{{ pulp_site_target_arch }}/os" -pulp_site_validate_certs: false pulp_site_install_dir: '/home/rocky/pulp' -pulp_site_selinux_suffix: "{{ ':Z' if ansible_selinux.status == 'enabled' else '' }}" +_pulp_site_selinux_suffix: "{{ ':Z' if ansible_selinux.status == 'enabled' else '' }}" pulp_site_target_facts: "{{ hostvars[groups['pulp'][0]]['ansible_facts'] }}" -pulp_site_target_distribution_version: "{{ pulp_site_target_facts['distribution_version'] }}" -pulp_site_target_distribution_version_major: "{{ pulp_site_target_facts['distribution_major_version'] }}" - -pulp_site_rpm_info: -- name: "baseos-{{ pulp_site_target_distribution_version }}-{{ appliances_pulp_repos.baseos[pulp_site_target_distribution_version].timestamp }}" - subpath: "{{ appliances_pulp_repos.baseos[pulp_site_target_distribution_version] | appliances_repo_to_subpath }}" -- name: "appstream-{{ pulp_site_target_distribution_version }}-{{ appliances_pulp_repos.appstream[pulp_site_target_distribution_version].timestamp }}" - subpath: "{{ appliances_pulp_repos.appstream[pulp_site_target_distribution_version] | appliances_repo_to_subpath }}" -- name: "crb-{{ pulp_site_target_distribution_version }}-{{ appliances_pulp_repos.crb[pulp_site_target_distribution_version].timestamp }}" - subpath: "{{ appliances_pulp_repos.crb[pulp_site_target_distribution_version] | appliances_repo_to_subpath }}" -- name: "extras-{{ pulp_site_target_distribution_version }}-{{ appliances_pulp_repos.extras[pulp_site_target_distribution_version].timestamp }}" - subpath: "{{ appliances_pulp_repos.extras[pulp_site_target_distribution_version] | appliances_repo_to_subpath }}" -- name: "epel-{{ pulp_site_target_distribution_version_major }}-{{ appliances_pulp_repos.epel[pulp_site_target_distribution_version_major].timestamp }}" - subpath: "{{ appliances_pulp_repos.epel[pulp_site_target_distribution_version_major] | appliances_repo_to_subpath }}" -- name: "ohpc-{{ pulp_site_target_distribution_version_major }}-{{ appliances_pulp_repos.openhpc_base[pulp_site_target_distribution_version_major].timestamp }}" - subpath: "{{ appliances_pulp_repos.openhpc_base[pulp_site_target_distribution_version_major] | appliances_repo_to_subpath }}" -- name: "ohpc-updates-{{ pulp_site_target_distribution_version_major }}-{{ appliances_pulp_repos.openhpc_updates[pulp_site_target_distribution_version_major].timestamp }}" - subpath: "{{ appliances_pulp_repos.openhpc_updates[pulp_site_target_distribution_version_major] | appliances_repo_to_subpath }}" -- name: "ceph-{{ pulp_site_target_distribution_version_major }}-{{ appliances_pulp_repos.ceph[pulp_site_target_distribution_version_major].timestamp }}" - subpath: "{{ appliances_pulp_repos.ceph[pulp_site_target_distribution_version_major] | appliances_repo_to_subpath }}" -- name: "grafana-{{ pulp_site_target_distribution_version_major }}-{{ appliances_pulp_repos.grafana.timestamp[pulp_site_target_distribution_version_major].timestamp }} - subpath: "{{ appliances_pulp_repos.grafana[pulp_site_target_distribution_version_major] | appliances_repo_to_subpath }}" +pulp_site_target_distribution_version: "{{ pulp_site_target_facts['distribution_version'] }}" # TODO: how to set automatically? pulp_site_rpm_repo_defaults: remote_username: "{{ pulp_site_upstream_username }}" remote_password: "{{ pulp_site_upstream_password }}" policy: on_demand - state: present -_pulp_site_rpm_info_all: "{{ pulp_site_rpm_info | map('combine', pulp_site_rpm_repo_defaults) }}" +_pulp_site_rpm_info: | + {{ + dnf_repos_repos | + select_repos(pulp_site_target_distribution_version) + }} +pulp_site_rpm_repos: | + {{ + _pulp_site_rpm_info | + to_rpm_repos(pulp_site_upstream_content_url, pulp_site_rpm_repo_defaults) + }} -pulp_site_rpm_repos: "{{ _pulp_site_rpm_info_all | to_rpm_repos(pulp_site_upstream_content_url) }}" -pulp_site_rpm_publications: "{{ _pulp_site_rpm_info_all | to_rpm_pubs }}" -pulp_site_rpm_distributions: "{{ _pulp_site_rpm_info_all | to_rpm_distros }}" +pulp_site_rpm_publications: "{{ _pulp_site_rpm_info | to_rpm_pubs }}" +pulp_site_rpm_distributions: "{{ _pulp_site_rpm_info | to_rpm_distros }}" diff --git a/ansible/roles/pulp_site/files/pulp.service b/ansible/roles/pulp_site/files/pulp.service new file mode 100644 index 0000000..464961d --- /dev/null +++ b/ansible/roles/pulp_site/files/pulp.service @@ -0,0 +1,12 @@ +# Adapted from https://grimoire.carcano.ch/blog/installing-pulp3-as-a-container/ +[Unit] +Description=Pulp +Wants=syslog.service + +[Service] +Restart=always +ExecStart=/usr/bin/podman start -a pulp +ExecStop=/usr/bin/podman stop -t 15 pulp + +[Install] +WantedBy=multi-user.target diff --git a/ansible/roles/pulp_site/filter_plugins/pulp-list-filters.py b/ansible/roles/pulp_site/filter_plugins/pulp-list-filters.py index 50e9126..41e995c 100644 --- a/ansible/roles/pulp_site/filter_plugins/pulp-list-filters.py +++ b/ansible/roles/pulp_site/filter_plugins/pulp-list-filters.py @@ -3,29 +3,61 @@ def filters(self): return { 'to_rpm_repos': self.to_rpm_repos, 'to_rpm_pubs': self.to_rpm_pubs, - 'to_rpm_distros': self.to_rpm_distros + 'to_rpm_distros': self.to_rpm_distros, + 'select_repos': self.select_repos, } - - def to_rpm_repos(self, list, pulp_url): - repo_list = map(lambda x: { - 'name': x['name'], - 'url': pulp_url+'/'+x['subpath'], - 'remote_username': x['remote_username'], - 'remote_password': x['remote_password'], - 'policy': x['policy'], - 'state': x['state'] }, list) - return repo_list + def select_repos(self, dnf_repos, target_distro_ver): + """ Filter dnf_repos to only those for a relevant distribution version (M.m or M). Returns a list of dicts. + Also adds pulp_repo_name field to give the repository a unique name in Pulp to be referenced by subsequent + filters + """ + + target_distro_ver_major = target_distro_ver.split('.')[0] + + rpm_repos = [] + for repokey in dnf_repos: + # select either the matching major.minor or major version: + if target_distro_ver in dnf_repos[repokey]: + selected_ver = target_distro_ver + elif target_distro_ver_major in dnf_repos[repokey]: + selected_ver = target_distro_ver_major + else: + raise ValueError(f'No key matching {target_distro_ver_major} or {target_distro_ver} found in f{repokey}') + repo_data = dnf_repos[repokey][selected_ver] + repo_data['pulp_repo_name'] = f"{repokey}-{selected_ver}-{dnf_repos[repokey][selected_ver]['pulp_timestamp']}" + rpm_repos.append(repo_data) + return rpm_repos + + def to_rpm_repos(self, rpm_info, content_url, repo_defaults): + """ Filter repo object list given by select_repos into dict required by the pulp_repository_rpm_repos variable + from stackhpc.pulp.pulp_repository role + """ + rpm_repos = [] + for repo_data in rpm_info: + rpm_data = repo_defaults.copy() # NB: this changes behaviour vs before, so now defaults can correctly be overriden + rpm_data['name'] = repo_data['pulp_repo_name'] + rpm_data['url'] = '/'.join([content_url, repo_data['pulp_path'], repo_data['pulp_timestamp']]) + rpm_data['state'] = 'present' + rpm_repos.append(rpm_data) + return rpm_repos + def to_rpm_pubs(self, list): + """ Filter repo object list given by select_repos into dict required by the pulp_publication_rpm variable + from stackhpc.pulp.pulp_publication role + """ pub_list = map(lambda x: { - 'repository': x['name'], - 'state': x['state'] }, list) + 'repository': x['pulp_repo_name'], + 'state': 'present' }, list) return pub_list def to_rpm_distros(self, list): + """ Filter repo object list given by select_repos into dict required by the pulp_distirubtion_rpm variable + from stackhpc.pulp.pulp_distribution role + """ distro_list = map(lambda x: { - 'name': x['name'], - 'repository': x['name'], - 'base_path': x['subpath'], - 'state': x['state'] }, list) - return distro_list \ No newline at end of file + 'name': x['pulp_repo_name'], + 'repository': x['pulp_repo_name'], + 'base_path': '/'.join([x['pulp_path'],x['pulp_timestamp']]), + 'state': 'present' }, list) + return distro_list diff --git a/ansible/roles/pulp_site/tasks/install.yml b/ansible/roles/pulp_site/tasks/install.yml index 39b4fcd..75b0f66 100644 --- a/ansible/roles/pulp_site/tasks/install.yml +++ b/ansible/roles/pulp_site/tasks/install.yml @@ -26,13 +26,27 @@ publish: - "{{ pulp_site_port }}:80" volume: - - "{{ pulp_site_install_dir }}/settings:/etc/pulp{{ pulp_site_selinux_suffix }}" - - "{{ pulp_site_install_dir }}/pulp_storage:/var/lib/pulp{{ pulp_site_selinux_suffix }}" - - "{{ pulp_site_install_dir }}/pgsql:/var/lib/pgsql{{ pulp_site_selinux_suffix }}" - - "{{ pulp_site_install_dir }}/containers:/var/lib/containers{{ pulp_site_selinux_suffix }}" + - "{{ pulp_site_install_dir }}/settings:/etc/pulp{{ _pulp_site_selinux_suffix }}" + - "{{ pulp_site_install_dir }}/pulp_storage:/var/lib/pulp{{ _pulp_site_selinux_suffix }}" + - "{{ pulp_site_install_dir }}/pgsql:/var/lib/pgsql{{ _pulp_site_selinux_suffix }}" + - "{{ pulp_site_install_dir }}/containers:/var/lib/containers{{ _pulp_site_selinux_suffix }}" device: /dev/fuse image: docker.io/pulp/pulp:3.68.1 + state: present +- name: Create systemd file + copy: + src: pulp.service + dest: /etc/systemd/system/pulp.service + register: _pulp_service + +- name: Start Pulp service + systemd: + name: pulp + state: "{{ 'started' if _pulp_service.changed else 'restarted' }}" + daemon_reload: "{{ _pulp_service.changed }}" + enabled: true + - name: Reset admin password once container has initialised no_log: true ansible.builtin.shell: diff --git a/ansible/roles/pulp_site/tasks/sync.yml b/ansible/roles/pulp_site/tasks/sync.yml index 5ef2bc5..9a2a932 100644 --- a/ansible/roles/pulp_site/tasks/sync.yml +++ b/ansible/roles/pulp_site/tasks/sync.yml @@ -3,17 +3,7 @@ - ansible.builtin.assert: that: pulp_site_upstream_password != '' quiet: true - fail_msg: "Upstream password not set. Either set env var ARK_PASSWORD or override pulp_site_upstream_password." - -- name: Wait for Pulp server - pulp.squeezer.status: - pulp_url: "{{ pulp_site_url }}" - username: "{{ pulp_site_username }}" - password: "{{ pulp_site_password }}" - register: _pulp_status - until: _pulp_status.failed == false - retries: 30 - delay: 20 + fail_msg: "Upstream password not set. Ensure `pulp_site_upstream_username` and `pulp_site_upstream_password` are overriden to your Ark credentials." - name: Ensure Pulp CLI config directory exists ansible.builtin.file: @@ -27,6 +17,16 @@ dest: ~/.config/pulp/cli.toml mode: '0644' +- name: Wait for Pulp server + pulp.squeezer.status: + pulp_url: "{{ pulp_site_url }}" + username: "{{ pulp_site_username }}" + password: "{{ pulp_site_password }}" + register: _pulp_status + until: _pulp_status.failed == false + retries: 30 + delay: 20 + - block: - name: Ensure squeezer cache exists ansible.builtin.file: diff --git a/ansible/roles/pulp_site/templates/cli.toml.j2 b/ansible/roles/pulp_site/templates/cli.toml.j2 index 0686790..c67dcf3 100644 --- a/ansible/roles/pulp_site/templates/cli.toml.j2 +++ b/ansible/roles/pulp_site/templates/cli.toml.j2 @@ -4,7 +4,6 @@ username = "{{ pulp_site_username }}" password = "{{ pulp_site_password }}" api_root = "/pulp/" domain = "default" -headers = [] cert = "" key = "" verify_ssl = true diff --git a/docs/environments.md b/docs/environments.md index d1c4923..183b775 100644 --- a/docs/environments.md +++ b/docs/environments.md @@ -14,7 +14,10 @@ All environments load the inventory from the `common` environment first, with th The ansible inventory for the environment is in `environments//inventory/`. It should generally contain: - A `hosts` file. This defines the hosts in the appliance. Generally it should be templated out by the deployment automation so it is also a convenient place to define variables which depend on the deployed hosts such as connection variables, IP addresses, ssh proxy arguments etc. -- A `groups` file defining ansible groups, which essentially controls which features of the appliance are enabled and where they are deployed. This repository generally follows a convention where functionality is defined using ansible roles applied to a group of the same name, e.g. `openhpc` or `grafana`. The meaning and use of each group is described in comments in `environments/common/inventory/groups`. As the groups defined there for the common environment are empty, functionality is disabled by default and must be enabled in a specific environment's `groups` file. Two template examples are provided in `environments/commmon/layouts/` demonstrating a minimal appliance with only the Slurm cluster itself, and an appliance with all functionality. +- A `groups` file defining ansible groups, which essentially controls which features of the appliance are enabled and where they are deployed. This repository generally follows a convention where functionality is defined using ansible roles applied to a group +of the same name, e.g. `openhpc` or `grafana`. The meaning and use of each group is described in comments in `environments/common/inventory/groups`. As the groups defined there for the common environment are empty, functionality is disabled by default and must be +enabled in a specific environment's `groups` file. The `site` environment contains an ini file at `environments/site/inventory/groups` which enables groups for default appliance functionality across all environments. Additional groups should generally also be +enabled in this file to avoid divergence between staging and production environments. Note that enabling some groups may require a site-specific image build and Ark credentials (see [operations guide](operations.md)). - Optionally, group variable files in `group_vars//overrides.yml`, where the group names match the functional groups described above. These can be used to override the default configuration for each functionality, which are defined in `environments/common/inventory/group_vars/all/.yml` (the use of `all` here is due to ansible's precedence rules). Although most of the inventory uses the group convention described above there are a few special cases: diff --git a/docs/experimental/pulp.md b/docs/experimental/pulp.md index c6b437d..582eec9 100644 --- a/docs/experimental/pulp.md +++ b/docs/experimental/pulp.md @@ -5,13 +5,47 @@ In order to ensure reproducible builds, the appliance can build images using rep ## Deploying/configuring Pulp Server ### Deploying a Pulp server -A playbook is provided to install and configure a Pulp server on a given host. Admin credentials for this server are automatically generated through the `ansible/adhoc/generate-passwords.yml` playbook. This can be run with -`ansible-playbook ansible/adhoc/deploy-pulp.yml -e "pulp_server="` -where `target_host` is any resolvable host. This will print a Pulp URL which can be copied to your environments as appropriate. Ensure that the server is accessible on the specified port. Note access to this server's content isn't authenticated so assumes the server is deployed behind a secure network. +A playbook is provided to install and configure a Pulp server on a given host. Admin credentials for this server are automatically generated through the `ansible/adhoc/generate-passwords.yml` playbook. To use this, create an inventory file +defining a group `pulp_server` containing a single host, which requires at least 2 vCPUs and 4GB RAM. The group should be defined in your `site` environment's inventory so that a single Pulp server is shared between all environments and +the same snapshots are tested in staging and production. +Deploying and syncing Pulp has been tested on an RL9 host. The hostvar `ansible_host` should be defined, giving the IP address Ansible should use for ssh. For example, you can create an ini file at `environments/site/inventory/pulp` with the contents: + +``` +[pulp_server] +pulp_host ansible_host= +``` + +> [!WARNING] +> The inventory hostname cannot conflict with group names i.e can't be called `pulp_site` or `pulp_server`. + +Once complete, it will print a message giving a value to set for `appliances_pulp_url` (see example config below), assuming the `ansible_host` address is also the address the cluster +should use to reach the Pulp server. + +Note access to this server's content isn't authenticated so this assumes the `pulp_server` host is not externally reachable. ### Using an existing Pulp server An existing Pulp server can be used to host Ark repos by overriding `pulp_site_password` and `appliances_pulp_url` in the target environment. Note that this assumes the same configuration as the appliance deployed Pulp i.e no content authentication. ## Syncing Pulp content with Ark -If the `pulp` group is added to the Packer build groups, the local Pulp server will be synced with Ark on build. You must authenticate with Ark by overriding `pulp_site_upstream_username` and `pulp_site_upstream_password` with your vault encrypted Ark dev credentials. `dnf_repos_username` and `dnf_repos_password` must remain unset to access content from the local Pulp. Content can also be synced by running `ansible/adhoc/sync-pulp.yml`. By default this syncs repositories for Rocky 9.5 with x86_64 architecture, but can be overridden by setting extra variables for `pulp_site_target_arch`, `pulp_site_target_distribution`, `pulp_site_target_distribution_version` and `pulp_site_target_distribution_version_major`. +If the `pulp_site` group is added to the Packer build groups, the local Pulp server will be synced with Ark on build. You must authenticate with Ark by overriding `pulp_site_upstream_username` and `pulp_site_upstream_password` with your vault encrypted Ark dev credentials. `dnf_repos_username` and `dnf_repos_password` must remain unset to access content from the local Pulp. + +Content can also be synced by running `ansible/adhoc/sync-pulp.yml`. By default this syncs repositories for the latest version of Rocky supported by the appliance but this can be overridden by setting extra variables for `pulp_site_target_arch`, `pulp_site_target_distribution` and `pulp_site_target_distribution_version`. + +## Example config in site variables + +``` +# environments/site/inventory/group_vars/all/pulp_site.yml: +appliances_pulp_url: "http://:8080" +pulp_site_upstream_username: +pulp_site_upstream_password: +``` + +## Installing packages from Pulp at runtime +By default, system repos are overwritten to point at Pulp repos during [image builds,](../image-build.md) so using a site Pulp server will require a new fatimage. If you instead wish to install packages at runtime, +you will need to add all host groups on which you will be installing packages to the `dnf_repos` group in `environments/site/inventory/groups` e.g: + +``` +[dnf_repos:children] +cluster +``` diff --git a/docs/operations.md b/docs/operations.md index 4f7bc5c..4c5c640 100644 --- a/docs/operations.md +++ b/docs/operations.md @@ -9,7 +9,7 @@ All subsequent sections assume that: - Appropriate OpenStack credentials are available. - Any non-appliance controlled infrastructure is available (e.g. networks, volumes, etc.). - `$ENV` is your current, activated environment, as defined by e.g. `environments/production/`. -- `$SITE_ENV` is the base site-specific environment, as defined by e.g. `environments/mysite/`. +- `$SITE_ENV` is the base site-specific environment, as defined by `environments/site/`. - A string `some/path/to/file.yml:myvar` defines a path relative to the repository root and an Ansible variable in that file. - Configuration is generally common to all environments at a site, i.e. is made in `environments/$SITE_ENV` not `environments/$ENV`. @@ -62,6 +62,24 @@ This is a usually a two-step process: Deploying the additional nodes and applying these changes requires rerunning both OpenTofu and the Ansible site.yml playbook - follow [Deploying a Cluster](#Deploying-a-Cluster). +# Enabling additional functionality +Roles in the appliance which are disabled by default can be enabled by adding the appropriate groups as children of the role's corresponding group in `environments/site/inventory/groups`. For example, +to install a Squid proxy on nodes in the login group, you would modify the `squid` group definition in `environments/site/inventory/groups` to: + +``` +[squid:children] +# Hosts to run squid proxy +login +``` + +Note that many non-default roles include package installations from repositories which the appliance overwrites to point at snapshotted mirrors on a Pulp server (by default StackHPC's Ark server), which are +disabled during runtime to prevent Ark credentials from being leaked. To enable this functionality, you must therefore either: + +- Create a site-specific fatimage (see [image build docs](image-build.md)) with the appropriate group added to the `inventory_groups` Packer variables. +- If you instead wish roles to perform their installations during runtime, deploy a site Pulp server and sync it with with mirrors of the snapshots from the upstream Ark server (see [Pulp docs](experimental/pulp.md)). + +In both cases, Ark credentials will be required. + # Adding Additional Packages By default, the following utility packages are installed during the StackHPC image build: - htop diff --git a/environments/.stackhpc/tofu/cluster_image.auto.tfvars.json b/environments/.stackhpc/tofu/cluster_image.auto.tfvars.json index af14839..88cdb42 100644 --- a/environments/.stackhpc/tofu/cluster_image.auto.tfvars.json +++ b/environments/.stackhpc/tofu/cluster_image.auto.tfvars.json @@ -1,6 +1,6 @@ { "cluster_image": { - "RL8": "openhpc-RL8-250808-1727-faa44755", - "RL9": "openhpc-RL9-250808-1727-faa44755" + "RL8": "openhpc-RL8-250820-0800-767addd8", + "RL9": "openhpc-RL9-250820-0800-767addd8" } } diff --git a/environments/common/files/grafana/grafana.repo.j2 b/environments/common/files/grafana/grafana.repo.j2 index 8f1aef5..6ce2581 100644 --- a/environments/common/files/grafana/grafana.repo.j2 +++ b/environments/common/files/grafana/grafana.repo.j2 @@ -1,6 +1,6 @@ {{ ansible_managed | comment }} [grafana] -baseurl = {{ appliances_pulp_url }}/pulp/content/{{ appliances_pulp_repos.grafana[ansible_distribution_major_version] | appliances_repo_to_subpath }} +baseurl = {{ appliances_pulp_url }}/pulp/content/{{ dnf_repos_repos['grafana'][ansible_distribution_major_version]['pulp_path'] }}/{{ dnf_repos_repos['grafana'][ansible_distribution_major_version]['pulp_timestamp'] }} enabled = 0 name = grafana async = 1 diff --git a/environments/common/inventory/group_vars/all/dnf_repo_timestamps.yml b/environments/common/inventory/group_vars/all/dnf_repo_timestamps.yml new file mode 100644 index 0000000..d2df041 --- /dev/null +++ b/environments/common/inventory/group_vars/all/dnf_repo_timestamps.yml @@ -0,0 +1,116 @@ +dnf_repos_default: + Ceph: + '8': + pulp_path: centos/8-stream/storage/x86_64/ceph-quincy + pulp_timestamp: 20231104T015751 + repo_file: ceph + '9': + pulp_path: centos/9-stream/storage/x86_64/ceph-reef + pulp_timestamp: 20250617T023108 + repo_file: ceph + appstream: + '8.10': + pulp_path: rocky/8.10/AppStream/x86_64/os + pulp_timestamp: 20250614T013846 + repo_file: Rocky-AppStream + '9.4': + pulp_path: rocky/9.4/AppStream/x86_64/os + pulp_timestamp: 20241112T003151 + repo_file: rocky + '9.5': + pulp_path: rocky/9.5/AppStream/x86_64/os + pulp_timestamp: 20250514T014704 + repo_file: rocky + '9.6': + pulp_path: rocky/9.6/AppStream/x86_64/os + pulp_timestamp: 20250816T020215 + repo_file: rocky + baseos: + '8.10': + pulp_path: rocky/8.10/BaseOS/x86_64/os + pulp_timestamp: 20250614T013846 + repo_file: Rocky-BaseOS + '9.4': + pulp_path: rocky/9.4/BaseOS/x86_64/os + pulp_timestamp: 20241115T011711 + repo_file: rocky + '9.5': + pulp_path: rocky/9.5/BaseOS/x86_64/os + pulp_timestamp: 20250513T031844 + repo_file: rocky + '9.6': + pulp_path: rocky/9.6/BaseOS/x86_64/os + pulp_timestamp: 20250815T050653 + repo_file: rocky + crb: + '8.10': + pulp_path: rocky/8.10/PowerTools/x86_64/os + pulp_timestamp: 20250614T013846 + repo_file: Rocky-PowerTools + repo_name: powertools + '9.4': + pulp_path: rocky/9.4/CRB/x86_64/os + pulp_timestamp: 20241115T003133 + repo_file: rocky + '9.5': + pulp_path: rocky/9.5/CRB/x86_64/os + pulp_timestamp: 20250514T014704 + repo_file: rocky + '9.6': + pulp_path: rocky/9.6/CRB/x86_64/os + pulp_timestamp: 20250815T034418 + repo_file: rocky + extras: + '8.10': + pulp_path: rocky/8.10/extras/x86_64/os + pulp_timestamp: 20250510T032327 + repo_file: Rocky-Extras + '9.4': + pulp_path: rocky/9.4/extras/x86_64/os + pulp_timestamp: 20241118T002802 + repo_file: rocky-extras + '9.5': + pulp_path: rocky/9.5/extras/x86_64/os + pulp_timestamp: 20250506T032818 + repo_file: rocky-extras + '9.6': + pulp_path: rocky/9.6/extras/x86_64/os + pulp_timestamp: 20250726T040613 + repo_file: rocky-extras + grafana: + '8': + pulp_path: grafana/oss/rpm + pulp_timestamp: 20250730T011314 + repo_file: grafana + timestamp: 20250615T005738 + '9': + pulp_path: grafana/oss/rpm + pulp_timestamp: 20250730T011314 + repo_file: grafana + epel: + '8': + pulp_path: epel/8/Everything/x86_64 + pulp_timestamp: 20250615T234151 + repo_file: epel + '9': + pulp_path: epel/9/Everything/x86_64 + pulp_timestamp: 20250817T000753 + repo_file: epel + OpenHPC: + '8': + pulp_path: OpenHPC/2/EL_8 + pulp_timestamp: 20241218T154614 + repo_file: OpenHPC + '9': + pulp_path: OpenHPC/3/EL_9 + pulp_timestamp: 20241218T154614 + repo_file: OpenHPC + OpenHPC-updates: + '8': + pulp_path: OpenHPC/2/updates/EL_8 + pulp_timestamp: 20250512T003315 + repo_file: OpenHPC + '9': + pulp_path: OpenHPC/3/updates/EL_9 + pulp_timestamp: 20250510T003301 + repo_file: OpenHPC diff --git a/environments/common/inventory/group_vars/all/dnf_repos.yml b/environments/common/inventory/group_vars/all/dnf_repos.yml new file mode 100644 index 0000000..e7a8ace --- /dev/null +++ b/environments/common/inventory/group_vars/all/dnf_repos.yml @@ -0,0 +1,8 @@ +# dnf_repos_default: see role ansible/roles/dnf_repos/README.md for format and dnf_repo_timestamps.yml for default definition + +# override this in environments/site/inventory/group_vars/dnf_repos.yml to add repos: +dnf_repos_extra: {} + +# indirection to skip openhpc repos if using alternative slurm: +dnf_repos_skip: "{{ [] if ((openhpc_install_type | default('ohpc') == 'ohpc')) else ['OpenHPC', 'OpenHPC-updates'] }}" +dnf_repos_repos: "{{ dnf_repos_default | combine(dnf_repos_extra) | dict2items | rejectattr('key', 'in', dnf_repos_skip) | items2dict }}" diff --git a/environments/common/inventory/group_vars/all/timestamps.yml b/environments/common/inventory/group_vars/all/timestamps.yml deleted file mode 100644 index 455c260..0000000 --- a/environments/common/inventory/group_vars/all/timestamps.yml +++ /dev/null @@ -1,88 +0,0 @@ -appliances_pulp_repos: - appstream: - '8.10': - path: rocky/8.10/AppStream/x86_64/os - timestamp: 20250614T013846 - '9.4': - path: rocky/9.4/AppStream/x86_64/os - timestamp: 20241112T003151 - '9.5': - path: rocky/9.5/AppStream/x86_64/os - timestamp: 20250514T014704 - '9.6': - path: rocky/9.6/AppStream/x86_64/os - timestamp: 20250726T040613 - baseos: - '8.10': - path: rocky/8.10/BaseOS/x86_64/os - timestamp: 20250614T013846 - '9.4': - path: rocky/9.4/BaseOS/x86_64/os - timestamp: 20241115T011711 - '9.5': - path: rocky/9.5/BaseOS/x86_64/os - timestamp: 20250513T031844 - '9.6': - path: rocky/9.6/BaseOS/x86_64/os - timestamp: 20250726T052250 - ceph: - '8': - path: centos/8-stream/storage/x86_64/ceph-quincy - timestamp: 20231104T015751 - '9': - path: centos/9-stream/storage/x86_64/ceph-reef - timestamp: 20250617T023108 - crb: - '8.10': - path: rocky/8.10/PowerTools/x86_64/os - timestamp: 20250614T013846 - '9.4': - path: rocky/9.4/CRB/x86_64/os - timestamp: 20241115T003133 - '9.5': - path: rocky/9.5/CRB/x86_64/os - timestamp: 20250514T014704 - '9.6': - path: rocky/9.6/CRB/x86_64/os - timestamp: 20250726T040613 - epel: - '8': - path: epel/8/Everything/x86_64 - timestamp: 20250615T234151 - '9': - path: epel/9/Everything/x86_64 - timestamp: 20250729T235750 - extras: - '8.10': - path: rocky/8.10/extras/x86_64/os - timestamp: 20250510T032327 - '9.4': - path: rocky/9.4/extras/x86_64/os - timestamp: 20241118T002802 - '9.5': - path: rocky/9.5/extras/x86_64/os - timestamp: 20250506T032818 - '9.6': - path: rocky/9.6/extras/x86_64/os - timestamp: 20250726T040613 - grafana: - '8': - path: grafana/oss/rpm - timestamp: 20250615T005738 - '9': - path: grafana/oss/rpm - timestamp: 20250730T011314 - openhpc_base: - '8': - path: OpenHPC/2/EL_8 - timestamp: 20241218T154614 - '9': - path: OpenHPC/3/EL_9 - timestamp: 20241218T154614 - openhpc_updates: - '8': - path: OpenHPC/2/updates/EL_8 - timestamp: 20250512T003315 - '9': - path: OpenHPC/3/updates/EL_9 - timestamp: 20250510T003301 diff --git a/environments/common/inventory/groups b/environments/common/inventory/groups index 57b6441..6926355 100644 --- a/environments/common/inventory/groups +++ b/environments/common/inventory/groups @@ -197,8 +197,8 @@ k3s_agent builder extra_packages -[pulp] -# Add builder to this group to enable automatically syncing of pulp during image build +[pulp_site] +# Add builder to this group to automatically sync pulp during image build [cacerts] # Hosts to configure CA certificates and trusts on @@ -211,3 +211,10 @@ extra_packages [nhc] # Hosts to configure for node health checks - either entire 'compute' group or empty + +[pulp_server] +# Host to deploy a Pulp server on and sync with mirrors of upstream Ark repositories. Should be a group containing a single VM provisioned +# separately from the appliance. e.g +# pulp_host ansible_host= +# Note the host name can't conflict with group names i.e can't be called `pulp` or `pulp_server` + diff --git a/environments/site/inventory/groups b/environments/site/inventory/groups index 9df61dc..b78197d 100644 --- a/environments/site/inventory/groups +++ b/environments/site/inventory/groups @@ -157,3 +157,12 @@ compute # Should be set to `compute` if enabled # Note that this feature currently assumes all compute nodes are VMs, enabling # when the cluster contains baremetal compute nodes may lead to unexpected scheduling behaviour + +[pulp_site] +# Add builder to this group to automatically sync pulp during image build + +[pulp_server] +# Host to deploy a Pulp server on and sync with mirrors of upstream Ark repositories. Should be a group containing a single VM provisioned +# separately from the appliance. e.g +# pulp_host ansible_host= +# Note inventory host name cannot conflict with group names i.e can't be called `pulp` or `pulp_server`. From 2984292dca33d4fa2b76af289c41cc7254e17fe9 Mon Sep 17 00:00:00 2001 From: bertiethorpe Date: Fri, 5 Sep 2025 09:18:26 +0000 Subject: [PATCH 07/50] temp fix: add alertmanager passwd to persist_openhpc_secrets template --- .../roles/persist_openhpc_secrets/templates/openhpc_secrets.fact | 1 + 1 file changed, 1 insertion(+) diff --git a/ansible/roles/persist_openhpc_secrets/templates/openhpc_secrets.fact b/ansible/roles/persist_openhpc_secrets/templates/openhpc_secrets.fact index 9d6de37..ca1742c 100644 --- a/ansible/roles/persist_openhpc_secrets/templates/openhpc_secrets.fact +++ b/ansible/roles/persist_openhpc_secrets/templates/openhpc_secrets.fact @@ -6,4 +6,5 @@ "vault_mysql_root_password": "{{ lookup('password', '/dev/null') }}", "vault_mysql_slurm_password": "{{ lookup('password', '/dev/null') }}", "vault_openhpc_mungekey": "{{ lookup('pipe', 'dd if=/dev/urandom bs=1 count=1024 2>/dev/null | base64') | regex_replace('\s+', '') }}" + "vault_alertmanager_admin_password": "{{ lookup('password', '/dev/null') }}" } From a3be3c9474977f6f818d5edcac15725e9c385fb3 Mon Sep 17 00:00:00 2001 From: bertiethorpe Date: Fri, 5 Sep 2025 12:10:13 +0000 Subject: [PATCH 08/50] missing ',' --- .../persist_openhpc_secrets/templates/openhpc_secrets.fact | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ansible/roles/persist_openhpc_secrets/templates/openhpc_secrets.fact b/ansible/roles/persist_openhpc_secrets/templates/openhpc_secrets.fact index ca1742c..e049951 100644 --- a/ansible/roles/persist_openhpc_secrets/templates/openhpc_secrets.fact +++ b/ansible/roles/persist_openhpc_secrets/templates/openhpc_secrets.fact @@ -5,6 +5,6 @@ "vault_elasticsearch_kibana_password": "{{ lookup('password', '/dev/null') }}", "vault_mysql_root_password": "{{ lookup('password', '/dev/null') }}", "vault_mysql_slurm_password": "{{ lookup('password', '/dev/null') }}", - "vault_openhpc_mungekey": "{{ lookup('pipe', 'dd if=/dev/urandom bs=1 count=1024 2>/dev/null | base64') | regex_replace('\s+', '') }}" + "vault_openhpc_mungekey": "{{ lookup('pipe', 'dd if=/dev/urandom bs=1 count=1024 2>/dev/null | base64') | regex_replace('\s+', '') }}", "vault_alertmanager_admin_password": "{{ lookup('password', '/dev/null') }}" } From 32e983803598386609fa2cb7f286663d2ae311dc Mon Sep 17 00:00:00 2001 From: bertiethorpe Date: Fri, 5 Sep 2025 12:52:14 +0000 Subject: [PATCH 09/50] alertmanager admin passwd group_var --- environments/.caas/inventory/group_vars/all/cluster.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/environments/.caas/inventory/group_vars/all/cluster.yml b/environments/.caas/inventory/group_vars/all/cluster.yml index b06314c..14633c8 100644 --- a/environments/.caas/inventory/group_vars/all/cluster.yml +++ b/environments/.caas/inventory/group_vars/all/cluster.yml @@ -11,6 +11,7 @@ vault_elasticsearch_kibana_password: "{{ hostvars[groups['control'][0]].ansible_ vault_mysql_root_password: "{{ hostvars[groups['control'][0]].ansible_local.openhpc_secrets.vault_mysql_root_password }}" vault_mysql_slurm_password: "{{ hostvars[groups['control'][0]].ansible_local.openhpc_secrets.vault_mysql_slurm_password }}" vault_openhpc_mungekey: "{{ hostvars[groups['control'][0]].ansible_local.openhpc_secrets.vault_openhpc_mungekey }}" +vault_alertmanager_admin_password: "{{ hostvars[groups['control'][0]].ansible_local.openhpc_secrets.vault_alertmanager_admin_password }}" # Override this to cope with the case where the podman group just doesn't exist appliances_local_users_podman_enable: "{{ groups.get('podman', []) | length > 0 }}" From 6e05021c310cbabc3f082b0661dca4c77b6b85de Mon Sep 17 00:00:00 2001 From: Steve Brasier <33413598+sjpb@users.noreply.github.com> Date: Fri, 5 Sep 2025 15:49:35 +0100 Subject: [PATCH 10/50] fix incorrect use of partition in nodegroup variable definitions (#771) --- environments/site/tofu/node_group/variables.tf | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/environments/site/tofu/node_group/variables.tf b/environments/site/tofu/node_group/variables.tf index 35c1b6b..4ef3407 100644 --- a/environments/site/tofu/node_group/variables.tf +++ b/environments/site/tofu/node_group/variables.tf @@ -1,11 +1,11 @@ variable "nodes" { type = list(string) - description = "list of node names for partition" + description = "List of node names for node group" } variable "flavor" { type = string - description = "Name of flavor for partition" + description = "Name of flavor for node group" } variable "cluster_name" { @@ -24,7 +24,7 @@ variable "key_pair" { variable "image_id" { type = string - description = "ID of image for the partition" + description = "ID of image for the node group" } variable "environment_root" { From 109f58497141adc031608fe68930699cc6c24c98 Mon Sep 17 00:00:00 2001 From: bertiethorpe Date: Mon, 8 Sep 2025 13:47:48 +0000 Subject: [PATCH 11/50] make caas persist secrets idempotent --- .../roles/persist_openhpc_secrets/tasks/main.yml | 10 +++++++--- .../templates/openhpc_secrets.fact | 16 ++++++++-------- 2 files changed, 15 insertions(+), 11 deletions(-) diff --git a/ansible/roles/persist_openhpc_secrets/tasks/main.yml b/ansible/roles/persist_openhpc_secrets/tasks/main.yml index 6ae9bcd..e0f5865 100644 --- a/ansible/roles/persist_openhpc_secrets/tasks/main.yml +++ b/ansible/roles/persist_openhpc_secrets/tasks/main.yml @@ -14,14 +14,18 @@ loop: - "{{ appliances_state_dir }}/ansible.facts.d" - "/etc/ansible/facts.d" - + +- name: Load existing OpenHPC secrets if present + ansible.builtin.setup: + filter: ansible_local + when: openhpc_secrets_stat.stat.exists + - name: Write OpenHPC secrets template: src: openhpc_secrets.fact dest: "{{ appliances_state_dir }}/ansible.facts.d/openhpc_secrets.fact" owner: root mode: 0600 - when: "not openhpc_secrets_stat.stat.exists" - name: Symlink persistent facts to facts_path file: @@ -30,6 +34,6 @@ dest: /etc/ansible/facts.d/openhpc_secrets.fact owner: root -- name: Read facts +- name: Refresh facts to pick up any new secrets ansible.builtin.setup: filter: ansible_local diff --git a/ansible/roles/persist_openhpc_secrets/templates/openhpc_secrets.fact b/ansible/roles/persist_openhpc_secrets/templates/openhpc_secrets.fact index e049951..5c6c5e6 100644 --- a/ansible/roles/persist_openhpc_secrets/templates/openhpc_secrets.fact +++ b/ansible/roles/persist_openhpc_secrets/templates/openhpc_secrets.fact @@ -1,10 +1,10 @@ { - "vault_azimuth_user_password": "{{ lookup('password', '/dev/null') }}", - "vault_grafana_admin_password": "{{ lookup('password', '/dev/null') }}", - "vault_elasticsearch_admin_password": "{{ lookup('password', '/dev/null') }}", - "vault_elasticsearch_kibana_password": "{{ lookup('password', '/dev/null') }}", - "vault_mysql_root_password": "{{ lookup('password', '/dev/null') }}", - "vault_mysql_slurm_password": "{{ lookup('password', '/dev/null') }}", - "vault_openhpc_mungekey": "{{ lookup('pipe', 'dd if=/dev/urandom bs=1 count=1024 2>/dev/null | base64') | regex_replace('\s+', '') }}", - "vault_alertmanager_admin_password": "{{ lookup('password', '/dev/null') }}" + "vault_azimuth_user_password": "{{ ansible_local.openhpc_secrets.vault_azimuth_user_password | default(lookup('password', '/dev/null')) }}", + "vault_grafana_admin_password": "{{ ansible_local.openhpc_secrets.vault_grafana_admin_password | default(lookup('password', '/dev/null')) }}", + "vault_elasticsearch_admin_password": "{{ ansible_local.openhpc_secrets.vault_elasticsearch_admin_password | default(lookup('password', '/dev/null')) }}", + "vault_elasticsearch_kibana_password": "{{ ansible_local.openhpc_secrets.vault_elasticsearch_kibana_password | default(lookup('password', '/dev/null')) }}", + "vault_mysql_root_password": "{{ ansible_local.openhpc_secrets.vault_mysql_root_password | default(lookup('password', '/dev/null')) }}", + "vault_mysql_slurm_password": "{{ ansible_local.openhpc_secrets.vault_mysql_slurm_password | default(lookup('password', '/dev/null')) }}", + "vault_openhpc_mungekey": "{{ ansible_local.openhpc_secrets.vault_openhpc_mungekey | default(lookup('pipe', 'dd if=/dev/urandom bs=1 count=1024 2>/dev/null | base64') | regex_replace('\\s+', '')) }}", + "vault_alertmanager_admin_password": "{{ ansible_local.openhpc_secrets.vault_alertmanager_admin_password | default(lookup('password', '/dev/null')) }}" } From 60d531d15ba5fd3bd79450ce33edad7f6ebb3c23 Mon Sep 17 00:00:00 2001 From: Pierre Riteau Date: Tue, 9 Sep 2025 10:38:50 +0200 Subject: [PATCH 12/50] Bump Pulp snapshots for RL 9.6 (#772) * Reorder repositories alphabetically * Bump Pulp snapshots for RL 9.6 * Bump CI image (RL9 only) --- .../tofu/cluster_image.auto.tfvars.json | 2 +- .../group_vars/all/dnf_repo_timestamps.yml | 62 +++++++++---------- 2 files changed, 32 insertions(+), 32 deletions(-) diff --git a/environments/.stackhpc/tofu/cluster_image.auto.tfvars.json b/environments/.stackhpc/tofu/cluster_image.auto.tfvars.json index 88cdb42..6b294d1 100644 --- a/environments/.stackhpc/tofu/cluster_image.auto.tfvars.json +++ b/environments/.stackhpc/tofu/cluster_image.auto.tfvars.json @@ -1,6 +1,6 @@ { "cluster_image": { "RL8": "openhpc-RL8-250820-0800-767addd8", - "RL9": "openhpc-RL9-250820-0800-767addd8" + "RL9": "openhpc-RL9-250908-2047-d90ebd0e" } } diff --git a/environments/common/inventory/group_vars/all/dnf_repo_timestamps.yml b/environments/common/inventory/group_vars/all/dnf_repo_timestamps.yml index d2df041..c80a85a 100644 --- a/environments/common/inventory/group_vars/all/dnf_repo_timestamps.yml +++ b/environments/common/inventory/group_vars/all/dnf_repo_timestamps.yml @@ -8,6 +8,24 @@ dnf_repos_default: pulp_path: centos/9-stream/storage/x86_64/ceph-reef pulp_timestamp: 20250617T023108 repo_file: ceph + OpenHPC: + '8': + pulp_path: OpenHPC/2/EL_8 + pulp_timestamp: 20241218T154614 + repo_file: OpenHPC + '9': + pulp_path: OpenHPC/3/EL_9 + pulp_timestamp: 20241218T154614 + repo_file: OpenHPC + OpenHPC-updates: + '8': + pulp_path: OpenHPC/2/updates/EL_8 + pulp_timestamp: 20250512T003315 + repo_file: OpenHPC + '9': + pulp_path: OpenHPC/3/updates/EL_9 + pulp_timestamp: 20250510T003301 + repo_file: OpenHPC appstream: '8.10': pulp_path: rocky/8.10/AppStream/x86_64/os @@ -23,7 +41,7 @@ dnf_repos_default: repo_file: rocky '9.6': pulp_path: rocky/9.6/AppStream/x86_64/os - pulp_timestamp: 20250816T020215 + pulp_timestamp: 20250902T060015 repo_file: rocky baseos: '8.10': @@ -40,7 +58,7 @@ dnf_repos_default: repo_file: rocky '9.6': pulp_path: rocky/9.6/BaseOS/x86_64/os - pulp_timestamp: 20250815T050653 + pulp_timestamp: 20250902T094855 repo_file: rocky crb: '8.10': @@ -58,8 +76,17 @@ dnf_repos_default: repo_file: rocky '9.6': pulp_path: rocky/9.6/CRB/x86_64/os - pulp_timestamp: 20250815T034418 + pulp_timestamp: 20250902T060015 repo_file: rocky + epel: + '8': + pulp_path: epel/8/Everything/x86_64 + pulp_timestamp: 20250615T234151 + repo_file: epel + '9': + pulp_path: epel/9/Everything/x86_64 + pulp_timestamp: 20250908T001730 + repo_file: epel extras: '8.10': pulp_path: rocky/8.10/extras/x86_64/os @@ -85,32 +112,5 @@ dnf_repos_default: timestamp: 20250615T005738 '9': pulp_path: grafana/oss/rpm - pulp_timestamp: 20250730T011314 + pulp_timestamp: 20250906T025340 repo_file: grafana - epel: - '8': - pulp_path: epel/8/Everything/x86_64 - pulp_timestamp: 20250615T234151 - repo_file: epel - '9': - pulp_path: epel/9/Everything/x86_64 - pulp_timestamp: 20250817T000753 - repo_file: epel - OpenHPC: - '8': - pulp_path: OpenHPC/2/EL_8 - pulp_timestamp: 20241218T154614 - repo_file: OpenHPC - '9': - pulp_path: OpenHPC/3/EL_9 - pulp_timestamp: 20241218T154614 - repo_file: OpenHPC - OpenHPC-updates: - '8': - pulp_path: OpenHPC/2/updates/EL_8 - pulp_timestamp: 20250512T003315 - repo_file: OpenHPC - '9': - pulp_path: OpenHPC/3/updates/EL_9 - pulp_timestamp: 20250510T003301 - repo_file: OpenHPC From c94d134930a25f5c21467ff61e904caadaae7c6d Mon Sep 17 00:00:00 2001 From: Steve Brasier <33413598+sjpb@users.noreply.github.com> Date: Tue, 9 Sep 2025 11:34:23 +0100 Subject: [PATCH 13/50] add support for setting server group (#773) --- environments/site/tofu/additional.tf | 4 +++- environments/site/tofu/compute.tf | 4 +++- environments/site/tofu/control.tf | 7 +++++++ environments/site/tofu/login.tf | 4 +++- environments/site/tofu/node_group/nodes.tf | 14 ++++++++++++++ environments/site/tofu/node_group/variables.tf | 5 +++++ environments/site/tofu/variables.tf | 8 ++++++++ 7 files changed, 43 insertions(+), 3 deletions(-) diff --git a/environments/site/tofu/additional.tf b/environments/site/tofu/additional.tf index 863e160..872f957 100644 --- a/environments/site/tofu/additional.tf +++ b/environments/site/tofu/additional.tf @@ -35,6 +35,7 @@ module "additional" { security_group_ids = lookup(each.value, "security_group_ids", [for o in data.openstack_networking_secgroup_v2.nonlogin: o.id]) additional_cloud_config = lookup(each.value, "additional_cloud_config", var.additional_cloud_config) additional_cloud_config_vars = lookup(each.value, "additional_cloud_config_vars", var.additional_cloud_config_vars) + server_group_id = lookup(each.value, "server_group_id", null) # can't be set for additional nodes compute_init_enable = [] @@ -68,6 +69,7 @@ module "additional" { "nodename_template", "security_group_ids", "additional_cloud_config", - "additional_cloud_config_vars" + "additional_cloud_config_vars", + "server_group_id" ] } diff --git a/environments/site/tofu/compute.tf b/environments/site/tofu/compute.tf index 9187f66..35d62c6 100644 --- a/environments/site/tofu/compute.tf +++ b/environments/site/tofu/compute.tf @@ -34,6 +34,7 @@ module "compute" { match_ironic_node = lookup(each.value, "match_ironic_node", null) availability_zone = lookup(each.value, "availability_zone", null) ip_addresses = lookup(each.value, "ip_addresses", null) + server_group_id = lookup(each.value, "server_group_id", null) # computed # not using openstack_compute_instance_v2.control.access_ip_v4 to avoid @@ -63,7 +64,8 @@ module "compute" { "gateway_ip", "nodename_template", "additional_cloud_config", - "additional_cloud_config_vars" + "additional_cloud_config_vars", + "server_group_id" ] } diff --git a/environments/site/tofu/control.tf b/environments/site/tofu/control.tf index 722e89d..19a41ae 100644 --- a/environments/site/tofu/control.tf +++ b/environments/site/tofu/control.tf @@ -72,6 +72,13 @@ resource "openstack_compute_instance_v2" "control" { } } + dynamic "scheduler_hints" { + for_each = var.control_server_group_id != null ? [true] : [] + content { + group = var.control_server_group_id + } + } + metadata = { environment_root = var.environment_root access_ip = openstack_networking_port_v2.control[var.cluster_networks[0].network].all_fixed_ips[0] diff --git a/environments/site/tofu/login.tf b/environments/site/tofu/login.tf index 7a5b3f8..5ecc033 100644 --- a/environments/site/tofu/login.tf +++ b/environments/site/tofu/login.tf @@ -34,6 +34,7 @@ module "login" { match_ironic_node = lookup(each.value, "match_ironic_node", null) availability_zone = lookup(each.value, "availability_zone", null) ip_addresses = lookup(each.value, "ip_addresses", null) + server_group_id = lookup(each.value, "server_group_id", null) # can't be set for login compute_init_enable = [] @@ -68,7 +69,8 @@ module "login" { "nodename_template", "additional_cloud_config", "additional_cloud_config_vars", - "security_group_ids" + "security_group_ids", + "server_group_id" ] } diff --git a/environments/site/tofu/node_group/nodes.tf b/environments/site/tofu/node_group/nodes.tf index 7c3fe21..45cd449 100644 --- a/environments/site/tofu/node_group/nodes.tf +++ b/environments/site/tofu/node_group/nodes.tf @@ -103,6 +103,13 @@ resource "openstack_compute_instance_v2" "compute_fixed_image" { } } + dynamic "scheduler_hints" { + for_each = var.server_group_id != null ? [true] : [] + content { + group = var.server_group_id + } + } + metadata = merge( { environment_root = var.environment_root @@ -164,6 +171,13 @@ resource "openstack_compute_instance_v2" "compute" { } } + dynamic "scheduler_hints" { + for_each = var.server_group_id != null ? [true] : [] + content { + group = var.server_group_id + } + } + metadata = merge( { environment_root = var.environment_root diff --git a/environments/site/tofu/node_group/variables.tf b/environments/site/tofu/node_group/variables.tf index 4ef3407..0a129ab 100644 --- a/environments/site/tofu/node_group/variables.tf +++ b/environments/site/tofu/node_group/variables.tf @@ -208,3 +208,8 @@ variable "additional_cloud_config_vars" { default = {} nullable = false } + +variable "server_group_id" { + type = string + default = null +} diff --git a/environments/site/tofu/variables.tf b/environments/site/tofu/variables.tf index f0451b3..af0b112 100644 --- a/environments/site/tofu/variables.tf +++ b/environments/site/tofu/variables.tf @@ -84,6 +84,7 @@ variable "login" { if match_ironic_node is true, defered to OpenStack otherwise gateway_ip: Address to add default route via nodename_template: Overrides variable cluster_nodename_template + server_group_id: String ID of server group to use for scheduler hint EOF type = any @@ -129,6 +130,7 @@ variable "compute" { if match_ironic_node is true, defered to OpenStack otherwise gateway_ip: Address to add default route via nodename_template: Overrides variable cluster_nodename_template + server_group_id: String ID of server group to use for scheduler hint Nodes are added to the following inventory groups: - $group_name @@ -340,3 +342,9 @@ variable "additional_cloud_config_vars" { type = map(any) default = {} } + +variable "control_server_group_id" { + description = "ID of server group to use for control node scheduler hint" + type = string + default = null +} From 82897d4f55a18f978efdd257e5c7d1aec6ab0d49 Mon Sep 17 00:00:00 2001 From: Pierre Riteau Date: Tue, 9 Sep 2025 13:30:48 +0200 Subject: [PATCH 14/50] Bump CUDA to 13.0.1 and NVIDIA driver to 580.82.07 --- ansible/roles/cuda/defaults/main.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ansible/roles/cuda/defaults/main.yml b/ansible/roles/cuda/defaults/main.yml index 0f5ad9a..e4e785b 100644 --- a/ansible/roles/cuda/defaults/main.yml +++ b/ansible/roles/cuda/defaults/main.yml @@ -1,7 +1,7 @@ cuda_repo_url: "https://developer.download.nvidia.com/compute/cuda/repos/rhel{{ ansible_distribution_major_version }}/{{ ansible_architecture }}/cuda-rhel{{ ansible_distribution_major_version }}.repo" cuda_nvidia_driver_stream: '580-open' -cuda_nvidia_driver_pkg: "nvidia-open-3:580.65.06-1.el{{ ansible_distribution_major_version }}" -cuda_package_version: '13.0.0-1' +cuda_nvidia_driver_pkg: "nvidia-open-3:580.82.07-1.el{{ ansible_distribution_major_version }}" +cuda_package_version: '13.0.1-1' cuda_version_short: "{{ (cuda_package_version | split('.'))[0:2] | join('.') }}" # major.minor cuda_packages: - "cuda-toolkit-{{ cuda_package_version }}" From b42c2f8fd7ee2d5d794bdb43d54b2b97502877d0 Mon Sep 17 00:00:00 2001 From: Steve Brasier <33413598+sjpb@users.noreply.github.com> Date: Thu, 11 Sep 2025 11:29:19 +0100 Subject: [PATCH 15/50] Add validation for tofu-templated vars (#775) * add validation for tofu-templated vars * update error message iaw review --- ansible/validate.yml | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/ansible/validate.yml b/ansible/validate.yml index 3341275..034f469 100644 --- a/ansible/validate.yml +++ b/ansible/validate.yml @@ -43,6 +43,37 @@ # below produced by dev/setup-env.sh - gives empty list if file is missing: _requirements_installed: "{{ ((lookup('file', _requirements_path + '.last', errors='ignore') or '{}') | from_yaml ).values() | flatten }}" +- name: Validate OpenTofu templated inventory is appropriate + # This "documents" the assumptions that Ansible makes about the + # OpenTofu-provided inventory + hosts: localhost + gather_facts: false + tags: + - validate + - opentofu + tasks: + - name: Check templated groups + assert: + that: + - item in groups + - groups[item] | length > 0 + fail_msg: > + Expected inventory group '{{ item }}' is missing or empty: + - Check OpenTofu inventory template is up to date + - Check OpenTofu configuration defines 'login' and 'compute' variables properly + loop: + - control + - compute + - login + - name: Check templated 'all' vars + assert: + that: + - openhpc_cluster_name is defined + - cluster_domain_suffix is defined + - cluster_home_volume is defined + - cluster_compute_groups is defined + fail_msg: "One or more expected variables are missing: is OpenTofu inventory template up to date?" + - name: Ensure control node is in inventory hosts: all gather_facts: false From 919a7e2c6bc29e6d31885a20050e0d0e268281c0 Mon Sep 17 00:00:00 2001 From: Steve Brasier <33413598+sjpb@users.noreply.github.com> Date: Thu, 11 Sep 2025 17:30:08 +0100 Subject: [PATCH 16/50] Fix error message for state volume provisioning (#780) --- environments/site/tofu/variables.tf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/environments/site/tofu/variables.tf b/environments/site/tofu/variables.tf index af0b112..3402c3a 100644 --- a/environments/site/tofu/variables.tf +++ b/environments/site/tofu/variables.tf @@ -203,7 +203,7 @@ variable "state_volume_provisioning" { validation { condition = contains(["manage", "attach"], var.state_volume_provisioning) error_message = <<-EOT - home_volume_provisioning must be "manage" or "attach" + state_volume_provisioning must be "manage" or "attach" EOT } } From c12ec99bcc1335ae5ecf31a5914fd4dbd21b167f Mon Sep 17 00:00:00 2001 From: Max Norton Date: Thu, 18 Sep 2025 12:16:06 +0100 Subject: [PATCH 17/50] Enable linting (#732) * Add Github Actions for running code linters * Fix linting issues. The super-linter.env currently has the following additions that are to be addressed in the future: VALIDATE_GITHUB_ACTIONS=false VALIDATE_SHELL_SHFMT=false VALIDATE_YAML=false Most of the linting for the above has been addressed with just a single issue remaining that blocks the linter from being enabled. * Update GH workflow so linting always runs befor any other jobs * Update GH workflow so linting always runs befor any other jobs * Fix linting issues on the merge of origin/main * Fix linting issues on the merge of origin/main * Use the head ref for workflow concurrency * Output the path filter result of the workflow * Tweak github action used to detect changed paths on push/pull request * Tweak github action used to detect changed paths on push/pull request * Tweak github action used to detect changed paths on push/pull request * Tweak github action used to detect changed paths on push/pull request * Tweak github action used to detect changed paths on push/pull request * Tweak github action used to detect changed paths on push/pull request * Tweak github action used to detect changed paths on push/pull request * Tweak github action used to detect changed paths on push/pull request * Tweak github action used to detect changed paths on push/pull request * Tweak github action used to detect changed paths on push/pull request * Tweak github action used to detect changed paths on push/pull request * Tweak github action used to detect changed paths on push/pull request * Tweak github action used to detect changed paths on push/pull request * Tweak github action used to detect changed paths on push/pull request * Tweak github action used to detect changed paths on push/pull request * Tweak github action used to detect changed paths on push/pull request * Tweak github action used to detect changed paths on push/pull request * Tweak github action used to detect changed paths on push/pull request * Tweak github action used to detect changed paths on push/pull request * Tweak github action used to detect changed paths on push/pull request * Troubleshooting: ansible.builtin.user * Troubleshooting: debugging temporarily added * Shift pylint invalid-name linting behond python bang line * Temporarily disable the ansible galaxy requirements validation * Reverting changes made to ansible.builtin.user and ansible.builtin.group where the name parameter was added. Reverting to ansible.builtin.group: becasue args aren't an expected label: groupadd: '{'name': 'grafana', 'gid': 979}' is not a valid group name * Arguments are dicts not labels * Preserve file permissions on .ssh directory contents * Wherever we use become_user set become: true, keeps the linter happy and maintains functionality * Fix linting on merge of origin/main * Fix linting on merge of origin/main * Update cluster image - using fatimage built from ci/linting branch * Add comments to workflow files detailing the CI workflow and enable these workflows * Fix workflow execution: 1. change trivvy to trivy 2. extra, stackhpc, and trivyscan workflows should trigger on workflow_call and workflow_dispatch * Fix linting issues from merge of origin/main * Exclude 'ansible/roles/compute_init/files/compute-init.yml' from ansible lint. The parser can't load the 'tasks/tuned.yml' ansible so fails with: load-failure[filenotfounderror]: [Errno 2] No such file or directory: 'ansible-slurm-appliance/tasks/main.yml' tasks/main.yml:1 This failure can't be skipped beause it's the output of the parser that's fed to the linter where such exceptions are made. * Temporarily disable Rocky 8 to speed up testing and reduce CI resources Temporarily disable ansible-lint: Run ansible/ansible-lint@v25.4.0 Run if [[ -n "" ]]; then Run action_ref="${GH_ACTION_REF_INPUT:-${GITHUB_ACTION_REF:-main}}" Using ansible-lint ref: main Run reqs_file=$(git rev-parse --show-toplevel)/.git/ansible-lint-requirements.txt --2025-09-09 14:51:58-- https://raw.githubusercontent.com/ansible/ansible-lint/main/.config/requirements-lock.txt Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.109.133, 185.199.110.133, 185.199.108.133, ... Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.109.133|:443... connected. HTTP request sent, awaiting response... 404 Not Found 2025-09-09 14:51:58 ERROR 404: Not Found. * Fix some bad ansible-lint line-length markup * Fix ansible-lint markup for line-length * Bump CI image - FOR RL9 ONLY TO CONSERVE CI RESOURCES * Revert ansible.builtin.command to ansible.builtin.shell due to missed comment "need login shell for module command" and mask ansible-lint error * Disable extra-build.yml workflow which has previously passed so we can focus on the stackhpc.yml workflow * Disable concurrency to see if this is killing stackhpc.yml * Remove concurrency from extr.yml, stackhpc.yml, and trivyscan.yml as they're all being triggered from main.yml which has its own concurrency check - the trivscan concurrency was also killing stackhpc * Enable ansible-lint * Enable triggering of all workflows from the main CI workflow * Bump CI image - FOR RL9 ONLY TO CONSERVE CI RESOURCES * Fix bad ansible-lint markup affecting the bang line * Reduce workflow CI resources whilst fixing test deploy and reimage workflow * Bump CI image - FOR RL9 ONLY TO CONSERVE CI RESOURCES * Enable Rocky Linux 8 - disabled to speed up testing * Enable all CI workflows * Bump CI image - FOR RL9 ONLY TO CONSERVE CI RESOURCES * Remove empty line between ansible "when" and "block" added by ansible-lint --fix, it's not required by the linter. * Enable check for ansible galaxy requirements * Revert the ansible collections path to ansible/collections so we don't inadvertently break any existing checkouts. Direct ansible-lint to use .ansible/collections so downloads are excluded from linting by our .ansible-lint.yml * Bump CI image --- .ansible-lint.yml | 24 + .checkov.yaml | 4 + .editorconfig | 8 + .github/bin/create-merge-branch.sh | 10 +- .github/bin/get-s3-image.sh | 14 +- .github/linters/.checkov.yaml | 1 + .github/linters/.python-lint | 1 + .github/linters/.shellcheckrc | 1 + .github/linters/.yamllint.yml | 1 + .github/linters/actionlint.yml | 1 + .github/workflows/extra.yml | 47 +- .github/workflows/fatimage.yml | 19 +- .github/workflows/lint.yml | 49 ++ .github/workflows/main.yml | 149 +++++ .github/workflows/nightly-cleanup.yml | 16 +- .github/workflows/nightlybuild.yml | 21 +- .github/workflows/release-image.yml | 9 +- .github/workflows/s3-image-sync.yml | 18 +- .github/workflows/stackhpc.yml | 70 +-- .github/workflows/trivyscan.yml | 29 +- .github/workflows/upgrade-check.yml.sample | 7 + .../workflows/upload-release-image.yml.sample | 6 + .gitignore | 1 + .python-lint | 6 + .shellcheckrc | 7 + .yamllint.yml | 24 + README.md | 130 +++-- actionlint.yml | 1 + ansible/adhoc/backup-keytabs.yml | 7 +- ansible/adhoc/cudatests.yml | 7 +- ansible/adhoc/deploy-pulp.yml | 26 +- ansible/adhoc/generate-passwords.yml | 5 +- ansible/adhoc/hpctests.yml | 3 +- ansible/adhoc/rebuild-via-slurm.yml | 5 +- ansible/adhoc/rebuild.yml | 17 +- ansible/adhoc/restart-slurm.yml | 19 +- ansible/adhoc/sync-pulp.yml | 1 + ansible/adhoc/update-packages.yml | 10 +- ansible/bootstrap.yml | 210 +++---- ansible/ci/check_eessi.yml | 22 +- ansible/ci/check_grafana.yml | 10 +- ansible/ci/check_sacct_hpctests.yml | 5 +- ansible/ci/check_slurm.yml | 5 +- ansible/ci/delete_images.yml | 14 +- ansible/ci/get_image_ids.yml | 5 +- .../ci/library/grafana_elasticsearch_query.py | 97 ++-- ansible/ci/output_vars.yml | 5 +- ansible/ci/retrieve_inventory.yml | 13 +- ansible/ci/update_timestamps.yml | 3 +- ansible/cleanup.yml | 36 +- ansible/extras.yml | 51 +- ansible/fatimage.yml | 152 ++--- ansible/filesystems.yml | 11 +- ansible/filter_plugins/utils.py | 77 +-- ansible/final.yml | 12 +- ansible/iam.yml | 27 +- ansible/library/latest_timestamps.py | 87 +-- ansible/library/user_namespace_facts.py | 68 ++- ansible/monitoring.yml | 30 +- ansible/noop.yml | 1 - ansible/portal.yml | 35 +- ansible/roles/alertmanager/README.md | 37 +- ansible/roles/alertmanager/defaults/main.yml | 18 +- ansible/roles/alertmanager/handlers/main.yml | 3 +- .../roles/alertmanager/tasks/configure.yml | 8 +- ansible/roles/alertmanager/tasks/install.yml | 5 +- ansible/roles/basic_users/README.md | 65 ++- ansible/roles/basic_users/defaults/main.yml | 5 +- .../basic_users/filter_plugins/filter_keys.py | 29 +- .../library/terminate_user_sessions.py | 68 ++- ansible/roles/basic_users/tasks/main.yml | 25 +- ansible/roles/block_devices/README.md | 26 +- ansible/roles/block_devices/defaults/main.yml | 6 +- .../block_devices/library/block_devices.py | 31 +- ansible/roles/block_devices/tasks/main.yml | 27 +- ansible/roles/cacerts/defaults/main.yml | 3 +- ansible/roles/cacerts/tasks/configure.yml | 9 +- ansible/roles/cacerts/tasks/export.yml | 5 +- ansible/roles/cacerts/tasks/main.yml | 3 +- ansible/roles/cluster_infra/defaults/main.yml | 1 + ansible/roles/cluster_infra/tasks/main.yml | 35 +- ansible/roles/compute_init/README.md | 230 ++++---- .../roles/compute_init/files/compute-init.yml | 115 ++-- ansible/roles/compute_init/tasks/export.yml | 31 +- ansible/roles/compute_init/tasks/install.yml | 25 +- ansible/roles/cuda/defaults/main.yml | 2 + ansible/roles/cuda/tasks/facts.yml | 2 +- ansible/roles/cuda/tasks/install.yml | 19 +- ansible/roles/cuda/tasks/runtime.yml | 3 +- ansible/roles/cuda/tasks/samples.yml | 13 +- ansible/roles/dnf_repos/README.md | 40 +- ansible/roles/dnf_repos/defaults/main.yml | 1 + .../roles/dnf_repos/tasks/disable_repos.yml | 2 +- ansible/roles/doca/defaults/main.yml | 4 +- .../roles/doca/tasks/install-kernel-devel.yml | 13 +- ansible/roles/doca/tasks/install.yml | 18 +- ansible/roles/doca/tasks/main.yml | 3 +- ansible/roles/eessi/README.md | 19 +- ansible/roles/eessi/defaults/main.yaml | 1 - ansible/roles/eessi/tasks/configure.yml | 5 +- ansible/roles/eessi/tasks/install.yml | 19 +- ansible/roles/eessi/tasks/main.yml | 4 +- ansible/roles/etc_hosts/README.md | 5 +- ansible/roles/etc_hosts/defaults/main.yml | 3 +- ansible/roles/etc_hosts/tasks/main.yml | 7 +- ansible/roles/fail2ban/README.md | 22 +- ansible/roles/fail2ban/handlers/main.yml | 3 +- ansible/roles/fail2ban/meta/main.yml | 10 +- ansible/roles/fail2ban/tasks/configure.yml | 9 +- ansible/roles/fail2ban/tasks/install.yml | 4 +- ansible/roles/fail2ban/tasks/main.yml | 4 +- ansible/roles/filebeat/defaults/main.yml | 2 +- ansible/roles/filebeat/handlers/main.yml | 7 +- ansible/roles/filebeat/tasks/install.yml | 10 +- ansible/roles/filebeat/tasks/main.yml | 5 +- ansible/roles/filebeat/tasks/runtime.yml | 24 +- ansible/roles/filebeat/tasks/validate.yml | 4 +- ansible/roles/firewalld/README.md | 42 +- ansible/roles/firewalld/defaults/main.yml | 3 +- ansible/roles/firewalld/handlers/main.yml | 2 +- ansible/roles/firewalld/meta/main.yml | 11 +- ansible/roles/firewalld/tasks/install.yml | 3 +- ansible/roles/firewalld/tasks/main.yml | 4 +- ansible/roles/firewalld/tasks/runtime.yml | 6 +- ansible/roles/freeipa/README.md | 33 +- ansible/roles/freeipa/defaults/main.yml | 9 +- ansible/roles/freeipa/tasks/addhost.yml | 7 +- .../roles/freeipa/tasks/backup-keytabs.yml | 6 +- .../roles/freeipa/tasks/client-install.yml | 4 +- ansible/roles/freeipa/tasks/enrol.yml | 19 +- ansible/roles/freeipa/tasks/server.yml | 40 +- ansible/roles/freeipa/tasks/users.yml | 10 +- ansible/roles/freeipa/tasks/validate.yml | 17 +- ansible/roles/gateway/README.md | 2 + ansible/roles/gateway/files/gateway-init.yml | 29 +- ansible/roles/gateway/tasks/main.yml | 5 +- .../files/openhpc-slurm.json | 2 +- .../roles/grafana-dashboards/tasks/main.yml | 25 +- ansible/roles/hpctests/README.md | 46 +- ansible/roles/hpctests/defaults/main.yml | 22 +- .../roles/hpctests/files/.clang-format-ignore | 1 + ansible/roles/hpctests/files/CPPLINT.cfg | 1 + .../roles/hpctests/files/plot_imb_pingpong.py | 111 ++-- ansible/roles/hpctests/library/hpl_pq.py | 40 +- .../roles/hpctests/library/plot_nxnlatbw.py | 185 +++++-- .../hpctests/library/read_imb_pingpong.py | 44 +- .../roles/hpctests/library/slurm_node_info.py | 47 +- ansible/roles/hpctests/meta/main.yml | 4 +- ansible/roles/hpctests/tasks/build-hpl.yml | 48 +- ansible/roles/hpctests/tasks/hpl-solo.yml | 60 +- ansible/roles/hpctests/tasks/main.yml | 25 +- ansible/roles/hpctests/tasks/pingmatrix.yml | 44 +- ansible/roles/hpctests/tasks/pingpong.yml | 37 +- ansible/roles/hpctests/tasks/setup.yml | 18 +- ansible/roles/hpctests/tasks/source-hpl.yml | 5 +- .../roles/hpctests/templates/hpl-build.sh.j2 | 0 .../roles/hpctests/templates/hpl-solo.sh.j2 | 0 .../roles/hpctests/templates/pingmatrix.sh.j2 | 0 .../roles/hpctests/templates/pingpong.sh.j2 | 0 ansible/roles/k3s/README.md | 10 +- ansible/roles/k3s/defaults/main.yml | 3 +- ansible/roles/k3s/tasks/agent-runtime.yml | 11 +- ansible/roles/k3s/tasks/install.yml | 93 ++-- ansible/roles/k3s/tasks/server-runtime.yml | 22 +- .../k3s/templates/k3s-agent.service.env.j2 | 6 +- .../roles/k3s/templates/k3s.service.env.j2 | 2 +- ansible/roles/k9s/tasks/main.yml | 19 +- ansible/roles/lustre/README.md | 15 +- ansible/roles/lustre/defaults/main.yml | 5 +- ansible/roles/lustre/tasks/configure.yml | 12 +- ansible/roles/lustre/tasks/install.yml | 18 +- ansible/roles/lustre/tasks/validate.yml | 9 +- ansible/roles/mysql/README.md | 28 +- ansible/roles/mysql/defaults/main.yml | 5 +- ansible/roles/mysql/tasks/configure.yml | 39 +- ansible/roles/mysql/tasks/install.yml | 9 +- ansible/roles/mysql/tasks/main.yml | 5 +- ansible/roles/nhc/README.md | 12 +- ansible/roles/nhc/tasks/export.yml | 1 + ansible/roles/nhc/tasks/main.yml | 2 +- ansible/roles/ofed/README.md | 7 +- ansible/roles/ofed/defaults/main.yml | 4 +- ansible/roles/ofed/tasks/install.yml | 36 +- ansible/roles/ofed/tasks/main.yml | 3 +- ansible/roles/openondemand/README.md | 42 +- ansible/roles/openondemand/defaults/main.yml | 34 +- .../files/missing_home_directory.html | 99 ++-- .../openondemand/tasks/codeserver_compute.yml | 9 +- .../openondemand/tasks/config_changes.yml | 3 +- ansible/roles/openondemand/tasks/exporter.yml | 5 +- .../openondemand/tasks/jupyter_compute.yml | 14 +- ansible/roles/openondemand/tasks/main.yml | 51 +- ansible/roles/openondemand/tasks/pam_auth.yml | 14 +- .../openondemand/tasks/rstudio_compute.yml | 7 +- ansible/roles/openondemand/tasks/validate.yml | 3 +- .../roles/openondemand/tasks/vnc_compute.yml | 30 +- ansible/roles/opensearch/defaults/main.yml | 4 +- ansible/roles/opensearch/handlers/main.yml | 3 +- .../roles/opensearch/tasks/archive_data.yml | 4 +- ansible/roles/opensearch/tasks/certs.yml | 3 +- ansible/roles/opensearch/tasks/install.yml | 15 +- .../opensearch/tasks/migrate-opendistro.yml | 3 +- ansible/roles/opensearch/tasks/runtime.yml | 44 +- ansible/roles/passwords/defaults/main.yml | 3 + ansible/roles/passwords/tasks/main.yml | 4 +- ansible/roles/passwords/tasks/validate.yml | 3 +- .../roles/persist_hostkeys/defaults/main.yml | 1 + ansible/roles/persist_hostkeys/tasks/main.yml | 66 +-- .../persist_openhpc_secrets/tasks/main.yml | 14 +- ansible/roles/podman/defaults/main.yml | 1 + ansible/roles/podman/tasks/configure.yml | 17 +- ansible/roles/podman/tasks/install.yml | 4 +- ansible/roles/podman/tasks/main.yml | 4 +- ansible/roles/proxy/defaults/main.yml | 1 + ansible/roles/proxy/tasks/main.yml | 23 +- ansible/roles/pulp_site/README.md | 33 +- .../filter_plugins/pulp-list-filters.py | 85 +-- ansible/roles/pulp_site/tasks/install.yml | 36 +- ansible/roles/pulp_site/tasks/sync.yml | 41 +- ansible/roles/rebuild/README.md | 11 +- ansible/roles/rebuild/defaults/main.yml | 6 +- ansible/roles/rebuild/tasks/configure.yml | 4 +- ansible/roles/rebuild/tasks/install.yml | 2 +- ansible/roles/rebuild/tasks/main.yml | 4 +- ansible/roles/rebuild/tasks/rebuild.yml | 6 +- .../roles/rebuild/tasks/rebuild_partition.yml | 7 +- ansible/roles/resolv_conf/README.md | 2 + ansible/roles/resolv_conf/defaults/main.yml | 1 + ansible/roles/resolv_conf/tasks/main.yml | 3 +- ansible/roles/slurm_exporter/README.md | 41 +- .../roles/slurm_exporter/defaults/main.yml | 4 +- .../roles/slurm_exporter/handlers/main.yml | 2 +- .../roles/slurm_exporter/tasks/configure.yml | 2 +- .../roles/slurm_exporter/tasks/install.yml | 12 +- ansible/roles/slurm_exporter/tasks/main.yml | 4 +- ansible/roles/slurm_recompile/README.md | 16 +- ansible/roles/slurm_recompile/tasks/main.yml | 16 +- ansible/roles/slurm_stats/README.md | 24 +- ansible/roles/slurm_stats/tasks/configure.yml | 8 +- ansible/roles/slurm_stats/tasks/install.yml | 2 +- ansible/roles/slurm_stats/tasks/main.yml | 4 +- ansible/roles/slurm_tools/README.md | 8 +- ansible/roles/slurm_tools/tasks/main.yml | 32 +- ansible/roles/squid/README.md | 2 +- ansible/roles/squid/defaults/main.yml | 5 +- ansible/roles/squid/handlers/main.yml | 3 +- ansible/roles/squid/tasks/configure.yml | 10 +- ansible/roles/squid/tasks/install.yml | 3 +- ansible/roles/squid/tasks/main.yml | 5 +- ansible/roles/sshd/defaults/main.yml | 1 + ansible/roles/sshd/handlers/main.yml | 3 +- ansible/roles/sshd/tasks/configure.yml | 11 +- ansible/roles/sshd/tasks/export.yml | 3 +- ansible/roles/sshd/tasks/main.yml | 3 +- ansible/roles/sssd/README.md | 1 - ansible/roles/sssd/defaults/main.yml | 1 + ansible/roles/sssd/handlers/main.yml | 3 +- ansible/roles/sssd/tasks/configure.yml | 20 +- ansible/roles/sssd/tasks/export.yml | 5 +- ansible/roles/sssd/tasks/install.yml | 7 +- ansible/roles/sssd/tasks/main.yml | 5 +- ansible/roles/systemd/README.md | 19 +- ansible/roles/systemd/defaults/main.yml | 3 +- ansible/roles/systemd/tasks/main.yml | 13 +- ansible/roles/topology/README.md | 34 +- ansible/roles/topology/defaults/main.yml | 1 - ansible/roles/topology/library/map_hosts.py | 35 +- ansible/roles/topology/tasks/main.yml | 2 +- ansible/roles/tuned/README.md | 7 +- ansible/roles/tuned/defaults/main.yml | 2 +- ansible/roles/tuned/tasks/configure.yml | 2 +- ansible/roles/tuned/tasks/install.yml | 3 +- ansible/roles/tuned/tasks/main.yml | 4 +- ansible/roles/zenith_proxy/defaults/main.yml | 8 +- .../files/podman-pod-infra-attach.sh | 2 +- ansible/roles/zenith_proxy/tasks/main.yml | 42 +- ansible/site.yml | 27 +- ansible/slurm.yml | 37 +- ansible/validate.yml | 40 +- cookiecutter/cookiecutter.json | 4 +- .../{{cookiecutter.environment}}/README.md | 2 +- .../inventory/group_vars/all/basic_users.yml | 1 + .../inventory/group_vars/all/hpctests.yml | 1 + .../{{cookiecutter.environment}}/tofu/main.tf | 32 +- dev/ansible-ssh | 24 +- dev/delete-cluster.py | 47 +- dev/extract_logs.py | 74 ++- dev/image-share.sh | 12 +- dev/output_manifest.py | 25 +- dev/setup-env.sh | 42 +- docs/README.md | 8 +- docs/adding-functionality.md | 3 +- docs/alerting.md | 73 +-- docs/chrony.md | 3 +- docs/ci.md | 3 +- docs/environments.md | 15 +- docs/experimental/compute-init.md | 12 +- docs/experimental/isolated-clusters.md | 144 ++--- docs/experimental/pulp.md | 15 +- docs/experimental/slurm-controlled-rebuild.md | 272 ++++----- docs/filesystems.md | 84 +-- docs/image-build.md | 76 +-- docs/k3s.README.md | 8 +- docs/mig.md | 55 +- docs/monitoring-and-logging.md | 67 ++- docs/networks.md | 9 +- docs/openondemand.md | 32 +- docs/operations.md | 122 +++-- docs/persistent-state.md | 2 + docs/production.md | 269 ++++----- docs/sequence.md | 11 +- docs/site/README.md | 3 +- docs/upgrades.md | 97 ++-- environments/.caas/README.md | 11 +- environments/.caas/hooks/post.yml | 16 +- environments/.caas/hooks/pre.yml | 38 +- .../inventory/group_vars/all/basic_users.yml | 1 + .../inventory/group_vars/all/cluster.yml | 1 + .../inventory/group_vars/all/grafana.yml | 1 + .../inventory/group_vars/all/hpctests.yml | 3 +- .../.caas/inventory/group_vars/all/manila.yml | 5 +- .../.caas/inventory/group_vars/all/nfs.yml | 5 +- .../inventory/group_vars/all/openhpc.yml | 1 + .../inventory/group_vars/all/openondemand.yml | 1 - .../.caas/inventory/group_vars/all/zenith.yml | 1 + .../.caas/inventory/group_vars/openstack.yml | 1 + .../ui-meta/slurm-infra-fast-volume-type.yml | 13 +- .../.caas/ui-meta/slurm-infra-manila-home.yml | 12 +- environments/.caas/ui-meta/slurm-infra.yml | 12 +- .../.stackhpc/hooks/post-bootstrap.yml | 8 +- environments/.stackhpc/hooks/pre.yml | 9 +- .../inventory/group_vars/all/basic_users.yml | 3 + .../inventory/group_vars/all/bastion.yml | 1 + .../inventory/group_vars/all/freeipa.yml | 1 + .../inventory/group_vars/all/hpctests.yml | 1 + .../inventory/group_vars/all/manila.yml | 1 + .../inventory/group_vars/all/openhpc.yml | 1 + .../inventory/group_vars/all/openondemand.yml | 7 +- .../inventory/group_vars/all/podman.yml | 1 + .../inventory/group_vars/all/tuned.yml | 1 + .../inventory/group_vars/builder.yml | 3 +- .../tofu/cluster_image.auto.tfvars.json | 8 +- environments/.stackhpc/tofu/main.tf | 103 ++-- environments/README.md | 14 +- .../common/files/filebeat/filebeat.yml | 1 + .../inventory/group_vars/all/alertmanager.yml | 4 +- .../inventory/group_vars/all/ansible_init.yml | 1 + .../inventory/group_vars/all/basic_users.yml | 3 +- .../inventory/group_vars/all/defaults.yml | 107 ++-- .../inventory/group_vars/all/filebeat.yml | 2 +- .../inventory/group_vars/all/firewalld.yml | 5 +- .../group_vars/all/freeipa_server.yml | 1 + .../inventory/group_vars/all/grafana.yml | 12 +- .../common/inventory/group_vars/all/k3s.yml | 1 + .../inventory/group_vars/all/manila.yml | 1 + .../common/inventory/group_vars/all/mysql.yml | 2 +- .../common/inventory/group_vars/all/nfs.yml | 12 +- .../inventory/group_vars/all/openhpc.yml | 13 +- .../inventory/group_vars/all/openondemand.yml | 48 +- .../inventory/group_vars/all/podman.yml | 1 + .../inventory/group_vars/all/prometheus.yml | 67 +-- .../common/inventory/group_vars/all/proxy.yml | 1 + .../common/inventory/group_vars/all/pulp.yml | 1 + .../group_vars/all/slurm_exporter.yml | 5 +- .../common/inventory/group_vars/all/squid.yml | 1 + .../common/inventory/group_vars/all/sshd.yaml | 1 + .../inventory/group_vars/all/systemd.yml | 1 + .../inventory/group_vars/all/update.yml | 7 +- .../site/inventory/group_vars/all/grafana.yml | 3 +- .../group_vars/all/vault_alertmanager.yml | 2 +- environments/site/tofu/additional.tf | 46 +- environments/site/tofu/baremetal-node-list.py | 34 +- environments/site/tofu/compute.tf | 50 +- environments/site/tofu/control.tf | 56 +- environments/site/tofu/data.tf | 3 +- environments/site/tofu/inventory.tf | 27 +- environments/site/tofu/login.tf | 50 +- environments/site/tofu/main.tf | 2 +- environments/site/tofu/network.tf | 8 +- environments/site/tofu/node_group/main.tf | 2 +- environments/site/tofu/node_group/network.tf | 4 +- environments/site/tofu/node_group/nodes.tf | 116 ++-- .../site/tofu/node_group/variables.tf | 193 +++---- .../site/tofu/read-inventory-secrets.py | 49 +- environments/site/tofu/variables.tf | 517 +++++++++--------- environments/site/tofu/volumes.tf | 60 +- packer/openhpc_extravars.yml | 3 +- requirements.yml | 1 - super-linter.env | 27 + 389 files changed, 5025 insertions(+), 4013 deletions(-) create mode 100644 .ansible-lint.yml create mode 100644 .checkov.yaml create mode 100644 .editorconfig mode change 100644 => 100755 .github/bin/create-merge-branch.sh mode change 100644 => 100755 .github/bin/get-s3-image.sh create mode 120000 .github/linters/.checkov.yaml create mode 120000 .github/linters/.python-lint create mode 120000 .github/linters/.shellcheckrc create mode 120000 .github/linters/.yamllint.yml create mode 120000 .github/linters/actionlint.yml create mode 100644 .github/workflows/lint.yml create mode 100644 .github/workflows/main.yml create mode 100644 .python-lint create mode 100644 .shellcheckrc create mode 100644 .yamllint.yml create mode 100644 actionlint.yml create mode 100644 ansible/roles/hpctests/files/.clang-format-ignore create mode 100644 ansible/roles/hpctests/files/CPPLINT.cfg mode change 100644 => 100755 ansible/roles/hpctests/templates/hpl-build.sh.j2 mode change 100644 => 100755 ansible/roles/hpctests/templates/hpl-solo.sh.j2 mode change 100644 => 100755 ansible/roles/hpctests/templates/pingmatrix.sh.j2 mode change 100644 => 100755 ansible/roles/hpctests/templates/pingpong.sh.j2 mode change 100644 => 100755 ansible/roles/zenith_proxy/files/podman-pod-infra-attach.sh create mode 100644 super-linter.env diff --git a/.ansible-lint.yml b/.ansible-lint.yml new file mode 100644 index 0000000..97d3b68 --- /dev/null +++ b/.ansible-lint.yml @@ -0,0 +1,24 @@ +--- +skip_list: + - role-name + # Unresolved issues with parsing jinja in multiline strings + # https://github.com/ansible/ansible-lint/issues/3935 + - jinja[spacing] + - galaxy[no-changelog] + - meta-runtime[unsupported-version] + +warn_list: + - name[missing] + - name[play] + - var-naming + +exclude_paths: + - actionlint.yml + - .ansible/ + - .github/ + # Rule 'syntax-check' is unskippable, you cannot use it in 'skip_list' or 'warn_list'. + # It breaks the parser which takes place before the linter, the only option is to exclude the file. + - ansible/roles/filebeat/tasks/runtime.yml + - environments/common/files/filebeat/filebeat.yml + # Rule 'load-failure[filenotfounderror]' is also unskippable + - ansible/roles/compute_init/files/compute-init.yml diff --git a/.checkov.yaml b/.checkov.yaml new file mode 100644 index 0000000..ef0fb8b --- /dev/null +++ b/.checkov.yaml @@ -0,0 +1,4 @@ +--- +skip-check: + # Requires all blocks to have rescue: - not considered appropriate + - CKV2_ANSIBLE_3 diff --git a/.editorconfig b/.editorconfig new file mode 100644 index 0000000..ab1e657 --- /dev/null +++ b/.editorconfig @@ -0,0 +1,8 @@ +# The is primarily used to alter the behaviour of linters executed by super-linter. +# See https://editorconfig.org/ + +# shfmt will default to indenting shell scripts with tabs, +# define the indent as 2 spaces +[{.github/bin,dev}/*.sh] +indent_style = space +indent_size = 2 diff --git a/.github/bin/create-merge-branch.sh b/.github/bin/create-merge-branch.sh old mode 100644 new mode 100755 index d76fe45..af1684d --- a/.github/bin/create-merge-branch.sh +++ b/.github/bin/create-merge-branch.sh @@ -44,7 +44,7 @@ if git show-branch "remotes/origin/$BRANCH_NAME" >/dev/null 2>&1; then fi echo "[INFO] Merging release tag - $RELEASE_TAG" -git merge --strategy recursive -X theirs --no-commit $RELEASE_TAG +git merge --strategy recursive -X theirs --no-commit "$RELEASE_TAG" # Check if the merge resulted in any changes being staged if [ -n "$(git status --short)" ]; then @@ -54,7 +54,7 @@ if [ -n "$(git status --short)" ]; then # NOTE(scott): The GitHub create-pull-request action does # the commiting for us, so we only need to make branches # and commits if running outside of GitHub actions. - if [ ! $GITHUB_ACTIONS ]; then + if [ ! "$GITHUB_ACTIONS" ]; then echo "[INFO] Checking out temporary branch '$BRANCH_NAME'..." git checkout -b "$BRANCH_NAME" @@ -74,8 +74,8 @@ if [ -n "$(git status --short)" ]; then # Write a file containing the branch name and tag # for automatic PR or MR creation that follows - echo "BRANCH_NAME=\"$BRANCH_NAME\"" > .mergeenv - echo "RELEASE_TAG=\"$RELEASE_TAG\"" >> .mergeenv + echo "BRANCH_NAME=\"$BRANCH_NAME\"" >.mergeenv + echo "RELEASE_TAG=\"$RELEASE_TAG\"" >>.mergeenv else echo "[INFO] Merge resulted in no changes" -fi \ No newline at end of file +fi diff --git a/.github/bin/get-s3-image.sh b/.github/bin/get-s3-image.sh old mode 100644 new mode 100755 index 98b9131..dc0c816 --- a/.github/bin/get-s3-image.sh +++ b/.github/bin/get-s3-image.sh @@ -13,14 +13,14 @@ echo "Checking if image $image_name exists in OpenStack" image_exists=$(openstack image list --name "$image_name" -f value -c Name) if [ -n "$image_exists" ]; then - echo "Image $image_name already exists in OpenStack." + echo "Image $image_name already exists in OpenStack." else - echo "Image $image_name not found in OpenStack. Getting it from S3." + echo "Image $image_name not found in OpenStack. Getting it from S3." - wget https://leafcloud.store/swift/v1/AUTH_f39848421b2747148400ad8eeae8d536/$bucket_name/$image_name --progress=dot:giga + wget "https://leafcloud.store/swift/v1/AUTH_f39848421b2747148400ad8eeae8d536/$bucket_name/$image_name" --progress=dot:giga - echo "Uploading image $image_name to OpenStack..." - openstack image create --file $image_name --disk-format qcow2 $image_name --progress + echo "Uploading image $image_name to OpenStack..." + openstack image create --file "$image_name" --disk-format qcow2 "$image_name" --progress - echo "Image $image_name has been uploaded to OpenStack." -fi \ No newline at end of file + echo "Image $image_name has been uploaded to OpenStack." +fi diff --git a/.github/linters/.checkov.yaml b/.github/linters/.checkov.yaml new file mode 120000 index 0000000..2cc8ad8 --- /dev/null +++ b/.github/linters/.checkov.yaml @@ -0,0 +1 @@ +../../.checkov.yaml \ No newline at end of file diff --git a/.github/linters/.python-lint b/.github/linters/.python-lint new file mode 120000 index 0000000..d0b7471 --- /dev/null +++ b/.github/linters/.python-lint @@ -0,0 +1 @@ +../../.python-lint \ No newline at end of file diff --git a/.github/linters/.shellcheckrc b/.github/linters/.shellcheckrc new file mode 120000 index 0000000..3f34501 --- /dev/null +++ b/.github/linters/.shellcheckrc @@ -0,0 +1 @@ +../../.shellcheckrc \ No newline at end of file diff --git a/.github/linters/.yamllint.yml b/.github/linters/.yamllint.yml new file mode 120000 index 0000000..54a3654 --- /dev/null +++ b/.github/linters/.yamllint.yml @@ -0,0 +1 @@ +../../.yamllint.yml \ No newline at end of file diff --git a/.github/linters/actionlint.yml b/.github/linters/actionlint.yml new file mode 120000 index 0000000..766b4e9 --- /dev/null +++ b/.github/linters/actionlint.yml @@ -0,0 +1 @@ +../../actionlint.yml \ No newline at end of file diff --git a/.github/workflows/extra.yml b/.github/workflows/extra.yml index f18e380..1941064 100644 --- a/.github/workflows/extra.yml +++ b/.github/workflows/extra.yml @@ -1,30 +1,23 @@ +--- + +# Test building extra images on OpenStack. +# This workflow can run standalone or as part of the main CI workflow. +# See the workflow file 'main.yml' for how this is CI triggered. + name: Test extra build on: + workflow_call: workflow_dispatch: - push: - branches: - - main - paths: - - 'environments/.stackhpc/tofu/cluster_image.auto.tfvars.json' - - 'ansible/roles/doca/**' - - 'ansible/roles/cuda/**' - - 'ansible/roles/slurm_recompile/**' # runs on cuda group - - 'ansible/roles/lustre/**' - - '.github/workflows/extra.yml' - pull_request: - paths: - - 'environments/.stackhpc/tofu/cluster_image.auto.tfvars.json' - - 'ansible/roles/doca/**' - - 'ansible/roles/cuda/**' - - 'ansible/roles/lustre/**' - - '.github/workflows/extra.yml' + +permissions: + contents: read + packages: write + # To report GitHub Actions status checks + statuses: write jobs: doca: name: extra-build - concurrency: - group: ${{ github.workflow }}-${{ github.ref }}-${{ matrix.build.image_name }} # to branch/PR + OS - cancel-in-progress: true runs-on: ubuntu-22.04 strategy: fail-fast: false # allow other matrix jobs to continue even if one fails @@ -46,7 +39,7 @@ jobs: PACKER_GITHUB_API_TOKEN: ${{ secrets.GITHUB_TOKEN }} steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v4 - name: Load current fat images into GITHUB_ENV # see https://docs.github.com/en/actions/writing-workflows/choosing-what-your-workflow-does/workflow-commands-for-github-actions#example-of-a-multiline-string @@ -60,7 +53,7 @@ jobs: - name: Record settings run: | echo CI_CLOUD: ${{ env.CI_CLOUD }} - echo FAT_IMAGES: ${FAT_IMAGES} + echo "FAT_IMAGES: ${FAT_IMAGES}" - name: Setup ssh run: | @@ -99,7 +92,7 @@ jobs: PACKER_LOG=1 packer build \ -on-error=${{ vars.PACKER_ON_ERROR }} \ - -var-file=$PKR_VAR_environment_root/${{ env.CI_CLOUD }}.pkrvars.hcl \ + -var-file="$PKR_VAR_environment_root/${{ env.CI_CLOUD }}.pkrvars.hcl" \ -var "source_image_name=${{ fromJSON(env.FAT_IMAGES)['cluster_image'][matrix.build.source_image_name_key] }}" \ -var "image_name=${{ matrix.build.image_name }}" \ -var "inventory_groups=${{ matrix.build.inventory_groups }}" \ @@ -111,14 +104,14 @@ jobs: run: | . venv/bin/activate IMAGE_ID=$(jq --raw-output '.builds[-1].artifact_id' packer/packer-manifest.json) - while ! openstack image show -f value -c name $IMAGE_ID; do + while ! openstack image show -f value -c name "$IMAGE_ID"; do sleep 5 done - IMAGE_NAME=$(openstack image show -f value -c name $IMAGE_ID) + IMAGE_NAME=$(openstack image show -f value -c name "$IMAGE_ID") echo "image-name=${IMAGE_NAME}" >> "$GITHUB_OUTPUT" echo "image-id=$IMAGE_ID" >> "$GITHUB_OUTPUT" - echo $IMAGE_ID > image-id.txt - echo $IMAGE_NAME > image-name.txt + echo "$IMAGE_ID" > image-id.txt + echo "$IMAGE_NAME" > image-name.txt - name: Make image usable for further builds run: | diff --git a/.github/workflows/fatimage.yml b/.github/workflows/fatimage.yml index 51ea29a..407bd44 100644 --- a/.github/workflows/fatimage.yml +++ b/.github/workflows/fatimage.yml @@ -1,6 +1,7 @@ name: Build fat image on: workflow_dispatch: + # checkov:skip=CKV_GHA_7: "The build output cannot be affected by user parameters other than the build entry point and the top-level source location. GitHub Actions workflow_dispatch inputs MUST be empty. " inputs: ci_cloud: description: 'Select the CI_CLOUD' @@ -16,6 +17,12 @@ on: required: true default: true +permissions: + contents: read + packages: write + # To report GitHub Actions status checks + statuses: write + jobs: openstack: name: openstack-imagebuild @@ -42,7 +49,7 @@ jobs: PACKER_GITHUB_API_TOKEN: ${{ secrets.GITHUB_TOKEN }} steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v4 - name: Record settings for CI cloud run: | @@ -85,7 +92,7 @@ jobs: PACKER_LOG=1 packer build \ -on-error=${{ github.event.inputs.cleanup_on_failure && 'cleanup' || 'abort' }} \ - -var-file=$PKR_VAR_environment_root/${{ env.CI_CLOUD }}.pkrvars.hcl \ + -var-file="$PKR_VAR_environment_root/${{ env.CI_CLOUD }}.pkrvars.hcl" \ -var "source_image_name=${{ matrix.build.source_image_name }}" \ -var "image_name=${{ matrix.build.image_name }}" \ -var "inventory_groups=${{ matrix.build.inventory_groups }}" \ @@ -96,14 +103,14 @@ jobs: run: | . venv/bin/activate IMAGE_ID=$(jq --raw-output '.builds[-1].artifact_id' packer/packer-manifest.json) - while ! openstack image show -f value -c name $IMAGE_ID; do + while ! openstack image show -f value -c name "$IMAGE_ID"; do sleep 5 done - IMAGE_NAME=$(openstack image show -f value -c name $IMAGE_ID) + IMAGE_NAME=$(openstack image show -f value -c name "$IMAGE_ID") echo "image-name=${IMAGE_NAME}" >> "$GITHUB_OUTPUT" echo "image-id=$IMAGE_ID" >> "$GITHUB_OUTPUT" - echo $IMAGE_ID > image-id.txt - echo $IMAGE_NAME > image-name.txt + echo "$IMAGE_ID" > image-id.txt + echo "$IMAGE_NAME" > image-name.txt - name: Make image usable for further builds run: | diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml new file mode 100644 index 0000000..d824577 --- /dev/null +++ b/.github/workflows/lint.yml @@ -0,0 +1,49 @@ +--- +name: Lint + +on: # yamllint disable-line rule:truthy + workflow_call: + +permissions: + contents: read + packages: read + # To report GitHub Actions status checks + statuses: write + +jobs: + lint: + name: Lint + runs-on: ubuntu-latest + permissions: + contents: read + packages: read + # To report GitHub Actions status checks + statuses: write + + steps: + - uses: actions/checkout@v4 + with: + # super-linter needs the full git history to get the + # list of files that changed across commits + fetch-depth: 0 + submodules: true + + - name: Run ansible-lint + uses: ansible/ansible-lint@v25.4.0 + env: + ANSIBLE_COLLECTIONS_PATH: .ansible/collections + + - name: Load super-linter configuration + # Use grep inverse matching to exclude eventual comments in the .env file + # because the GitHub Actions command to set environment variables doesn't + # support comments. + # yamllint disable-line rule:line-length + # Ref: https://docs.github.com/en/actions/writing-workflows/choosing-what-your-workflow-does/workflow-commands-for-github-actions#setting-an-environment-variable + run: grep -v '^#' super-linter.env >> "$GITHUB_ENV" + if: always() + + - name: Run super-linter + uses: super-linter/super-linter@v7.3.0 + if: always() + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml new file mode 100644 index 0000000..5e2ccc7 --- /dev/null +++ b/.github/workflows/main.yml @@ -0,0 +1,149 @@ +--- + +# This file governs the main CI workflow. +# It's the only workflow triggered on push and pull requests, +# it manages the CI workflow as follows: +# 1. Lint the code aborting the workflow if there are linting errors. +# 2. Determine which files have changed and set job outputs accordingly. +# 3. Conditionally run the other workflows based on the changed files: +# * stackhpc.yml +# * extra.yml +# * trivyscan.yml + +name: Test on push and pull request + +permissions: + actions: write + contents: read + packages: write + # To report GitHub Actions status checks + statuses: write + id-token: write + +on: + push: + branches: + - main + pull_request: + +concurrency: + group: ${{ github.workflow }}-${{ github.head_ref }} + cancel-in-progress: true + +jobs: + lint: + name: Lint + uses: ./.github/workflows/lint.yml + + files_changed: + name: Determine files changed + needs: lint + runs-on: ubuntu-latest + # Map a step output to a job output, this allows other jobs to be gated on the filter results + outputs: + # The 'stackhpc' output will be 'true' if either of the two stackhpc filters below matched + stackhpc: ${{ toJson(fromJson(steps.filter_on_every.outputs.stackhpc) || fromJson(steps.filter_on_some.outputs.stackhpc)) }} + extra_on_push: ${{ steps.filter_on_some.outputs.extra_on_push }} + extra_on_pull_request: ${{ steps.filter_on_some.outputs.extra_on_pull_request }} + trivyscan: ${{ steps.filter_on_some.outputs.trivyscan }} + steps: + - name: Checkout + uses: actions/checkout@v4 + + # NOTE: We're detecting the changed files within a job so that we can gate execution of other jobs. + # We use dorny/paths-filter which doesn't work like the conventional 'paths' and 'paths_exclude', + # we can't do the following: + # paths: + # - '**' + # - '!dev/**' + # - 'dev/setup-env.sh' + # + # Which would include all files whilst removing all "dev/" files except "dev/setup-env.sh". + # We have to use two filters: + # * first filter includes all changed files and removes "dev/" files + # * second filter explicitly adds 'dev/setup-env.sh' + # We use the logical OR of the filters outputs to gate jobs. + + - name: Paths matching on every filter rule + # For safety use the commit of dorny/paths-filter@v3 + uses: dorny/paths-filter@de90cc6fb38fc0963ad72b210f1f284cd68cea36 + id: filter_on_every + with: + # Filter changed files, 'every' means the file is matched only if it matches all filter rules. + # NOTE: currently seeing: Warning: Unexpected input(s) 'predicate-quantifier', valid inputs are.. + # this can be ignored, filtering works as expected. + predicate-quantifier: 'every' + list-files: 'json' + filters: | + stackhpc: + - '**' + - '!dev/**' + - '!**/*.md' + - '!.gitignore' + - '!.github/workflows/**' + + - name: Paths matching on any filter rule + # For safety use the commit of dorny/paths-filter@v3 + uses: dorny/paths-filter@de90cc6fb38fc0963ad72b210f1f284cd68cea36 + id: filter_on_some + with: + # Filter changed files, 'some' means the file is matched if any one of the filter rules match. + # NOTE: currently seeing: Warning: Unexpected input(s) 'predicate-quantifier', valid inputs are.. + # this can be ignored, filtering works as expected. + predicate-quantifier: 'some' + list-files: 'json' + filters: | + stackhpc: + - 'dev/setup-env.sh' + - '.github/workflows/stackhpc.yml' + extra_on_push: + - 'environments/.stackhpc/tofu/cluster_image.auto.tfvars.json' + - 'ansible/roles/doca/**' + - 'ansible/roles/cuda/**' + - 'ansible/roles/slurm_recompile/**' # runs on cuda group + - 'ansible/roles/lustre/**' + - '.github/workflows/extra.yml' + extra_on_pull_request: + - 'environments/.stackhpc/tofu/cluster_image.auto.tfvars.json' + - 'ansible/roles/doca/**' + - 'ansible/roles/cuda/**' + - 'ansible/roles/lustre/**' + - '.github/workflows/extra.yml' + trivyscan: + - 'environments/.stackhpc/tofu/cluster_image.auto.tfvars.json' + + - name: Paths matched output + # NOTE: This is a debug step, it shows what files were matched by the filters. + # It's useful because dorny/paths-filter doesn't work like the conventional 'paths' and 'paths_exclude' + run: > + echo '{ "stackhpc_every_files": ${{ steps.filter_on_every.outputs.stackhpc_files }} }' | jq -r '.'; + echo '{ "stackhpc_some_files": ${{ steps.filter_on_some.outputs.stackhpc_files }} }' | jq -r '.'; + echo '{ "extra_on_push_files": ${{ steps.filter_on_some.outputs.extra_on_push_files }} }' | jq -r '.'; + echo '{ "extra_on_pull_request_files": ${{ steps.filter_on_some.outputs.extra_on_pull_request_files }} }' | jq -r '.'; + echo '{ "trivyscan_files": ${{ steps.filter_on_some.outputs.trivyscan_files }} }' | jq -r '.' + + stackhpc: + name: Test deployment and reimage on OpenStack + needs: files_changed + if: | + needs.files_changed.outputs.stackhpc == 'true' + uses: ./.github/workflows/stackhpc.yml + secrets: inherit + + extra: + name: Test extra build + needs: files_changed + if: | + github.event_name != 'pull_request' && needs.files_changed.outputs.extra_on_push == 'true' || + github.event_name == 'pull_request' && needs.files_changed.outputs.extra_on_pull_request == 'true' + uses: ./.github/workflows/extra.yml + secrets: inherit + + trivyscan: + name: Trivy scan image for vulnerabilities + needs: files_changed + if: | + github.event_name == 'pull_request' && + needs.files_changed.outputs.trivyscan == 'true' + uses: ./.github/workflows/trivyscan.yml + secrets: inherit diff --git a/.github/workflows/nightly-cleanup.yml b/.github/workflows/nightly-cleanup.yml index 897d357..5bec96d 100644 --- a/.github/workflows/nightly-cleanup.yml +++ b/.github/workflows/nightly-cleanup.yml @@ -4,6 +4,12 @@ on: schedule: - cron: '0 21 * * *' # Run at 9PM - image sync runs at midnight +permissions: + contents: read + packages: write + # To report GitHub Actions status checks + statuses: write + jobs: ci_cleanup: name: ci-cleanup @@ -20,7 +26,7 @@ jobs: OS_CLOUD: openstack CI_CLOUD: ${{ matrix.cloud }} steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v4 - name: Record which cloud CI is running on run: | @@ -31,7 +37,7 @@ jobs: python3 -m venv venv . venv/bin/activate pip install -U pip - pip install $(grep -o 'python-openstackclient[><=0-9\.]*' requirements.txt) + pip install "$(grep -o 'python-openstackclient[><=0-9\.]*' requirements.txt)" shell: bash - name: Write clouds.yaml @@ -52,7 +58,7 @@ jobs: # Flatten multiline value so can be passed as env var CI_CLUSTERS_FORMATTED=$(echo "$CI_CLUSTERS" | tr '\n' ' ' | sed 's/ $//') echo "DEBUG: Formatted CI clusters: $CI_CLUSTERS_FORMATTED" - echo "ci_clusters=$CI_CLUSTERS_FORMATTED" >> $GITHUB_ENV + echo "ci_clusters=$CI_CLUSTERS_FORMATTED" >> "$GITHUB_ENV" fi shell: bash @@ -69,7 +75,7 @@ jobs: echo "Processing cluster: $cluster_prefix" # Get all servers with the matching name for control node - CONTROL_SERVERS=$(openstack server list --name ${cluster_prefix}-control --format json) + CONTROL_SERVERS=$(openstack server list --name "${cluster_prefix}-control" --format json) # Get unique server names to avoid duplicate cleanup UNIQUE_NAMES=$(echo "$CONTROL_SERVERS" | jq -r '.[].Name' | sort | uniq) @@ -86,7 +92,7 @@ jobs: fi echo "Deleting cluster $cluster_prefix (server $server)..." - ./dev/delete-cluster.py $cluster_prefix --force + ./dev/delete-cluster.py "$cluster_prefix" --force done done shell: bash diff --git a/.github/workflows/nightlybuild.yml b/.github/workflows/nightlybuild.yml index ea4b242..21e9d64 100644 --- a/.github/workflows/nightlybuild.yml +++ b/.github/workflows/nightlybuild.yml @@ -1,6 +1,7 @@ name: Build nightly image on: workflow_dispatch: + # checkov:skip=CKV_GHA_7: "The build output cannot be affected by user parameters other than the build entry point and the top-level source location. GitHub Actions workflow_dispatch inputs MUST be empty. " inputs: ci_cloud: description: 'Select the CI_CLOUD' @@ -13,6 +14,12 @@ on: # schedule: # - cron: '0 0 * * *' # Run at midnight on default branch +permissions: + contents: read + packages: write + # To report GitHub Actions status checks + statuses: write + jobs: openstack: name: openstack-imagebuild @@ -39,7 +46,7 @@ jobs: PACKER_GITHUB_API_TOKEN: ${{ secrets.GITHUB_TOKEN }} steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v4 - name: Record settings for CI cloud run: | @@ -81,8 +88,8 @@ jobs: packer init . PACKER_LOG=1 packer build \ - -on-error=${{ vars.PACKER_ON_ERROR }} \ - -var-file=$PKR_VAR_environment_root/${{ env.CI_CLOUD }}.pkrvars.hcl \ + -on-error="${{ vars.PACKER_ON_ERROR }}" \ + -var-file="$PKR_VAR_environment_root/${{ env.CI_CLOUD }}.pkrvars.hcl" \ -var "source_image_name=${{ matrix.build.source_image_name }}" \ -var "image_name=${{ matrix.build.image_name }}" \ -var "image_name_version=" \ @@ -94,10 +101,10 @@ jobs: run: | . venv/bin/activate IMAGE_ID=$(jq --raw-output '.builds[-1].artifact_id' packer/packer-manifest.json) - while ! openstack image show -f value -c name $IMAGE_ID; do + while ! openstack image show -f value -c name "$IMAGE_ID"; do sleep 5 done - IMAGE_NAME=$(openstack image show -f value -c name $IMAGE_ID) + IMAGE_NAME=$(openstack image show -f value -c name "$IMAGE_ID") echo "image-name=${IMAGE_NAME}" >> "$GITHUB_OUTPUT" echo "image-id=$IMAGE_ID" >> "$GITHUB_OUTPUT" @@ -142,7 +149,7 @@ jobs: SOURCE_CLOUD: ${{ github.event.inputs.ci_cloud || vars.CI_CLOUD }} TARGET_CLOUD: ${{ matrix.target_cloud }} steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v4 - name: Record settings for CI cloud run: | @@ -154,7 +161,7 @@ jobs: python3 -m venv venv . venv/bin/activate pip install -U pip - pip install $(grep -o 'python-openstackclient[><=0-9\.]*' requirements.txt) + pip install "$(grep -o 'python-openstackclient[><=0-9\.]*' requirements.txt)" - name: Write clouds.yaml run: | diff --git a/.github/workflows/release-image.yml b/.github/workflows/release-image.yml index 8fcddf5..1ee545c 100644 --- a/.github/workflows/release-image.yml +++ b/.github/workflows/release-image.yml @@ -6,6 +6,13 @@ on: - published # should work for both pre-releases and releases env: IMAGE_PATH: environments/.stackhpc/tofu/cluster_image.auto.tfvars.json + +permissions: + contents: read + packages: write + # To report GitHub Actions status checks + statuses: write + jobs: ci-image-release: name: ci-image-release @@ -18,7 +25,7 @@ jobs: - RL8 - RL9 steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v4 - name: Write s3cmd configuration run: echo "${{ secrets.LEAFCLOUD_S3_CFG }}" > ~/.s3cfg diff --git a/.github/workflows/s3-image-sync.yml b/.github/workflows/s3-image-sync.yml index 990125f..43adf50 100644 --- a/.github/workflows/s3-image-sync.yml +++ b/.github/workflows/s3-image-sync.yml @@ -10,6 +10,12 @@ env: S3_BUCKET: openhpc-images-prerelease IMAGE_PATH: environments/.stackhpc/tofu/cluster_image.auto.tfvars.json +permissions: + contents: read + packages: write + # To report GitHub Actions status checks + statuses: write + jobs: s3_cleanup: runs-on: ubuntu-22.04 @@ -17,7 +23,7 @@ jobs: strategy: fail-fast: false steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v4 - name: Write s3cmd configuration run: | @@ -50,7 +56,7 @@ jobs: outputs: ci_cloud: ${{ steps.ci.outputs.CI_CLOUD }} steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v4 - name: Record which cloud CI is running on id: ci @@ -62,7 +68,7 @@ jobs: python3 -m venv venv . venv/bin/activate pip install -U pip - pip install $(grep -o 'python-openstackclient[><=0-9\.]*' requirements.txt) + pip install "$(grep -o 'python-openstackclient[><=0-9\.]*' requirements.txt)" shell: bash - name: Write clouds.yaml @@ -138,7 +144,7 @@ jobs: OS_CLOUD: openstack CI_CLOUD: ${{ matrix.cloud }} steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v4 - name: Record which cloud CI is running on run: | @@ -149,7 +155,7 @@ jobs: python3 -m venv venv . venv/bin/activate pip install -U pip - pip install $(grep -o 'python-openstackclient[><=0-9\.]*' requirements.txt) + pip install "$(grep -o 'python-openstackclient[><=0-9\.]*' requirements.txt)" shell: bash - name: Write clouds.yaml @@ -175,7 +181,7 @@ jobs: image_hanging=$(openstack image list --name ${{ env.TARGET_IMAGE }} -f value -c ID -c Status | grep -v ' active$' | awk '{print $1}') if [ -n "$image_hanging" ]; then echo "Cleaning up OpenStack image with ID: $image_hanging" - openstack image delete $image_hanging + openstack image delete "$image_hanging" else echo "No image ID found, skipping cleanup." fi diff --git a/.github/workflows/stackhpc.yml b/.github/workflows/stackhpc.yml index da4933b..cb4e865 100644 --- a/.github/workflows/stackhpc.yml +++ b/.github/workflows/stackhpc.yml @@ -1,33 +1,23 @@ +--- + +# Test deployment and reimage on OpenStack. +# This workflow can run standalone or as part of the main CI workflow. +# See the workflow file 'main.yml' for how this is CI triggered. name: Test deployment and reimage on OpenStack on: + workflow_call: workflow_dispatch: - push: - branches: - - main - paths: - - '**' - - '!dev/**' - - 'dev/setup-env.sh' - - '!**.md' - - '!.gitignore' - - '!.github/workflows/' - - '.github/workflows/stackhpc' - pull_request: - paths: - - '**' - - '!dev/**' - - 'dev/setup-env.sh' - - '!**.md' - - '!.gitignore' - - '!.github/workflows/' - - '.github/workflows/stackhpc' + +permissions: + contents: read + packages: write + # To report GitHub Actions status checks + statuses: write + jobs: openstack: name: openstack-ci - concurrency: - group: ${{ github.workflow }}-${{ github.ref }}-${{ matrix.os_version }} # to branch/PR + OS - cancel-in-progress: true runs-on: ubuntu-22.04 strategy: fail-fast: false # allow other matrix jobs to continue even if one fails @@ -46,7 +36,7 @@ jobs: - name: Find the latest release run: | - echo LATEST_RELEASE_TAG=$(curl -s https://api.github.com/repos/stackhpc/ansible-slurm-appliance/releases/latest | jq -r .tag_name) >> "$GITHUB_ENV" + echo "LATEST_RELEASE_TAG=$(curl -s https://api.github.com/repos/stackhpc/ansible-slurm-appliance/releases/latest | jq -r .tag_name)" >> "$GITHUB_ENV" - name: Checkout latest release uses: actions/checkout@v4 @@ -59,19 +49,19 @@ jobs: run: | # Iterate over the labels labels=$(echo '${{ toJSON(github.event.pull_request.labels) }}' | jq -r '.[].name') - echo $labels + echo "$labels" for label in $labels; do if [[ $label == CI_CLOUD=* ]]; then # Extract the value after 'CI_CLOUD=' CI_CLOUD_OVERRIDE=${label#CI_CLOUD=} - echo "CI_CLOUD=${CI_CLOUD_OVERRIDE}" >> $GITHUB_ENV + echo "CI_CLOUD=${CI_CLOUD_OVERRIDE}" >> "$GITHUB_ENV" fi done - name: Record debug info run: | - echo LATEST_RELEASE_TAG: $LATEST_RELEASE_TAG - echo CI_CLOUD: $CI_CLOUD + echo "LATEST_RELEASE_TAG: $LATEST_RELEASE_TAG" + echo "CI_CLOUD: $CI_CLOUD" - name: Setup ssh run: | @@ -107,7 +97,7 @@ jobs: run: | . venv/bin/activate . environments/.stackhpc/activate - echo vault_demo_user_password: "$DEMO_USER_PASSWORD" > $APPLIANCES_ENVIRONMENT_ROOT/inventory/group_vars/all/test_user.yml + echo "vault_demo_user_password: $DEMO_USER_PASSWORD" > "$APPLIANCES_ENVIRONMENT_ROOT/inventory/group_vars/all/test_user.yml" env: DEMO_USER_PASSWORD: ${{ secrets.TEST_USER_PASSWORD }} @@ -116,14 +106,14 @@ jobs: run: | . venv/bin/activate . environments/.stackhpc/activate - cd $STACKHPC_TF_DIR + cd "$STACKHPC_TF_DIR" tofu apply -auto-approve -var-file="${{ env.CI_CLOUD }}.tfvars" - name: Delete infrastructure if provisioning failed run: | . venv/bin/activate . environments/.stackhpc/activate - cd $STACKHPC_TF_DIR + cd "$STACKHPC_TF_DIR" tofu destroy -auto-approve -var-file="${{ env.CI_CLOUD }}.tfvars" if: failure() && steps.provision_servers.outcome == 'failure' @@ -159,7 +149,7 @@ jobs: run: | . venv/bin/activate . environments/.stackhpc/activate - cd $STACKHPC_TF_DIR + cd "$STACKHPC_TF_DIR" tofu init tofu apply -auto-approve -var-file="${{ env.CI_CLOUD }}.tfvars" @@ -205,14 +195,14 @@ jobs: # load ansible variables into shell: ansible-playbook ansible/ci/output_vars.yml \ -e output_vars_hosts=openondemand \ - -e output_vars_path=$APPLIANCES_ENVIRONMENT_ROOT/vars.txt \ + -e output_vars_path="$APPLIANCES_ENVIRONMENT_ROOT/vars.txt" \ -e output_vars_items=bastion_ip,bastion_user,openondemand_servername - source $APPLIANCES_ENVIRONMENT_ROOT/vars.txt + source "$APPLIANCES_ENVIRONMENT_ROOT/vars.txt" # setup ssh proxying: sudo apt-get --yes install proxychains echo proxychains installed - ssh -v -fN -D 9050 ${bastion_user}@${bastion_ip} + ssh -v -fN -D 9050 "${bastion_user}@${bastion_ip}" echo port 9050 forwarded # check OOD server returns 200: @@ -222,9 +212,9 @@ jobs: --server-response \ --no-check-certificate \ --http-user=demo_user \ - --http-password=${DEMO_USER_PASSWORD} https://${openondemand_servername} \ + --http-password="${DEMO_USER_PASSWORD}" "https://${openondemand_servername}" \ 2>&1) - (echo $statuscode | grep "200 OK") || (echo $statuscode && exit 1) + (echo "$statuscode" | grep "200 OK") || (echo "$statuscode" && exit 1) env: DEMO_USER_PASSWORD: ${{ secrets.TEST_USER_PASSWORD }} @@ -234,14 +224,14 @@ jobs: . environments/.stackhpc/activate if [ -n "$SNAPSHOT" ] then - echo Deleting $SNAPSHOT - openstack volume snapshot delete $SNAPSHOT + echo "Deleting $SNAPSHOT" + openstack volume snapshot delete "$SNAPSHOT" fi - name: Delete infrastructure run: | . venv/bin/activate . environments/.stackhpc/activate - cd $STACKHPC_TF_DIR + cd "$STACKHPC_TF_DIR" tofu destroy -auto-approve -var-file="${{ env.CI_CLOUD }}.tfvars" || echo "tofu failed in $STACKHPC_TF_DIR" if: ${{ success() || cancelled() }} diff --git a/.github/workflows/trivyscan.yml b/.github/workflows/trivyscan.yml index fe049e6..8cfc8e4 100644 --- a/.github/workflows/trivyscan.yml +++ b/.github/workflows/trivyscan.yml @@ -1,17 +1,22 @@ +--- + +# Scan the built image for vulnerabilities using Trivy. +# This workflow can run standalone or as part of the main CI workflow. +# See the workflow file 'main.yml' for how this is CI triggered. + name: Trivy scan image for vulnerabilities on: + workflow_call: workflow_dispatch: - pull_request: - branches: - - main - paths: - - 'environments/.stackhpc/tofu/cluster_image.auto.tfvars.json' + +permissions: + contents: read + packages: write + # To report GitHub Actions status checks + statuses: write jobs: scan: - concurrency: - group: ${{ github.workflow }}-${{ github.ref }}-${{ matrix.build }} # to branch/PR + build - cancel-in-progress: true runs-on: ubuntu-latest strategy: fail-fast: false @@ -23,19 +28,19 @@ jobs: CI_CLOUD: ${{ vars.CI_CLOUD }} steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v4 - name: Override CI_CLOUD if PR label is present if: ${{ github.event_name == 'pull_request' }} run: | # Iterate over the labels labels=$(echo '${{ toJSON(github.event.pull_request.labels) }}' | jq -r '.[].name') - echo $labels + echo "$labels" for label in $labels; do if [[ $label == CI_CLOUD=* ]]; then # Extract the value after 'CI_CLOUD=' CI_CLOUD_OVERRIDE=${label#CI_CLOUD=} - echo "CI_CLOUD=${CI_CLOUD_OVERRIDE}" >> $GITHUB_ENV + echo "CI_CLOUD=${CI_CLOUD_OVERRIDE}" >> "$GITHUB_ENV" fi done @@ -60,7 +65,7 @@ jobs: python3 -m venv venv . venv/bin/activate pip install -U pip - pip install $(grep -o 'python-openstackclient[><=0-9\.]*' requirements.txt) + pip install "$(grep -o 'python-openstackclient[><=0-9\.]*' requirements.txt)" shell: bash - name: Write clouds.yaml diff --git a/.github/workflows/upgrade-check.yml.sample b/.github/workflows/upgrade-check.yml.sample index 39efcd8..eabe973 100644 --- a/.github/workflows/upgrade-check.yml.sample +++ b/.github/workflows/upgrade-check.yml.sample @@ -28,6 +28,13 @@ on: schedule: - cron: "0 9 * * *" workflow_dispatch: + +permissions: + contents: read + packages: write + # To report GitHub Actions status checks + statuses: write + jobs: check_for_update: runs-on: ubuntu-22.04 diff --git a/.github/workflows/upload-release-image.yml.sample b/.github/workflows/upload-release-image.yml.sample index d1f9305..fd7635a 100644 --- a/.github/workflows/upload-release-image.yml.sample +++ b/.github/workflows/upload-release-image.yml.sample @@ -29,6 +29,12 @@ on: - openhpc-images # - openhpc-images-prerelease +permissions: + contents: read + packages: write + # To report GitHub Actions status checks + statuses: write + jobs: image_upload: runs-on: ubuntu-22.04 diff --git a/.gitignore b/.gitignore index d5b752d..6dfeb97 100644 --- a/.gitignore +++ b/.gitignore @@ -6,3 +6,4 @@ venv packer/openhpc2 .vscode requirements.yml.last +.ansible diff --git a/.python-lint b/.python-lint new file mode 100644 index 0000000..7fe8d51 --- /dev/null +++ b/.python-lint @@ -0,0 +1,6 @@ +[MESSAGES CONTROL] + +# There seems to be an issue with the check +# https://github.com/pylint-dev/pylint/issues/214 +disable= + duplicate-code, diff --git a/.shellcheckrc b/.shellcheckrc new file mode 100644 index 0000000..454b8ef --- /dev/null +++ b/.shellcheckrc @@ -0,0 +1,7 @@ +# Configuration file for shellcheck +# https://github.com/koalaman/shellcheck/blob/master/shellcheck.1.md#rc-files + +# Unable to exclude *.sh.j2 files and the ansible parentheses upset shellcheck a lot. +# Lines can be address individually with # shellcheck disable=SCxxxx but this gets quite prolific. +# Disabling globally as we have more sh.j2 files than .sh +disable=SC1009,SC1054,SC1064,SC1065,SC1072,SC1073,SC1083 diff --git a/.yamllint.yml b/.yamllint.yml new file mode 100644 index 0000000..3220260 --- /dev/null +++ b/.yamllint.yml @@ -0,0 +1,24 @@ +--- +extends: default + +rules: + brackets: + forbid: non-empty + comments: + # https://github.com/prettier/prettier/issues/6780 + min-spaces-from-content: 1 + # https://github.com/adrienverge/yamllint/issues/384 + comments-indentation: false + document-start: disable + # 160 chars was the default used by old E204 rule, but + # you can easily change it or disable in your .yamllint file. + line-length: + max: 160 + # We are adding an extra space inside braces as that's how prettier does it + # and we are trying not to fight other linters. + braces: + min-spaces-inside: 0 # yamllint defaults to 0 + max-spaces-inside: 1 # yamllint defaults to 0 + octal-values: + forbid-implicit-octal: true # yamllint defaults to false + forbid-explicit-octal: true # yamllint defaults to false diff --git a/README.md b/README.md index f8503a4..8acd424 100644 --- a/README.md +++ b/README.md @@ -1,8 +1,9 @@ -[![Test deployment and image build on OpenStack](https://github.com/stackhpc/ansible-slurm-appliance/actions/workflows/stackhpc.yml/badge.svg)](https://github.com/stackhpc/ansible-slurm-appliance/actions/workflows/stackhpc.yml) - # StackHPC Slurm Appliance +[![Test deployment and image build on OpenStack](https://github.com/stackhpc/ansible-slurm-appliance/actions/workflows/stackhpc.yml/badge.svg)](https://github.com/stackhpc/ansible-slurm-appliance/actions/workflows/stackhpc.yml) + This repository contains playbooks and configuration to define a Slurm-based HPC environment. This includes: + - [Rocky Linux](https://rockylinux.org/)-based hosts. - [OpenTofu](https://opentofu.org/) configurations to define the cluster's infrastructure-as-code. - Packages for Slurm and MPI software stacks from [OpenHPC](https://openhpc.community/). @@ -22,18 +23,20 @@ While it is tested on OpenStack it should work on any cloud with appropriate Ope ## Demonstration Deployment The default configuration in this repository may be used to create a cluster to explore use of the appliance. It provides: + - Persistent state backed by an OpenStack volume. - NFS-based shared file system backed by another OpenStack volume. It requires an OpenStack cloud, and an Ansible "deploy host" with access to that cloud. Before starting ensure that: + - You have root access on the deploy host. - You can create instances from the [latest Slurm appliance image](https://github.com/stackhpc/ansible-slurm-appliance/releases), which already contains the required packages. This is built and tested in StackHPC's CI. - You have an SSH keypair defined in OpenStack, with the private key available on the deploy host. - Created instances have access to internet (note proxies can be setup through the appliance if necessary). - Created instances have accurate/synchronised time (for VM instances this is usually provided by the hypervisor; if not or for bare metal instances it may be necessary to configure a time service via the appliance). -- Three security groups are present: ``default`` allowing intra-cluster communication, ``SSH`` allowing external access via SSH and ``HTTPS`` allowing access for Open OnDemand. +- Three security groups are present: `default` allowing intra-cluster communication, `SSH` allowing external access via SSH and `HTTPS` allowing access for Open OnDemand. ### Setup deploy host @@ -44,11 +47,13 @@ The following operating systems are supported for the deploy host: These instructions assume the deployment host is running Rocky Linux 8: - sudo yum install -y git python38 - git clone https://github.com/stackhpc/ansible-slurm-appliance - cd ansible-slurm-appliance - git checkout ${latest-release-tag} - ./dev/setup-env.sh +```shell +sudo yum install -y git python38 +git clone https://github.com/stackhpc/ansible-slurm-appliance +cd ansible-slurm-appliance +git checkout ${latest-release-tag} +./dev/setup-env.sh +``` You will also need to install [OpenTofu](https://opentofu.org/docs/intro/install/rpm/). @@ -56,12 +61,16 @@ You will also need to install [OpenTofu](https://opentofu.org/docs/intro/install Run the following from the repository root to activate the venv: - . venv/bin/activate +```shell +. venv/bin/activate +``` Use the `cookiecutter` template to create a new environment to hold your configuration: - cd environments - cookiecutter ../cookiecutter +```shell +cd environments +cookiecutter ../cookiecutter +``` and follow the prompts to complete the environment name and description. @@ -69,52 +78,59 @@ and follow the prompts to complete the environment name and description. Go back to the root folder and activate the new environment: - cd .. - . environments/$ENV/activate +```shell +cd .. +. environments/$ENV/activate +``` And generate secrets for it: - ansible-playbook ansible/adhoc/generate-passwords.yml +```shell +ansible-playbook ansible/adhoc/generate-passwords.yml +``` ### Define and deploy infrastructure Create an OpenTofu variables file to define the required infrastructure, e.g.: - # environments/$ENV/tofu/terraform.tfvars: - - cluster_name = "mycluster" - cluster_networks = [ - { - network = "some_network" # * - subnet = "some_subnet" # * - } - ] - key_pair = "my_key" # * - control_node_flavor = "some_flavor_name" - login = { - # Arbitrary group name for these login nodes - interactive = { - nodes: ["login-0"] - flavor: "login_flavor_name" # * - } +```text +# environments/$ENV/tofu/terraform.tfvars: +cluster_name = "mycluster" +cluster_networks = [ + { + network = "some_network" # * + subnet = "some_subnet" # * + } +] +key_pair = "my_key" # * +control_node_flavor = "some_flavor_name" +login = { + # Arbitrary group name for these login nodes + interactive = { + nodes: ["login-0"] + flavor: "login_flavor_name" # * } - cluster_image_id = "rocky_linux_9_image_uuid" - compute = { - # Group name used for compute node partition definition - general = { - nodes: ["compute-0", "compute-1"] - flavor: "compute_flavor_name" # * - } +} +cluster_image_id = "rocky_linux_9_image_uuid" +compute = { + # Group name used for compute node partition definition + general = { + nodes: ["compute-0", "compute-1"] + flavor: "compute_flavor_name" # * } +} +``` Variables marked `*` refer to OpenStack resources which must already exist. The above is a minimal configuration - for all variables and descriptions see `environments/$ENV/tofu/variables.tf`. To deploy this infrastructure, ensure the venv and the environment are [activated](#create-a-new-environment) and run: - export OS_CLOUD=openstack - cd environments/$ENV/tofu/ - tofu init - tofu apply +```shell +export OS_CLOUD=openstack +cd environments/$ENV/tofu/ +tofu init +tofu apply +``` and follow the prompts. Note the OS_CLOUD environment variable assumes that OpenStack credentials are defined using a [clouds.yaml](https://docs.openstack.org/python-openstackclient/latest/configuration/index.html#clouds-yaml) file in a default location with the default cloud name of `openstack`. @@ -122,11 +138,15 @@ and follow the prompts. Note the OS_CLOUD environment variable assumes that Open To configure the appliance, ensure the venv and the environment are [activated](#create-a-new-environment) and run: - ansible-playbook ansible/site.yml +```shell +ansible-playbook ansible/site.yml +``` Once it completes you can log in to the cluster using: - ssh rocky@$login_ip +```shell +ssh rocky@$login_ip +``` where the IP of the login node is given in `environments/$ENV/inventory/hosts.yml` @@ -134,7 +154,27 @@ where the IP of the login node is given in `environments/$ENV/inventory/hosts.ym - `environments/`: See [docs/environments.md](docs/environments.md). - `ansible/`: Contains the ansible playbooks to configure the infrastructure. -- `packer/`: Contains automation to use Packer to build machine images for an environment - see the README in this directory for further information. +- `packer/`: Contains automation to use Packer to build machine images for an environment - see the readme in this directory for further information. - `dev/`: Contains development tools. For further information see the [docs](docs/) directory. + +## Developing locally + +To run the GitHub Actions linters locally, use: + +```shell +docker run --rm \ + -e RUN_LOCAL=true \ + --env-file "super-linter.env" \ + -v "$(pwd)":/tmp/lint \ + ghcr.io/super-linter/super-linter:v7.3.0 +``` + +```shell +ANSIBLE_COLLECTIONS_PATH=.ansible/collections \ + ansible-lint -c .ansible-lint.yml +``` + +Specifying `ANSIBLE_COLLECTIONS_PATH` ensures `ansible-lint` downloads collections and roles under the `.ansible` directory, separating them from our own roles under the `ansible` directory. +We exclude these downloaded files from linting by listing `.ansible` under `exclude_paths` in `.ansible-lint.yml`. diff --git a/actionlint.yml b/actionlint.yml new file mode 100644 index 0000000..ed97d53 --- /dev/null +++ b/actionlint.yml @@ -0,0 +1 @@ +--- diff --git a/ansible/adhoc/backup-keytabs.yml b/ansible/adhoc/backup-keytabs.yml index 5566e48..a88daf7 100644 --- a/ansible/adhoc/backup-keytabs.yml +++ b/ansible/adhoc/backup-keytabs.yml @@ -1,11 +1,12 @@ +--- # Use ONE of the following tags on this playbook: # - retrieve: copies keytabs out of the state volume to the environment # - deploy: copies keytabs from the environment to the state volume - hosts: freeipa_client - become: yes - gather_facts: no + become: true + gather_facts: false tasks: - - import_role: + - ansible.builtin.import_role: name: freeipa tasks_from: backup-keytabs.yml diff --git a/ansible/adhoc/cudatests.yml b/ansible/adhoc/cudatests.yml index 59af856..f571f8a 100644 --- a/ansible/adhoc/cudatests.yml +++ b/ansible/adhoc/cudatests.yml @@ -1,8 +1,9 @@ +--- - hosts: cuda - become: yes - gather_facts: yes + become: true + gather_facts: true tags: cuda_samples tasks: - - import_role: + - ansible.builtin.import_role: name: cuda tasks_from: samples.yml diff --git a/ansible/adhoc/deploy-pulp.yml b/ansible/adhoc/deploy-pulp.yml index f7bafc3..11158cb 100644 --- a/ansible/adhoc/deploy-pulp.yml +++ b/ansible/adhoc/deploy-pulp.yml @@ -1,17 +1,19 @@ +--- + - name: Install pulp on server - become: yes + become: true hosts: pulp_server tasks: - - name: Install pulp - ansible.builtin.include_role: - name: pulp_site - tasks_from: install.yml - public: true + - name: Install pulp + ansible.builtin.include_role: + name: pulp_site + tasks_from: install.yml + public: true - - name: Print Pulp endpoint - become: no - debug: - msg: | - Server configured, override 'appliances_pulp_url' with + - name: Print Pulp endpoint + become: false + ansible.builtin.debug: + msg: | + Server configured, override 'appliances_pulp_url' with appliances_pulp_url: "http://{{ hostvars[groups['pulp_server'] | first].ansible_host }}:{{ pulp_site_port }}" - (or the correct IP if multi-homed) in your environments + (or the correct IP if multi-homed) in your environments diff --git a/ansible/adhoc/generate-passwords.yml b/ansible/adhoc/generate-passwords.yml index 89c08f0..f9354f2 100644 --- a/ansible/adhoc/generate-passwords.yml +++ b/ansible/adhoc/generate-passwords.yml @@ -1,9 +1,8 @@ --- - - name: Generate passwords.yml hosts: localhost gather_facts: false tasks: - name: Include password generation role - include_role: - name: passwords \ No newline at end of file + ansible.builtin.include_role: + name: passwords diff --git a/ansible/adhoc/hpctests.yml b/ansible/adhoc/hpctests.yml index 6e733d3..5747e7c 100644 --- a/ansible/adhoc/hpctests.yml +++ b/ansible/adhoc/hpctests.yml @@ -3,10 +3,9 @@ # Relies on installed packages in appliance defaults - see openhpc variables. --- - - hosts: hpctests[0] # TODO: might want to make which node is used selectable? become: false gather_facts: false tasks: - - import_role: + - ansible.builtin.import_role: name: hpctests diff --git a/ansible/adhoc/rebuild-via-slurm.yml b/ansible/adhoc/rebuild-via-slurm.yml index 4f7b5a5..33cbe5c 100644 --- a/ansible/adhoc/rebuild-via-slurm.yml +++ b/ansible/adhoc/rebuild-via-slurm.yml @@ -1,3 +1,4 @@ +--- # Rebuild compute nodes via slurm. # Nodes will be rebuilt if `image_id` in inventory is different to the # currently-provisioned image. Otherwise they are rebooted. @@ -9,9 +10,9 @@ - hosts: login run_once: true - gather_facts: no + gather_facts: false tasks: - name: Run slurm-controlled rebuild - import_role: + ansible.builtin.import_role: name: rebuild tasks_from: rebuild.yml diff --git a/ansible/adhoc/rebuild.yml b/ansible/adhoc/rebuild.yml index 9e7a3a7..b6033e4 100644 --- a/ansible/adhoc/rebuild.yml +++ b/ansible/adhoc/rebuild.yml @@ -1,21 +1,24 @@ +--- # Rebuild hosts with a specified image from OpenStack. -# +# # Use ansible's -v output to see output. # Use --limit to control which hosts to rebuild (either specific hosts or the _ groups defining partitions). # Optionally, supply `-e rebuild_image=` to define a specific image, otherwise the current image is reused. # -# NOTE: If a hostvar `instance_id` is defined this is used to select hosts. Otherwise the hostname is used and this must be unique, which may not be the case e.g. if using identically-named staging and production hosts. +# NOTE: If a hostvar `instance_id` is defined this is used to select hosts. +# Otherwise the hostname is used and this must be unique, which may not be the case e.g. if using identically-named staging and production hosts. # # Example: # ansible-playbook -v --limit ohpc_compute ansible/adhoc/rebuild.yml -e rebuild_image=openhpc_v2.3 - hosts: cluster - become: no - gather_facts: no + become: false + gather_facts: false tasks: - - command: "openstack server rebuild {{ instance_id | default(inventory_hostname) }}{% if rebuild_image is defined %} --image {{ rebuild_image }}{% endif %}" + # yamllint disable-line rule:line-length + - ansible.builtin.command: "openstack server rebuild {{ instance_id | default(inventory_hostname) }}{% if rebuild_image is defined %} --image {{ rebuild_image }}{% endif %}" delegate_to: localhost - - wait_for_connection: + changed_when: false + - ansible.builtin.wait_for_connection: delay: 60 timeout: 600 - diff --git a/ansible/adhoc/restart-slurm.yml b/ansible/adhoc/restart-slurm.yml index 41b9dcb..de837f5 100644 --- a/ansible/adhoc/restart-slurm.yml +++ b/ansible/adhoc/restart-slurm.yml @@ -1,3 +1,4 @@ +--- # Restart all slurm daemons e.g. after changing configuration. Note that: # - `scontrol reconfigure` will handle most reconfiguration - see https://slurm.schedmd.com/scontrol.html#OPT_reconfigure # for which options need a restart @@ -5,25 +6,25 @@ # restart daemons as required. - hosts: compute,login - become: yes - gather_facts: no + become: true + gather_facts: false tasks: - - service: + - ansible.builtin.service: name: slurmd state: stopped - hosts: control - become: yes - gather_facts: no + become: true + gather_facts: false tasks: - - service: + - ansible.builtin.service: name: slurmctld state: restarted - hosts: compute,login - become: yes - gather_facts: no + become: true + gather_facts: false tasks: - - service: + - ansible.builtin.service: name: slurmd state: started diff --git a/ansible/adhoc/sync-pulp.yml b/ansible/adhoc/sync-pulp.yml index 373f3ab..a3b07ae 100644 --- a/ansible/adhoc/sync-pulp.yml +++ b/ansible/adhoc/sync-pulp.yml @@ -1,3 +1,4 @@ +--- - hosts: localhost tasks: - ansible.builtin.include_role: diff --git a/ansible/adhoc/update-packages.yml b/ansible/adhoc/update-packages.yml index ae970ba..929b0da 100644 --- a/ansible/adhoc/update-packages.yml +++ b/ansible/adhoc/update-packages.yml @@ -1,18 +1,20 @@ +--- - hosts: update - become: yes + become: true gather_facts: false tasks: - name: Update selected packages - yum: + ansible.builtin.dnf: name: "{{ update_name }}" state: "{{ update_state }}" exclude: "{{ update_exclude }}" disablerepo: "{{ update_disablerepo }}" register: updates - name: Log updated packages - copy: + ansible.builtin.copy: content: "{{ updates.results | join('\n') }}" dest: "{{ update_log_path }}" + mode: "0644" delegate_to: localhost - - debug: + - ansible.builtin.debug: msg: "{{ updates.results | length }} changes to packages - see {{ update_log_path }} for details" diff --git a/ansible/bootstrap.yml b/ansible/bootstrap.yml index 50d0246..21f9303 100644 --- a/ansible/bootstrap.yml +++ b/ansible/bootstrap.yml @@ -1,16 +1,15 @@ --- - - hosts: cluster gather_facts: false - become: yes + become: true tasks: - name: Check if ansible-init is installed - stat: + ansible.builtin.stat: path: /etc/systemd/system/ansible-init.service register: _stat_ansible_init_unitfile - + - name: Wait for ansible-init to finish - wait_for: + ansible.builtin.wait_for: path: /var/lib/ansible-init.done timeout: "{{ ansible_init_wait }}" # seconds when: _stat_ansible_init_unitfile.stat.exists @@ -21,7 +20,7 @@ tags: - deprecated tasks: - - fail: + - ansible.builtin.fail: msg: | Variables prefixed secrets_openhpc_* are deprecated - run: $ ansible-playbook ansible/adhoc/generate-passwords.yml @@ -29,34 +28,34 @@ when: "'secrets_openhpc_' in (hostvars[inventory_hostname] | join)" - hosts: resolv_conf - become: yes + become: true gather_facts: false tags: resolv_conf tasks: - - import_role: + - ansible.builtin.import_role: name: resolv_conf - hosts: etc_hosts gather_facts: false tags: etc_hosts - become: yes + become: true tasks: - - import_role: + - ansible.builtin.import_role: name: etc_hosts - hosts: proxy gather_facts: false tags: proxy - become: yes + become: true tasks: - - import_role: + - ansible.builtin.import_role: name: proxy - hosts: chrony tags: chrony - become: yes + become: true tasks: - - import_role: + - ansible.builtin.import_role: name: mrlesmithjr.chrony # skip install tasks as might not have network yet tasks_from: config_chrony.yml @@ -67,53 +66,53 @@ - hosts: cluster gather_facts: false - become: yes + become: true tasks: - name: Fix incorrect permissions on /etc in Rocky-9-GenericCloud-Base-9.4-20240523.0.x86_64.qcow2 # breaks munge - file: + ansible.builtin.file: path: /etc state: directory owner: root group: root mode: u=rwx,go=rx # has g=rwx - name: Prevent ssh hanging if shared home is unavailable - lineinfile: + ansible.builtin.lineinfile: path: /etc/profile search_string: HOSTNAME=$(/usr/bin/hostnamectl --transient 2>/dev/null) || \ state: absent - name: Add system user groups - ansible.builtin.group: "{{ item.group }}" + ansible.builtin.group: "{{ item.group }}" # noqa: args[module] loop: "{{ appliances_local_users }}" when: - item.enable | default(true) | bool - "'group' in item" - become_method: "sudo" + become_method: ansible.builtin.sudo # Need to change working directory otherwise we try to switch back to non-existent directory. - become_flags: '-i' + become_flags: "-i" - name: Add system users - ansible.builtin.user: "{{ item.user }}" + ansible.builtin.user: "{{ item.user }}" # noqa: args[module] loop: "{{ appliances_local_users }}" when: item.enable | default(true) | bool - become_method: "sudo" + become_method: ansible.builtin.sudo # Need to change working directory otherwise we try to switch back to non-existent directory. - become_flags: '-i' + become_flags: "-i" - name: Reset ssh connection to allow user changes to affect ansible_user - meta: reset_connection - become: no + ansible.builtin.meta: reset_connection + become: false - hosts: systemd - become: yes + become: true gather_facts: false tags: systemd tasks: - name: Make systemd unit modifications - import_role: + ansible.builtin.import_role: name: systemd - hosts: selinux gather_facts: false - become: yes + become: true tags: - selinux tasks: @@ -125,37 +124,37 @@ - hosts: sshd tags: sshd - gather_facts: no - become: yes + gather_facts: false + become: true tasks: - name: Configure sshd - import_role: + ansible.builtin.import_role: name: sshd - hosts: dnf_repos - become: yes + become: true tags: dnf_repos tasks: - - name: Check that creds won't be leaked to users - ansible.builtin.assert: - that: dnf_repos_password is undefined - fail_msg: Passwords should not be templated into repofiles during configure, unset 'dnf_repos_password' - when: - - appliances_mode == 'configure' - - not (dnf_repos_allow_insecure_creds | default(false)) # useful for development + - name: Check that creds won't be leaked to users + ansible.builtin.assert: + that: dnf_repos_password is undefined + fail_msg: Passwords should not be templated into repofiles during configure, unset 'dnf_repos_password' + when: + - appliances_mode == 'configure' + - not (dnf_repos_allow_insecure_creds | default(false)) # useful for development - hosts: cacerts tags: cacerts gather_facts: false tasks: - name: Install custom cacerts - import_role: + ansible.builtin.import_role: name: cacerts - hosts: squid tags: squid - gather_facts: yes - become: yes + gather_facts: true + become: true tasks: # - Installing squid requires working dnf repos # - Configuring dnf_repos itself requires working dnf repos to install epel @@ -166,27 +165,27 @@ tasks_from: set_repos.yml when: "'dnf_repos' in group_names" - name: Configure squid proxy - import_role: + ansible.builtin.import_role: name: squid - hosts: dnf_repos tags: dnf_repos - gather_facts: yes - become: yes + gather_facts: true + become: true tasks: - - name: Replace system repos with pulp repos - ansible.builtin.include_role: - name: dnf_repos - tasks_from: set_repos.yml + - name: Replace system repos with pulp repos + ansible.builtin.include_role: + name: dnf_repos + tasks_from: set_repos.yml # --- tasks after here require general access to package repos --- - hosts: tuned tags: tuned - gather_facts: yes - become: yes + gather_facts: true + become: true tasks: - name: Install and configure tuneD - include_role: + ansible.builtin.include_role: name: tuned tasks_from: "{{ 'configure.yml' if appliances_mode == 'configure' else 'main.yml' }}" @@ -195,39 +194,39 @@ tags: - freeipa - freeipa_server - gather_facts: yes - become: yes + gather_facts: true + become: true tasks: - name: Install FreeIPA server - import_role: + ansible.builtin.import_role: name: freeipa tasks_from: server.yml - hosts: cluster gather_facts: false - become: yes + become: true tags: cockpit tasks: - - name: Remove RHEL cockpit - command: dnf -y remove cockpit-ws # N.B. using ansible dnf module is very slow + - name: Remove RHEL cockpit # noqa: no-changed-when + ansible.builtin.command: dnf -y remove cockpit-ws register: dnf_remove_output - ignore_errors: true # Avoid failing if a lock or other error happens + ignore_errors: true # Avoid failing if a lock or other error happens - hosts: firewalld gather_facts: false - become: yes + become: true tags: firewalld tasks: - - include_role: + - ansible.builtin.include_role: name: firewalld tasks_from: "{{ 'runtime.yml' if appliances_mode == 'configure' else 'main.yml' }}" - hosts: fail2ban gather_facts: false - become: yes + become: true tags: fail2ban tasks: - - include_role: + - ansible.builtin.include_role: name: fail2ban tasks_from: "{{ 'configure.yml' if appliances_mode == 'configure' else 'main.yml' }}" @@ -236,91 +235,92 @@ hosts: podman tags: podman tasks: - - include_role: + - ansible.builtin.include_role: name: podman tasks_from: "{{ 'configure.yml' if appliances_mode == 'configure' else 'main.yml' }}" - hosts: update gather_facts: false - become: yes + become: true tags: - update tasks: - - block: - - name: Update selected packages - yum: - name: "{{ update_name }}" - state: "{{ update_state }}" - exclude: "{{ update_exclude }}" - disablerepo: "{{ update_disablerepo }}" - async: "{{ 30 * 60 }}" # wait for up to 30 minutes - poll: 15 # check every 15 seconds - register: updates - - name: Ensure update log directory on localhost exists - file: - path: "{{ update_log_path | dirname }}" - state: directory - become: false - delegate_to: localhost - run_once: true - - name: Log updated packages - copy: - content: "{{ updates.results | join('\n') }}" - dest: "{{ update_log_path }}" - delegate_to: localhost - become: no - - debug: - msg: "{{ updates.results | length }} changes to packages - see {{ update_log_path }} for details" - when: "update_enable | default('false') | bool" - + - when: "update_enable | default('false') | bool" + block: + - name: Update selected packages + ansible.builtin.dnf: + name: "{{ update_name }}" + state: "{{ update_state }}" + exclude: "{{ update_exclude }}" + disablerepo: "{{ update_disablerepo }}" + async: "{{ 30 * 60 }}" # wait for up to 30 minutes + poll: 15 # check every 15 seconds + register: updates + - name: Ensure update log directory on localhost exists + ansible.builtin.file: + path: "{{ update_log_path | dirname }}" + state: directory + mode: "0755" + become: false + delegate_to: localhost + run_once: true # noqa: run-once[task] + - name: Log updated packages + ansible.builtin.copy: + content: "{{ updates.results | join('\n') }}" + dest: "{{ update_log_path }}" + mode: "0644" + delegate_to: localhost + become: false + - ansible.builtin.debug: + msg: "{{ updates.results | length }} changes to packages - see {{ update_log_path }} for details" - hosts: - selinux - update gather_facts: false - become: yes + become: true tags: - reboot - selinux - update tasks: - name: Check for pending reboot from package updates - command: + ansible.builtin.command: cmd: dnf needs-restarting -r register: update_reboot_required failed_when: "update_reboot_required.rc not in [0, 1]" changed_when: false - name: Reboot to cover SELinux state change or package upgrades - reboot: + ansible.builtin.reboot: post_reboot_delay: 30 when: (sestatus['reboot_required'] | default(false)) or (update_reboot_required.rc == 1) - name: Wait for hosts to be reachable - wait_for_connection: + ansible.builtin.wait_for_connection: sleep: 15 - name: Clear facts - meta: clear_facts + ansible.builtin.meta: clear_facts - name: Update facts - setup: + ansible.builtin.setup: - hosts: ofed - gather_facts: yes - become: yes + gather_facts: true + become: true tags: ofed tasks: - - include_role: + - ansible.builtin.include_role: name: ofed - hosts: ansible_init - gather_facts: yes - become: yes + gather_facts: true + become: true tags: linux_ansible_init tasks: - name: Install ansible-init - include_role: + ansible.builtin.include_role: name: azimuth_cloud.image_utils.linux_ansible_init when: "appliances_mode == 'build'" - hosts: k3s:&builder - become: yes + become: true tags: k3s tasks: - name: Install k3s diff --git a/ansible/ci/check_eessi.yml b/ansible/ci/check_eessi.yml index 280f865..a72bd91 100644 --- a/ansible/ci/check_eessi.yml +++ b/ansible/ci/check_eessi.yml @@ -5,20 +5,21 @@ eessi_test_rootdir: /home/eessi_test tasks: - name: Create test root directory - file: + ansible.builtin.file: path: "{{ eessi_test_rootdir }}" state: directory owner: "{{ ansible_user }}" group: "{{ ansible_user }}" + mode: "0755" become: true - - - name: Clone eessi-demo repo + + - name: Clone eessi-demo repo # noqa: latest[git] ansible.builtin.git: repo: "https://github.com/eessi/eessi-demo.git" dest: "{{ eessi_test_rootdir }}/eessi-demo" - name: Create batch script - copy: + ansible.builtin.copy: dest: "{{ eessi_test_rootdir }}/eessi-demo/TensorFlow/tensorflow.sh" content: | #!/usr/bin/env bash @@ -26,25 +27,26 @@ #SBATCH --error=%x.out source /cvmfs/pilot.eessi-hpc.org/latest/init/bash srun ./run.sh + mode: "0644" - - name: Run test job - ansible.builtin.shell: + - name: Run test job # noqa: no-changed-when + ansible.builtin.command: cmd: sbatch --wait tensorflow.sh chdir: "{{ eessi_test_rootdir }}/eessi-demo/TensorFlow" register: job_output - name: Retrieve job output - slurp: + ansible.builtin.slurp: src: "{{ eessi_test_rootdir }}/eessi-demo/TensorFlow/tensorflow.sh.out" register: _tensorflow_out no_log: true # as its base64 encoded so useless - name: Show job output - debug: + ansible.builtin.debug: msg: "{{ _tensorflow_out.content | b64decode }}" - name: Fail if job output contains error - fail: + ansible.builtin.fail: # Note: Job prints live progress bar to terminal, so use regex filter to remove this from stdout - msg: "Test job using EESSI modules failed. Job output was: {{ job_output.stdout | regex_replace('\b', '') }}" + msg: "Test job using EESSI modules failed. Job output was: {{ job_output.stdout | regex_replace('\b', '') }}" when: '"Epoch 5/5" not in _tensorflow_out.content | b64decode' diff --git a/ansible/ci/check_grafana.yml b/ansible/ci/check_grafana.yml index 36fb78b..0764b65 100644 --- a/ansible/ci/check_grafana.yml +++ b/ansible/ci/check_grafana.yml @@ -1,15 +1,16 @@ +--- # Checks Slurm jobs from hpctests are shown in Grafana. # Can't actually check the dashboard programatically so this queries the datasource used by the dashboard instead. - hosts: control # so proxying etc is irrelevant - gather_facts: no - become: no + gather_facts: false + become: false tasks: - name: Wait for slurm-stats file to exist (run by cron) ansible.builtin.wait_for: path: /var/log/slurm-stats/finished_jobs.json timeout: 315 # slurm stats cron job runs every 5 mins - + - name: Query grafana for expected hpctests jobs grafana_elasticsearch_query: grafana_url: http://{{ grafana_api_address }}:{{ grafana_port }} @@ -23,4 +24,5 @@ delay: 5 vars: _found_jobs: "{{ _slurm_stats_jobs.docs | map(attribute='JobName', default='(json error in slurmstats data)') }}" - _expected_jobs: ['pingpong.sh'] + _expected_jobs: + - "pingpong.sh" diff --git a/ansible/ci/check_sacct_hpctests.yml b/ansible/ci/check_sacct_hpctests.yml index 1ebbf21..3628609 100644 --- a/ansible/ci/check_sacct_hpctests.yml +++ b/ansible/ci/check_sacct_hpctests.yml @@ -1,3 +1,4 @@ +--- - hosts: control gather_facts: false become: true @@ -7,13 +8,13 @@ 1,pingpong.sh,COMPLETED tasks: - name: Get info for ended jobs - shell: + ansible.builtin.command: cmd: sacct --format=jobid,jobname,state --allocations --parsable2 --delimiter=, --starttime=now-1days --endtime=now # by default start/end time is midnight/now which is not robust changed_when: false register: sacct - name: Check info for ended jobs - assert: + ansible.builtin.assert: that: sacct_stdout_expected in sacct.stdout fail_msg: | Expected: diff --git a/ansible/ci/check_slurm.yml b/ansible/ci/check_slurm.yml index ff527da..45cda6c 100644 --- a/ansible/ci/check_slurm.yml +++ b/ansible/ci/check_slurm.yml @@ -1,9 +1,10 @@ +--- - hosts: login:!builder # won't have a slurm control daemon when in build - become: no + become: false gather_facts: false tasks: - name: Run sinfo - shell: 'sinfo --noheader --format="%N %P %a %l %D %t" | sort' # using --format ensures we control whitespace: Partition,partition_state,max_jobtime,num_nodes,node_state,node_name + ansible.builtin.shell: 'sinfo --noheader --format="%N %P %a %l %D %t" | sort' # noqa: risky-shell-pipe register: sinfo changed_when: false until: sinfo.stdout_lines == expected_sinfo diff --git a/ansible/ci/delete_images.yml b/ansible/ci/delete_images.yml index 78b5742..992fb8e 100644 --- a/ansible/ci/delete_images.yml +++ b/ansible/ci/delete_images.yml @@ -1,12 +1,12 @@ +--- - hosts: login:!builder - become: no - gather_facts: no + become: false + gather_facts: false tasks: - - import_tasks: get_image_ids.yml - - - name: Delete images - shell: + - ansible.builtin.import_tasks: get_image_ids.yml + - name: Delete images # noqa: no-changed-when + ansible.builtin.shell: cmd: | openstack image delete {{ item.artifact_id }} delegate_to: localhost - loop: "{{ manifest['builds'] }}" + loop: "{{ manifest['builds'] }}" # noqa: no-changed-when diff --git a/ansible/ci/get_image_ids.yml b/ansible/ci/get_image_ids.yml index 4a53b15..ede3a72 100644 --- a/ansible/ci/get_image_ids.yml +++ b/ansible/ci/get_image_ids.yml @@ -1,12 +1,13 @@ +--- - name: Read packer build manifest - set_fact: + ansible.builtin.set_fact: manifest: "{{ lookup('file', manifest_path) | from_json }}" vars: manifest_path: "{{ lookup('env', 'APPLIANCES_REPO_ROOT') }}/packer/packer-manifest.json" delegate_to: localhost - name: Get latest image builds - set_fact: + ansible.builtin.set_fact: login_build: "{{ manifest['builds'] | selectattr('custom_data', 'eq', {'source': 'login'}) | last }}" compute_build: "{{ manifest['builds'] | selectattr('custom_data', 'eq', {'source': 'compute'}) | last }}" control_build: "{{ manifest['builds'] | selectattr('custom_data', 'eq', {'source': 'control'}) | last }}" diff --git a/ansible/ci/library/grafana_elasticsearch_query.py b/ansible/ci/library/grafana_elasticsearch_query.py index 3809565..7a1d603 100644 --- a/ansible/ci/library/grafana_elasticsearch_query.py +++ b/ansible/ci/library/grafana_elasticsearch_query.py @@ -1,10 +1,17 @@ #!/usr/bin/python +# pylint: disable=missing-module-docstring # Copyright: (c) 2022 Steve Brasier steve@stackhpc.com -from __future__ import (absolute_import, division, print_function) -__metaclass__ = type +from __future__ import absolute_import, division, print_function -DOCUMENTATION = r''' +import json + +import requests # pylint: disable=import-error +from ansible.module_utils.basic import AnsibleModule # pylint: disable=import-error + +__metaclass__ = type # pylint: disable=invalid-name + +DOCUMENTATION = r""" --- module: grafana_elasticsearch_query @@ -16,9 +23,9 @@ author: - Steve Brasier -''' +""" -EXAMPLES = r''' +EXAMPLES = r""" - name: Get elasticsearch hits grafana_elasticsearch_query: grafana_url: http://{{ grafana_api_address }}:{{ grafana_port }} @@ -26,63 +33,83 @@ grafana_password: "{{ vault_grafana_admin_password }}" datasource: slurmstats index_pattern: 'filebeat-*' -''' +""" -RETURN = r''' +RETURN = r""" # These are examples of possible return values, and in general should use other names for return values. docs: description: List of dicts with the original json in each document. returned: always type: list -''' - -from ansible.module_utils.basic import AnsibleModule -import requests -import json - -def run_module(): - module_args = dict( - grafana_url=dict(type="str", required=True), - grafana_username=dict(type="str", required=True), - grafana_password=dict(type="str", required=True), - datasource=dict(type="str", required=True), - index_pattern=dict(type="str", required=True), - ) +""" + + +def run_module(): # pylint: disable=missing-function-docstring + module_args = { + "grafana_url": { + "type": "str", + "required": True, + }, + "grafana_username": { + "type": "str", + "required": True, + }, + "grafana_password": { + "type": "str", + "required": True, + }, + "datasource": { + "type": "str", + "required": True, + }, + "index_pattern": { + "type": "str", + "required": True, + }, + } - result = dict( - changed=False, - jobs=[] - ) + result = { + "changed": False, + "jobs": [], + } module = AnsibleModule(argument_spec=module_args, supports_check_mode=True) - auth=(module.params['grafana_username'], module.params['grafana_password']) - + auth = (module.params["grafana_username"], module.params["grafana_password"]) + # list datasources: - datasources_api_url = module.params["grafana_url"] + '/api/datasources' + datasources_api_url = module.params["grafana_url"] + "/api/datasources" r = requests.get(datasources_api_url, auth=auth) datasources = json.loads(r.text) # select required datasource: - ds = [s for s in datasources if s['name'] == module.params["datasource"]][0] + ds = [s for s in datasources if s["name"] == module.params["datasource"]][0] # get documents: - datasource_proxy_url = module.params["grafana_url"] + '/api/datasources/proxy/' + str(ds['id']) + '/' + module.params['index_pattern'] + '/_search' + datasource_proxy_url = ( + module.params["grafana_url"] + + "/api/datasources/proxy/" + + str(ds["id"]) + + "/" + + module.params["index_pattern"] + + "/_search" + ) r = requests.get(datasource_proxy_url, auth=auth) search = json.loads(r.text) - # see https://www.elastic.co/guide/en/elasticsearch/reference/current/search-search.html#search-api-response-body: - docs = [h['_source']['json'] for h in search['hits']['hits']] + # see + # https://www.elastic.co/guide/en/elasticsearch/reference/current/search-search.html#search-api-response-body: + docs = [h["_source"]["json"] for h in search["hits"]["hits"]] result = { - 'docs': docs, + "docs": docs, } module.exit_json(**result) -def main(): +def main(): # pylint: disable=missing-function-docstring run_module() -if __name__ == '__main__': +if __name__ == "__main__": main() diff --git a/ansible/ci/output_vars.yml b/ansible/ci/output_vars.yml index 0e2bc4c..2963a58 100644 --- a/ansible/ci/output_vars.yml +++ b/ansible/ci/output_vars.yml @@ -1,7 +1,8 @@ +--- # Output specific hostvars to a file in a form which can be sourced by bash # NB: obviously the keys and values for the hostvars need to be suitable bash variables -- hosts: "{{ output_vars_hosts }}" - gather_facts: no +- hosts: "{{ output_vars_hosts }}" # noqa: syntax-check[specific] + gather_facts: false tasks: - copy: dest: "{{ output_vars_path }}" diff --git a/ansible/ci/retrieve_inventory.yml b/ansible/ci/retrieve_inventory.yml index d5f61bb..6e395ef 100644 --- a/ansible/ci/retrieve_inventory.yml +++ b/ansible/ci/retrieve_inventory.yml @@ -1,27 +1,28 @@ +--- # Retrieve inventory from a deployed CI arcus environment by reversing arcus/inventory/hooks/pre.yml # Usage example: # ansible-playbook ansible/ci/retrieve_inventory.yml -e cluster_prefix=ci4005969475 # - hosts: localhost - become: no - gather_facts: no + become: false + gather_facts: false vars: cluster_prefix: "{{ undef(hint='cluster_prefix must be defined') }}" # e.g. ci4005969475 ci_vars_file: "{{ appliances_environment_root + '/tofu/' + lookup('env', 'CI_CLOUD') }}.tfvars" cluster_network: "{{ lookup('ansible.builtin.ini', 'cluster_net', file=ci_vars_file, type='properties') | trim('\"') }}" tasks: - name: Get control host IP - set_fact: + ansible.builtin.set_fact: control_ip: "{{ (lookup('pipe', 'openstack server show -f json ' + cluster_prefix + '-control') | from_json)['addresses'][cluster_network][0] }}" - name: Add host into in-memory inventory - add_host: + ansible.builtin.add_host: name: cluster_control groups: control ansible_host: "{{ control_ip }}" - hosts: control - become: yes - gather_facts: no + become: true + gather_facts: false tasks: - ansible.builtin.fetch: src: "/etc/ci-config/{{ item | basename }}" diff --git a/ansible/ci/update_timestamps.yml b/ansible/ci/update_timestamps.yml index 8db4757..c6eb6f0 100644 --- a/ansible/ci/update_timestamps.yml +++ b/ansible/ci/update_timestamps.yml @@ -1,6 +1,7 @@ +--- - hosts: localhost tasks: - - name: Get latest timestamps from sources + - name: Get latest timestamps from sources # noqa: syntax-check[unknown-module] # ansible/library/latest_timestamps.py latest_timestamps: repos_dict: "{{ dnf_repos_default }}" content_url: "https://ark.stackhpc.com/pulp/content" diff --git a/ansible/cleanup.yml b/ansible/cleanup.yml index 3db6eb1..6b495d7 100644 --- a/ansible/cleanup.yml +++ b/ansible/cleanup.yml @@ -1,16 +1,17 @@ +--- # Clean up a Packer build VM -- meta: flush_handlers +- ansible.builtin.meta: flush_handlers -- name: Remove dnf caches - command: dnf clean all +- name: Remove dnf caches # noqa: no-changed-when + ansible.builtin.command: dnf clean all # If image build happens on a Neutron subnet with property dns_namservers defined, then cloud-init # disables NetworkManager's control of /etc/resolv.conf and appends nameservers itself. # We don't want network configuration during instance boot to depend on the configuration # of the network the builder was on, so we reset these aspects. - name: Delete /etc/resolv.conf - file: + ansible.builtin.file: path: /etc/resolv.conf state: absent when: "'resolv_conf' not in group_names" # if its been overriden, deleting it is the wrong thing to do @@ -19,25 +20,25 @@ # NB: This *doesn't* delete the 90-dns-none.conf file created by the resolv_conf role # as if nameservers are explicitly being set by that role we don't want to allow NM # to override it again. - file: + ansible.builtin.file: path: /etc/NetworkManager/conf.d/99-cloud-init.conf state: absent - name: Get remote environment for ansible_user - setup: + ansible.builtin.setup: gather_subset: env - become: no + become: false - name: Delete any injected ssh config for ansible_user - file: + ansible.builtin.file: path: "{{ ansible_env.HOME }}/.ssh/" state: absent -- name: Run cloud-init cleanup - command: cloud-init clean --logs --seed +- name: Run cloud-init cleanup # noqa: no-changed-when + ansible.builtin.command: cloud-init clean --logs --seed -- name: Cleanup /tmp - command : rm -rf /tmp/* +- name: Cleanup /tmp # noqa: no-changed-when + ansible.builtin.command: rm -rf /tmp/* - name: Delete files triggering vulnerability scans ansible.builtin.file: @@ -54,10 +55,10 @@ - /etc/ansible-init/playbooks/roles/mrlesmithjr.chrony/requirements.txt - name: Get package facts - package_facts: + ansible.builtin.package_facts: - name: Ensure image summary directory exists - file: + ansible.builtin.file: path: /var/lib/image/ state: directory owner: root @@ -65,9 +66,10 @@ mode: u=rwX,go=rX - name: Write image summary - copy: + ansible.builtin.copy: content: "{{ image_info | to_nice_json }}" dest: /var/lib/image/image.json + mode: "0644" vars: image_info: branch: "{{ lookup('pipe', 'git rev-parse --abbrev-ref HEAD') }}" @@ -79,5 +81,5 @@ cuda: "{{ ansible_facts.packages['cuda-toolkit'].0.version | default('-') }}" slurm-ohpc: "{{ ansible_facts.packages['slurm-ohpc'].0.version | default('-') }}" -- name: Show image summary - command: cat /var/lib/image/image.json +- name: Show image summary # noqa: no-changed-when + ansible.builtin.command: cat /var/lib/image/image.json diff --git a/ansible/extras.yml b/ansible/extras.yml index 08892e4..02b0d40 100644 --- a/ansible/extras.yml +++ b/ansible/extras.yml @@ -1,5 +1,6 @@ +--- - hosts: k3s_server:!builder - become: yes + become: true tags: k3s tasks: - name: Start k3s server @@ -10,7 +11,7 @@ # technically should be part of bootstrap.yml but hangs waiting on failed mounts # if runs before filesystems.yml after the control node has been reimaged - hosts: k3s_agent:!builder - become: yes + become: true tags: k3s tasks: - name: Start k3s agents @@ -19,13 +20,13 @@ tasks_from: agent-runtime.yml - hosts: basic_users:!builder - become: yes + become: true tags: - basic_users - users - gather_facts: yes + gather_facts: true tasks: - - import_role: + - ansible.builtin.import_role: name: basic_users - name: Setup EESSI @@ -35,57 +36,57 @@ gather_facts: false tasks: - name: Install / configure EESSI - include_role: + ansible.builtin.include_role: name: eessi tasks_from: "{{ 'configure.yml' if appliances_mode == 'configure' else 'main.yml' }}" - name: Setup CUDA hosts: cuda - become: yes - gather_facts: yes + become: true + gather_facts: true tags: cuda tasks: - - include_role: + - ansible.builtin.include_role: name: cuda tasks_from: "{{ 'runtime.yml' if appliances_mode == 'configure' else 'install.yml' }}" - name: Setup vGPU hosts: vgpu - become: yes - gather_facts: yes + become: true + gather_facts: true tags: vgpu tasks: - - include_role: + - ansible.builtin.include_role: name: stackhpc.linux.vgpu tasks_from: "{{ 'configure.yml' if appliances_mode == 'configure' else 'install.yml' }}" handlers: - - name: reboot - fail: + - name: reboot # noqa: name[casing] + ansible.builtin.fail: msg: Reboot handler for stackhpc.linux.vgpu role fired unexpectedly. This was supposed to be unreachable. - name: Persist hostkeys across rebuilds # Must be after filesystems.yml (for storage) # and before portal.yml (where OOD login node hostkeys are scanned) hosts: persist_hostkeys:!builder - become: yes - gather_facts: no + become: true + gather_facts: false tasks: - - import_role: + - ansible.builtin.import_role: name: persist_hostkeys - name: Install k9s - become: yes + become: true hosts: k9s tags: k9s tasks: - - import_role: - name: k9s + - ansible.builtin.import_role: + name: k9s - hosts: extra_packages - become: yes + become: true tags: - - extra_packages + - extra_packages tasks: - - name: Install additional packages - dnf: - name: "{{ appliances_extra_packages }}" + - name: Install additional packages + ansible.builtin.dnf: + name: "{{ appliances_extra_packages }}" diff --git a/ansible/fatimage.yml b/ansible/fatimage.yml index 46a99bc..8e8e58a 100644 --- a/ansible/fatimage.yml +++ b/ansible/fatimage.yml @@ -1,13 +1,14 @@ +--- # Builder version of site.yml just installing binaries - hosts: builder - become: no - gather_facts: no + become: false + gather_facts: false tasks: - name: Report hostname (= final image name) - command: hostname + ansible.builtin.command: hostname # noqa: no-changed-when - name: Report inventory groups - debug: + ansible.builtin.debug: var: group_names - name: Run pre.yml hook @@ -20,21 +21,21 @@ - name: Sync pulp repos with upstream hosts: pulp_site tasks: - - ansible.builtin.include_role: - name: pulp_site - tasks_from: sync.yml - apply: - delegate_to: localhost - when: appliances_mode != 'configure' + - ansible.builtin.include_role: + name: pulp_site + tasks_from: sync.yml + apply: + delegate_to: localhost + when: appliances_mode != 'configure' - import_playbook: bootstrap.yml - hosts: doca - become: yes - gather_facts: yes + become: true + gather_facts: true tasks: - name: Install NVIDIA DOCA - import_role: + ansible.builtin.import_role: name: doca - name: Run post-bootstrap.yml hook @@ -45,33 +46,33 @@ when: hook_path | exists - hosts: builder - become: yes - gather_facts: yes + become: true + gather_facts: true tasks: # - import_playbook: iam.yml - name: Install FreeIPA client - import_role: + ansible.builtin.import_role: name: freeipa tasks_from: client-install.yml when: "'freeipa_client' in group_names" - name: Install sssd - import_role: + ansible.builtin.import_role: name: sssd tasks_from: install.yml when: "'sssd' in group_names" # - import_playbook: filesystems.yml: - name: Install nfs packages - dnf: + ansible.builtin.dnf: name: nfs-utils when: "'nfs' in group_names" - name: Install Manila client packages - include_role: + ansible.builtin.include_role: name: stackhpc.os-manila-mount tasks_from: install.yml when: "'manila' in group_names" - name: Install Lustre packages - include_role: + ansible.builtin.include_role: name: lustre tasks_from: install.yml when: "'lustre' in group_names" @@ -82,46 +83,46 @@ - name: Install compute_init playbook hosts: compute_init tags: compute_init # tagged to allow running on cluster instances for dev - become: yes + become: true tasks: - - include_role: + - ansible.builtin.include_role: name: compute_init tasks_from: install.yml - name: Install gateway playbook hosts: gateway tags: gateway - become: yes - gather_facts: no + become: true + gather_facts: false tasks: - - include_role: + - ansible.builtin.include_role: name: gateway - hosts: builder - become: yes - gather_facts: yes + become: true + gather_facts: true tasks: # - import_playbook: slurm.yml: - name: Setup DB - include_role: + ansible.builtin.include_role: name: mysql tasks_from: install.yml when: "'mysql' in group_names" - name: Install rebuild - include_role: + ansible.builtin.include_role: name: rebuild tasks_from: install.yml - name: Install OpenHPC - import_role: + ansible.builtin.import_role: name: stackhpc.openhpc tasks_from: install.yml when: "'openhpc' in group_names" # - import_playbook: portal.yml - name: Open Ondemand server (packages) - include_role: + ansible.builtin.include_role: name: osc.ood tasks_from: install-package.yml vars_from: "Rocky/{{ ansible_distribution_major_version }}.yml" @@ -129,90 +130,90 @@ # # FUTURE: install-apps.yml - this is git clones - name: Open Ondemand server (apps) - include_role: + ansible.builtin.include_role: name: osc.ood tasks_from: install-apps.yml vars_from: "Rocky/{{ ansible_distribution_major_version }}.yml" when: "'openondemand' in group_names" - name: Open Ondemand remote desktop - import_role: + ansible.builtin.import_role: name: openondemand tasks_from: vnc_compute.yml when: "'openondemand_desktop' in group_names" - name: Open Ondemand jupyter node - import_role: + ansible.builtin.import_role: name: openondemand tasks_from: jupyter_compute.yml when: "'openondemand_jupyter' in group_names" - name: Install Apache PAM module # Extracted from start of roles/openondemand/tasks/pam_auth.yml to ensure only installed during build - yum: + ansible.builtin.dnf: name: mod_authnz_pam # - import_playbook: monitoring.yml: - - import_role: + - ansible.builtin.import_role: name: opensearch tasks_from: install.yml when: "'opensearch' in group_names" - - import_role: + - ansible.builtin.import_role: name: slurm_stats tasks_from: install.yml when: "'slurm_stats' in group_names" - - import_role: + - ansible.builtin.import_role: name: filebeat tasks_from: install.yml when: "'filebeat' in group_names" - - import_role: - # can't only run cloudalchemy.node_exporter/tasks/install.yml as needs vars from preflight.yml and triggers service start - # however starting node exporter is ok + - ansible.builtin.import_role: + # can't only run cloudalchemy.node_exporter/tasks/install.yml as needs vars from preflight.yml and triggers service start + # however starting node exporter is ok name: cloudalchemy.node_exporter when: "'node_exporter' in group_names" - - name: openondemand exporter - dnf: + - name: Openondemand exporter + ansible.builtin.dnf: name: ondemand_exporter when: "'openondemand' in group_names" - - name: slurm exporter - include_role: + - name: Slurm exporter + ansible.builtin.include_role: name: slurm_exporter tasks_from: install.yml when: "'slurm_exporter' in group_names" - name: Install alertmanager - include_role: + ansible.builtin.include_role: name: alertmanager tasks_from: install.yml when: "'alertmanager' in group_names" - name: Download HPL source - include_role: + ansible.builtin.include_role: name: hpctests tasks_from: source-hpl.yml - hosts: prometheus - become: yes - gather_facts: yes + become: true + gather_facts: true tasks: - - import_role: + - ansible.builtin.import_role: name: cloudalchemy.prometheus tasks_from: preflight.yml # can't run cloudalchemy.prometheus/tasks/install.yml as it triggers a unit start # so below is a partial extraction of this: - - name: create prometheus system group - group: + - name: Create prometheus system group + ansible.builtin.group: name: prometheus system: true state: present - - name: create prometheus system user - user: + - name: Create prometheus system user + ansible.builtin.user: name: prometheus system: true shell: "/usr/sbin/nologin" @@ -220,31 +221,33 @@ createhome: false home: "{{ prometheus_db_dir }}" - - name: download prometheus binary to local folder + - name: Download prometheus binary to local folder become: false - get_url: + ansible.builtin.get_url: + # yamllint disable-line rule:line-length url: "https://github.com/prometheus/prometheus/releases/download/v{{ prometheus_version }}/prometheus-{{ prometheus_version }}.linux-{{ go_arch }}.tar.gz" dest: "/tmp/prometheus-{{ prometheus_version }}.linux-{{ go_arch }}.tar.gz" checksum: "sha256:{{ __prometheus_checksum }}" + mode: "0644" register: _download_archive until: _download_archive is succeeded retries: 5 delay: 2 - - name: unpack prometheus binaries + - name: Unpack prometheus binaries become: false - unarchive: - remote_src: yes + ansible.builtin.unarchive: + remote_src: true src: "/tmp/prometheus-{{ prometheus_version }}.linux-{{ go_arch }}.tar.gz" dest: "/tmp" creates: "/tmp/prometheus-{{ prometheus_version }}.linux-{{ go_arch }}/prometheus" - - name: propagate official prometheus and promtool binaries - copy: - remote_src: yes + - name: Propagate official prometheus and promtool binaries + ansible.builtin.copy: + remote_src: true src: "/tmp/prometheus-{{ prometheus_version }}.linux-{{ go_arch }}/{{ item }}" dest: "{{ _prometheus_binary_install_dir }}/{{ item }}" - mode: 0755 + mode: "0755" owner: root group: root with_items: @@ -252,26 +255,26 @@ - promtool - hosts: grafana - become: yes - gather_facts: yes + become: true + gather_facts: true tasks: - name: Include distribution variables for cloudalchemy.grafana - include_vars: "{{ appliances_repository_root }}/ansible/roles/cloudalchemy.grafana/vars/redhat.yml" - - import_role: + ansible.builtin.include_vars: "{{ appliances_repository_root }}/ansible/roles/cloudalchemy.grafana/vars/redhat.yml" + - ansible.builtin.import_role: name: cloudalchemy.grafana tasks_from: install.yml - - import_role: + - ansible.builtin.import_role: name: cloudalchemy.grafana tasks_from: plugins.yml - - include_role: # done in same play so it can use handlers from cloudalchemy.grafana + - ansible.builtin.include_role: # done in same play so it can use handlers from cloudalchemy.grafana name: grafana-dashboards - name: Add support for NVIDIA GPU auto detection to Slurm hosts: slurm_recompile - become: yes + become: true tasks: - name: Recompile slurm - import_role: + ansible.builtin.import_role: name: slurm_recompile vars: slurm_recompile_with_nvml: "{{ groups.cuda | length > 0 }}" @@ -286,12 +289,11 @@ - import_playbook: final.yml - hosts: builder - become: yes - gather_facts: yes + become: true + gather_facts: true tags: finalise tasks: - name: Cleanup image - import_tasks: cleanup.yml - + ansible.builtin.import_tasks: cleanup.yml - name: Shutdown Packer VM community.general.shutdown: diff --git a/ansible/filesystems.yml b/ansible/filesystems.yml index 41a685d..804f7d2 100644 --- a/ansible/filesystems.yml +++ b/ansible/filesystems.yml @@ -1,11 +1,10 @@ --- - - name: Setup block devices hosts: block_devices - become: yes + become: true tags: block_devices tasks: - - include_role: + - ansible.builtin.include_role: name: block_devices - name: Setup NFS @@ -14,7 +13,7 @@ tags: - nfs tasks: - - include_role: + - ansible.builtin.include_role: name: stackhpc.nfs - name: Setup Manila share mounts @@ -22,7 +21,7 @@ become: true tags: manila tasks: - - include_role: + - ansible.builtin.include_role: name: stackhpc.os-manila-mount tasks_from: "{{ item }}" loop: "{{ ['lookup.yml', 'mount.yml'] if appliances_mode == 'configure' else ['main.yml'] }}" @@ -32,7 +31,7 @@ become: true tags: lustre tasks: - - include_role: + - ansible.builtin.include_role: name: lustre # NB install is ONLY run in builder tasks_from: configure.yml diff --git a/ansible/filter_plugins/utils.py b/ansible/filter_plugins/utils.py index 42b7107..33ad391 100644 --- a/ansible/filter_plugins/utils.py +++ b/ansible/filter_plugins/utils.py @@ -1,79 +1,84 @@ #!/usr/bin/python +# pylint: disable=missing-module-docstring # Copyright: (c) 2020, StackHPC # Apache 2 License -from ansible.errors import AnsibleError, AnsibleFilterError -from ansible.utils.display import Display -from collections import defaultdict -import jinja2 -from ansible.module_utils.six import string_types import os.path import re +from collections import defaultdict + +from ansible.utils.display import Display # pylint: disable=import-error + def prometheus_node_exporter_targets(hosts, hostvars, env_key, group): - """ Return a mapping in cloudalchemy.nodeexporter prometheus_targets - format. + """Return a mapping in cloudalchemy.nodeexporter prometheus_targets + format. - hosts: list of inventory_hostnames - hostvars: Ansible hostvars variable - env_key: key to lookup in each host's hostvars to add as label 'env' (default: 'ungrouped') - group: string to add as label 'group' + hosts: list of inventory_hostnames + hostvars: Ansible hostvars variable + env_key: key to lookup in each host's hostvars to add as label 'env' (default: 'ungrouped') + group: string to add as label 'group' """ result = [] per_env = defaultdict(list) for host in hosts: - host_env = hostvars[host].get(env_key, 'ungrouped') + host_env = hostvars[host].get(env_key, "ungrouped") per_env[host_env].append(host) - for env, hosts in per_env.items(): + for env, hosts in per_env.items(): # pylint: disable=redefined-argument-from-local target = { "targets": [f"{target}:9100" for target in hosts], - "labels": { - 'env': env, - 'group': group - } + "labels": {"env": env, "group": group}, } result.append(target) return result -def readfile(fpath): + +def readfile(fpath): # pylint: disable=missing-function-docstring if not os.path.isfile(fpath): return "" - with open(fpath) as f: + with open(fpath) as f: # pylint: disable=unspecified-encoding return f.read() -def exists(fpath): + +def exists(fpath): # pylint: disable=missing-function-docstring return os.path.isfile(fpath) + def to_ood_regex(items): - """ Convert a list of strings possibly containing digits into a regex containing \d+ - - eg {{ [compute-001, compute-002, control] | to_regex }} -> '(compute-\d+)|(control)' + """Convert a list of strings possibly containing digits into a regex containing \\d+ + + eg {{ [compute-001, compute-002, control] | to_regex }} -> '(compute-\\d+)|(control)' """ - + # NB: for python3.12+ the \d in this function & docstring - # need to be raw strings. See https://docs.python.org/3/reference/lexical_analysis.html + # need to be raw strings. See + # https://docs.python.org/3/reference/lexical_analysis.html # There's a python bug which means re.sub() can't use '\d' in the replacement so # have to do replacement in two stages: - r = [re.sub(r"\d+", 'XBACKSLASHX', v) for v in items] - r = [v.replace('XBACKSLASHX', '\d+') for v in set(r)] - r = ['(%s)' % v for v in r] - return '|'.join(r) + r = [re.sub(r"\d+", "XBACKSLASHX", v) for v in items] + r = [v.replace("XBACKSLASHX", r"\d+") for v in set(r)] + r = [f"({v})" for v in r] + return "|".join(r) + +# pylint: disable=useless-object-inheritance class FilterModule(object): - ''' Ansible core jinja2 filters ''' + """Ansible core jinja2 filters""" - def warn(self, message, **kwargs): + # pylint: disable=missing-function-docstring + def warn(self, message, **kwargs): # pylint: disable=unused-argument Display().warning(message) return message + # pylint: disable=missing-function-docstring def filters(self): return { # jinja2 overrides - 'readfile': readfile, - 'prometheus_node_exporter_targets': prometheus_node_exporter_targets, - 'exists': exists, - 'warn': self.warn, - 'to_ood_regex': to_ood_regex, + "readfile": readfile, + "prometheus_node_exporter_targets": prometheus_node_exporter_targets, + "exists": exists, + "warn": self.warn, + "to_ood_regex": to_ood_regex, } diff --git a/ansible/final.yml b/ansible/final.yml index 3e715df..d984204 100644 --- a/ansible/final.yml +++ b/ansible/final.yml @@ -1,5 +1,5 @@ - hosts: dnf_repos - become: yes + become: true tags: dnf_repos tasks: - name: Disable pulp repos @@ -12,19 +12,19 @@ hosts: compute_init:!builder # NB: done last so other roles can prepare configuration etc tags: compute_init - become: yes + become: true tasks: - - include_role: + - ansible.builtin.include_role: name: compute_init tasks_from: export.yml - hosts: proxy gather_facts: false tags: proxy - become: yes + become: true tasks: - - include_role: + - ansible.builtin.include_role: name: proxy vars: proxy_state: absent - when: proxy_remove | default(false) | bool == true + when: proxy_remove | default(false) | bool diff --git a/ansible/iam.yml b/ansible/iam.yml index 8b3bf6b..d570a7a 100644 --- a/ansible/iam.yml +++ b/ansible/iam.yml @@ -1,13 +1,14 @@ +--- - hosts: freeipa_client tags: - freeipa - freeipa_server # as this is only relevant if using freeipa_server - freeipa_host - gather_facts: no - become: yes + gather_facts: false + become: true tasks: - name: Ensure FreeIPA client hosts are added to the FreeIPA server - import_role: + ansible.builtin.import_role: name: freeipa tasks_from: addhost.yml when: groups['freeipa_server'] | length > 0 @@ -16,16 +17,16 @@ tags: - freeipa - freeipa_client - gather_facts: yes - become: yes + gather_facts: true + become: true tasks: - name: Install FreeIPA client - include_role: + ansible.builtin.include_role: name: freeipa tasks_from: client-install.yml when: "appliances_mode != 'configure'" - name: Enrol FreeIPA client - import_role: + ansible.builtin.import_role: name: freeipa tasks_from: enrol.yml @@ -34,19 +35,19 @@ - freeipa - freeipa_server - users - gather_facts: yes - become: yes + gather_facts: true + become: true tasks: - name: Add FreeIPA users - import_role: + ansible.builtin.import_role: name: freeipa tasks_from: users.yml - hosts: sssd - become: yes - gather_facts: no + become: true + gather_facts: false tags: sssd tasks: - name: Configure sssd - import_role: + ansible.builtin.import_role: name: sssd diff --git a/ansible/library/latest_timestamps.py b/ansible/library/latest_timestamps.py index 0de3883..6ac4549 100644 --- a/ansible/library/latest_timestamps.py +++ b/ansible/library/latest_timestamps.py @@ -1,25 +1,32 @@ -__metaclass__ = type +# pylint: disable=missing-module-docstring +import requests # pylint: disable=import-error +from ansible.module_utils.basic import AnsibleModule # pylint: disable=import-error +from bs4 import BeautifulSoup # pylint: disable=import-error, wrong-import-order -DOCUMENTATION = r''' +__metaclass__ = type # pylint: disable=invalid-name + +DOCUMENTATION = r""" --- module: latest_timestamps short_description: Gets the latest set of snapshots from Pulp version_added: "1.0.0" -description: Gets the latest set of snapshots from given source URLs and returns dictionary to replace 'appliances_repo_timestamps' with +description: > + Gets the latest set of snapshots from given source URLs + and returns dictionary to replace 'appliances_repo_timestamps' with author: - William Tripp - Steve Brasier -''' +""" -EXAMPLES = r''' +EXAMPLES = r""" - name: Get latest timestamps latest_timestamps: repos_dict: "{{ appliances_repo_timestamp_sources }}" content_url: "https://ark.stackhpc.com/pulp/content" register: result -''' +""" -RETURN = r''' +RETURN = r""" latest_dict: description: Dictionary with updated timestamps type: dict @@ -28,48 +35,58 @@ description: List of repos that have updated timestamps type: str[] returned: always -''' +""" -from ansible.module_utils.basic import AnsibleModule -import requests -from bs4 import BeautifulSoup -def run_module(): - module_args = dict( - repos_dict=dict(type='dict', required=True), - content_url=dict(type='str', required=True) - ) +def run_module(): # pylint: disable=missing-function-docstring + module_args = { + "repos_dict": { + "type": "dict", + "required": True, + }, + "content_url": { + "type": "str", + "required": True, + }, + } - result = dict( - changed=False, - original_message='', - message='' - ) + result = { + "changed": False, + "original_message": "", + "message": "", + } - module = AnsibleModule( - argument_spec=module_args, - supports_check_mode=True - ) + module = AnsibleModule(argument_spec=module_args, supports_check_mode=True) - timestamps = dict(module.params['repos_dict']) + timestamps = dict(module.params["repos_dict"]) for repo in timestamps: for version in timestamps[repo]: html_txt = requests.get( - url= module.params['content_url'] + '/' + timestamps[repo][version]['pulp_path'] - ).text - timestamp_link_list = BeautifulSoup(html_txt,features="html.parser").body.find('pre').find_all() # getting raw list of timestamps from html - timestamp_link_list = map(lambda x: x.string,timestamp_link_list) # stripping xml tags - latest_timestamp = list(timestamp_link_list)[-1][:-1] # last timestamp in list with trailing / removed - timestamps[repo][version]['pulp_timestamp'] = latest_timestamp - result['timestamps'] = dict(sorted(timestamps.items())) + url=module.params["content_url"] + + "/" + + timestamps[repo][version]["pulp_path"] + ).text + timestamp_link_list = ( + BeautifulSoup(html_txt, features="html.parser") + .body.find("pre") + .find_all() + ) # getting raw list of timestamps from html + timestamp_link_list = map( + lambda x: x.string, timestamp_link_list + ) # stripping xml tags + latest_timestamp = list(timestamp_link_list)[-1][ + :-1 + ] # last timestamp in list with trailing / removed + timestamps[repo][version]["pulp_timestamp"] = latest_timestamp + result["timestamps"] = dict(sorted(timestamps.items())) module.exit_json(**result) -def main(): +def main(): # pylint: disable=missing-function-docstring run_module() -if __name__ == '__main__': +if __name__ == "__main__": main() diff --git a/ansible/library/user_namespace_facts.py b/ansible/library/user_namespace_facts.py index 022f63f..a68834b 100644 --- a/ansible/library/user_namespace_facts.py +++ b/ansible/library/user_namespace_facts.py @@ -1,11 +1,19 @@ #!/usr/bin/python +# pylint: disable=missing-module-docstring # Copyright: (c) 2020, Will Szumski -# GNU General Public License v3.0+ (see COPYING or https://www.gnu.org/licenses/gpl-3.0.txt) -from __future__ import (absolute_import, division, print_function) -__metaclass__ = type +# GNU General Public License v3.0+ (see COPYING or +# https://www.gnu.org/licenses/gpl-3.0.txt) +from __future__ import absolute_import, division, print_function -DOCUMENTATION = r''' +import csv +import os + +from ansible.module_utils.basic import AnsibleModule # pylint: disable=import-error + +__metaclass__ = type # pylint: disable=invalid-name + +DOCUMENTATION = r""" --- module: user_namepace_facts @@ -17,14 +25,14 @@ author: - Will Szumski (@jovial) -''' +""" -EXAMPLES = r''' +EXAMPLES = r""" - name: Return ansible_facts user_namepace_facts: -''' +""" -RETURN = r''' +RETURN = r""" # These are examples of possible return values, and in general should use other names for return values. ansible_facts: description: Facts to add to ansible_facts. @@ -41,20 +49,17 @@ type: str returned: always, empty dict if /etc/subgid doesn't exist sample: { "foo": {"size": 123, "start": 100000 }} -''' +""" -from ansible.module_utils.basic import AnsibleModule -import csv -import os -def parse(path): +def parse(path): # pylint: disable=missing-function-docstring result = {} if not os.path.exists(path): return result - with open(path) as f: - reader = csv.reader(f, delimiter=':') + with open(path) as f: # pylint: disable=unspecified-encoding + reader = csv.reader(f, delimiter=":") for row in reader: user = row[0] entry = { @@ -65,50 +70,43 @@ def parse(path): return result -def run_module(): + +def run_module(): # pylint: disable=missing-function-docstring # define available arguments/parameters a user can pass to the module - module_args = dict() + module_args = {} # seed the result dict in the object # we primarily care about changed and state # changed is if this module effectively modified the target # state will include any data that you want your module to pass back # for consumption, for example, in a subsequent task - result = dict( - changed=False, - ansible_facts=dict(), - ) + result = { + "changed": False, + "ansible_facts": {}, + } # the AnsibleModule object will be our abstraction working with Ansible # this includes instantiation, a couple of common attr would be the # args/params passed to the execution, as well as if the module # supports check mode - module = AnsibleModule( - argument_spec=module_args, - supports_check_mode=True - ) + module = AnsibleModule(argument_spec=module_args, supports_check_mode=True) # manipulate or modify the state as needed (this is going to be the # part where your module will do what it needs to do) - result = { - 'ansible_facts': { - 'subuid': {}, - 'subgid': {} - } - } + result = {"ansible_facts": {"subuid": {}, "subgid": {}}} - result['ansible_facts']['subuid'] = parse('/etc/subuid') - result['ansible_facts']['subgid'] = parse('/etc/subgid') + result["ansible_facts"]["subuid"] = parse("/etc/subuid") + result["ansible_facts"]["subgid"] = parse("/etc/subgid") # in the event of a successful module execution, you will want to # simple AnsibleModule.exit_json(), passing the key/value results module.exit_json(**result) -def main(): +def main(): # pylint: disable=missing-function-docstring run_module() -if __name__ == '__main__': +if __name__ == "__main__": main() diff --git a/ansible/monitoring.yml b/ansible/monitoring.yml index d34a65f..c8225a0 100644 --- a/ansible/monitoring.yml +++ b/ansible/monitoring.yml @@ -5,11 +5,11 @@ hosts: opensearch tags: opensearch tasks: - - import_role: + - ansible.builtin.import_role: name: opensearch tasks_from: install.yml become: true - - import_role: + - ansible.builtin.import_role: name: opensearch tasks_from: runtime.yml become: true @@ -18,7 +18,7 @@ hosts: slurm_stats tags: slurm_stats tasks: - - include_role: + - ansible.builtin.include_role: name: slurm_stats tasks_from: "{{ 'configure.yml' if appliances_mode == 'configure' else 'main.yml' }}" @@ -26,7 +26,7 @@ hosts: filebeat tags: filebeat tasks: - - include_role: + - ansible.builtin.include_role: name: filebeat tasks_from: "{{ 'runtime.yml' if appliances_mode == 'configure' else 'main.yml' }}" @@ -34,7 +34,7 @@ hosts: node_exporter tags: node_exporter tasks: - - import_role: + - ansible.builtin.import_role: name: cloudalchemy.node_exporter - name: Deploy OpenOndemand exporter @@ -44,7 +44,7 @@ - openondemand - openondemand_server tasks: - - import_role: + - ansible.builtin.import_role: name: openondemand tasks_from: exporter.yml @@ -53,7 +53,7 @@ become: true tags: slurm_exporter tasks: - - include_role: + - ansible.builtin.include_role: name: slurm_exporter tasks_from: "{{ 'configure.yml' if appliances_mode == 'configure' else 'main.yml' }}" @@ -62,7 +62,7 @@ tags: prometheus tasks: - name: Check for existing prometheus binaries - stat: + ansible.builtin.stat: path: /usr/local/bin/{{ item }} register: prometheus_binaries loop: @@ -70,10 +70,10 @@ - promtool - name: Skip prometheus install if prometheus binaries exist and prometheus_version not defined # i.e. if prometheus_version isn't defined we don't care, so use what's already there - set_fact: - prometheus_skip_install: "{{ false if prometheus_version is defined else true }}" + ansible.builtin.set_fact: + prometheus_skip_install: "{{ false if prometheus_version is defined else true }}" when: "(prometheus_binaries.results | map(attribute='stat') | map(attribute='exists')) + [prometheus_skip_install is not defined]" - - import_role: + - ansible.builtin.import_role: name: cloudalchemy.prometheus - name: Deploy grafana @@ -94,22 +94,22 @@ group: root mode: '0755' become: true - - include_role: + - ansible.builtin.include_role: name: cloudalchemy.grafana vars: # Internal role used to install dashboards as cloudalchemy role does not support all required options: grafana_dashboards: [] - - include_role: # done in same play so it can use handlers from cloudalchemy.grafana + - ansible.builtin.include_role: # done in same play so it can use handlers from cloudalchemy.grafana name: grafana-dashboards when: "appliances_mode != 'configure'" - name: Deploy alertmanager hosts: alertmanager tags: alertmanager - become: yes + become: true gather_facts: false tasks: - name: Configure alertmanager - include_role: + ansible.builtin.include_role: name: alertmanager tasks_from: configure.yml diff --git a/ansible/noop.yml b/ansible/noop.yml index adad248..4c1c5ea 100644 --- a/ansible/noop.yml +++ b/ansible/noop.yml @@ -1,5 +1,4 @@ --- - # This file exists so that we can conditionally import a playbook. The path # must exist, but we can use a when conditional so that it is not actually # run diff --git a/ansible/portal.yml b/ansible/portal.yml index 58ca69f..361a603 100644 --- a/ansible/portal.yml +++ b/ansible/portal.yml @@ -1,15 +1,16 @@ +--- - hosts: openondemand tags: - openondemand - openondemand_server - become: yes - gather_facts: yes # TODO + become: true + gather_facts: true # TODO tasks: - name: Skip openondemand apps installation in configure mode - set_fact: + ansible.builtin.set_fact: ood_install_apps: {} when: appliances_mode == 'configure' - - import_role: + - ansible.builtin.import_role: name: openondemand tasks_from: main.yml @@ -18,10 +19,10 @@ - openondemand - openondemand_desktop - openondemand_matlab - become: yes - gather_facts: yes + become: true + gather_facts: true tasks: - - import_role: + - ansible.builtin.import_role: name: openondemand tasks_from: vnc_compute.yml when: appliances_mode != 'configure' # is run during build @@ -30,10 +31,10 @@ tags: - openondemand - openondemand_jupyter - become: yes - gather_facts: yes + become: true + gather_facts: true tasks: - - import_role: + - ansible.builtin.import_role: name: openondemand tasks_from: jupyter_compute.yml when: appliances_mode != 'configure' # is run during build @@ -42,10 +43,10 @@ tags: - openondemand - openondemand_rstudio - become: yes - gather_facts: yes + become: true + gather_facts: true tasks: - - import_role: + - ansible.builtin.import_role: name: openondemand tasks_from: rstudio_compute.yml when: appliances_mode != 'configure' # is run during build @@ -54,10 +55,10 @@ tags: - openondemand - openondemand_codeserver - become: yes - gather_facts: yes + become: true + gather_facts: true tasks: - - import_role: + - ansible.builtin.import_role: name: openondemand tasks_from: codeserver_compute.yml - when: appliances_mode != 'configure' # is run during build \ No newline at end of file + when: appliances_mode != 'configure' # is run during build diff --git a/ansible/roles/alertmanager/README.md b/ansible/roles/alertmanager/README.md index f5bc23b..900e0e1 100644 --- a/ansible/roles/alertmanager/README.md +++ b/ansible/roles/alertmanager/README.md @@ -5,6 +5,7 @@ to route Prometheus alerts to a receiver. Currently Slack is the only supported receiver. Note that: + - HA configuration is not supported - Alertmanager state is not preserved when the node it runs on (by default, control node) is reimaged, so any alerts silenced via the GUI will reoccur. @@ -14,6 +15,7 @@ Alertmanager is enabled by default on the `control` node in the `site` environment's `inventory/groups` file. In general usage may only require: + - Enabling the Slack integration (see section below). - Possibly setting `alertmanager_web_external_url`. @@ -25,6 +27,7 @@ All variables are optional. See [defaults/main.yml](defaults/main.yml) for all default values. General variables: + - `alertmanager_version`: String, version (no leading 'v') - `alertmanager_download_checksum`: String, checksum for relevant version from [prometheus.io download page](https://prometheus.io/download/), in format @@ -43,14 +46,14 @@ The following variables are equivalent to similarly-named arguments to the `alertmanager` binary. See `man alertmanager` for more info: - `alertmanager_config_file`: String, path the main alertmanager config file - will be written to. Parent directory will be created if necessary. + will be written to. Parent directory will be created if necessary. - `alertmanager_web_config_file`: String, path alertmanager web config file - will be written to. Parent directory will be created if necessary. + will be written to. Parent directory will be created if necessary. - `alertmanager_storage_path`: String, base path for data storage. - `alertmanager_web_listen_addresses`: List of strings, defining addresses to listeen on. - `alertmanager_web_external_url`: String, the URL under which Alertmanager is - externally reachable - defaults to host IP address and `alertmanager_port`. - See man page for more details if proxying alertmanager. + externally reachable - defaults to host IP address and `alertmanager_port`. + See man page for more details if proxying alertmanager. - `alertmanager_data_retention`: String, how long to keep data for - `alertmanager_data_maintenance_interval`: String, interval between garbage collection and snapshotting to disk of the silences and the notification logs. @@ -59,6 +62,7 @@ The following variables are equivalent to similarly-named arguments to the - `alertmanager_default_receivers`: The following variables are templated into the alertmanager [main configuration](https://prometheus.io/docs/alerting/latest/configuration/): + - `alertmanager_config_template`: String, path to configuration template. The default is to template in `alertmanager_config_default` and `alertmanager_config_extra`. - `alertmanager_config_default`: Mapping with default configuration for the @@ -70,24 +74,27 @@ The following variables are templated into the alertmanager [main configuration] - `alertmanager_extra_receivers`: A list of additional [receiver](https://prometheus.io/docs/alerting/), mappings to add, by default empty. - `alertmanager_slack_receiver`: Mapping defining the [Slack receiver](https://prometheus.io/docs/alerting/latest/configuration/#slack_config). Note the default configuration for this is in -`environments/common/inventory/group_vars/all/alertmanager.yml`. + `environments/common/inventory/group_vars/all/alertmanager.yml`. - `alertmanager_slack_receiver_name`: String, name for the above Slack reciever. - `alertmanager_slack_receiver_send_resolved`: Bool, whether to send resolved alerts via the above Slack reciever. -- `alertmanager_null_receiver`: Mapping defining a `null` [receiver](https://prometheus.io/docs/alerting/latest/configuration/#receiver) so a receiver is always defined. +- `alertmanager_null_receiver`: Mapping defining a `null` [receiver](https://prometheus.io/docs/alerting/latest/configuration/#receiver) so a receiver is always defined. - `alertmanager_config_extra`: Mapping with additional configuration. Keys in this become top-level keys in the configuration. E.g this might be: - ```yaml - alertmanager_config_extra: - global: - smtp_from: smtp.example.org:587 - time_intervals: - - name: monday-to-friday - time_intervals: - - weekdays: ['monday:friday'] - ``` + + ```yaml + alertmanager_config_extra: + global: + smtp_from: smtp.example.org:587 + time_intervals: + - name: monday-to-friday + time_intervals: + - weekdays: ['monday:friday'] + ``` + Note that `route` and `receivers` keys should not be added here. The following variables are templated into the alertmanager [web configuration](https://prometheus.io/docs/alerting/latest/https/): + - `alertmanager_web_config_default`: Mapping with default configuration for `basic_auth_users` providing the default web user. - `alertmanager_alertmanager_web_config_extra`: Mapping with additional web diff --git a/ansible/roles/alertmanager/defaults/main.yml b/ansible/roles/alertmanager/defaults/main.yml index b303017..4b90994 100644 --- a/ansible/roles/alertmanager/defaults/main.yml +++ b/ansible/roles/alertmanager/defaults/main.yml @@ -1,5 +1,6 @@ -alertmanager_version: '0.28.1' -alertmanager_download_checksum: 'sha256:5ac7ab5e4b8ee5ce4d8fb0988f9cb275efcc3f181b4b408179fafee121693311' +--- +alertmanager_version: "0.28.1" +alertmanager_download_checksum: "sha256:5ac7ab5e4b8ee5ce4d8fb0988f9cb275efcc3f181b4b408179fafee121693311" alertmanager_download_dest: /tmp/alertmanager.tar.gz alertmanager_binary_dir: /usr/local/bin alertmanager_started: true @@ -11,13 +12,13 @@ alertmanager_config_file: /etc/alertmanager/alertmanager.yml alertmanager_web_config_file: /etc/alertmanager/alertmanager-web.yml alertmanager_storage_path: /var/lib/alertmanager -alertmanager_port: '9093' +alertmanager_port: "9093" alertmanager_web_listen_addresses: - ":{{ alertmanager_port }}" -alertmanager_web_external_url: '' # defined in environments/common/inventory/group_vars/all/alertmanager.yml for visibility +alertmanager_web_external_url: "" # defined in environments/common/inventory/group_vars/all/alertmanager.yml for visibility -alertmanager_data_retention: '120h' -alertmanager_data_maintenance_interval: '15m' +alertmanager_data_retention: "120h" +alertmanager_data_maintenance_interval: "15m" alertmanager_config_flags: {} # other command-line parameters as shown by `man alertmanager` alertmanager_config_template: alertmanager.yml.j2 alertmanager_web_config_template: alertmanager-web.yml.j2 @@ -35,7 +36,7 @@ alertmanager_alertmanager_web_config_extra: {} # top-level only # app_creds: alertmanager_null_receiver: - name: 'null' + name: "null" alertmanager_slack_receiver: {} # defined in environments/common/inventory/group_vars/all/alertmanager.yml as it needs prometheus_address alertmanager_extra_receivers: [] alertmanager_default_receivers: "{{ [alertmanager_null_receiver] + ([alertmanager_slack_receiver] if alertmanager_slack_integration is defined else []) }}" @@ -43,7 +44,8 @@ alertmanager_receivers: "{{ alertmanager_default_receivers + alertmanager_extra alertmanager_config_default: route: - group_by: ['...'] + group_by: + - "..." receiver: "{{ alertmanager_slack_receiver_name if alertmanager_slack_integration is defined else 'null' }}" receivers: "{{ alertmanager_receivers }}" diff --git a/ansible/roles/alertmanager/handlers/main.yml b/ansible/roles/alertmanager/handlers/main.yml index ee87e1e..6e427a6 100644 --- a/ansible/roles/alertmanager/handlers/main.yml +++ b/ansible/roles/alertmanager/handlers/main.yml @@ -1,5 +1,6 @@ +--- - name: Restart alertmanager - systemd: + ansible.builtin.systemd: name: alertmanager state: restarted daemon_reload: "{{ _alertmanager_service.changed | default(false) }}" diff --git a/ansible/roles/alertmanager/tasks/configure.yml b/ansible/roles/alertmanager/tasks/configure.yml index a43ec20..15f252f 100644 --- a/ansible/roles/alertmanager/tasks/configure.yml +++ b/ansible/roles/alertmanager/tasks/configure.yml @@ -1,3 +1,4 @@ +--- - name: Create alertmanager directories ansible.builtin.file: path: "{{ item }}" @@ -11,7 +12,7 @@ - "{{ alertmanager_storage_path }}" - name: Create alertmanager service file with immutable options - template: + ansible.builtin.template: src: alertmanager.service.j2 dest: /usr/lib/systemd/system/alertmanager.service owner: root @@ -38,10 +39,9 @@ mode: u=rw,go= notify: Restart alertmanager -- meta: flush_handlers - +- ansible.builtin.meta: flush_handlers - name: Ensure alertmanager service state - systemd: + ansible.builtin.systemd: name: alertmanager state: "{{ 'started' if alertmanager_started | bool else 'stopped' }}" enabled: "{{ alertmanager_enabled | bool }}" diff --git a/ansible/roles/alertmanager/tasks/install.yml b/ansible/roles/alertmanager/tasks/install.yml index 0f655da..f1cb9cd 100644 --- a/ansible/roles/alertmanager/tasks/install.yml +++ b/ansible/roles/alertmanager/tasks/install.yml @@ -1,3 +1,4 @@ +--- - name: Create alertmanager system user ansible.builtin.user: name: "{{ alertmanager_system_user }}" @@ -22,4 +23,6 @@ group: root mode: u=rwx,go=rx remote_src: true - extra_opts: ['--strip-components=1', '--show-stored-names'] + extra_opts: + - "--strip-components=1" + - "--show-stored-names" diff --git a/ansible/roles/basic_users/README.md b/ansible/roles/basic_users/README.md index 70ab154..23bea4c 100644 --- a/ansible/roles/basic_users/README.md +++ b/ansible/roles/basic_users/README.md @@ -1,9 +1,8 @@ - -basic_users -=========== +# basic_users Setup users on cluster nodes using `/etc/passwd` and manipulating `$HOME`, i.e. without requiring LDAP etc. Features: + - UID/GID is consistent across cluster (and explicitly defined). - SSH key generated and propagated to all nodes to allow login between cluster nodes. @@ -12,59 +11,56 @@ without requiring LDAP etc. Features: - When deleting users, systemd user sessions are terminated first. > [!IMPORTANT] The defaults for this role assumes that `$HOME` for users -managed by this role (e.g. not `rocky` and other system users) is on a shared -filesystem. The export of this shared filesystem may be root squashed if its -server is in the `basic_user` group - see configuration examples below. +> managed by this role (e.g. not `rocky` and other system users) is on a shared +> filesystem. The export of this shared filesystem may be root squashed if its +> server is in the `basic_user` group - see configuration examples below. -Role Variables --------------- +## Role Variables - `basic_users_homedir_server`: Optional inventory hostname in the `basic_users` group defining the host to use to create home directories. If the home - directory export is root squashed, this host *must* be the home directory + directory export is root squashed, this host _must_ be the home directory server. Default is the `control` node which is appropriate for the default appliance configuration. Not relevant if `create_home` is false for all users. - `basic_users_homedir_server_path`: Optional path prefix for home directories on - the `basic_users_homedir_server`, i.e. on the "server side". Default is - `/exports/home` which is appropriate for the default appliance configuration. + the `basic_users_homedir_server`, i.e. on the "server-side". Default is + `/exports/home` which is appropriate for the default appliance configuration. - `basic_users_homedir_client`: Optional inventory hostname in the `basic_users` - group defining the host to use to create ssh keys etc in home directories. + group defining the host to use to create SSH keys etc in home directories. This should be a host mounting the home directories. Default is the first node in the `login` group which is appropriate for the default appliance configuration. - `basic_users_users`: Optional, default empty list. A list of mappings defining - information for each user. In general, mapping keys/values are passed through - as parameters to [ansible.builtin.user](https://docs.ansible.com/ansible/latest/collections/ansible/builtin/user_module.html) - and default values are as given there, with the following differences: + information for each user. In general, mapping keys/values are passed through + as parameters to [ansible.builtin.user](https://docs.ansible.com/ansible/latest/collections/ansible/builtin/user_module.html) + and default values are as given there, with the following differences: - `generate_ssh_key`: Default is `true`, and the generated key is added to - the user's authorized keys. - - `ssh_key_comment`: Default is user name. - - `home`: Set automatically based on the user name and + the user's authorized keys. + - `ssh_key_comment`: Default is username. + - `home`: Set automatically based on the username and `basic_users_homedir_server_path`. Can be overriden for users with - non-standard home directory paths. + non-standard home directory paths. - `uid`: Should be set, so that the UID/GID is consistent across the cluster (which Slurm requires). - - `shell`: If *not* set will be `/sbin/nologin` on the `control` node to - prevent users logging in to this node, and the default shell on other - nodes. Explicitly setting this defines the shell for all nodes and if the - shared home directories are mounted on the control node will allow the - user to log in to the control node. + - `shell`: If _not_ set will be `/sbin/nologin` on the `control` node to + prevent users logging in to this node, and the default shell on other + nodes. Explicitly setting this defines the shell for all nodes and if the + shared home directories are mounted on the control node will allow the + user to log in to the control node. - `public_key`: Optional, define a key to log into the cluster with. - `sudo`: Optional, a (possibly multiline) string defining sudo rules for the - user. + user. - `ssh_key_type` defaults to `ed25519` instead of the `ansible.builtin.user` - default of `rsa`. + default of `rsa`. - Any other keys may present for other purposes (i.e. not used by this role). - `basic_users_groups`: Optional, default empty list. A list of mappings defining information for each group. Mapping keys/values are passed through as parameters to [ansible.builtin.group](https://docs.ansible.com/ansible/latest/collections/ansible/builtin/group_module.html) and default values are as given there. - `basic_users_override_sssd`: Optional bool, default false. Whether to disable `sssd` when ensuring users/groups exist with this role. Permits creating local users/groups even if they clash with users provided via sssd (e.g. from LDAP). Ignored if host is not in group `sssd` as well. Note with this option active `sssd` will be stopped and restarted each time this role is run. -Dependencies ------------- +## Dependencies None. -Example Configurations ----------------------- +## Example Configurations With default appliance NFS configuration, create user `alice` with access to all nodes except the control node, and delete user `bob`: @@ -83,9 +79,10 @@ basic_users_users: ``` Using an external share which: - - does not root squash (so this role can create directories on it) - - is mounted to all nodes including the control node (so this role can set - authorized keys there) + +- does not root squash (so this role can create directories on it) +- is mounted to all nodes including the control node (so this role can set + authorized keys there) Create user `Carol`: @@ -99,7 +96,7 @@ basic_users_user: public_key: ssh-ed25519 ... ``` -Using an external share which *does* root squash, so home directories cannot be +Using an external share which _does_ root squash, so home directories cannot be created by this role and must already exist, create user `Dan`: ```yaml diff --git a/ansible/roles/basic_users/defaults/main.yml b/ansible/roles/basic_users/defaults/main.yml index 7b24ef7..8b4b66b 100644 --- a/ansible/roles/basic_users/defaults/main.yml +++ b/ansible/roles/basic_users/defaults/main.yml @@ -1,9 +1,10 @@ +--- basic_users_homedir_server: "{{ groups['control'] | first }}" # no way, generally, to find the nfs_server basic_users_homedir_server_path: /exports/home -basic_users_homedir_client: "{{ groups['login'] | first }}" +basic_users_homedir_client: "{{ groups['login'] | first }}" basic_users_userdefaults: state: present # need this here so don't have to add default() everywhere - generate_ssh_key: true + generate_ssh_key: true ssh_key_comment: "{{ item.name }}" ssh_key_type: ed25519 shell: "{{'/sbin/nologin' if 'control' in group_names else omit }}" diff --git a/ansible/roles/basic_users/filter_plugins/filter_keys.py b/ansible/roles/basic_users/filter_plugins/filter_keys.py index 119a430..12aa079 100644 --- a/ansible/roles/basic_users/filter_plugins/filter_keys.py +++ b/ansible/roles/basic_users/filter_plugins/filter_keys.py @@ -1,22 +1,27 @@ -""" Filter a dict to remove specified keys """ +"""Filter a dict to remove specified keys""" import copy -USER_MODULE_PARAMS = ('append authorization comment create_home createhome expires force generate_ssh_key group ' - 'groups hidden home local login_class move_home name user non_unique password password_expire_min ' - 'password_expire_max password_lock profile remove role seuser shell skeleton ssh_key_bits ' - 'ssh_key_comment ssh_key_file ssh_key_passphrase ssh_key_type state system uid update_password').split() +USER_MODULE_PARAMS = ( + "append authorization comment create_home createhome expires force generate_ssh_key group " + "groups hidden home local login_class move_home name user non_unique password " + "password_expire_min password_expire_max password_lock profile remove role seuser shell " + "skeleton ssh_key_bits ssh_key_comment ssh_key_file ssh_key_passphrase ssh_key_type state " + "system uid update_password" +).split() -class FilterModule(object): - def filters(self): - return { - 'filter_user_params': self.filter_user_params - } +class FilterModule( + object +): # pylint: disable=missing-class-docstring, useless-object-inheritance + + def filters(self): # pylint: disable=missing-function-docstring + return {"filter_user_params": self.filter_user_params} def filter_user_params(self, d): - ''' Return a copy of dict `d` containing only keys which are parameters for the user module''' - + # pylint: disable-next=line-too-long + """Return a copy of dict `d` containing only keys which are parameters for the user module""" + user_dict = copy.deepcopy(d) remove_keys = set(user_dict).difference(USER_MODULE_PARAMS) for key in remove_keys: diff --git a/ansible/roles/basic_users/library/terminate_user_sessions.py b/ansible/roles/basic_users/library/terminate_user_sessions.py index 711b373..542c338 100644 --- a/ansible/roles/basic_users/library/terminate_user_sessions.py +++ b/ansible/roles/basic_users/library/terminate_user_sessions.py @@ -1,11 +1,15 @@ #!/usr/bin/python +# pylint: disable=missing-module-docstring # Copyright: (c) 2021, Steve Brasier # Apache V2 licence -from __future__ import (absolute_import, division, print_function) -__metaclass__ = type +from __future__ import absolute_import, division, print_function -DOCUMENTATION = r''' +from ansible.module_utils.basic import AnsibleModule # pylint: disable=import-error + +__metaclass__ = type # pylint: disable=invalid-name + +DOCUMENTATION = r""" --- module: terminate_user_sessions @@ -22,54 +26,60 @@ description: Name of user required: true type: str - + author: - Steve Brasier (stackhpc.com) -''' +""" -EXAMPLES = r''' +EXAMPLES = r""" - terminate_user_sessions: name: fred -''' - -RETURN = r''' -''' +""" -from ansible.module_utils.basic import AnsibleModule +RETURN = r""" +""" -def run_module(): - # define available arguments/parameters a user can pass to the module - module_args = dict( - user=dict(type='str', required=True), - ) +def run_module(): # pylint: disable=missing-function-docstring + # define available arguments/parameters a user can pass to the module] + module_args = { + "user": { + "type": "str", + "required": True, + } + } - result = dict(changed=False) + result = { + "changed": False, + } - module = AnsibleModule( - argument_spec=module_args, - supports_check_mode=True - ) + module = AnsibleModule(argument_spec=module_args, supports_check_mode=True) if module.check_mode: module.exit_json(**result) - _, sessions_stdout, _ = module.run_command("loginctl --no-legend list-sessions", check_rc=True) + _, sessions_stdout, _ = module.run_command( + "loginctl --no-legend list-sessions", check_rc=True + ) for line in sessions_stdout.splitlines(): session_info = line.split() user = session_info[1] session_id = session_info[0] - if user == module.params['user']: - _, sessions_stdout, _ = module.run_command("loginctl terminate-session %s" % session_id, check_rc=True) - result['changed'] = True - + if user == module.params["user"]: + _, sessions_stdout, _ = module.run_command( + # pylint: disable-next=consider-using-f-string + "loginctl terminate-session %s" % session_id, + check_rc=True, + ) + result["changed"] = True + # successful module exit: module.exit_json(**result) -def main(): +def main(): # pylint: disable=missing-function-docstring run_module() -if __name__ == '__main__': - main() \ No newline at end of file +if __name__ == "__main__": + main() diff --git a/ansible/roles/basic_users/tasks/main.yml b/ansible/roles/basic_users/tasks/main.yml index 6abba9c..cd01430 100644 --- a/ansible/roles/basic_users/tasks/main.yml +++ b/ansible/roles/basic_users/tasks/main.yml @@ -9,7 +9,7 @@ - "item.state | default('present') == 'absent'" - name: Stop sssd if required - systemd: + ansible.builtin.systemd: name: sssd state: stopped register: _stop_sssd @@ -18,11 +18,11 @@ - basic_users_override_sssd | bool - name: Create groups - ansible.builtin.group: "{{ item }}" - loop: "{{ basic_users_groups }}" + ansible.builtin.group: "{{ item }}" # noqa: args[module] + loop: "{{ basic_users_groups }}" - name: Create users - user: "{{ basic_users_userdefaults | combine(item) | filter_user_params() | combine(_disable_homedir) }}" + ansible.builtin.user: "{{ basic_users_userdefaults | combine(item) | filter_user_params() | combine(_disable_homedir) }}" # noqa: args[module] loop: "{{ basic_users_users }}" loop_control: label: "{{ item.name }}" @@ -32,10 +32,11 @@ generate_ssh_key: false - name: Write sudo rules - blockinfile: + ansible.builtin.blockinfile: path: /etc/sudoers.d/80-{{ item.name }}-user block: "{{ item.sudo }}" create: true + mode: "0440" loop: "{{ basic_users_users }}" loop_control: label: "{{ item.name }}" @@ -44,10 +45,10 @@ - "'sudo' in item" - name: Restart sssd if required - systemd: + ansible.builtin.systemd: name: sssd state: started - when: _stop_sssd is changed + when: _stop_sssd is changed # noqa: no-handler # This task runs only on the home directory server so it can handle # root-squashed exports @@ -56,7 +57,7 @@ ansible.builtin.copy: remote_src: true src: "{{ item.skeleton | default('/etc/skel/') }}" - dest: "{{ item.home | default( basic_users_homedir_server_path + '/' + item.name ) }}" + dest: "{{ item.home | default(basic_users_homedir_server_path + '/' + item.name) }}" owner: "{{ item.name }}" group: "{{ item.name }}" mode: u=rwX,go= @@ -72,12 +73,13 @@ # paths are easily constructed, becoming each user so that root-squash # doesn't matter - name: Create ~/.ssh directories - file: + ansible.builtin.file: state: directory path: ~/.ssh/ owner: "{{ item.name }}" group: "{{ item.name }}" mode: u=rwX,go= + become: true become_user: "{{ item.name }}" loop: "{{ basic_users_users }}" loop_control: @@ -89,11 +91,12 @@ - name: Generate cluster ssh key community.crypto.openssh_keypair: - path: "{{ item.ssh_key_file | default('~/.ssh/id_' + _ssh_key_type )}}" # NB: ssh_key_file is from ansible.builtin.user + path: "{{ item.ssh_key_file | default('~/.ssh/id_' + _ssh_key_type) }}" # NB: ssh_key_file is from ansible.builtin.user type: "{{ _ssh_key_type }}" comment: "{{ item.ssh_key_comment | default(item.name) }}" vars: _ssh_key_type: "{{ item.ssh_key_type | default('ed25519') }}" + become: true become_user: "{{ item.name }}" loop: "{{ basic_users_users }}" loop_control: @@ -111,6 +114,7 @@ manage_dir: false key: "{{ item.public_key }}" path: ~/.ssh/authorized_keys + become: true become_user: "{{ item.item.name }}" loop: "{{ _cluster_ssh_keypair.results }}" loop_control: @@ -128,6 +132,7 @@ manage_dir: false key: "{{ item.public_key }}" path: ~/.ssh/authorized_keys + become: true become_user: "{{ item.name }}" loop: "{{ basic_users_users }}" loop_control: diff --git a/ansible/roles/block_devices/README.md b/ansible/roles/block_devices/README.md index 0d326d3..ac642ce 100644 --- a/ansible/roles/block_devices/README.md +++ b/ansible/roles/block_devices/README.md @@ -1,9 +1,9 @@ -block_devices -============= +# block_devices Manage filesystems on block devices (such as OpenStack volumes), including creating partitions, creating filesystems and mounting filesystems. This is a convenience wrapper around the ansible modules: + - community.general.parted - community.general.filesystem - ansible.buildin.file @@ -15,13 +15,11 @@ To avoid issues with device names changing after e.g. reboots, devices are ident [^1]: See `environments/common/inventory/group_vars/builder/defaults.yml` -Requirements ------------- +## Requirements N/A. -Role Variables --------------- +## Role Variables - `block_devices_partition_state`: Optional. Partition state, 'present' or 'absent' (as for parted) or 'skip'. Defaults to 'present'. - `block_devices_serial`: Required. Serial number of block device. For an OpenStack volume this is the volume ID. @@ -36,20 +34,18 @@ Role Variables Multiple NFS client/server configurations may be provided by defining `block_devices_configurations`. This should be a list of mappings with keys/values are as per the variables above without the `block_devices_` prefix. Omitted keys/values are filled from the corresponding variable. -Dependencies ------------- +## Dependencies See top of page. -Example Playbook ----------------- +## Example Playbook ```yaml - hosts: servers become: true tasks: - - include_role: - name: block_devices + - include_role: + name: block_devices ``` The example variables below create an `ext4` partition on `/dev/sdb1` and mount it as `/mnt/files` with the default owner/group: @@ -71,12 +67,10 @@ block_devices_configurations: path: /mnt/files ``` -License -------- +## License Apache V2 -Author Information ------------------- +## Author Information stackhpc.com diff --git a/ansible/roles/block_devices/defaults/main.yml b/ansible/roles/block_devices/defaults/main.yml index 0f997bf..1a9da7b 100644 --- a/ansible/roles/block_devices/defaults/main.yml +++ b/ansible/roles/block_devices/defaults/main.yml @@ -1,9 +1,11 @@ -block_devices_configurations: [{}] +--- +block_devices_configurations: + - {} block_devices_partition_state: present # 'present', 'absent' (as for parted) or 'skip' block_devices_device: # Path to block device, e.g. '/dev/sda'. See community.general.parted:device and community.general.filesystem:dev block_devices_number: # Partition number, e.g 1 for /dev/sda1. See community.general.parted:number block_devices_fstype: # Filesystem type, e.g. e.g. 'ext4'. See community.general.filesystem:fstype -block_devices_resizefs: no # Grow filesystem into block device space (yes or no). See community.general.filesystem:resizefs +block_devices_resizefs: false # Grow filesystem into block device space (yes or no). See community.general.filesystem:resizefs block_devices_filesystem_state: present # 'present', 'absent' (as for community.general.filesystem:state) or 'skip' block_devices_path: # Path to mount point, e.g. '/mnt/files' block_devices_mount_state: mounted # Mount state, see ansible.posix.mount:state diff --git a/ansible/roles/block_devices/library/block_devices.py b/ansible/roles/block_devices/library/block_devices.py index ac34f2b..4a598fc 100644 --- a/ansible/roles/block_devices/library/block_devices.py +++ b/ansible/roles/block_devices/library/block_devices.py @@ -1,9 +1,14 @@ #!/usr/bin/python +# pylint: disable=missing-module-docstring # Copyright: (c) 2021, StackHPC # Apache 2 License -DOCUMENTATION = r''' +import json + +from ansible.module_utils.basic import AnsibleModule # pylint: disable=import-error + +DOCUMENTATION = r""" --- module: block_devices @@ -13,32 +18,30 @@ author: - Steve Brasier (@sjpb) -''' +""" -RETURN = r''' +RETURN = r""" devices: description: dict with device serial numbers as keys and full paths (e.g. /dev/sdb) as values type: dict return: always -''' +""" -import json -from ansible.module_utils.basic import AnsibleModule - -def run_module(): - module_args = dict() +def run_module(): # pylint: disable=missing-function-docstring + module_args = {} module = AnsibleModule(argument_spec=module_args, supports_check_mode=True) result = {"changed": False} _, stdout, _ = module.run_command("lsblk --paths --json -O", check_rc=True) - - device_info = json.loads(stdout)['blockdevices'] - result['devices'] = dict((item['serial'], item['name']) for item in device_info) + + device_info = json.loads(stdout)["blockdevices"] + result["devices"] = dict((item["serial"], item["name"]) for item in device_info) module.exit_json(**result) -def main(): + +def main(): # pylint: disable=missing-function-docstring run_module() -if __name__ == '__main__': +if __name__ == "__main__": main() diff --git a/ansible/roles/block_devices/tasks/main.yml b/ansible/roles/block_devices/tasks/main.yml index efaec3c..4ce7925 100644 --- a/ansible/roles/block_devices/tasks/main.yml +++ b/ansible/roles/block_devices/tasks/main.yml @@ -1,5 +1,6 @@ +--- - name: Warn role is deprecated - debug: + ansible.builtin.debug: msg: "{{ 'Role block_devices is deprecated, see ansible/roles/block_devices/README.md' | warn }}" when: block_devices_configurations | length > 0 @@ -7,18 +8,18 @@ block_devices: register: _block_devices -- name: Create partitions - parted: +- name: Create partitions + community.general.parted: device: "{{ _device }}" number: "{{ item.get('number', block_devices_number) }}" state: "{{ item.get('partition_state', block_devices_partition_state) }}" when: "item.get('partition_state', block_devices_partition_state) != 'skip'" loop: "{{ block_devices_configurations }}" vars: - _device: "{{ _block_devices.devices[ item.get('serial', block_devices_serial) ] }}" + _device: "{{ _block_devices.devices[item.get('serial', block_devices_serial)] }}" - name: Create filesystems - filesystem: + community.general.filesystem: fstype: "{{ item.get('fstype', block_devices_fstype) }}" dev: "{{ _device }}{{ item.get('number', block_devices_number) }}" resizefs: "{{ item.get('resizefs', block_devices_resizefs) }}" @@ -26,26 +27,27 @@ when: "item.get('filesystem_state', block_devices_filesystem_state) != 'skip'" loop: "{{ block_devices_configurations }}" vars: - _device: "{{ _block_devices.devices[ item.get('serial', block_devices_serial) ] }}" + _device: "{{ _block_devices.devices[item.get('serial', block_devices_serial)] }}" - name: Get filesystem UUIDs - command: + ansible.builtin.command: cmd: "lsblk {{ _device }}{{ item.get('number', block_devices_number) }} --noheadings --output UUID" loop: "{{ block_devices_configurations }}" vars: - _device: "{{ _block_devices.devices[ item.get('serial', block_devices_serial) ] }}" + _device: "{{ _block_devices.devices[item.get('serial', block_devices_serial)] }}" register: block_devices_uuids changed_when: false - check_mode: no + check_mode: false - name: Ensure mount point exists - file: + ansible.builtin.file: path: "{{ item.get('path', block_devices_path) }}" state: directory + mode: "0755" loop: "{{ block_devices_configurations }}" - name: Mount filesystems by UUID - mount: + ansible.posix.mount: path: "{{ item.get('path', block_devices_path) }}" src: "UUID={{ _uuid }}" fstype: "{{ item.get('fstype', block_devices_fstype) }}" @@ -57,10 +59,11 @@ index_var: block_devices_idx - name: Set owner/group for mounted directory - file: + ansible.builtin.file: path: "{{ item.get('path', block_devices_path) }}" state: directory owner: "{{ item.get('owner', block_devices_owner) | default(omit) }}" group: "{{ item.get('group', block_devices_group) | default(omit) }}" + mode: "0755" when: "item.get('owner', block_devices_owner) or item.get('group', block_devices_group)" loop: "{{ block_devices_configurations }}" diff --git a/ansible/roles/cacerts/defaults/main.yml b/ansible/roles/cacerts/defaults/main.yml index c1f940f..d53992a 100644 --- a/ansible/roles/cacerts/defaults/main.yml +++ b/ansible/roles/cacerts/defaults/main.yml @@ -1,3 +1,4 @@ -#cacerts_dest_dir: /etc/pki/ca-trust/source/anchors/ +--- +# cacerts_dest_dir: /etc/pki/ca-trust/source/anchors/ cacerts_cert_dir: "{{ appliances_environment_root }}/cacerts" cacerts_update: true diff --git a/ansible/roles/cacerts/tasks/configure.yml b/ansible/roles/cacerts/tasks/configure.yml index 5001f44..a23f275 100644 --- a/ansible/roles/cacerts/tasks/configure.yml +++ b/ansible/roles/cacerts/tasks/configure.yml @@ -1,16 +1,15 @@ --- - - name: Copy all certificates - copy: + ansible.builtin.copy: src: "{{ item }}" dest: /etc/pki/ca-trust/source/anchors/ owner: root group: root - mode: 0644 + mode: "0644" with_fileglob: - "{{ cacerts_cert_dir }}/*" become: true -- name: Update trust store - command: update-ca-trust extract +- name: Update trust store # noqa: no-changed-when + ansible.builtin.command: update-ca-trust extract become: true diff --git a/ansible/roles/cacerts/tasks/export.yml b/ansible/roles/cacerts/tasks/export.yml index c9c6471..8e036a1 100644 --- a/ansible/roles/cacerts/tasks/export.yml +++ b/ansible/roles/cacerts/tasks/export.yml @@ -1,10 +1,11 @@ +--- - name: Copy cacerts from deploy host to /exports/cluster/cacerts/ - copy: + ansible.builtin.copy: src: "{{ item }}" dest: /exports/cluster/cacerts/ owner: slurm group: root - mode: 0644 + mode: "0644" with_fileglob: - "{{ cacerts_cert_dir }}/*" delegate_to: "{{ groups['control'] | first }}" diff --git a/ansible/roles/cacerts/tasks/main.yml b/ansible/roles/cacerts/tasks/main.yml index 84f4934..ec83d2b 100644 --- a/ansible/roles/cacerts/tasks/main.yml +++ b/ansible/roles/cacerts/tasks/main.yml @@ -1 +1,2 @@ -- import_tasks: configure.yml +--- +- ansible.builtin.import_tasks: configure.yml diff --git a/ansible/roles/cluster_infra/defaults/main.yml b/ansible/roles/cluster_infra/defaults/main.yml index f2f9637..3b1f6c7 100644 --- a/ansible/roles/cluster_infra/defaults/main.yml +++ b/ansible/roles/cluster_infra/defaults/main.yml @@ -1,2 +1,3 @@ +--- ansible_init_collections: [] ansible_init_playbooks: [] diff --git a/ansible/roles/cluster_infra/tasks/main.yml b/ansible/roles/cluster_infra/tasks/main.yml index f62c257..91c2ab3 100644 --- a/ansible/roles/cluster_infra/tasks/main.yml +++ b/ansible/roles/cluster_infra/tasks/main.yml @@ -1,4 +1,5 @@ -- debug: +--- +- ansible.builtin.debug: msg: | terraform_backend_type: {{ terraform_backend_type }} terraform_state: {{ terraform_state }} @@ -8,55 +9,57 @@ # if we we have cluster_floating_ip, otherwise assume that we're # assigning the FIP in Terraform and that it will be available in # outputs.cluster_gateway_ip. -- block: +- when: + - cluster_floating_ip is defined + - cluster_floating_ip + block: - name: Look up floating IP azimuth_cloud.terraform.os_floating_ip_info: - floating_ip: "{{ cluster_floating_ip }}" + floating_ip: "{{ cluster_floating_ip }}" register: cluster_floating_ip_info - name: Set floating IP address fact - set_fact: + ansible.builtin.set_fact: cluster_floating_ip_address: "{{ cluster_floating_ip_info.floating_ip.floating_ip_address }}" - when: - - cluster_floating_ip is defined - - cluster_floating_ip - - name: Install Terraform binary - include_role: + ansible.builtin.include_role: name: azimuth_cloud.terraform.install - name: Make Terraform project directory - file: + ansible.builtin.file: path: "{{ terraform_project_path }}" state: directory + mode: "0755" - name: Write backend configuration - copy: + ansible.builtin.copy: content: | terraform { backend "{{ terraform_backend_type }}" { } } dest: "{{ terraform_project_path }}/backend.tf" + mode: "0644" - name: Template Terraform files into project directory - template: + ansible.builtin.template: src: >- - {{ + {{ "{}{}.j2".format( ( - cluster_terraform_template_dir ~ "/" - if cluster_terraform_template_dir is defined + cluster_terraform_template_dir ~ "/" + if cluster_terraform_template_dir is defined else "" ), item ) }} dest: "{{ terraform_project_path }}/{{ item }}" + mode: "0644" loop: - outputs.tf - providers.tf - resources.tf - name: Provision infrastructure - include_role: + ansible.builtin.include_role: name: azimuth_cloud.terraform.infra diff --git a/ansible/roles/compute_init/README.md b/ansible/roles/compute_init/README.md index 7a95d2b..cc8b2de 100644 --- a/ansible/roles/compute_init/README.md +++ b/ansible/roles/compute_init/README.md @@ -8,6 +8,7 @@ Allow compute nodes to rejoin the cluster after a reboot without running the > required configuration may change with further development. To enable this: + 1. Add the `compute` group (or a subset) into the `compute_init` group. 2. Build an image which includes the `compute_init` group. This is the case for StackHPC-built release images. @@ -35,67 +36,67 @@ property described above. If a role is marked as requiring a custom image then it also requires an image build with the role name added to the [Packer inventory_groups variable](../../../docs/image-build.md). -| Playbook | Role (or functionality) | Support | Custom image reqd.? | -| -------------------------|-------------------------|---------------------------------|---------------------| -| hooks/pre.yml | ? | None at present | n/a | -| validate.yml | n/a | Not relevant during boot | n/a | -| bootstrap.yml | (wait for ansible-init) | Not relevant during boot | n/a | -| bootstrap.yml | resolv_conf | Fully supported | No | -| bootstrap.yml | etc_hosts | Fully supported | No | -| bootstrap.yml | chrony | Fully supported | No | -| bootstrap.yml | proxy | None at present | No | -| bootstrap.yml | (/etc permissions) | None required - use image build | No | -| bootstrap.yml | (ssh /home fix) | None required - use image build | No | -| bootstrap.yml | (system users) | None required - use image build | No | -| bootstrap.yml | systemd | None required - use image build | No | -| bootstrap.yml | selinux | None required - use image build | Maybe [1] | -| bootstrap.yml | sshd | Fully supported | No | -| bootstrap.yml | dnf_repos | None at present [2] | - | -| bootstrap.yml | cacerts | Supported [3] | - | -| bootstrap.yml | squid | Not relevant for compute nodes | n/a | -| bootstrap.yml | tuned | Fully supported | No | -| bootstrap.yml | freeipa_server | Not relevant for compute nodes | n/a | -| bootstrap.yml | cockpit | None required - use image build | No | -| bootstrap.yml | firewalld | Not relevant for compute nodes | n/a | -| bootstrap.yml | fail2ban | Not relevant for compute nodes | n/a | -| bootstrap.yml | podman | Not relevant for compute nodes | n/a | -| bootstrap.yml | update | Not relevant during boot | n/a | -| bootstrap.yml | reboot | Not relevant for compute nodes | n/a | -| bootstrap.yml | ofed | Not relevant during boot | Yes | -| bootstrap.yml | ansible_init (install) | Not relevant during boot | n/a | -| bootstrap.yml | k3s (install) | Not relevant during boot | n/a | -| hooks/post-bootstrap.yml | ? | None at present | n/a | -| iam.yml | freeipa_client | None at present [4] | Yes | -| iam.yml | freeipa_server | Not relevant for compute nodes | n/a | -| iam.yml | sssd | Fully supported | No | -| filesystems.yml | block_devices | None required - role deprecated | n/a | -| filesystems.yml | nfs | All client functionality | No | -| filesystems.yml | manila | All functionality | No [5] | -| filesystems.yml | lustre | All functionality | Yes | -| extras.yml | basic_users | All functionality [6] | No | -| extras.yml | eessi | All functionality [7] | No | -| extras.yml | cuda | None required - use image build | Yes [8] | -| extras.yml | vgpu | All functionality | Yes | -| extras.yml | persist_hostkeys | Not relevant for compute nodes | n/a | -| extras.yml | compute_init (export) | Not relevant for compute nodes | n/a | -| extras.yml | k9s (install) | Not relevant during boot | n/a | -| extras.yml | extra_packages | None at present [9] | - | -| slurm.yml | mysql | Not relevant for compute nodes | n/a | -| slurm.yml | rebuild | Not relevant for compute nodes | n/a | -| slurm.yml | openhpc [10] | All slurmd functionality | No | -| slurm.yml | (set memory limits) | Fully supported | No | -| slurm.yml | (block ssh) | Fully supported | No | -| slurm.yml | nhc | Fully supported | No | -| portal.yml | (openondemand server) | Not relevant for compute nodes | n/a | -| portal.yml | (openondemand vnc desktop) | None required - use image build | No | -| portal.yml | (openondemand jupyter server) | None required - use image build | No | -| monitoring.yml | node_exporter | None required - use image build | No | -| monitoring.yml | (other monitoring) | Not relevant for compute nodes | - | -| disable-repos.yml | dnf_repos | None at present [2] | - | -| hooks/post.yml | ? | None at present | - | - +| Playbook | Role (or functionality) | Support | Custom image reqd.? | +| ------------------------ | ----------------------------- | ------------------------------- | ------------------- | +| hooks/pre.yml | ? | None at present | n/a | +| validate.yml | n/a | Not relevant during boot | n/a | +| bootstrap.yml | (wait for ansible-init) | Not relevant during boot | n/a | +| bootstrap.yml | resolv_conf | Fully supported | No | +| bootstrap.yml | etc_hosts | Fully supported | No | +| bootstrap.yml | chrony | Fully supported | No | +| bootstrap.yml | proxy | None at present | No | +| bootstrap.yml | (/etc permissions) | None required - use image build | No | +| bootstrap.yml | (SSH /home fix) | None required - use image build | No | +| bootstrap.yml | (system users) | None required - use image build | No | +| bootstrap.yml | systemd | None required - use image build | No | +| bootstrap.yml | selinux | None required - use image build | Maybe [1] | +| bootstrap.yml | sshd | Fully supported | No | +| bootstrap.yml | dnf_repos | None at present [2] | - | +| bootstrap.yml | cacerts | Supported [3] | - | +| bootstrap.yml | squid | Not relevant for compute nodes | n/a | +| bootstrap.yml | tuned | Fully supported | No | +| bootstrap.yml | freeipa_server | Not relevant for compute nodes | n/a | +| bootstrap.yml | cockpit | None required - use image build | No | +| bootstrap.yml | firewalld | Not relevant for compute nodes | n/a | +| bootstrap.yml | fail2ban | Not relevant for compute nodes | n/a | +| bootstrap.yml | podman | Not relevant for compute nodes | n/a | +| bootstrap.yml | update | Not relevant during boot | n/a | +| bootstrap.yml | reboot | Not relevant for compute nodes | n/a | +| bootstrap.yml | ofed | Not relevant during boot | Yes | +| bootstrap.yml | ansible_init (install) | Not relevant during boot | n/a | +| bootstrap.yml | k3s (install) | Not relevant during boot | n/a | +| hooks/post-bootstrap.yml | ? | None at present | n/a | +| iam.yml | freeipa_client | None at present [4] | Yes | +| iam.yml | freeipa_server | Not relevant for compute nodes | n/a | +| iam.yml | sssd | Fully supported | No | +| filesystems.yml | block_devices | None required - role deprecated | n/a | +| filesystems.yml | nfs | All client functionality | No | +| filesystems.yml | manila | All functionality | No [5] | +| filesystems.yml | lustre | All functionality | Yes | +| extras.yml | basic_users | All functionality [6] | No | +| extras.yml | eessi | All functionality [7] | No | +| extras.yml | cuda | None required - use image build | Yes [8] | +| extras.yml | vgpu | All functionality | Yes | +| extras.yml | persist_hostkeys | Not relevant for compute nodes | n/a | +| extras.yml | compute_init (export) | Not relevant for compute nodes | n/a | +| extras.yml | k9s (install) | Not relevant during boot | n/a | +| extras.yml | extra_packages | None at present [9] | - | +| slurm.yml | MySQL | Not relevant for compute nodes | n/a | +| slurm.yml | rebuild | Not relevant for compute nodes | n/a | +| slurm.yml | openhpc [10] | All slurmd functionality | No | +| slurm.yml | (set memory limits) | Fully supported | No | +| slurm.yml | (block SSH) | Fully supported | No | +| slurm.yml | nhc | Fully supported | No | +| portal.yml | (openondemand server) | Not relevant for compute nodes | n/a | +| portal.yml | (openondemand vnc desktop) | None required - use image build | No | +| portal.yml | (openondemand jupyter server) | None required - use image build | No | +| monitoring.yml | node_exporter | None required - use image build | No | +| monitoring.yml | (other monitoring) | Not relevant for compute nodes | - | +| disable-repos.yml | dnf_repos | None at present [2] | - | +| hooks/post.yml | ? | None at present | - | Notes: + 1. `selinux` is set to disabled in StackHPC images. 2. Requirement for this functionality is TBD. 3. `cacerts_cert_dir` must be the same on all nodes. @@ -105,32 +106,32 @@ Notes: 6. Assumes home directory already exists on shared storage. 7. Assumes `cvmfs_config` is the same on control node and all compute nodes. 8. If `cuda` role was run during build, the nvidia-persistenced is enabled - and will start during boot. + and will start during boot. 9. Would require `dnf_repos`. 10. `openhpc` does not need to be added to `compute_init_enable`, this is automatically enabled by adding `compute`. ## Approach + This works as follows: + 1. During image build, an ansible-init playbook and supporting files -(e.g. templates, filters, etc) are installed. + (e.g. templates, filters, etc) are installed. 2. Cluster instances are created as usual; the above compute-init playbook does -not run. + not run. 3. The `site.yml` playbook is run as usual to configure all the instances into -a cluster. In addition, with `compute-init` enabled, a `/exports/cluster` NFS -share is created on the control node containing: - - an /etc/hosts file for the cluster - - Hostvars for each compute node + a cluster. In addition, with `compute-init` enabled, a `/exports/cluster` NFS + share is created on the control node containing: - an /etc/hosts file for the cluster - Hostvars for each compute node 4. On reboot of a compute node, ansible-init runs the compute-init playbook -which: - a. Checks whether the `enable_compute` metadata flag is set, and exits if - not. - b. Tries to mount the above `/exports/cluster` NFS share from the control - node, and exits if it cannot. - c. Configures itself using the exported hostvars, depending on the - `enable_*` flags set in metadata. - d. Issues an `scontrol` command to resume the node (because Slurm will - consider it as "unexpectedly rebooted"). + which: + a. Checks whether the `enable_compute` metadata flag is set, and exits if + not. + b. Tries to mount the above `/exports/cluster` NFS share from the control + node, and exits if it cannot. + c. Configures itself using the exported hostvars, depending on the + `enable_*` flags set in metadata. + d. Issues an `scontrol` command to resume the node (because Slurm will + consider it as "unexpectedly rebooted"). The check in 4b. above is what prevents the compute-init script from trying to configure the node before the services on the control node are available @@ -147,35 +148,43 @@ a new image: 2. Reimage the compute nodes: - ansible-playbook --limit compute ansible/adhoc/rebuild.yml +```shell +ansible-playbook --limit compute ansible/adhoc/rebuild.yml +``` 3. Add metadata to a compute node e.g. via Horizon to turn on compute-init playbook functionality. 4. Stop ansible-init from running - ansible all -ba "systemctl stop ansible-init" +```shell +ansible all -ba "systemctl stop ansible-init" +``` 5. Fake an image build to deploy the compute-init playbook: - ansible-playbook ansible/fatimage.yml --tags compute_init +```shell +ansible-playbook ansible/fatimage.yml --tags compute_init +``` - NB: This will also re-export the compute hostvars, as the nodes are not - in the builder group, which conveniently means any changes made to that - play also get picked up. +NB: This will also reexport the compute hostvars, as the nodes are not +in the builder group, which conveniently means any changes made to that +play also get picked up. 6. Fake a reimage of compute to run ansible-init and the updated compute-init playbook: - ansible all -ba "rm -f /var/lib/ansible-init.done && systemctl restart ansible-init" +```shell +ansible all -ba "rm -f /var/lib/ansible-init.done && systemctl restart ansible-init" +``` - Use `systemctl status ansible-init` to view stdout/stderr from Ansible. +Use `systemctl status ansible-init` to view stdout/stderr from Ansible. Steps 4/5/6 can be repeated with changes to the compute script. If required, reimage the compute node(s) first as in step 2 and/or add additional metadata as in step 3. - ## Design notes + - Duplicating code in roles into the `compute-init` script is unfortunate, but does allow developing this functionality without wider changes to the appliance. @@ -188,7 +197,6 @@ as in step 3. 1. Control node copies files resulting from role into cluster exports, compute-init copies to local disk. Only works if files are not host-specific Examples: etc_hosts, eessi config? - 2. Re-implement the role. Works if the role vars are not too complicated, (else they all need to be duplicated in compute-init). Could also only support certain subsets of role functionality or variables @@ -197,29 +205,29 @@ as in step 3. - Some variables are defined using hostvars from other nodes, which aren't available v the current approach: - ``` - [root@rl9-compute-0 rocky]# grep hostvars /mnt/cluster/hostvars/rl9-compute-0/hostvars.yml - "grafana_address": "{{ hostvars[groups['grafana'].0].api_address }}", - "grafana_api_address": "{{ hostvars[groups['grafana'].0].internal_address }}", - "mysql_host": "{{ hostvars[groups['mysql'] | first].api_address }}", - "nfs_server_default": "{{ hostvars[groups['control'] | first ].internal_address }}", - "openhpc_slurm_control_host": "{{ hostvars[groups['control'].0].api_address }}", - "openondemand_address": "{{ hostvars[groups['openondemand'].0].api_address if groups['openondemand'] | count > 0 else '' }}", - "openondemand_node_proxy_directives": "{{ _opeonondemand_unset_auth if (openondemand_auth == 'basic_pam' and 'openondemand_host_regex' and groups['grafana'] | length > 0 and hostvars[ groups['grafana'] | first]._grafana_auth_is_anonymous) else '' }}", - "openondemand_servername": "{{ hostvars[ groups['openondemand'] | first].ansible_host }}", - "prometheus_address": "{{ hostvars[groups['prometheus'].0].api_address }}", - "{{ hostvars[groups['freeipa_server'].0].ansible_host }}" - ``` - - More generally, there is nothing to stop any group var depending on a - "{{ hostvars[] }}" interpolation ... - - Only `nfs_server_default` and `openhpc_slurm_control_host` are of concern - for compute nodes - both of these indirect via `api_address` to - `inventory_hostname`. This has been worked around by replacing this with - "{{ groups['control'] | first }}" which does result in the control node - inventory hostname when templating. - - Note that although `groups` is defined in the templated hostvars, when - the hostvars are loaded using `include_vars:` is is ignored as it is a - "magic variable" determined by ansible itself and cannot be set. + ```text + [root@rl9-compute-0 rocky]# grep hostvars /mnt/cluster/hostvars/rl9-compute-0/hostvars.yml + "grafana_address": "{{ hostvars[groups['grafana'].0].api_address }}", + "grafana_api_address": "{{ hostvars[groups['grafana'].0].internal_address }}", + "mysql_host": "{{ hostvars[groups['mysql'] | first].api_address }}", + "nfs_server_default": "{{ hostvars[groups['control'] | first ].internal_address }}", + "openhpc_slurm_control_host": "{{ hostvars[groups['control'].0].api_address }}", + "openondemand_address": "{{ hostvars[groups['openondemand'].0].api_address if groups['openondemand'] | count > 0 else '' }}", + "openondemand_node_proxy_directives": "{{ _opeonondemand_unset_auth if (openondemand_auth == 'basic_pam' and 'openondemand_host_regex' and groups['grafana'] | length > 0 and hostvars[ groups['grafana'] | first]._grafana_auth_is_anonymous) else '' }}", + "openondemand_servername": "{{ hostvars[ groups['openondemand'] | first].ansible_host }}", + "prometheus_address": "{{ hostvars[groups['prometheus'].0].api_address }}", + "{{ hostvars[groups['freeipa_server'].0].ansible_host }}" + ``` + + More generally, there is nothing to stop any group var depending on a + "{{ hostvars[] }}" interpolation ... + + Only `nfs_server_default` and `openhpc_slurm_control_host` are of concern + for compute nodes - both of these indirect via `api_address` to + `inventory_hostname`. This has been worked around by replacing this with + "{{ groups['control'] | first }}" which does result in the control node + inventory hostname when templating. + + Note that although `groups` is defined in the templated hostvars, when + the hostvars are loaded using `include_vars:` is is ignored as it is a + "magic variable" determined by ansible itself and cannot be set. diff --git a/ansible/roles/compute_init/files/compute-init.yml b/ansible/roles/compute_init/files/compute-init.yml index 397da01..0ff647a 100644 --- a/ansible/roles/compute_init/files/compute-init.yml +++ b/ansible/roles/compute_init/files/compute-init.yml @@ -1,8 +1,7 @@ --- - - name: Compute node initialisation hosts: localhost - become: yes + become: true vars: os_metadata: "{{ lookup('url', 'http://169.254.169.254/openstack/latest/meta_data.json') | from_json }}" server_node_ip: "{{ os_metadata.meta.control_address }}" @@ -12,7 +11,7 @@ enable_cacerts: "{{ os_metadata.meta.cacerts | default(false) | bool }}" enable_sssd: "{{ os_metadata.meta.sssd | default(false) | bool }}" enable_sshd: "{{ os_metadata.meta.sshd | default(false) | bool }}" - enable_tuned: "{{ os_metadata.meta.tuned | default(false) | bool }}" + enable_tuned: "{{ os_metadata.meta.tuned | default(false) | bool }}" enable_nfs: "{{ os_metadata.meta.nfs | default(false) | bool }}" enable_manila: "{{ os_metadata.meta.manila | default(false) | bool }}" enable_lustre: "{{ os_metadata.meta.lustre | default(false) | bool }}" @@ -24,7 +23,6 @@ # TODO: "= role defaults" - could be moved to a vars_file: on play with similar precedence effects resolv_conf_nameservers: [] - tuned_profile_baremetal: hpc-compute tuned_profile_vm: virtual-guest tuned_profile: "{{ tuned_profile_baremetal if ansible_virtualization_role != 'guest' else tuned_profile_vm }}" @@ -47,17 +45,16 @@ - nosuid tasks: - - block: + - when: not enable_compute + block: - name: Report skipping initialization if not compute node # meta: end_play produces no output - debug: + ansible.builtin.debug: msg: "Skipping compute initialization: Metadata enable_compute is not true" - - - meta: end_play - when: not enable_compute + - ansible.builtin.meta: end_play - name: Ensure the mount directory exists - file: + ansible.builtin.file: path: /mnt/cluster state: directory owner: slurm @@ -76,46 +73,46 @@ # exits from playbook if this failed below, allowing ansible-init to # finish, which allows site.yml to continue on initial deploy - - block: + - when: _mount_mnt_cluster.failed + block: - name: Report skipping initialization if cannot mount nfs # meta: end_play produces no output - debug: + ansible.builtin.debug: msg: "Skipping compute initialization: Failed to mount /exports/cluster from control node {{ server_node_ip }}" - - - meta: end_play - when: _mount_mnt_cluster.failed + - ansible.builtin.meta: end_play - name: Check if hostvars exist + become: true become_user: slurm - stat: + ansible.builtin.stat: path: "/mnt/cluster/hostvars/{{ ansible_hostname }}/hostvars.yml" register: hostvars_stat - - block: + - when: not hostvars_stat.stat.exists + block: - name: Report skipping initialization if host vars does not exist # meta: end_play produces no output - debug: + ansible.builtin.debug: msg: "Skipping compute initialization: hostvars does not exist" - - meta: end_play - when: not hostvars_stat.stat.exists - + - ansible.builtin.meta: end_play - name: Sync /mnt/cluster to /var/tmp + become: true become_user: slurm - synchronize: + ansible.posix.synchronize: src: "/mnt/cluster/" dest: "/var/tmp/cluster/" - archive: yes - recursive: yes + archive: true + recursive: true - name: Unmount /mnt/cluster after sync - mount: + ansible.posix.mount: path: /mnt/cluster state: unmounted - name: Load hostvars # this is higher priority than vars block = normal ansible's hostvars - include_vars: + ansible.builtin.include_vars: file: "/var/tmp/cluster/hostvars/{{ ansible_hostname }}/hostvars.yml" - name: Run chrony role @@ -129,6 +126,7 @@ when: enable_chrony - name: Configure resolve.conf + when: enable_resolv_conf block: - name: Set nameservers in /etc/resolv.conf ansible.builtin.template: @@ -151,16 +149,14 @@ ansible.builtin.systemd: name: NetworkManager state: reloaded - when: _copy_nm_config.changed | default(false) - when: enable_resolv_conf - + when: _copy_nm_config.changed | default(false) # noqa: no-handler - name: Copy cluster /etc/hosts - copy: + ansible.builtin.copy: src: /var/tmp/cluster/hosts dest: /etc/hosts owner: root group: root - mode: 0644 + mode: "0644" when: enable_etc_hosts - name: Configure cacerts @@ -178,7 +174,7 @@ when: enable_sshd - name: Configure tuned - include_tasks: tasks/tuned.yml + ansible.builtin.include_tasks: tasks/tuned.yml when: enable_tuned - name: Configure sssd @@ -200,12 +196,15 @@ loop: "{{ nfs_configurations }}" - name: Manila mounts + when: + - enable_manila + - os_manila_mount_shares | length > 0 block: - name: Read manila share info from nfs file - include_vars: + ansible.builtin.include_vars: file: /var/tmp/cluster/manila_share_info.yml no_log: true # contains secrets - + - name: Ensure Ceph configuration directory exists ansible.builtin.file: path: "{{ os_manila_mount_ceph_conf_path }}" @@ -267,10 +266,6 @@ loop_control: label: "{{ item.share_name }}" when: item.mount_state | default(os_manila_mount_state) in ['mounted' or 'ephemeral'] - when: - - enable_manila - - os_manila_mount_shares | length > 0 - - name: Configure lustre ansible.builtin.include_role: name: lustre @@ -278,27 +273,29 @@ when: enable_lustre - name: Basic users - ansible.builtin.include_role: + ansible.builtin.include_role: name: basic_users when: enable_basic_users - name: EESSI + when: enable_eessi + # NB: don't need conditional block on enable_compute as have already exited + # if not the case block: - name: Copy cvmfs config - copy: + ansible.builtin.copy: src: /var/tmp/cluster/cvmfs/default.local dest: /etc/cvmfs/default.local owner: root group: root - mode: 0644 + mode: "0644" - - name: Ensure CVMFS config is setup - command: + - name: Ensure CVMFS config is setup # noqa: no-changed-when + ansible.builtin.command: cmd: "cvmfs_config setup" - when: enable_eessi - name: Configure VGPUs - include_role: + ansible.builtin.include_role: name: stackhpc.linux.vgpu tasks_from: 'configure.yml' when: enable_vgpu @@ -306,54 +303,54 @@ # NB: don't need conditional block on enable_compute as have already exited # if not the case - name: Write Munge key - copy: + ansible.builtin.copy: # NB: openhpc_munge_key is *binary* and may not survive json encoding # so do same as environments/common/inventory/group_vars/all/openhpc.yml content: "{{ vault_openhpc_mungekey | b64decode }}" dest: "/etc/munge/munge.key" owner: munge group: munge - mode: 0400 + mode: "0400" - name: Set slurmctld location for configless operation - lineinfile: + ansible.builtin.lineinfile: path: /etc/sysconfig/slurmd line: "SLURMD_OPTIONS='--conf-server {{ openhpc_slurm_control_host_address | default(openhpc_slurm_control_host) }}'" regexp: "^SLURMD_OPTIONS=" - create: yes + create: true owner: root group: root - mode: 0644 + mode: "0644" - name: Ensure Munge service state - service: + ansible.builtin.service: name: munge enabled: true state: started - name: Set locked memory limits on user-facing nodes - lineinfile: + ansible.builtin.lineinfile: path: /etc/security/limits.conf - regexp: '\* soft memlock unlimited' + regexp: "\\* soft memlock unlimited" line: "* soft memlock unlimited" - name: Configure sshd pam module - blockinfile: + ansible.builtin.blockinfile: path: /etc/pam.d/sshd - insertafter: 'account\s+required\s+pam_nologin.so' + insertafter: "account\\s+required\\s+pam_nologin.so" block: | account sufficient pam_access.so account required pam_slurm.so - name: Configure login access control - blockinfile: + ansible.builtin.blockinfile: path: /etc/security/access.conf block: | +:adm:ALL -:ALL:ALL - name: Ensure slurmd service state - service: + ansible.builtin.service: name: slurmd enabled: true state: started @@ -364,9 +361,9 @@ tasks_from: boot.yml when: enable_nhc - - name: Ensure node is resumed + - name: Ensure node is resumed # noqa: no-changed-when # TODO: consider if this is always safe for all job states? - command: scontrol update state=resume nodename={{ ansible_hostname }} + ansible.builtin.command: scontrol update state=resume nodename={{ ansible_hostname }} register: _scontrol_update failed_when: - _scontrol_update.rc > 0 diff --git a/ansible/roles/compute_init/tasks/export.yml b/ansible/roles/compute_init/tasks/export.yml index f5c594c..5b31bd6 100644 --- a/ansible/roles/compute_init/tasks/export.yml +++ b/ansible/roles/compute_init/tasks/export.yml @@ -1,5 +1,6 @@ +--- - name: Ensure the /exports/cluster directory exists - file: + ansible.builtin.file: path: /exports/cluster state: directory owner: slurm @@ -9,7 +10,7 @@ delegate_to: "{{ groups['control'] | first }}" - name: Copy /etc/hosts to /exports/cluster - copy: + ansible.builtin.copy: src: /etc/hosts dest: /exports/cluster/hosts owner: slurm @@ -20,7 +21,7 @@ delegate_to: "{{ groups['control'] | first }}" - name: Create hostvars directory - file: + ansible.builtin.file: path: /exports/cluster/hostvars/{{ inventory_hostname }}/ state: directory owner: slurm @@ -29,7 +30,7 @@ delegate_to: "{{ groups['control'] | first }}" - name: Template out hostvars - template: + ansible.builtin.template: src: hostvars.yml.j2 dest: /exports/cluster/hostvars/{{ inventory_hostname }}/hostvars.yml owner: slurm @@ -38,7 +39,7 @@ delegate_to: "{{ groups['control'] | first }}" - name: Copy manila share info to /exports/cluster - copy: + ansible.builtin.copy: content: "{{ os_manila_mount_share_info_var | to_nice_yaml }}" dest: /exports/cluster/manila_share_info.yml owner: slurm @@ -52,22 +53,22 @@ os_manila_mount_share_info: "{{ os_manila_mount_share_info }}" - name: Ensure /exports/cluster/cvmfs directory exists - file: + ansible.builtin.file: path: /exports/cluster/cvmfs state: directory owner: slurm group: root - mode: 0755 + mode: "0755" run_once: true delegate_to: "{{ groups['control'] | first }}" - name: Copy EESSI CVMFS config to /exports/cluster - copy: + ansible.builtin.copy: src: /etc/cvmfs/default.local dest: /exports/cluster/cvmfs/default.local owner: slurm group: root - mode: 0644 + mode: "0644" remote_src: true run_once: true delegate_to: "{{ groups['control'] | first }}" @@ -79,7 +80,7 @@ when: "'cacerts' in group_names" - name: Create hostconfig directory - file: + ansible.builtin.file: path: "/exports/cluster/hostconfig/{{ inventory_hostname }}/" state: directory owner: slurm @@ -87,20 +88,20 @@ mode: u=rX,g=rwX,o= delegate_to: "{{ groups['control'] | first }}" -- name: Template sssd config - import_role: +- name: Template sssd config + ansible.builtin.import_role: name: sssd tasks_from: export.yml when: "'sssd' in group_names" -- name: Template sshd config - import_role: +- name: Template sshd config + ansible.builtin.import_role: name: sshd tasks_from: export.yml when: "'sshd' in group_names" - name: Export generated NHC config - import_role: + ansible.builtin.import_role: name: nhc tasks_from: export.yml when: "'nhc' in group_names" diff --git a/ansible/roles/compute_init/tasks/install.yml b/ansible/roles/compute_init/tasks/install.yml index 67f339c..f7ee876 100644 --- a/ansible/roles/compute_init/tasks/install.yml +++ b/ansible/roles/compute_init/tasks/install.yml @@ -1,12 +1,11 @@ --- - - name: Ensure directories exist - file: + ansible.builtin.file: path: "/etc/ansible-init/playbooks/{{ item }}" state: directory owner: root group: root - mode: 0755 + mode: "0755" loop: - templates - files @@ -16,11 +15,15 @@ - roles - name: Inject files from roles - synchronize: - src: '{{ item.src }}' - dest: '/etc/ansible-init/playbooks/{{ item.dest }}' + ansible.posix.synchronize: + src: "{{ item.src }}" + dest: "/etc/ansible-init/playbooks/{{ item.dest }}" archive: false - rsync_opts: ["-p", "--chmod=D770,F644", "--owner=root", "--group=root"] + rsync_opts: + - "-p" + - "--chmod=D770,F644" + - "--owner=root" + - "--group=root" recursive: true use_ssh_args: true become: true @@ -53,18 +56,18 @@ dest: roles/ - name: Add filter_plugins to ansible.cfg - lineinfile: + ansible.builtin.lineinfile: path: /etc/ansible-init/ansible.cfg line: "filter_plugins = /etc/ansible-init/filter_plugins" state: present owner: root group: root - mode: 0644 + mode: "0644" - name: Add compute initialisation playbook - copy: + ansible.builtin.copy: src: compute-init.yml dest: /etc/ansible-init/playbooks/10-compute-init.yml owner: root group: root - mode: 0644 + mode: "0644" diff --git a/ansible/roles/cuda/defaults/main.yml b/ansible/roles/cuda/defaults/main.yml index e4e785b..692301d 100644 --- a/ansible/roles/cuda/defaults/main.yml +++ b/ansible/roles/cuda/defaults/main.yml @@ -1,3 +1,5 @@ +--- +# yamllint disable-line rule:line-length cuda_repo_url: "https://developer.download.nvidia.com/compute/cuda/repos/rhel{{ ansible_distribution_major_version }}/{{ ansible_architecture }}/cuda-rhel{{ ansible_distribution_major_version }}.repo" cuda_nvidia_driver_stream: '580-open' cuda_nvidia_driver_pkg: "nvidia-open-3:580.82.07-1.el{{ ansible_distribution_major_version }}" diff --git a/ansible/roles/cuda/tasks/facts.yml b/ansible/roles/cuda/tasks/facts.yml index 0d60457..787f026 100644 --- a/ansible/roles/cuda/tasks/facts.yml +++ b/ansible/roles/cuda/tasks/facts.yml @@ -1,4 +1,4 @@ --- - name: Set cuda_facts_version_short - set_fact: + ansible.builtin.set_fact: cuda_facts_version_short: "{{ cuda_version_short }}" diff --git a/ansible/roles/cuda/tasks/install.yml b/ansible/roles/cuda/tasks/install.yml index 39bd20d..91af515 100644 --- a/ansible/roles/cuda/tasks/install.yml +++ b/ansible/roles/cuda/tasks/install.yml @@ -1,10 +1,11 @@ - +--- # Based on https://docs.nvidia.com/datacenter/tesla/driver-installation-guide/ - name: Install cuda repo - get_url: + ansible.builtin.get_url: dest: "/etc/yum.repos.d/cuda-rhel{{ ansible_distribution_major_version }}.repo" url: "{{ cuda_repo_url }}" + mode: "0644" - name: Check if nvidia driver module is enabled ansible.builtin.command: dnf module list --enabled nvidia-driver @@ -24,7 +25,7 @@ register: _cuda_driver_install - name: Check kernel has not been modified - assert: + ansible.builtin.assert: that: "'kernel ' not in _cuda_driver_install.stdout | default('')" # space ensures we don't flag e.g. kernel-devel-matched fail_msg: "{{ _cuda_driver_install.stdout_lines | default([]) | select('search', 'kernel ') }}" @@ -37,13 +38,13 @@ register: cuda_package_install - name: Add cuda binaries to path - lineinfile: + ansible.builtin.lineinfile: path: /etc/profile.d/sh.local - line: 'export PATH=$PATH:$(ls -1d /usr/local/cuda-* | sort -V | tail -1)/bin' + line: "export PATH=$PATH:$(ls -1d /usr/local/cuda-* | sort -V | tail -1)/bin" when: cuda_package_version != 'none' - name: Enable NVIDIA Persistence Daemon - systemd: + ansible.builtin.systemd: name: nvidia-persistenced enabled: true state: "{{ cuda_persistenced_state }}" @@ -51,9 +52,9 @@ - name: Reboot ansible.builtin.reboot: post_reboot_delay: 30 - when: cuda_package_install.changed + when: cuda_package_install.changed # noqa: no-handler - name: Wait for hosts to be reachable - wait_for_connection: + ansible.builtin.wait_for_connection: sleep: 15 - when: cuda_package_install.changed + when: cuda_package_install.changed # noqa: no-handler diff --git a/ansible/roles/cuda/tasks/runtime.yml b/ansible/roles/cuda/tasks/runtime.yml index c16a48c..e2dfab3 100644 --- a/ansible/roles/cuda/tasks/runtime.yml +++ b/ansible/roles/cuda/tasks/runtime.yml @@ -1,5 +1,6 @@ +--- - name: Ensure NVIDIA Persistence Daemon state - systemd: + ansible.builtin.systemd: name: nvidia-persistenced enabled: true state: "{{ cuda_persistenced_state }}" diff --git a/ansible/roles/cuda/tasks/samples.yml b/ansible/roles/cuda/tasks/samples.yml index b2bccd7..392a295 100644 --- a/ansible/roles/cuda/tasks/samples.yml +++ b/ansible/roles/cuda/tasks/samples.yml @@ -1,13 +1,15 @@ +--- - name: Ensure cuda_samples_path exists - file: + ansible.builtin.file: state: directory path: "{{ cuda_samples_path }}" owner: "{{ ansible_user }}" group: "{{ ansible_user }}" + mode: "0755" - name: Download CUDA samples release - unarchive: - remote_src: yes + ansible.builtin.unarchive: + remote_src: true src: "{{ cuda_samples_release_url }}" dest: "{{ cuda_samples_path }}" owner: "{{ ansible_user }}" @@ -15,12 +17,13 @@ creates: "{{ cuda_samples_path }}/cuda-samples-{{ cuda_version_short }}" - name: Create CUDA samples build directory - file: + ansible.builtin.file: state: directory path: "{{ cuda_samples_path }}/cuda-samples-{{ cuda_version_short }}/build" + mode: "0755" - name: Build CUDA samples - shell: + ansible.builtin.shell: # We need to source /etc/profile.d/sh.local to add CUDA to the PATH cmd: . /etc/profile.d/sh.local && cmake .. && make -j {{ ansible_processor_vcpus }} chdir: "{{ cuda_samples_path }}/cuda-samples-{{ cuda_version_short }}/build" diff --git a/ansible/roles/dnf_repos/README.md b/ansible/roles/dnf_repos/README.md index ff22c79..a7c6bc2 100644 --- a/ansible/roles/dnf_repos/README.md +++ b/ansible/roles/dnf_repos/README.md @@ -1,38 +1,34 @@ -dnf_repos -========= +# dnf_repos -Modifies repo definitions for repofiles in `/etc/yum.repos.d` to point to snapshots in StackHPC's Ark Pulp server or mirrors of them +Modifies repository definitions for repofiles in `/etc/yum.repos.d` to point to snapshots in StackHPC's Ark Pulp server or mirrors of them on a local Pulp server. -Requirements ------------- +## Requirements Requires Ark credentials if using StackHPC's upstream Ark server. -Role Variables --------------- +## Role Variables -Variables in this role are also required by `pulp_site` so set in +Variables in this role are also required by `pulp_site` so set in `environments/common/inventory/groups_vars/all/dnf_repos.yml`. See that file for detailed default values. - `dnf_repos_repos`: Dict of dicts containing information to construct URLs for Ark snapshots from the target Pulp server for each Rocky version. For example: - ``` - dnf_repos_repos: - appstream: # ansible.builtin.yum_repository:name - '8.10': # ansible_distribution_version or ansible_distribution_major_version - repo_file: Rocky-AppStream # yum_repository: file - # repo_name: # optional, override yum_repository:name - pulp_path: rocky/8.10/AppStream/x86_64/os # The subpath of the the upstream Ark server's content endpoint URL for the repo's snapshots, see https://ark.stackhpc.com/pulp/content/ - pulp_timestamp: 20250614T013846 - # pulp_content_url: # optional, dnf_repos_pulp_content_url - '9.6': - ... - ``` + ```yaml + dnf_repos_repos: + appstream: # ansible.builtin.yum_repository:name + "8.10": # ansible_distribution_version or ansible_distribution_major_version + repo_file: Rocky-AppStream # yum_repository: file + # repo_name: # optional, override yum_repository:name + pulp_path: rocky/8.10/AppStream/x86_64/os # The subpath of the the upstream Ark server's content endpoint URL for the repo's snapshots, see https://ark.stackhpc.com/pulp/content/ + pulp_timestamp: 20250614T013846 + # pulp_content_url: # optional, dnf_repos_pulp_content_url + "9.6": ... + ``` - `dnf_repos_default`: Appliance default repos to use Ark snapshots for. Following same format as `dnf_repos_repos`. - See for appliance default repo list `environments/common/inventory/group_vars/all/dnf_repo_timestamps.yml`. + See for appliance default repository list `environments/common/inventory/group_vars/all/dnf_repo_timestamps.yml`. - `dnf_repos_extra`: Additional repos to use Ark snapshots for. Follows same format as `dnf_repos_repos`. Defaults to `{}` -- `dnf_repos_pulp_content_url`: Optional str. Content URL of Pulp server to use Ark snapshots from. +- `dnf_repos_pulp_content_url`: Optional str. Content URL of Pulp server to use Ark snapshots from. Defaults to `{{ appliances_pulp_url }}/pulp/content` - `dnf_repos_username`: Optional str. Username for Ark. Should be set if using upstream StackHPC Ark Pulp server, but omitted if using local Pulp server (see `ansible/roles/pulp_site`) diff --git a/ansible/roles/dnf_repos/defaults/main.yml b/ansible/roles/dnf_repos/defaults/main.yml index fe3c44e..112c5c7 100644 --- a/ansible/roles/dnf_repos/defaults/main.yml +++ b/ansible/roles/dnf_repos/defaults/main.yml @@ -1,3 +1,4 @@ +--- dnf_repos_repos: {} # see environments/common/inventory/group_vars/all/{dnf_repos,timestamps}.yml dnf_repos_pulp_content_url: "{{ appliances_pulp_url }}/pulp/content" dnf_repos_username: "{{ omit }}" diff --git a/ansible/roles/dnf_repos/tasks/disable_repos.yml b/ansible/roles/dnf_repos/tasks/disable_repos.yml index 4db073b..0339f5b 100644 --- a/ansible/roles/dnf_repos/tasks/disable_repos.yml +++ b/ansible/roles/dnf_repos/tasks/disable_repos.yml @@ -27,5 +27,5 @@ path: "{{ item.path }}" regexp: '^enabled\ ?=\ ?1' replace: 'enabled=0' - backup: yes + backup: true loop: "{{ _dnf_repo_files.files }}" diff --git a/ansible/roles/doca/defaults/main.yml b/ansible/roles/doca/defaults/main.yml index 8fb5e92..7f28ef8 100644 --- a/ansible/roles/doca/defaults/main.yml +++ b/ansible/roles/doca/defaults/main.yml @@ -1,3 +1,5 @@ -doca_version: '2.9.3' # 2.9 is LTS, last to support ConnectX-4, 3 years for bug fixes and CVE updates +--- + +doca_version: "2.9.3" # 2.9 is LTS, last to support ConnectX-4, 3 years for bug fixes and CVE updates doca_profile: doca-ofed doca_repo_url: "https://linux.mellanox.com/public/repo/doca/{{ doca_version }}/rhel{{ ansible_distribution_version }}/{{ ansible_architecture }}/" diff --git a/ansible/roles/doca/tasks/install-kernel-devel.yml b/ansible/roles/doca/tasks/install-kernel-devel.yml index 6a1943a..9968058 100644 --- a/ansible/roles/doca/tasks/install-kernel-devel.yml +++ b/ansible/roles/doca/tasks/install-kernel-devel.yml @@ -1,24 +1,27 @@ +--- - name: Get installed kernels - command: dnf list --installed kernel + ansible.builtin.command: dnf list --installed kernel register: _ofed_dnf_kernels changed_when: false - name: Determine running kernel - command: uname -r # e.g. 4.18.0-513.18.1.el8_9.x86_64 + ansible.builtin.command: uname -r register: _ofed_loaded_kernel changed_when: false - name: Check current kernel is newest installed - assert: + ansible.builtin.assert: that: _ofed_kernel_current == _ofed_dnf_kernels_newest fail_msg: "Kernel {{ _ofed_loaded_kernel.stdout }} is loaded but newer {{ _ofed_dnf_kernels_newest }} is installed: consider rebooting?" vars: + # yamllint disable rule:line-length _ofed_kernel_current: >- {{ _ofed_loaded_kernel.stdout | regex_replace('\.(?:.(?!\.))+$', '') | regex_replace('\.(?:.(?!\.))+$', '') }} _ofed_dnf_kernels_newest: >- {{ _ofed_dnf_kernels.stdout_lines[1:] | map('split') | map(attribute=1) | map('regex_replace', '\.(?:.(?!\.))+$', '') | community.general.version_sort | last }} - # dnf line format e.g. "kernel.x86_64 4.18.0-513.18.1.el8_9 @baseos " + # yamllint enable rule:line-length + # dnf line format e.g. "kernel.x86_64 4.18.0-513.18.1.el8_9 @baseos " - name: Install matching kernel-devel package - dnf: + ansible.builtin.dnf: name: "kernel-devel-{{ _ofed_loaded_kernel.stdout | trim }}" diff --git a/ansible/roles/doca/tasks/install.yml b/ansible/roles/doca/tasks/install.yml index d26fda7..e21218e 100644 --- a/ansible/roles/doca/tasks/install.yml +++ b/ansible/roles/doca/tasks/install.yml @@ -1,5 +1,5 @@ -- import_tasks: install-kernel-devel.yml - +--- +- ansible.builtin.import_tasks: install-kernel-devel.yml - name: Install DOCA repo ansible.builtin.yum_repository: name: doca @@ -13,21 +13,21 @@ ansible.builtin.dnf: name: doca-extra -- name: Build DOCA kernel modules - ansible.builtin.shell: +- name: Build DOCA kernel modules # noqa: no-changed-when + ansible.builtin.command: cmd: /opt/mellanox/doca/tools/doca-kernel-support register: _doca_kernel_build - - name: Find generated doca-kernel-repo - ansible.builtin.shell: 'find /tmp/DOCA.* -name doca-kernel-repo-*' + ansible.builtin.shell: "find /tmp/DOCA.* -name doca-kernel-repo-*" register: _doca_kernel_repo # e.g. /tmp/DOCA.WVMchs2QWo/doca-kernel-repo-24.10.1.1.4.0-1.kver.5.14.0.427.31.1.el9.4.x86.64.x86_64.rpm changed_when: false -- name: Create dnf cache +- name: Create dnf cache # noqa: no-changed-when ansible.builtin.command: dnf makecache - name: Install DOCA repository package + # checkov:skip=CKV2_ANSIBLE_4: "Ensure that packages with untrusted or missing GPG signatures are not used by dnf" ansible.builtin.dnf: name: "{{ _doca_kernel_repo.stdout }}" disable_gpg_check: true @@ -41,11 +41,11 @@ state: absent path: "{{ (_doca_kernel_repo.stdout | split('/'))[:3] | join('/') }}" # leading / means 1st element of split list is '' -- name: Update initramfs +- name: Update initramfs # noqa: no-changed-when ansible.builtin.command: cmd: dracut -f register: _doca_dracut failed_when: _doca_dracut.stderr != '' # appears rc is always 0 -- name: Load the new driver +- name: Load the new driver # noqa: no-changed-when ansible.builtin.command: /etc/init.d/openibd restart diff --git a/ansible/roles/doca/tasks/main.yml b/ansible/roles/doca/tasks/main.yml index e7a272f..df97825 100644 --- a/ansible/roles/doca/tasks/main.yml +++ b/ansible/roles/doca/tasks/main.yml @@ -1 +1,2 @@ -- include_tasks: install.yml +--- +- ansible.builtin.include_tasks: install.yml diff --git a/ansible/roles/eessi/README.md b/ansible/roles/eessi/README.md index d48e009..df9e835 100644 --- a/ansible/roles/eessi/README.md +++ b/ansible/roles/eessi/README.md @@ -1,26 +1,23 @@ -EESSI -===== +# EESSI Configure the EESSI pilot respository for use on given hosts. -Requirements ------------- +## Requirements None. -Role Variables --------------- +## Role Variables - `cvmfs_quota_limit_mb`: Optional int. Maximum size of local package cache on each node in MB. -- `cvmfs_config_overrides`: Optional dict. Set of key-value pairs for additional CernVM-FS settings see [official docs](https://cvmfs.readthedocs.io/en/stable/cpt-configure.html) for list of options. Each dict key should correspond to a valid config variable (e.g. `CVMFS_HTTP_PROXY`) and the corresponding dict value will be set as the variable value (e.g. `https://my-proxy.com`). These configuration parameters will be written to the `/etc/cvmfs/default.local` config file on each host in the form `KEY=VALUE`. +- `cvmfs_config_overrides`: Optional dict. Set of key-value pairs for additional CernVM-FS settings see [official docs](https://cvmfs.readthedocs.io/en/stable/cpt-configure.html) for list of options. + Each dict key should correspond to a valid config variable (e.g. `CVMFS_HTTP_PROXY`) and the corresponding dict value will be set as the variable value (e.g. `https://my-proxy.com`). + These configuration parameters will be written to the `/etc/cvmfs/default.local` config file on each host in the form `KEY=VALUE`. -Dependencies ------------- +## Dependencies None. -Example Playbook ----------------- +## Example Playbook ```yaml - name: Setup EESSI diff --git a/ansible/roles/eessi/defaults/main.yaml b/ansible/roles/eessi/defaults/main.yaml index 60e61f1..581c24f 100644 --- a/ansible/roles/eessi/defaults/main.yaml +++ b/ansible/roles/eessi/defaults/main.yaml @@ -7,7 +7,6 @@ cvmfs_config_default: CVMFS_QUOTA_LIMIT: "{{ cvmfs_quota_limit_mb }}" cvmfs_config_overrides: {} - cvmfs_config: "{{ cvmfs_config_default | combine(cvmfs_config_overrides) }}" cvmfs_gpg_checksum: "sha256:4ac81adff957565277cfa6a4a330cdc2ce5a8fdd73b8760d1a5a32bef71c4bd6" diff --git a/ansible/roles/eessi/tasks/configure.yml b/ansible/roles/eessi/tasks/configure.yml index b308376..2c765d2 100644 --- a/ansible/roles/eessi/tasks/configure.yml +++ b/ansible/roles/eessi/tasks/configure.yml @@ -7,10 +7,11 @@ option: "{{ item.key }}" value: "{{ item.value }}" no_extra_spaces: true + mode: "0644" loop: "{{ cvmfs_config | dict2items }}" # NOTE: Not clear how to make this idempotent -- name: Ensure CVMFS config is setup - command: +- name: Ensure CVMFS config is setup # noqa: no-changed-when + ansible.builtin.command: cmd: "cvmfs_config setup" diff --git a/ansible/roles/eessi/tasks/install.yml b/ansible/roles/eessi/tasks/install.yml index a4adb0b..50b939c 100644 --- a/ansible/roles/eessi/tasks/install.yml +++ b/ansible/roles/eessi/tasks/install.yml @@ -1,34 +1,37 @@ --- - name: Download Cern GPG key + # checkov:skip=CKV2_ANSIBLE_2: "Ensure that HTTPS url is used with get_url" ansible.builtin.get_url: url: http://cvmrepo.web.cern.ch/cvmrepo/yum/RPM-GPG-KEY-CernVM dest: ./cvmfs-key.gpg checksum: "{{ cvmfs_gpg_checksum }}" + mode: "0644" -- name: Import downloaded GPG key - command: rpm --import cvmfs-key.gpg - +- name: Import downloaded GPG key # noqa: no-changed-when + ansible.builtin.command: rpm --import cvmfs-key.gpg # noqa: command-instead-of-module - name: Add CVMFS repo - dnf: + # checkov:skip=CKV2_ANSIBLE_4: "Ensure that packages with untrusted or missing GPG signatures are not used by dnf" + ansible.builtin.dnf: name: https://ecsft.cern.ch/dist/cvmfs/cvmfs-release/cvmfs-release-latest.noarch.rpm disable_gpg_check: true - name: Install CVMFS - dnf: + ansible.builtin.dnf: name: cvmfs - name: Install EESSI CVMFS config - dnf: + # checkov:skip=CKV2_ANSIBLE_4: "Ensure that packages with untrusted or missing GPG signatures are not used by dnf" + ansible.builtin.dnf: name: https://github.com/EESSI/filesystem-layer/releases/download/latest/cvmfs-config-eessi-latest.noarch.rpm # NOTE: Can't find any docs on obtaining gpg key - maybe downloading directly from github is ok? disable_gpg_check: true # Alternative version using official repo - still no GPG key :( # - name: Add EESSI repo -# dnf: +# ansible.builtin.dnf: # name: http://repo.eessi-infra.org/eessi/rhel/8/noarch/eessi-release-0-1.noarch.rpm # - name: Install EESSI CVMFS config -# dnf: +# ansible.builtin.dnf: # name: cvmfs-config-eessi diff --git a/ansible/roles/eessi/tasks/main.yml b/ansible/roles/eessi/tasks/main.yml index 79d326c..e5e0787 100644 --- a/ansible/roles/eessi/tasks/main.yml +++ b/ansible/roles/eessi/tasks/main.yml @@ -1,4 +1,4 @@ --- -- include_tasks: install.yml -- include_tasks: configure.yml +- ansible.builtin.include_tasks: install.yml +- ansible.builtin.include_tasks: configure.yml diff --git a/ansible/roles/etc_hosts/README.md b/ansible/roles/etc_hosts/README.md index 0ad9568..8c1c422 100644 --- a/ansible/roles/etc_hosts/README.md +++ b/ansible/roles/etc_hosts/README.md @@ -3,11 +3,12 @@ Hosts in the `etc_hosts` groups have `/etc/hosts` created with entries of the format `IP_address canonical_hostname [alias]`. By default, an entry is created for each host in this group as follows: + - The value of `ansible_host` is used as the IP_address. - If `node_fqdn` is defined then that is used as the canonical hostname and `inventory_hostname` as an alias. Otherwise `inventory_hostname` is used as the canonical hostname. -This may need overriding for multi-homed hosts or hosts with multiple aliases. + This may need overriding for multi-homed hosts or hosts with multiple aliases. -# Variables +## Variables - `etc_hosts_template`: Template file to use. Default is the in-role template. - `etc_hosts_hostvars`: A list of variable names, used (in the order supplied) to create the entry for each host. Default is described above. diff --git a/ansible/roles/etc_hosts/defaults/main.yml b/ansible/roles/etc_hosts/defaults/main.yml index c2ecbca..bf7dbe5 100644 --- a/ansible/roles/etc_hosts/defaults/main.yml +++ b/ansible/roles/etc_hosts/defaults/main.yml @@ -1,3 +1,4 @@ +--- etc_hosts_template: hosts.j2 etc_hosts_hostvars: "{{ ['ansible_host'] + (['node_fqdn'] if node_fqdn is defined else []) + ['inventory_hostname'] }}" -etc_hosts_extra_hosts: '' +etc_hosts_extra_hosts: "" diff --git a/ansible/roles/etc_hosts/tasks/main.yml b/ansible/roles/etc_hosts/tasks/main.yml index 6fdabf5..452b58f 100644 --- a/ansible/roles/etc_hosts/tasks/main.yml +++ b/ansible/roles/etc_hosts/tasks/main.yml @@ -1,8 +1,9 @@ +--- - name: Template out /etc/hosts - template: + ansible.builtin.template: src: "{{ etc_hosts_template }}" dest: /etc/hosts owner: root group: root - mode: 0644 - become: yes + mode: "0644" + become: true diff --git a/ansible/roles/fail2ban/README.md b/ansible/roles/fail2ban/README.md index 0e744fd..dec727e 100644 --- a/ansible/roles/fail2ban/README.md +++ b/ansible/roles/fail2ban/README.md @@ -1,27 +1,23 @@ -fail2ban -========= +# fail2ban Setup fail2ban to protect SSH on a host. Note that no email alerts are set up so logs (at `/var/log/fail2ban.log`) will have to be manually reviewed if required. -Requirements ------------- +## Requirements - An EL8 system. - `firewalld` running. -Role Variables --------------- +## Role Variables + None. -Dependencies ------------- +## Dependencies None. -Example Playbook ----------------- +## Example Playbook ```yaml - hosts: fail2ban @@ -34,12 +30,10 @@ Example Playbook name: fail2ban ``` -License -------- +## License Apache v2 -Author Information ------------------- +## Author Information stackhpc.com diff --git a/ansible/roles/fail2ban/handlers/main.yml b/ansible/roles/fail2ban/handlers/main.yml index d578c29..9db9b01 100644 --- a/ansible/roles/fail2ban/handlers/main.yml +++ b/ansible/roles/fail2ban/handlers/main.yml @@ -1,7 +1,6 @@ --- - - name: Restart fail2ban - service: + ansible.builtin.service: name: fail2ban state: restarted enabled: true diff --git a/ansible/roles/fail2ban/meta/main.yml b/ansible/roles/fail2ban/meta/main.yml index 02d6a2f..1005726 100644 --- a/ansible/roles/fail2ban/meta/main.yml +++ b/ansible/roles/fail2ban/meta/main.yml @@ -1,6 +1,8 @@ +--- galaxy_info: author: Steve Brasier company: stackhpc + description: Setup fail2ban to protect SSH on a host # If the issue tracker for your role is not on github, uncomment the # next line and provide a value @@ -15,7 +17,7 @@ galaxy_info: # - CC-BY-4.0 license: Apache-2.0 - min_ansible_version: 2.1 + min_ansible_version: "2.1" # If this a Container Enabled role, provide the minimum Ansible Container version. # min_ansible_container_version: @@ -27,9 +29,9 @@ galaxy_info: # https://galaxy.ansible.com/api/v1/platforms/ # platforms: - - name: EL - versions: - - 8 + - name: EL + versions: + - "8" galaxy_tags: [] # List tags for your role here, one per line. A tag is a keyword that describes diff --git a/ansible/roles/fail2ban/tasks/configure.yml b/ansible/roles/fail2ban/tasks/configure.yml index e4951f7..6bde88a 100644 --- a/ansible/roles/fail2ban/tasks/configure.yml +++ b/ansible/roles/fail2ban/tasks/configure.yml @@ -1,15 +1,16 @@ --- - name: Create config - template: + ansible.builtin.template: dest: /etc/fail2ban/jail.local src: jail.local.j2 + mode: "0644" notify: Restart fail2ban -- name: flush handlers - meta: flush_handlers +- name: Flush handlers + ansible.builtin.meta: flush_handlers - name: Ensure fail2ban running even if no config change - service: + ansible.builtin.service: name: fail2ban state: started enabled: true diff --git a/ansible/roles/fail2ban/tasks/install.yml b/ansible/roles/fail2ban/tasks/install.yml index 65f3bfe..e745a4f 100644 --- a/ansible/roles/fail2ban/tasks/install.yml +++ b/ansible/roles/fail2ban/tasks/install.yml @@ -1,10 +1,10 @@ --- - name: Install EPEL repo - package: + ansible.builtin.package: name: epel-release - name: Install fail2ban packages - package: + ansible.builtin.package: name: - fail2ban-server - fail2ban-firewalld diff --git a/ansible/roles/fail2ban/tasks/main.yml b/ansible/roles/fail2ban/tasks/main.yml index 410e943..8a0a795 100644 --- a/ansible/roles/fail2ban/tasks/main.yml +++ b/ansible/roles/fail2ban/tasks/main.yml @@ -1,4 +1,4 @@ --- -- import_tasks: install.yml -- import_tasks: configure.yml +- ansible.builtin.import_tasks: install.yml +- ansible.builtin.import_tasks: configure.yml diff --git a/ansible/roles/filebeat/defaults/main.yml b/ansible/roles/filebeat/defaults/main.yml index bdd02a2..1701427 100644 --- a/ansible/roles/filebeat/defaults/main.yml +++ b/ansible/roles/filebeat/defaults/main.yml @@ -1,6 +1,6 @@ --- -#filebeat_config_path: undefined # REQUIRED. Path to filebeat.yml configuration file template +# filebeat_config_path: undefined # REQUIRED. Path to filebeat.yml configuration file template filebeat_debug: false # Note all the below can only be set/changed using the install.yml task file: diff --git a/ansible/roles/filebeat/handlers/main.yml b/ansible/roles/filebeat/handlers/main.yml index 77b9363..8fa3862 100644 --- a/ansible/roles/filebeat/handlers/main.yml +++ b/ansible/roles/filebeat/handlers/main.yml @@ -1,9 +1,8 @@ --- - - name: Restart filebeat container - systemd: + ansible.builtin.systemd: name: filebeat.service state: restarted - enabled: yes - daemon_reload: yes + enabled: true + daemon_reload: true become: true diff --git a/ansible/roles/filebeat/tasks/install.yml b/ansible/roles/filebeat/tasks/install.yml index 6514e30..74c3b09 100644 --- a/ansible/roles/filebeat/tasks/install.yml +++ b/ansible/roles/filebeat/tasks/install.yml @@ -1,8 +1,9 @@ --- - name: Create systemd unit file - template: + ansible.builtin.template: dest: /etc/systemd/system/filebeat.service src: filebeat.service.j2 + mode: "0644" become: true register: _filebeat_unit @@ -10,9 +11,10 @@ containers.podman.podman_image: name: "docker.elastic.co/beats/filebeat-oss" tag: "{{ filebeat_version }}" + become: true become_user: "{{ filebeat_podman_user }}" -- name: Reload filebeat unit file - command: systemctl daemon-reload - when: _filebeat_unit.changed +- name: Reload filebeat unit file # noqa: no-changed-when + ansible.builtin.command: systemctl daemon-reload # noqa: command-instead-of-module + when: _filebeat_unit.changed # noqa: no-handler become: true diff --git a/ansible/roles/filebeat/tasks/main.yml b/ansible/roles/filebeat/tasks/main.yml index 849683c..7a1e329 100644 --- a/ansible/roles/filebeat/tasks/main.yml +++ b/ansible/roles/filebeat/tasks/main.yml @@ -1,2 +1,3 @@ -- import_tasks: install.yml -- import_tasks: runtime.yml +--- +- ansible.builtin.import_tasks: install.yml +- ansible.builtin.import_tasks: runtime.yml diff --git a/ansible/roles/filebeat/tasks/runtime.yml b/ansible/roles/filebeat/tasks/runtime.yml index 1197450..cc2bd91 100644 --- a/ansible/roles/filebeat/tasks/runtime.yml +++ b/ansible/roles/filebeat/tasks/runtime.yml @@ -1,38 +1,36 @@ --- - - name: Collect usernamespace facts user_namespace_facts: - name: Set facts containing sub-ids - set_fact: + ansible.builtin.set_fact: # filebeat user is 1000 filebeat_host_user_id: "{{ ansible_facts.subuid[filebeat_podman_user]['start'] + 1000 - 1 }}" filebeat_host_group_id: "{{ ansible_facts.subgid[filebeat_podman_user]['start'] + 1000 - 1 }}" - name: Ensure parent directory exists - file: + ansible.builtin.file: state: directory path: "/etc/filebeat" owner: "{{ filebeat_host_user_id }}" group: "{{ filebeat_host_group_id }}" - mode: 0770 + mode: "0770" become: true - name: Template configuration files - template: - src: "{{ filebeat_config_path }}" - dest: /etc/filebeat/filebeat.yml - owner: "{{ filebeat_host_user_id }}" - group: "{{ filebeat_host_group_id }}" - mode: 0600 + ansible.builtin.template: + src: "{{ filebeat_config_path }}" + dest: /etc/filebeat/filebeat.yml + owner: "{{ filebeat_host_user_id }}" + group: "{{ filebeat_host_group_id }}" + mode: "0600" notify: Restart filebeat container become: true - name: Flush handlers - meta: flush_handlers - + ansible.builtin.meta: flush_handlers - name: Ensure filebeat service state - systemd: + ansible.builtin.systemd: name: filebeat.service state: started enabled: true diff --git a/ansible/roles/filebeat/tasks/validate.yml b/ansible/roles/filebeat/tasks/validate.yml index b493620..0787938 100644 --- a/ansible/roles/filebeat/tasks/validate.yml +++ b/ansible/roles/filebeat/tasks/validate.yml @@ -1,5 +1,5 @@ --- - name: Assert that filebeat_config_path is defined - assert: - that: filebeat_config_path is defined \ No newline at end of file + ansible.builtin.assert: + that: filebeat_config_path is defined diff --git a/ansible/roles/firewalld/README.md b/ansible/roles/firewalld/README.md index 2d75b6b..280e828 100644 --- a/ansible/roles/firewalld/README.md +++ b/ansible/roles/firewalld/README.md @@ -1,48 +1,44 @@ -Role Name -========= +# Role Name Install and configure the `firewalld` firewall. -Requirements ------------- +## Requirements EL8 host -Role Variables --------------- +## Role Variables - `firewalld_enabled`: Optional. Whether `firewalld` service is enabled (starts at boot). Default `yes`. - `firewalld_state`: Optional. State of `firewalld` service. Default `started`. Other values: `stopped`. - `firewalld_configs`: Optional. List of dicts giving parameters for [ansible.posix.firewalld module](https://docs.ansible.com/ansible/latest/collections/ansible/posix/firewalld_module.html). Default is an empty list. Note that the default configuration for firewalld on Rocky Linux 8.5 is as follows: + ```shell # firewall-offline-cmd --list-all public target: default icmp-block-inversion: no - interfaces: - sources: + interfaces: + sources: services: cockpit dhcpv6-client ssh - ports: - protocols: + ports: + protocols: forward: no masquerade: no - forward-ports: - source-ports: - icmp-blocks: - rich rules: + forward-ports: + source-ports: + icmp-blocks: + rich rules: ``` -Dependencies ------------- +## Dependencies None. -Example Playbook ----------------- +## Example Playbook -``` +```yaml - hosts: firewalld gather_facts: false become: yes @@ -52,12 +48,10 @@ Example Playbook name: firewalld ``` -License -------- +## License BSD -Author Information ------------------- +## Author Information -An optional section for the role authors to include contact information, or a website (HTML is not allowed). +An optional section for the role authors to include contact information, or a site (HTML is not allowed). diff --git a/ansible/roles/firewalld/defaults/main.yml b/ansible/roles/firewalld/defaults/main.yml index d2bdac7..2720037 100644 --- a/ansible/roles/firewalld/defaults/main.yml +++ b/ansible/roles/firewalld/defaults/main.yml @@ -1,3 +1,4 @@ -firewalld_enabled: yes +--- +firewalld_enabled: true firewalld_state: started firewalld_configs: [] diff --git a/ansible/roles/firewalld/handlers/main.yml b/ansible/roles/firewalld/handlers/main.yml index c7a008a..0e8c3df 100644 --- a/ansible/roles/firewalld/handlers/main.yml +++ b/ansible/roles/firewalld/handlers/main.yml @@ -1,6 +1,6 @@ --- - name: Restart filewalld - service: + ansible.builtin.service: name: firewalld state: restarted when: firewalld_state != 'stopped' diff --git a/ansible/roles/firewalld/meta/main.yml b/ansible/roles/firewalld/meta/main.yml index c572acc..7e1dddb 100644 --- a/ansible/roles/firewalld/meta/main.yml +++ b/ansible/roles/firewalld/meta/main.yml @@ -1,7 +1,8 @@ +--- galaxy_info: - author: your name - description: your role description - company: your company (optional) + author: StackHPC Ltd + description: Install and configure the `firewalld` firewall + company: StackHPC Ltd # If the issue tracker for your role is not on github, uncomment the # next line and provide a value @@ -14,9 +15,9 @@ galaxy_info: # - GPL-3.0-only # - Apache-2.0 # - CC-BY-4.0 - license: license (GPL-2.0-or-later, MIT, etc) + license: (GPL-2.0-or-later, MIT, etc) - min_ansible_version: 2.1 + min_ansible_version: "2.1" # If this a Container Enabled role, provide the minimum Ansible Container version. # min_ansible_container_version: diff --git a/ansible/roles/firewalld/tasks/install.yml b/ansible/roles/firewalld/tasks/install.yml index 1709cfb..c30c064 100644 --- a/ansible/roles/firewalld/tasks/install.yml +++ b/ansible/roles/firewalld/tasks/install.yml @@ -1,3 +1,4 @@ +--- - name: Install firewalld package - dnf: + ansible.builtin.dnf: name: firewalld diff --git a/ansible/roles/firewalld/tasks/main.yml b/ansible/roles/firewalld/tasks/main.yml index 98a7aa7..7a1e329 100644 --- a/ansible/roles/firewalld/tasks/main.yml +++ b/ansible/roles/firewalld/tasks/main.yml @@ -1,3 +1,3 @@ --- -- import_tasks: install.yml -- import_tasks: runtime.yml +- ansible.builtin.import_tasks: install.yml +- ansible.builtin.import_tasks: runtime.yml diff --git a/ansible/roles/firewalld/tasks/runtime.yml b/ansible/roles/firewalld/tasks/runtime.yml index 2c9ab59..03a5356 100644 --- a/ansible/roles/firewalld/tasks/runtime.yml +++ b/ansible/roles/firewalld/tasks/runtime.yml @@ -1,10 +1,10 @@ -- name: Apply filewalld configs +--- +- name: Apply filewalld configs # noqa: args[module] ansible.posix.firewalld: "{{ item }}" notify: Restart filewalld loop: "{{ firewalld_configs }}" -- meta: flush_handlers - +- ansible.builtin.meta: flush_handlers - name: Ensure filewalld state ansible.builtin.systemd: name: firewalld diff --git a/ansible/roles/freeipa/README.md b/ansible/roles/freeipa/README.md index 0fd9c36..4bcf2f6 100644 --- a/ansible/roles/freeipa/README.md +++ b/ansible/roles/freeipa/README.md @@ -1,15 +1,15 @@ - # freeipa Support FreeIPA in the appliance. In production use it is expected the FreeIPA server(s) will be external to the cluster, implying that hosts and users are managed outside the appliance. However for testing and development the role can also deploy an "in-appliance" FreeIPA server, add hosts to it and manage users in FreeIPA. -# FreeIPA Client +## FreeIPA Client + +### FreeIPA Client Usage -## Usage - Add hosts to the `freeipa_client` group and run (at a minimum) the `ansible/iam.yml` playbook. -- Host names must match the domain name. By default (using the site OpenTofu) hostnames are of the form `nodename.cluster_name.cluster_domain_suffix` where `cluster_name` and `cluster_domain_suffix` are OpenTofu variables. +- Hostnames must match the domain name. By default (using the site OpenTofu) hostnames are of the form `nodename.cluster_name.cluster_domain_suffix` where `cluster_name` and `cluster_domain_suffix` are OpenTofu variables. - Hosts discover the FreeIPA server FQDN (and their own domain) from DNS records. If DNS servers are not set this is not set from DHCP, then use the `resolv_conf` role to configure this. For example when using the in-appliance FreeIPA development server: - + ```ini # environments//groups ... @@ -21,19 +21,20 @@ Support FreeIPA in the appliance. In production use it is expected the FreeIPA s ```yaml # environments//inventory/group_vars/all/resolv_conf.yml resolv_conf_nameservers: - - "{{ hostvars[groups['freeipa_server'] | first].ansible_host }}" + - "{{ hostvars[groups['freeipa_server'] | first].ansible_host }}" ``` - -- For production use with an external FreeIPA server, a random one-time password (OTP) must be generated when adding hosts to FreeIPA (e.g. using `ipa host-add --random ...`). This password should be set as a hostvar `freeipa_host_password`. Initial host enrolment will use this OTP to enrol the host. After this it becomes irrelevant so it does not need to be committed to git. This approach means the appliance does not require the FreeIPA administrator password. +- For production use with an external FreeIPA server, a random one-time password (OTP) must be generated when adding hosts to FreeIPA (e.g. using `ipa host-add --random ...`). + This password should be set as a hostvar `freeipa_host_password`. + Initial host enrolment will use this OTP to enrol the host. After this it becomes irrelevant so it does not need to be committed to Git. + This approach means the appliance does not require the FreeIPA administrator password. - For development use with the in-appliance FreeIPA server, `freeipa_host_password` will be automatically generated in memory. - The `control` host must define `appliances_state_dir` (on persistent storage). This is used to back-up keytabs to allow FreeIPA clients to automatically re-enrol after e.g. reimaging. Note that: - This is implemented when using the site OpenTofu; on the control node `appliances_state_dir` defaults to `/var/lib/state` which is mounted from a volume. - Nodes are not re-enroled by a [Slurm-driven reimage](../../collections/ansible_collections/stackhpc/slurm_openstack_tools/roles/rebuild/README.md) (as that does not run this role). - If both a backed-up keytab and `freeipa_host_password` exist, the former is used. - -## Role Variables for Clients +### Role Variables for Clients - `freeipa_host_password`. Required for initial enrolment only, FreeIPA host password as described above. - `freeipa_setup_dns`: Optional, whether to use the FreeIPA server as the client's nameserver. Defaults to `true` when `freeipa_server` contains a host, otherwise `false`. @@ -41,10 +42,12 @@ Support FreeIPA in the appliance. In production use it is expected the FreeIPA s See also use of `appliances_state_dir` on the control node as described above. -# FreeIPA Server +## FreeIPA Server + As noted above this is only intended for development and testing. Note it cannot be run on the `openondemand` node as no other virtual servers must be defined in the Apache configuration. -## Usage +### FreeIPA Server Usage + - Add a single host to the `freeipa_server` group and run (at a minimum) the `ansible/bootstrap.yml` and `ansible/iam.yml` playbooks. - As well as configuring the FreeIPA server, the role will also: - Add ansible hosts in the group `freeipa_client` as FreeIPA hosts. @@ -52,7 +55,7 @@ As noted above this is only intended for development and testing. Note it cannot The FreeIPA GUI will be available on `https:///ipa/ui`. -## Role Variables for Server +### Role Variables for Server These role variables are only required when using `freeipa_server`: @@ -60,10 +63,10 @@ These role variables are only required when using `freeipa_server`: - `freeipa_domain`: Optional, name of domain. Default is lowercased `freeipa_realm`. - `freeipa_ds_password`: Optional, password to be used by the Directory Server for the Directory Manager user (`ipa-server-install --ds-password`). Default is generated in `environments//inventory/group_vars/all/secrets.yml` - `freeipa_admin_password`: Optional, password for the IPA `admin` user. Default is generated as for `freeipa_ds_password`. -- `freeipa_server_ip`: Optional, IP address of freeipa_server host. Default is `ansible_host` of the `freeipa_server` host. Default `false`. +- `freeipa_server_ip`: Optional, IP address of freeipa_server host. Default is `ansible_host` of the `freeipa_server` host. Default `false`. - `freeipa_setup_dns`: Optional bool, whether to configure the FreeIPA server as an integrated DNS server and define a zone and records. NB: This also controls whether `freeipa_client` hosts use the `freeipa_server` host for name resolution. Default `true` when `freeipa_server` contains a host. - `freeipa_client_ip`: Optional, IP address of FreeIPA client. Default is `ansible_host`. - `freeipa_users`: A list of dicts defining users to add, with keys/values as for [community.general.ipa_user](https://docs.ansible.com/ansible/latest/collections/community/general/ipa_user_module.html): Note that: - `name`, `givenname` (firstname) and `sn` (surname) are required. - `ipa_host`, `ipa_port`, `ipa_prot`, `ipa_user`, `validate_certs` are automatically provided and cannot be overridden. - - If `password` is set, the value should *not* be a hash (unlike `ansible.builtin.user` as used by the `basic_users` role), and it must be changed on first login. `krbpasswordexpiration` does not appear to be able to override this. + - If `password` is set, the value should _not_ be a hash (unlike `ansible.builtin.user` as used by the `basic_users` role), and it must be changed on first login. `krbpasswordexpiration` does not appear to be able to override this. diff --git a/ansible/roles/freeipa/defaults/main.yml b/ansible/roles/freeipa/defaults/main.yml index f3482a4..364c0dc 100644 --- a/ansible/roles/freeipa/defaults/main.yml +++ b/ansible/roles/freeipa/defaults/main.yml @@ -1,8 +1,9 @@ -#freeipa_realm: +--- +# freeipa_realm: freeipa_domain: "{{ freeipa_realm | lower }}" -#freeipa_ds_password: -#freeipa_admin_password: -#freeipa_server_ip: +# freeipa_ds_password: +# freeipa_admin_password: +# freeipa_server_ip: freeipa_setup_dns: "{{ groups['freeipa_server'] | length > 0 }}" freeipa_client_ip: "{{ ansible_host }}" # when run on freeipa_client group! # freeipa_host_password: diff --git a/ansible/roles/freeipa/tasks/addhost.yml b/ansible/roles/freeipa/tasks/addhost.yml index 8020f80..f01cba0 100644 --- a/ansible/roles/freeipa/tasks/addhost.yml +++ b/ansible/roles/freeipa/tasks/addhost.yml @@ -1,3 +1,4 @@ +--- - name: Get ipa host information # This uses DNS to find the ipa server, which works as this is running on the enrolled ipa server # It doesn't fail even if the host doesn't exist @@ -10,7 +11,7 @@ validate_certs: false delegate_to: "{{ groups['freeipa_server'].0 }}" register: _ipa_host_check - check_mode: yes + check_mode: true changed_when: false - name: Add host to IPA @@ -29,6 +30,6 @@ register: _ipa_host_add - name: Set fact for ipa host password - set_fact: + ansible.builtin.set_fact: freeipa_host_password: "{{ _ipa_host_add.host.randompassword }}" - when: _ipa_host_add.changed + when: _ipa_host_add.changed # noqa: no-handler diff --git a/ansible/roles/freeipa/tasks/backup-keytabs.yml b/ansible/roles/freeipa/tasks/backup-keytabs.yml index 7fc77f9..1de3f7f 100644 --- a/ansible/roles/freeipa/tasks/backup-keytabs.yml +++ b/ansible/roles/freeipa/tasks/backup-keytabs.yml @@ -1,5 +1,6 @@ +--- - name: Retrieve keytabs to localhost - fetch: + ansible.builtin.fetch: src: "{{ _freeipa_keytab_backup_path }}" dest: "{{ appliances_environment_root }}/keytabs/{{ inventory_hostname }}/" flat: true @@ -7,8 +8,9 @@ tags: retrieve - name: Copy keytabs back to control node - copy: + ansible.builtin.copy: src: "{{ appliances_environment_root }}/keytabs/{{ inventory_hostname }}/" dest: "{{ _freeipa_keytab_backup_path | dirname }}" + mode: "0644" delegate_to: "{{ groups['control'].0 }}" tags: deploy diff --git a/ansible/roles/freeipa/tasks/client-install.yml b/ansible/roles/freeipa/tasks/client-install.yml index a164cd2..82f7901 100644 --- a/ansible/roles/freeipa/tasks/client-install.yml +++ b/ansible/roles/freeipa/tasks/client-install.yml @@ -1,4 +1,4 @@ - +--- - name: Install FreeIPA client package - dnf: + ansible.builtin.dnf: name: ipa-client diff --git a/ansible/roles/freeipa/tasks/enrol.yml b/ansible/roles/freeipa/tasks/enrol.yml index 9848f04..19e0ee2 100644 --- a/ansible/roles/freeipa/tasks/enrol.yml +++ b/ansible/roles/freeipa/tasks/enrol.yml @@ -1,14 +1,16 @@ +--- +# yamllint disable-line rule:line-length # based on https://access.redhat.com/documentation/en-us/red_hat_enterprise_linux/8/html/installing_identity_management/assembly_installing-an-idm-client_installing-identity-management - name: Retrieve persisted keytab from previous enrolement - slurp: + ansible.builtin.slurp: src: "{{ _freeipa_keytab_backup_path }}" delegate_to: "{{ groups['control'] | first }}" register: _slurp_persisted_keytab failed_when: false - name: Write persisted keytab from previous enrolment - copy: + ansible.builtin.copy: content: "{{ _slurp_persisted_keytab.content | b64decode }}" dest: /tmp/krb5.keytab owner: root @@ -33,7 +35,7 @@ # 3. New SSH keys are generated # 4. ipaUniqueID is preserved # and ALSO that the keytab is changed! - command: + ansible.builtin.command: cmd: > ipa-client-install --unattended @@ -52,7 +54,7 @@ - name: Enrol with FreeIPA using random password # Note --password is overloaded - it's bulkpassword unless --principal or --force-join is used in which case it's admin password - command: + ansible.builtin.command: cmd: > ipa-client-install --unattended @@ -75,19 +77,19 @@ # This service is installed by nfs-utils, which attempts to start it. # It has ConditionPathExists=/etc/krb5.keytab which fails if host is not enroled. # This task avoids a reboot. - systemd: + ansible.builtin.systemd: name: rpc-gssd.service state: started enabled: true - name: Retrieve current keytab - slurp: + ansible.builtin.slurp: src: /etc/krb5.keytab register: _slurp_current_keytab failed_when: false - name: Ensure keytab backup directory exists - file: + ansible.builtin.file: path: "{{ _freeipa_keytab_backup_path | dirname }}" state: directory owner: root @@ -96,7 +98,8 @@ delegate_to: "{{ groups['control'] | first }}" - name: Persist keytab - copy: + ansible.builtin.copy: content: "{{ _slurp_current_keytab.content | b64decode }}" dest: "{{ _freeipa_keytab_backup_path }}" + mode: "0644" delegate_to: "{{ groups['control'] | first }}" diff --git a/ansible/roles/freeipa/tasks/server.yml b/ansible/roles/freeipa/tasks/server.yml index e555ebe..b711998 100644 --- a/ansible/roles/freeipa/tasks/server.yml +++ b/ansible/roles/freeipa/tasks/server.yml @@ -1,20 +1,22 @@ +--- +# yamllint disable-line rule:line-length # Based on https://access.redhat.com/documentation/en-us/red_hat_enterprise_linux/8/html/installing_identity_management/preparing-the-system-for-ipa-server-installation_installing-identity-management#host-name-and-dns-requirements-for-ipa_preparing-the-system-for-ipa-server-installation - name: Install freeipa server packages - dnf: - name: '@idm:DL1/dns' + ansible.builtin.dnf: + name: "@idm:DL1/dns" state: present - name: Install ipa server -# TODO: make no-ui-redirect and dns configurable?? -# TODO: set file mask as per docs? Would be hard to cope with failures. Doesn't appear to be necessary actually. - command: + # TODO: make no-ui-redirect and dns configurable?? + # TODO: set file mask as per docs? Would be hard to cope with failures. Doesn't appear to be necessary actually. + ansible.builtin.command: cmd: > ipa-server-install --realm {{ freeipa_realm | quote }} --domain {{ freeipa_domain | lower | quote }} --ds-password {{ freeipa_ds_password | quote }} - --admin-password {{ freeipa_admin_password | quote }} + --admin-password {{ freeipa_admin_password | quote }} --ip-address={{ freeipa_server_ip }} {% if freeipa_setup_dns | bool %}--setup-dns{% endif %} --auto-reverse @@ -32,26 +34,26 @@ - name: Disable redirects to hard-coded domain # see https://pagure.io/freeipa/issue/7479 - replace: + ansible.builtin.replace: path: /etc/httpd/conf.d/ipa-rewrite.conf - regexp: '{{ item.regexp }}' - replace: '{{ item.replace }}' + regexp: "{{ item.regexp }}" + replace: "{{ item.replace }}" loop: # RewriteRule ^/$ https://${FQDN}/ipa/ui [L,NC,R=301] - irrelevant if using --no-ui-redirect - - regexp: '^(RewriteRule \^/\$) (https://.*)(/ipa/ui.*)$' - replace: '\1 \3' + - regexp: "^(RewriteRule \\^/\\$) (https://.*)(/ipa/ui.*)$" + replace: "\\1 \\3" # RewriteRule ^/ipa/(.*) - occurs twice - - regexp: '^(RewriteRule \^\/ipa\/\(.*)$' - replace: '#\1' - - regexp: '^(RewriteCond .*)$' - replace: '#\1' + - regexp: "^(RewriteRule \\^\\/ipa\\/\\(.*)$" + replace: "#\\1" + - regexp: "^(RewriteCond .*)$" + replace: "#\\1" # RewriteRule ^/(.*) https://${FQDN}/$1 [L,R=301] - - regexp: '^(RewriteRule \^/\(\.\*\).*)$' - replace: '#\1' + - regexp: "^(RewriteRule \\^/\\(\\.\\*\\).*)$" + replace: "#\\1" register: _replace_freeipa_rewrites - name: Get freeipa server facts - setup: + ansible.builtin.setup: - name: Fix HTTP_REFERER ansible.builtin.lineinfile: @@ -60,7 +62,7 @@ register: _http_referer - name: Reload apache configuration - service: + ansible.builtin.service: name: httpd state: reloaded when: _replace_freeipa_rewrites.changed or _http_referer.changed diff --git a/ansible/roles/freeipa/tasks/users.yml b/ansible/roles/freeipa/tasks/users.yml index bd1caca..97068fa 100644 --- a/ansible/roles/freeipa/tasks/users.yml +++ b/ansible/roles/freeipa/tasks/users.yml @@ -4,12 +4,12 @@ displayname: "{{ item.displayname | default(omit) }}" gidnumber: "{{ item.gidnumber | default(omit) }}" givenname: "{{ item.givenname }}" - #ipa_host + # ipa_host ipa_pass: "{{ freeipa_admin_password | quote }}" - #ipa_port - #ipa_prot + # ipa_port + # ipa_prot ipa_timeout: "{{ item.ipa_timeout | default(omit) }}" - #ipa_user + # ipa_user krbpasswordexpiration: "{{ item.krbpasswordexpiration | default(omit) }}" loginshell: "{{ item.loginshell | default(omit) }}" mail: "{{ item.mail | default(omit) }}" @@ -23,5 +23,5 @@ uidnumber: "{{ item.uidnumber | default(omit) }}" update_password: "{{ item.update_password | default(omit) }}" userauthtype: "{{ item.userauthtype | default(omit) }}" - #validate_certs + # validate_certs loop: "{{ freeipa_users }}" diff --git a/ansible/roles/freeipa/tasks/validate.yml b/ansible/roles/freeipa/tasks/validate.yml index 238f89e..39faba3 100644 --- a/ansible/roles/freeipa/tasks/validate.yml +++ b/ansible/roles/freeipa/tasks/validate.yml @@ -1,12 +1,13 @@ +--- - name: Get hostname as reported by command - command: hostname + ansible.builtin.command: hostname register: _freeipa_validate_hostname changed_when: false when: "'freeipa_server' in group_names" - name: Ensure hostname is fully-qualified # see section 2.7 of redhat guide to installing identity management - assert: + ansible.builtin.assert: that: _freeipa_validate_hostname.stdout | split('.') | length >= 3 fail_msg: "freeipa_server hostname '{{ _freeipa_validate_hostname.stdout }}' is not fully-qualified (a.b.c)" when: "'freeipa_server' in group_names" @@ -14,23 +15,23 @@ - name: Check for virtual servers in httpd configuration of freeipa_server # e.g. fatimage with OOD config; community.general.ipa_host fails with "401 Unauthorized: No session cookie found" # https://lists.fedoraproject.org/archives/list/freeipa-users@lists.fedorahosted.org/message/7RH7XDFR35KDPYJ7AQCQI2H2EOWIZCWA/ - find: + ansible.builtin.find: path: /etc/httpd/conf.d/ - contains: '- {{ @@ -25,24 +26,24 @@ # batch takes default '' because last devices doesn't have trailing blank line - name: Examine whether device address contains gateway_ip - set_fact: + ansible.builtin.set_fact: device_is_gateway_device: "{{ nmcli_devices | map(attribute='ip4_address') | map('ansible.utils.network_in_network', gateway_ip) }}" # list of bools - false if gateway_ip == '' - name: Get name of connection containing gateway_ip # might be empty string - set_fact: + ansible.builtin.set_fact: gateway_ip_connection: >- {{ nmcli_devices | map(attribute='connection') | - zip(device_is_gateway_device) | selectattr('1') | + zip(device_is_gateway_device) | selectattr('1') | map(attribute=0) | list | first | default ('') }} - name: Show debug info - debug: + ansible.builtin.debug: msg: "gateway_ip={{ gateway_ip }} access_ip={{ access_ip }} gateway_ip_connection={{ gateway_ip_connection }}" - name: Error if device has a gateway which is not the desired one - assert: + ansible.builtin.assert: that: item.gateway == gateway_ip fail_msg: "Device {{ item | to_nice_json }} has gateway: cannot apply gateway {{ gateway_ip }}" when: @@ -51,8 +52,8 @@ - item.ip4_gateway != gateway_ip loop: "{{ nmcli_devices }}" - - name: Remove undesired gateways - shell: | + - name: Remove undesired gateways # noqa: no-changed-when + ansible.builtin.shell: | nmcli connection modify '{{ item.connection }}' \ ipv4.never-default yes \ ipv6.never-default yes @@ -62,9 +63,9 @@ - item.ip4_gateway != '' - item.connection != gateway_ip_connection loop: "{{ nmcli_devices }}" - - - name: Add desired gateways - shell: | + + - name: Add desired gateways # noqa: no-changed-when + ansible.builtin.shell: | nmcli connection modify '{{ item.connection }}' \ ipv4.address {{ item.ip4_address }} \ ipv4.gateway {{ gateway_ip }} diff --git a/ansible/roles/gateway/tasks/main.yml b/ansible/roles/gateway/tasks/main.yml index c13ba5c..82b481a 100644 --- a/ansible/roles/gateway/tasks/main.yml +++ b/ansible/roles/gateway/tasks/main.yml @@ -1,7 +1,8 @@ +--- - name: Add gateway playbook - copy: + ansible.builtin.copy: src: gateway-init.yml dest: /etc/ansible-init/playbooks/05-gateway-init.yml owner: root group: root - mode: 0644 + mode: "0644" diff --git a/ansible/roles/grafana-dashboards/files/openhpc-slurm.json b/ansible/roles/grafana-dashboards/files/openhpc-slurm.json index fb4078c..4cc5a46 100644 --- a/ansible/roles/grafana-dashboards/files/openhpc-slurm.json +++ b/ansible/roles/grafana-dashboards/files/openhpc-slurm.json @@ -2072,4 +2072,4 @@ "title": "OpenHPC Slurm", "uid": "openhpc-slurm", "version": 2 -} \ No newline at end of file +} diff --git a/ansible/roles/grafana-dashboards/tasks/main.yml b/ansible/roles/grafana-dashboards/tasks/main.yml index 235088f..2292dac 100644 --- a/ansible/roles/grafana-dashboards/tasks/main.yml +++ b/ansible/roles/grafana-dashboards/tasks/main.yml @@ -25,7 +25,7 @@ - become: false block: - name: Create local grafana dashboard directory - tempfile: + ansible.builtin.tempfile: state: directory register: _tmp_dashboards changed_when: false @@ -52,10 +52,11 @@ tags: - skip_ansible_lint - - name: copy in-role grafana dashboards + - name: Copy in-role grafana dashboards ansible.builtin.copy: src: "{{ item.dashboard_file }}" dest: "{{ _tmp_dashboards.path }}" + mode: "0644" loop: "{{ grafana_dashboards }}" when: - grafana_dashboards | length > 0 @@ -109,7 +110,7 @@ - name: Create/Update dashboards file (provisioning) become: true - copy: + ansible.builtin.copy: dest: "/etc/grafana/provisioning/dashboards/ansible.yml" content: | apiVersion: 1 @@ -123,12 +124,12 @@ backup: false owner: root group: grafana - mode: 0640 + mode: "0640" notify: restart grafana - name: Register preexisting dashboards become: true - find: + ansible.builtin.find: paths: "{{ grafana_data_dir }}/dashboards" hidden: true patterns: @@ -137,15 +138,17 @@ - name: Import grafana dashboards become: true - copy: - remote_src: yes + ansible.builtin.copy: + remote_src: true src: "{{ _tmp_dashboards.path }}/" # Note trailing / to only copy contents, not directory itself dest: "{{ grafana_data_dir }}/dashboards/" + directory_mode: "0755" + mode: "0644" notify: "provisioned dashboards changed" - name: Register all installed dashboards become: true - find: + ansible.builtin.find: paths: "{{ grafana_data_dir }}/dashboards" hidden: true patterns: @@ -153,13 +156,13 @@ register: _dashboards_post - name: Get dashboard lists - set_fact: - _dashboards_pre_list: "{{ _dashboards_pre | json_query('files[*].path') | default([]) }}" + ansible.builtin.set_fact: + _dashboards_pre_list: "{{ _dashboards_pre | json_query('files[*].path') | default([]) }}" _dashboards_post_list: "{{ _dashboards_post | json_query('files[*].path') | default([]) }}" - name: Remove installed dashboards not defined through this role become: true - file: + ansible.builtin.file: path: "{{ item }}" state: absent with_items: "{{ _dashboards_pre_list | difference( _dashboards_post_list ) }}" diff --git a/ansible/roles/hpctests/README.md b/ansible/roles/hpctests/README.md index 2cb9b76..ed3d64e 100644 --- a/ansible/roles/hpctests/README.md +++ b/ansible/roles/hpctests/README.md @@ -1,53 +1,55 @@ -hpctests -========= +# hpctests An MPI-based test suite for Slurm appliance clusters. -This is intended as a replacement for [this test role](https://github.com/stackhpc/ansible_collection_slurm_openstack_tools/tree/main/roles/test/) but will be safe to run on clusters in production use as it does not use NFS exports for package installs. Instead it assumes the required packages are pre-installed, which is the case by default with this appliance. +This is intended as a replacement for [this test role](https://github.com/stackhpc/ansible_collection_slurm_openstack_tools/tree/main/roles/test/) but will be safe to run on clusters in production use as it does not use NFS exports for package installs. Instead it assumes the required packages are pre-installed, which is the case by default with this appliance. Tests (with corresponding tags) are: + - `pingpong`: Runs Intel MPI Benchmark's IMB-MPI1 pingpong between a pair of (scheduler-selected) nodes. Reports zero-size message latency and maximum bandwidth. - `pingmatrix`: Runs a similar pingpong test but between all pairs of nodes. Reports zero-size message latency & maximum bandwidth. - `hpl-solo`: Runs the HPL benchmark individually on all nodes. Reports Gflops. All tests use GCC 9 and OpenMPI 4 with UCX. The HPL-based tests use OpenBLAS. -Requirements ------------- +## Requirements - An OpenHPC v2.x cluster. - The following OpenHPC packages installed (note this is the default in the appliance, see `environments/common/inventory/group_vars/all/openhpc.yml:openhpc_default_packages`): - `ohpc-gnu9-openmpi4-perf-tools` - `openblas-gnu9-ohpc` -Role Variables --------------- +## Role Variables + - `hpctests_user`: Optional. User to run jobs as. Default is `ansible_user`. - `hpctests_rootdir`: Optional. Path to root of test directory tree. This must be a r/w filesystem shared to all cluster nodes under test. Default is `/home/{{ hpctests_user }}/hpctests`. **NB:** Do not use `~` in this path. - `hpctests_partition`: Optional. Name of partition to use, otherwise default partition is used. - `hpctests_nodes`: Optional. A Slurm node expression, e.g. `'compute-[0-15,19]'` defining the nodes to use. If not set all nodes in the selected partition are used. -- `hpctests_ucx_net_devices`: Optional. Control which network device/interface to use, e.g. `mlx5_1:0`. The default of `all` (as per UCX) may not be appropriate for multi-rail nodes with different bandwidths on each device. See [here](https://openucx.readthedocs.io/en/master/faq.html#what-is-the-default-behavior-in-a-multi-rail-environment) and [here](https://github.com/openucx/ucx/wiki/UCX-environment-parameters#setting-the-devices-to-use). Alternatively a mapping of partition name (as `hpctests_partition`) to device/interface can be used. For partitions not defined in the mapping the default of `all` is used. +- `hpctests_ucx_net_devices`: Optional. Control which network device/interface to use, e.g. `mlx5_1:0`. + The default of `all` (as per UCX) may not be appropriate for multi-rail nodes with different bandwidths on each device. See [here](https://openucx.readthedocs.io/en/master/faq.html#what-is-the-default-behavior-in-a-multi-rail-environment) and [here](https://github.com/openucx/ucx/wiki/UCX-environment-parameters#setting-the-devices-to-use). + Alternatively a mapping of partition name (as `hpctests_partition`) to device/interface can be used. For partitions not defined in the mapping the default of `all` is used. - `hpctests_outdir`: Optional. Directory to use for test output on local host. Defaults to `$HOME/hpctests` (for local user). - `hpctests_hpl_NB`: Optional, default 192. The HPL block size "NB" - for Intel CPUs see [here](https://software.intel.com/content/www/us/en/develop/documentation/onemkl-linux-developer-guide/top/intel-oneapi-math-kernel-library-benchmarks/intel-distribution-for-linpack-benchmark/configuring-parameters.html). - `hpctests_hpl_mem_frac`: Optional, default 0.3. The HPL problem size "N" will - be selected to target using this fraction of each node's memory - - **CAUTION: see note below**. + be selected to target using this fraction of each node's memory - + **CAUTION: see note below**. - `hpctests_hpl_arch`: Optional, default 'linux64'. Arbitrary architecture name for HPL build. HPL is compiled on the first compute node of those selected (see `hpctests_nodes`), so this can be used to create different builds for different types of compute node. - --- + **CAUTION** > The default of `hpctests_hpl_mem_frac=0.3` will not significantly load nodes. -Values up to ~0.8 may be appropriate for a stress test but ensure cloud -operators are aware in case this overloads e.g. power supplies or cooling. -Values > 0.8 require longer runtimes and increase the risk of out-of-memory -errors without normally significantly increasing the stress on the node. ---- +> Values up to ~0.8 may be appropriate for a stress test but ensure cloud +> operators are aware in case this overloads e.g. power supplies or cooling. +> Values > 0.8 require longer runtimes and increase the risk of out-of-memory + +## errors without normally significantly increasing the stress on the node The following variables should not generally be changed: + - `hpctests_pre_cmd`: Optional. Command(s) to include in sbatch templates before module load commands. - `hpctests_pingmatrix_modules`: Optional. List of modules to load for pingmatrix test. Defaults are suitable for OpenHPC 2.x cluster using the required packages. - `hpctests_pingpong_modules`: As above but for pingpong test. @@ -55,13 +57,11 @@ The following variables should not generally be changed: - `hpctests_hpl_modules`: As above but for hpl tests. - `hpctests_hpl_version`: Version of HPL -Dependencies ------------- +## Dependencies None. -Example Playbook ----------------- +## Example Playbook The role should be run on a login node; @@ -76,12 +76,10 @@ The role should be run on a login node; name: hpctests ``` -License -------- +## License Apache v2 -Author Information ------------------- +## Author Information stackhpc.com diff --git a/ansible/roles/hpctests/defaults/main.yml b/ansible/roles/hpctests/defaults/main.yml index e514de5..fa1c3c2 100644 --- a/ansible/roles/hpctests/defaults/main.yml +++ b/ansible/roles/hpctests/defaults/main.yml @@ -2,16 +2,24 @@ hpctests_user: "{{ ansible_user }}" hpctests_group: "{{ hpctests_user }}" hpctests_rootdir: "/home/{{ hpctests_user }}/hpctests" -hpctests_pre_cmd: '' -hpctests_pingmatrix_modules: [gnu12 openmpi4] -hpctests_pingpong_modules: [gnu12 openmpi4 imb] -hpctests_pingpong_plot: yes -hpctests_hpl_modules: [gnu12 openmpi4 openblas] +hpctests_pre_cmd: "" +hpctests_pingmatrix_modules: + - gnu12 + - openmpi4 +hpctests_pingpong_modules: + - gnu12 + - openmpi4 + - imb +hpctests_pingpong_plot: true +hpctests_hpl_modules: + - gnu12 + - openmpi4 + - openblas hpctests_outdir: "{{ lookup('env', 'APPLIANCES_ENVIRONMENT_ROOT') }}/hpctests" hpctests_ucx_net_devices: all hpctests_hpl_version: "2.3" hpctests_hpl_NB: 192 hpctests_hpl_mem_frac: 0.3 hpctests_hpl_arch: linux64 -#hpctests_nodes: -#hpctests_partition: +# hpctests_nodes: +# hpctests_partition: diff --git a/ansible/roles/hpctests/files/.clang-format-ignore b/ansible/roles/hpctests/files/.clang-format-ignore new file mode 100644 index 0000000..72e8ffc --- /dev/null +++ b/ansible/roles/hpctests/files/.clang-format-ignore @@ -0,0 +1 @@ +* diff --git a/ansible/roles/hpctests/files/CPPLINT.cfg b/ansible/roles/hpctests/files/CPPLINT.cfg new file mode 100644 index 0000000..88e41cd --- /dev/null +++ b/ansible/roles/hpctests/files/CPPLINT.cfg @@ -0,0 +1 @@ +exclude_files=.*.c diff --git a/ansible/roles/hpctests/files/plot_imb_pingpong.py b/ansible/roles/hpctests/files/plot_imb_pingpong.py index dbf6398..eb15c4f 100644 --- a/ansible/roles/hpctests/files/plot_imb_pingpong.py +++ b/ansible/roles/hpctests/files/plot_imb_pingpong.py @@ -1,55 +1,76 @@ -import matplotlib as mpl -import matplotlib.pyplot as plt -from matplotlib import ticker -import numpy as np +# pylint: disable=missing-module-docstring import os -def sizeof_fmt(num, suffix='B'): - """ TODO: """ +import matplotlib.pyplot as plt # pylint: disable=import-error +from matplotlib import ticker # pylint: disable=import-error + + +def sizeof_fmt(num, suffix="B"): + """TODO:""" # from https://stackoverflow.com/a/1094933/916373 - for unit in ['','Ki','Mi','Gi','Ti','Pi','Ei','Zi']: + for unit in ["", "Ki", "Mi", "Gi", "Ti", "Pi", "Ei", "Zi"]: if abs(num) < 1024.0: - return "%3.1f%s%s" % (num, unit, suffix) + # pylint: disable-next=consider-using-f-string + return "%3.1f%s%s" % ( + num, + unit, + suffix, + ) num /= 1024.0 - return "%.1f%s%s" % (num, 'Yi', suffix) + return "%.1f%s%s" % (num, "Yi", suffix) # pylint: disable=consider-using-f-string + def read_imb_out(path): - """ Read stdout from an IMB-MPI1 run. - - Returns a dict with: - key:= int, total number of processes involved - value:= pandas dataframe, i.e. one per results table. Columns as per table. - - If multiple results tables are present it is assumed that they are all the same benchmark, - and only differ in the number of processes. + """Read stdout from an IMB-MPI1 run. + + Returns a dict with: + key:= int, total number of processes involved + value:= pandas dataframe, i.e. one per results table. Columns as per table. + + If multiple results tables are present it is assumed that they are all the same benchmark, + and only differ in the number of processes. """ data = {} - COLTYPES = { # all benchmark names here should be lowercase - 'uniband': (int, int, float, int), # #bytes #repetitions Mbytes/sec Msg/sec - 'biband': (int, int, float, int), - 'pingpong':(int, int, float, float), # #bytes #repetitions t[usec] Mbytes/sec - 'alltoall':(int, int, float, float, float) # #bytes #repetitions t_min[usec] t_max[usec] t_avg[usec] + COLTYPES = { # all benchmark names here should be lowercase # pylint: disable=invalid-name + # #bytes #repetitions Mbytes/sec Msg/sec + "uniband": (int, int, float, int), + "biband": (int, int, float, int), + # #bytes #repetitions t[usec] Mbytes/sec + "pingpong": (int, int, float, float), + "alltoall": ( + int, + int, + float, + float, + float, + ), # #bytes #repetitions t_min[usec] t_max[usec] t_avg[usec] } - with open(path) as f: + with open(path) as f: # pylint: disable=unspecified-encoding for line in f: - if line.startswith('# Benchmarking '): + if line.startswith("# Benchmarking "): benchmark = line.split()[-1].lower() if benchmark not in COLTYPES: - raise ValueError('Do not know how to read %r benchmark in %s' % (benchmark, path)) + raise ValueError( + "Do not know how to read %r benchmark in %s" # pylint: disable=consider-using-f-string + % (benchmark, path) + ) converters = COLTYPES[benchmark] line = next(f) - if not line.startswith('# #processes = '): - raise ValueError('expected %s, got %s' % (expect, nprocs_line)) - n_procs = int(line.split('=')[-1].strip()) - while line.startswith('#'): - line = next(f) # may or may not include line "# .. additional processes waiting in MPI_Barrier", plus other # lines + expected = "# #processes = " + if not line.startswith(expected): + raise ValueError(f"expected {expected}, got {line}") + n_procs = int(line.split("=")[-1].strip()) + while line.startswith("#"): + # may or may not include line "# .. additional processes + # waiting in MPI_Barrier", plus other # lines + line = next(f) rows = [] while True: line = next(f).strip() - if line == '': + if line == "": break rows.append([f(v) for (f, v) in zip(converters, line.split())]) # turn data around: @@ -60,26 +81,30 @@ def read_imb_out(path): data[n_procs] = cols return data -if __name__ == '__main__': + +if __name__ == "__main__": import sys + d = read_imb_out(sys.argv[1]) if len(d) > 1: - raise ValueError('Found > 1 benchmark in', sys.argv[1]) + raise ValueError("Found > 1 benchmark in", sys.argv[1]) outdir = os.path.dirname(sys.argv[1]) for n, df in d.items(): fig, ax1 = plt.subplots() ax2 = ax1.twinx() - ax1.plot(df[0], df[2], label='latency', color='b') - ax2.plot(df[0], df[3], label='bandwidth', color='r') - ax1.set_xscale('log', base=2) - ax1.set_yscale('log', base=10) - ax1.xaxis.set_major_formatter(ticker.FuncFormatter(lambda x, pos: sizeof_fmt(x))) + ax1.plot(df[0], df[2], label="latency", color="b") + ax2.plot(df[0], df[3], label="bandwidth", color="r") + ax1.set_xscale("log", base=2) + ax1.set_yscale("log", base=10) + ax1.xaxis.set_major_formatter( + ticker.FuncFormatter(lambda x, pos: sizeof_fmt(x)) + ) ax1.grid(True, which="both") - ax1.set_xlabel('#bytes') - ax1.set_ylabel('latency ($\mu$s)', color='b') - ax2.set_ylabel('bandwidth (Mbytes/sec)', color='r') - fig.legend(loc='upper left') + ax1.set_xlabel("#bytes") + ax1.set_ylabel("latency ($\\mu$s)", color="b") + ax2.set_ylabel("bandwidth (Mbytes/sec)", color="r") + fig.legend(loc="upper left") plt.tight_layout() - figpath = os.path.join(outdir, 'pingpong.png') + figpath = os.path.join(outdir, "pingpong.png") plt.savefig(figpath) print(figpath) diff --git a/ansible/roles/hpctests/library/hpl_pq.py b/ansible/roles/hpctests/library/hpl_pq.py index 96eff80..0e017a6 100644 --- a/ansible/roles/hpctests/library/hpl_pq.py +++ b/ansible/roles/hpctests/library/hpl_pq.py @@ -1,11 +1,12 @@ #!/usr/bin/python +# pylint: disable=missing-module-docstring # -*- coding: utf-8 -*- # Copyright: (c) 2020, StackHPC # Apache 2 License -from ansible.module_utils.basic import AnsibleModule -import json + +from ansible.module_utils.basic import AnsibleModule # pylint: disable=import-error ANSIBLE_METADATA = { "metadata_version": "0.1", @@ -18,8 +19,9 @@ module: hpl_pq short_description: Calculate P and Q values for HPL. version_added: "0.0" -description: - - "Takes number of processes and returns a dict with keys 'P' and 'Q' giving appropriate values, i.e. with Q equal or slightly larger than P and P * Q == num_processes." +description: > + Takes number of processes and returns a dict with keys 'P' and 'Q' giving appropriate values, + i.e. with Q equal or slightly larger than P and P * Q == num_processes. options: num_processes: description: @@ -36,33 +38,39 @@ TODO """ + def factors(n): - """ Return a sequence of (a, b) tuples where a < b giving factors of n. - - Based on https://stackoverflow.com/a/6909532/916373 + """Return a sequence of (a, b) tuples where a < b giving factors of n. + + Based on https://stackoverflow.com/a/6909532/916373 """ - return [(i, n//i) for i in range(1, int(n**0.5) + 1) if n % i == 0] + return [(i, n // i) for i in range(1, int(n**0.5) + 1) if n % i == 0] -def run_module(): - module_args = dict( - num_processes=dict(type="int", required=True), - ) + +def run_module(): # pylint: disable=missing-function-docstring + module_args = { + "num_processes": { + "type": "int", + "required": True, + }, + } module = AnsibleModule(argument_spec=module_args, supports_check_mode=True) result = {"changed": False} if module.check_mode: module.exit_json(**result) - + num_processes = module.params["num_processes"] f = factors(num_processes) - p, q = f[-1] # nearest to square + p, q = f[-1] # nearest to square - result['grid'] = {'P':p, 'Q': q} + result["grid"] = {"P": p, "Q": q} module.exit_json(**result) -def main(): +def main(): # pylint: disable=missing-function-docstring run_module() + if __name__ == "__main__": main() diff --git a/ansible/roles/hpctests/library/plot_nxnlatbw.py b/ansible/roles/hpctests/library/plot_nxnlatbw.py index 0193b69..b1a9810 100644 --- a/ansible/roles/hpctests/library/plot_nxnlatbw.py +++ b/ansible/roles/hpctests/library/plot_nxnlatbw.py @@ -1,11 +1,13 @@ #!/usr/bin/python +# pylint: disable=missing-module-docstring # -*- coding: utf-8 -*- # Copyright: (c) 2020, StackHPC # Apache 2 License -from ansible.module_utils.basic import AnsibleModule -import json, os +import os + +from ansible.module_utils.basic import AnsibleModule # pylint: disable=import-error ANSIBLE_METADATA = { "metadata_version": "0.1", @@ -18,8 +20,10 @@ module: plot_nxnlatbw short_description: Read nxnlatbw output, report statistics and tabulate latencies version_added: "0.0" -description: - - "Reads output from running the nxnlatbw ping matrix. Return value includes a 'stats' key with min/max latency and bandwidth values. Generates an html table of pairwise latencies, coloured by value." +description: > + Reads output from running the nxnlatbw ping matrix. + Return value includes a 'stats' key with min/max latency and bandwidth values. + Generates an html table of pairwise latencies, coloured by value. options: src: description: @@ -32,8 +36,9 @@ required: true type: str nodes: - description: - - Comma-separated list of nodenames to label RANKS with - NB this should be provided in the same order as ranks + description: > + Comma-separated list of nodenames to label RANKS with - + NB this should be provided in the same order as ranks requirements: - "python >= 3.6" author: @@ -64,119 +69,179 @@ """ -def html_rows(rankAs, rankBs, nodes, data): - """ Create an HTML-format fragment defining table rows. - Args: - rankAs, rankBs: lists of ranks - nodes: list of nodenames in rank order - data: dict with keys (rankA, rankB) +def html_rows( + rankAs, rankBs, nodes, data +): # pylint: disable=invalid-name # pylint: disable=invalid-name + """Create an HTML-format fragment defining table rows. - Returns a string. + Args: + rankAs, rankBs: lists of ranks + nodes: list of nodenames in rank order + data: dict with keys (rankA, rankB) + + Returns a string. """ - + minv = min(data.values()) maxv = max(data.values()) rows = [] - for rankA in rankAs: # row + for rankA in rankAs: # row # pylint: disable=invalid-name if nodes: - outrow = ['%s [%s]' % (nodes[rankA], rankA)] + outrow = [ + # pylint: disable-next=consider-using-f-string + "%s [%s]" + % (nodes[rankA], rankA) + ] else: - outrow = ['%s' % rankA] - for rankB in rankBs: + outrow = [ + # pylint: disable-next=consider-using-f-string + "%s" + % rankA + ] + for rankB in rankBs: # pylint: disable=invalid-name val = data.get((rankA, rankB)) if val is not None: try: - lightness = 50 + (50 - 50 * ((val - minv) / (maxv - minv))) # want value in range LOW = 100 (white) -> HIGH 50(red) - except ZeroDivisionError: # no min-max spread + lightness = 50 + ( + 50 - 50 * ((val - minv) / (maxv - minv)) + ) # want value in range LOW = 100 (white) -> HIGH 50(red) + except ZeroDivisionError: # no min-max spread lightness = 100 - outrow += ['%.1f' % (lightness, val)] + outrow += [ + # pylint: disable-next=consider-using-f-string + '%.1f' + % (lightness, val) + ] else: - outrow += ['-'] - outrow += [''] - rows.append(' '.join(outrow)) - return '\n'.join(rows) - - -def run_module(): - module_args = dict( - src=dict(type="str", required=True), - dest=dict(type="str", required=True), - nodes=dict(type="str", required=False, default=None) - ) + outrow += ["-"] + outrow += [""] + rows.append(" ".join(outrow)) + return "\n".join(rows) + + +def run_module(): # pylint: disable=missing-function-docstring, too-many-locals + module_args = { + "src": { + "type": "str", + "required": True, + }, + "dest": { + "type": "str", + "required": True, + }, + "nodes": { + "type": "str", + "required": False, + "default": None, + }, + } module = AnsibleModule(argument_spec=module_args, supports_check_mode=True) result = {"changed": False} - + src = os.path.expanduser(module.params["src"]) dest = os.path.expanduser(module.params["dest"]) nodes = module.params["nodes"] if nodes is not None: - nodes = nodes.split(',') - + nodes = nodes.split(",") + if module.check_mode: module.exit_json(**result) - # read latencies/bandwidths: + # read latencies/bandwidths: latencies = {} bandwidths = {} - with open(src) as nxn_f: + with open(src) as nxn_f: # pylint: disable=unspecified-encoding for ln, line in enumerate(nxn_f): - vals = line.split(',') - if vals[0] == 'src': + vals = line.split(",") + if vals[0] == "src": continue if len(vals) != 4: - print('warning: skipping line %i (%i values)' % (ln, len(vals))) + print( + # pylint: disable-next=consider-using-f-string + "warning: skipping line %i (%i values)" + % (ln, len(vals)) + ) continue + # pylint: disable=invalid-name try: - rankA, rankB, lat, bw = int(vals[0]), int(vals[1]), float(vals[2]), float(vals[3]) + ( + rankA, + rankB, + lat, + bw, + ) = ( + int(vals[0]), + int(vals[1]), + float(vals[2]), + float(vals[3]), + ) except ValueError: - print('warning: skipping line %i (%s) - parse failure' % (ln, line)) + print(f"warning: skipping line {ln} ({line}) - parse failure") continue latencies[rankA, rankB] = lat bandwidths[rankA, rankB] = bw - + # pylint: enable=invalid-name + # get list of node IDs: - rankAs = sorted(set(k[0] for k in latencies)) - rankBs = sorted(set(k[1] for k in latencies)) + rankAs = sorted(set(k[0] for k in latencies)) # pylint: disable=invalid-name + rankBs = sorted(set(k[1] for k in latencies)) # pylint: disable=invalid-name if rankAs != rankBs: module.fail_json("Ranks extracted from result columns differed", **result) if nodes and len(nodes) != len(rankAs): - module.fail_json("Results contained %i ranks but %i node names provided" % (len(rankAs), len(nodes)), **result) + module.fail_json( + "Results contained %i ranks but %i node names provided" # pylint: disable=consider-using-f-string + % (len(rankAs), len(nodes)), + **result, + ) # find min values: min_lat = min(latencies.values()) max_lat = max(latencies.values()) min_bw = min(bandwidths.values()) max_bw = max(bandwidths.values()) - + # create HTML fragments: - ranks = ' '.join('%s' % rankB for rankB in rankBs) + ranks = " ".join( + # pylint: disable-next=consider-using-f-string + "%s" % rankB + for rankB in rankBs + ) lat_rows = html_rows(rankAs, rankBs, nodes, latencies) bw_rows = html_rows(rankAs, rankBs, nodes, bandwidths) - page = HTML_TEMPLATE.format(min_lat=min_lat, max_lat=max_lat, min_bw=min_bw, max_bw=max_bw, ranks=ranks, lat_rows=lat_rows, bw_rows=bw_rows) + page = HTML_TEMPLATE.format( + min_lat=min_lat, + max_lat=max_lat, + min_bw=min_bw, + max_bw=max_bw, + ranks=ranks, + lat_rows=lat_rows, + bw_rows=bw_rows, + ) - with open(dest, 'w') as outf: + with open(dest, "w") as outf: # pylint: disable=unspecified-encoding outf.write(page) - result['changed'] = True - result['stats'] = { - 'min_latency (us)': min_lat, - 'max_latency (us)': max_lat, - 'min_bandwidth (MB/s)': min_bw, - 'max_bandwidth (MB/s)': max_bw, - 'min_bandwidth (Gbit/s)': min_bw / 125.0, - 'max_bandwidth (Gbit/s)': max_bw / 125.0, + result["changed"] = True + result["stats"] = { + "min_latency (us)": min_lat, + "max_latency (us)": max_lat, + "min_bandwidth (MB/s)": min_bw, + "max_bandwidth (MB/s)": max_bw, + "min_bandwidth (Gbit/s)": min_bw / 125.0, + "max_bandwidth (Gbit/s)": max_bw / 125.0, } module.exit_json(**result) -def main(): +def main(): # pylint: disable=missing-function-docstring run_module() + if __name__ == "__main__": main() diff --git a/ansible/roles/hpctests/library/read_imb_pingpong.py b/ansible/roles/hpctests/library/read_imb_pingpong.py index fb52ef4..808b6bb 100644 --- a/ansible/roles/hpctests/library/read_imb_pingpong.py +++ b/ansible/roles/hpctests/library/read_imb_pingpong.py @@ -1,11 +1,12 @@ #!/usr/bin/python +# pylint: disable=missing-module-docstring # -*- coding: utf-8 -*- # Copyright: (c) 2020, StackHPC # Apache 2 License -from ansible.module_utils.basic import AnsibleModule -import json + +from ansible.module_utils.basic import AnsibleModule # pylint: disable=import-error ANSIBLE_METADATA = { "metadata_version": "0.1", @@ -39,42 +40,47 @@ """ CONVERTERS = (int, int, float, float) -COLUMNS = ('bytes', 'repetitions', 'latency', 'bandwidth') +COLUMNS = ("bytes", "repetitions", "latency", "bandwidth") + -def run_module(): - module_args = dict( - path=dict(type="str", required=True), - ) +def run_module(): # pylint: disable=missing-function-docstring + module_args = { + "path": { + "type": "str", + "required": True, + }, + } module = AnsibleModule(argument_spec=module_args, supports_check_mode=True) result = {"changed": False} - + path = module.params["path"] if module.check_mode: module.exit_json(**result) columns = ([], [], [], []) - with open(path) as f: + with open(path) as f: # pylint: disable=unspecified-encoding for line in f: - if line == ' #bytes #repetitions t[usec] Mbytes/sec\n': + if line == " #bytes #repetitions t[usec] Mbytes/sec\n": while True: line = next(f).strip() - if line == '': + if line == "": break for ix, v in enumerate(line.split()): columns[ix].append(CONVERTERS[ix](v)) - - result['columns'] = { - 'bytes': columns[0], - 'repetitions': columns[1], - 'latency': columns[2], - 'bandwidth': columns[3], + + result["columns"] = { + "bytes": columns[0], + "repetitions": columns[1], + "latency": columns[2], + "bandwidth": columns[3], } module.exit_json(**result) -def main(): +def main(): # pylint: disable=missing-function-docstring run_module() + if __name__ == "__main__": - main() \ No newline at end of file + main() diff --git a/ansible/roles/hpctests/library/slurm_node_info.py b/ansible/roles/hpctests/library/slurm_node_info.py index 52e6800..dd3e0b3 100644 --- a/ansible/roles/hpctests/library/slurm_node_info.py +++ b/ansible/roles/hpctests/library/slurm_node_info.py @@ -1,11 +1,12 @@ #!/usr/bin/python +# pylint: disable=missing-module-docstring # -*- coding: utf-8 -*- # Copyright: (c) 2020, StackHPC # Apache 2 License -from ansible.module_utils.basic import AnsibleModule -import json + +from ansible.module_utils.basic import AnsibleModule # pylint: disable=import-error ANSIBLE_METADATA = { "metadata_version": "0.1", @@ -18,8 +19,10 @@ module: slurm_node_info short_description: Get information about Slurm nodes version_added: "0.0" -description: - - "Gets all the available information from Slurm's `sinfo` command about specified nodes. The returned `info` property is a dict with keys from sinfo --All parameters and values a list of strings in specified node order." +description: > + Gets all the available information from Slurm's `sinfo` command about specified nodes. + The returned `info` property is a dict with keys from sinfo -- + All parameters and values a list of strings in specified node order. options nodes: description: @@ -37,32 +40,42 @@ """ -def run_module(): - module_args = dict( - nodes=dict(type="list", required=True), - ) +def run_module(): # pylint: disable=missing-function-docstring + module_args = { + "nodes": { + "type": "list", + "required": True, + } + } module = AnsibleModule(argument_spec=module_args, supports_check_mode=True) result = {"changed": False} if module.check_mode: module.exit_json(**result) - - _, stdout,_ = module.run_command("sinfo --Format All --Node", check_rc=True) # `--nodes` doesn't filter enough, other partitions are still shown + + _, stdout, _ = module.run_command( + "sinfo --Format All --Node", check_rc=True + ) # `--nodes` doesn't filter enough, other partitions are still shown lines = stdout.splitlines() info = {} - params = [v.strip() for v in lines[0].split('|')] - values = [line.split('|') for line in lines[1:]] - nodelist_ix = params.index('NODELIST') + params = [v.strip() for v in lines[0].split("|")] + values = [line.split("|") for line in lines[1:]] + nodelist_ix = params.index("NODELIST") print(values) for ix, param in enumerate(params): - info[param] = [nodeinfo[ix].strip() for nodeinfo in values if nodeinfo[nodelist_ix].strip() in module.params['nodes']] - result['info'] = info - + info[param] = [ + nodeinfo[ix].strip() + for nodeinfo in values + if nodeinfo[nodelist_ix].strip() in module.params["nodes"] + ] + result["info"] = info + module.exit_json(**result) -def main(): +def main(): # pylint: disable=missing-function-docstring run_module() + if __name__ == "__main__": main() diff --git a/ansible/roles/hpctests/meta/main.yml b/ansible/roles/hpctests/meta/main.yml index 8d471f0..af60695 100644 --- a/ansible/roles/hpctests/meta/main.yml +++ b/ansible/roles/hpctests/meta/main.yml @@ -1,6 +1,8 @@ +--- galaxy_info: author: Steve Brasier company: StackHPC + description: HPC Tests - Meta # If the issue tracker for your role is not on github, uncomment the # next line and provide a value @@ -15,7 +17,7 @@ galaxy_info: # - CC-BY-4.0 license: Apache-2.0 - min_ansible_version: 2.1 + min_ansible_version: "2.1" # If this a Container Enabled role, provide the minimum Ansible Container version. # min_ansible_container_version: diff --git a/ansible/roles/hpctests/tasks/build-hpl.yml b/ansible/roles/hpctests/tasks/build-hpl.yml index 7f6d48b..7339d9b 100644 --- a/ansible/roles/hpctests/tasks/build-hpl.yml +++ b/ansible/roles/hpctests/tasks/build-hpl.yml @@ -1,60 +1,62 @@ --- - - name: Make directory - file: + ansible.builtin.file: path: "{{ hpctests_rootdir }}/hpl" state: directory + mode: "0755" - name: Unarchive HPL sources from /opt/hpl - unarchive: + ansible.builtin.unarchive: src: "/opt/hpl/hpl-{{ hpctests_hpl_version }}.tar.gz" dest: "{{ hpctests_rootdir }}/hpl" - remote_src: yes + remote_src: true owner: "{{ hpctests_user }}" group: "{{ hpctests_group }}" - mode: '0755' - keep_newer: yes + mode: "0755" + keep_newer: true - name: Copy BLAS makefile - copy: + ansible.builtin.copy: src: "{{ hpctests_hpl_srcdir }}/setup/Make.Linux_PII_CBLAS" dest: "{{ hpctests_hpl_srcdir }}/Make.{{ hpctests_hpl_arch }}" - remote_src: yes + remote_src: true + mode: "0644" - name: Modify make file - replace: + ansible.builtin.replace: path: "{{ hpctests_hpl_srcdir }}/Make.{{ hpctests_hpl_arch }}" regexp: "{{ item.regexp }}" replace: "{{ item.replace }}" loop: - - regexp: '^TOPdir.*$' + - regexp: "^TOPdir.*$" replace: "TOPdir = {{ hpctests_hpl_srcdir }}" - - regexp: '^ARCH\s+=.*$' + - regexp: "^ARCH\\s+=.*$" replace: "ARCH = {{ hpctests_hpl_arch }}" - - regexp: '^MPdir.*$' + - regexp: "^MPdir.*$" replace: "MPdir = $(MPI_DIR)" - - regexp: '^MPinc.*$' + - regexp: "^MPinc.*$" replace: "MPinc = -I$(MPI_DIR)/include" - - regexp: '^MPlib.*$' + - regexp: "^MPlib.*$" replace: "MPlib = $(MPI_DIR)/lib/libmpi.so" - - regexp: '^LAdir.*$' + - regexp: "^LAdir.*$" replace: "LAdir = $(OPENBLAS_DIR)" - - regexp: '^LAinc.*$' - replace: "LAinc =" # not sure if this one is needed? - - regexp: '^LAlib.*$' + - regexp: "^LAinc.*$" + replace: "LAinc =" # not sure if this one is needed? + - regexp: "^LAlib.*$" replace: "LAlib = $(OPENBLAS_LIB)/libopenblas.so" - - regexp: '^CC\s+=.*$' + - regexp: "^CC\\s+=.*$" replace: "CC = mpicc" - - regexp: '^LINKER\s+=.*$' + - regexp: "^LINKER\\s+=.*$" replace: "LINKER = mpicc" - name: Create build job script - template: + ansible.builtin.template: src: "hpl-build.sh.j2" dest: "{{ hpctests_hpl_srcdir }}/hpl-build-{{ hpctests_hpl_arch }}.sh" - + mode: "0644" + - name: Build HPL executable - shell: + ansible.builtin.command: cmd: "bash -l -c 'sbatch --wait hpl-build-{{ hpctests_hpl_arch }}.sh'" # need login shell for module command chdir: "{{ hpctests_hpl_srcdir }}" creates: "bin/{{ hpctests_hpl_arch }}/xhpl" diff --git a/ansible/roles/hpctests/tasks/hpl-solo.yml b/ansible/roles/hpctests/tasks/hpl-solo.yml index 4c49531..f131733 100644 --- a/ansible/roles/hpctests/tasks/hpl-solo.yml +++ b/ansible/roles/hpctests/tasks/hpl-solo.yml @@ -1,12 +1,14 @@ +--- # For further information on tuning HPL see e.g.: # - https://ulhpc-tutorials.readthedocs.io/en/latest/parallel/mpi/HPL/ # - https://community.arm.com/developer/tools-software/hpc/b/hpc-blog/posts/profiling-and-tuning-linpack-step-step-guide # - http://www.crc.nd.edu/~rich/CRC_Summer_Scholars_2014/HPL-HowTo.pdf - name: Make directory - file: + ansible.builtin.file: path: "{{ hpctests_rootdir }}/hpl-solo" state: directory + mode: "0755" - name: Get Slurm node info slurm_node_info: @@ -14,7 +16,7 @@ register: hpctests_nodeinfo - name: Check nodes are homogenous - assert: + ansible.builtin.assert: that: "{{ hpctests_nodeinfo.info[item] | unique | length == 1 }}" fail_msg: "Selected nodes are not homogenous: {{ item }} ({{ hpctests_nodeinfo.info['NODELIST'] }}) = {{ hpctests_nodeinfo.info[item] }}" loop: @@ -26,7 +28,7 @@ - name: Calculate number of processes (per node) # Will run array job, which is SAME on each node, so only need to deal with a single node's processors here # Also ignore any hyperthreading TODO: document - set_fact: + ansible.builtin.set_fact: hpctests_hplsolo_ntasks: "{{ (hpctests_nodeinfo.info['SOCKETS'][0]) | int * (hpctests_nodeinfo.info['CORES'][0] | int) }}" - name: Calculate problem shape @@ -37,50 +39,58 @@ - name: Calculate problem size # Based on example shown in http://www.crc.nd.edu/~rich/CRC_Summer_Scholars_2014/HPL-HowTo.pdf but we have MB not GB - set_fact: - hpctests_hplsolo_N: "{{ ((((( (hpctests_nodeinfo.info['MEMORY'][0] | int) * (hpctests_hpl_mem_frac | float) * 1024 * 1024 * 1) / 8) | root) / hpctests_hpl_NB) | int ) * hpctests_hpl_NB }}" -- debug: - msg: "Using {{ hpctests_hplsolo_ntasks }} process per node with P={{ hpctests_hplsolo_pq.grid.P }}, Q={{ hpctests_hplsolo_pq.grid.Q }} targeting {{ (hpctests_hpl_mem_frac | float) * 100 }}% of {{ hpctests_nodeinfo.info['MEMORY'][0] }} MB memory per node, block size (NB) = {{ hpctests_hpl_NB }}, problem size (N) = {{ hpctests_hplsolo_N }}" + ansible.builtin.set_fact: + # yamllint disable-line rule:line-length + hpctests_hplsolo_N: "{{ ((((((hpctests_nodeinfo.info['MEMORY'][0] | int) * (hpctests_hpl_mem_frac | float) * 1024 * 1024 * 1) / 8) | root) / hpctests_hpl_NB) + | int) * hpctests_hpl_NB }}" +- ansible.builtin.debug: + # yamllint disable rule:line-length + msg: "Using {{ hpctests_hplsolo_ntasks }} process per node with P={{ hpctests_hplsolo_pq.grid.P }}, Q={{ hpctests_hplsolo_pq.grid.Q }} targeting {{ (hpctests_hpl_mem_frac + | float) * 100 }}% of {{ hpctests_nodeinfo.info['MEMORY'][0] }} MB memory per node, block size (NB) = {{ hpctests_hpl_NB }}, problem size (N) = {{ hpctests_hplsolo_N + }}" + # yamllint enable rule:line-length - name: Get all nodes in partition - shell: "sinfo --Node --noheader --format %N --partition={{ hpctests_partition }}" + ansible.builtin.command: "sinfo --Node --noheader --format %N --partition={{ hpctests_partition }}" register: all_nodes changed_when: false - name: Calculate excluded nodes - set_fact: + ansible.builtin.set_fact: hpctests_hplsolo_excluded_nodes: "{{ all_nodes.stdout_lines | difference(hpctests_computes.stdout_lines) }}" - name: Copy HPL binary - copy: + ansible.builtin.copy: src: "{{ hpctests_hpl_srcdir }}/bin/{{ hpctests_hpl_arch }}/xhpl" dest: "{{ hpctests_rootdir }}/hpl-solo/xhpl-{{ hpctests_hpl_arch }}" mode: "u+x" - remote_src: yes + remote_src: true - name: Template out HPL.dat - template: + ansible.builtin.template: src: "HPL.dat.j2" dest: "{{ hpctests_rootdir }}/hpl-solo/HPL.dat" + mode: "0644" vars: - hpctests_hpl_N: "{{ hpctests_hplsolo_N }}" - hpctests_hpl_P: "{{ hpctests_hplsolo_pq.grid.P }}" - hpctests_hpl_Q: "{{ hpctests_hplsolo_pq.grid.Q }}" + hpctests_hpl_N: "{{ hpctests_hplsolo_N }}" + hpctests_hpl_P: "{{ hpctests_hplsolo_pq.grid.P }}" + hpctests_hpl_Q: "{{ hpctests_hplsolo_pq.grid.Q }}" - name: Create sbatch script - template: + ansible.builtin.template: src: hpl-solo.sh.j2 dest: "{{ hpctests_rootdir }}/hpl-solo/hpl-solo.sh" + mode: "0755" vars: hpctests_hplsolo_ntasks: 2 # TODO: FIXME -- name: Remove previous outputs +- name: Remove previous outputs # noqa: no-changed-when # As depending on the number of nodes there will be different numbers of output files for different partitions so won't all get overwritten - shell: + ansible.builtin.shell: cmd: "rm -f {{ hpctests_rootdir }}/hpl-solo/hpl-solo.sh.*.out" -- name: Run hpl-solo - shell: bash -l -c 'sbatch --wait hpl-solo.sh' # need login shell for module command +- name: Run hpl-solo # noqa: no-changed-when + ansible.builtin.command: bash -l -c 'sbatch --wait hpl-solo.sh' args: chdir: "{{ hpctests_rootdir }}/hpl-solo" async: "{{ 20 * 60 }}" # wait for up to 20 minutes @@ -89,7 +99,7 @@ - name: Check HPL completed OK tags: postpro - shell: "grep '1 tests completed and passed residual checks' *.out" + ansible.builtin.shell: "grep '1 tests completed and passed residual checks' *.out" args: chdir: "{{ hpctests_rootdir }}/hpl-solo" changed_when: false @@ -105,7 +115,7 @@ # HPL_pdgesv() start time Thu Feb 25 19:58:25 2021 # tags: postpro - shell: "grep '^W[R|C]' *.out | tr -s ' ' | cut -d ' ' -f 7" # tr -s squeezes multiple spaces to single, then take gflops column + ansible.builtin.shell: "set -o pipefail && grep '^W[R|C]' *.out | tr -s ' ' | cut -d ' ' -f 7" args: chdir: "{{ hpctests_rootdir }}/hpl-solo" changed_when: false @@ -113,7 +123,8 @@ - name: Summarise results tags: postpro - debug: + ansible.builtin.debug: + # yamllint disable rule:line-length msg: | Summary for hpl-solo on {{ hpctests_computes.stdout_lines | length }} nodes in '{{ hpctests_partition }}' partition, job ID {{ hpctests_hplsolo_sbatch.stdout.split()[-1] }}, device '{{ hpctests_ucx_net_devices }}': @@ -122,4 +133,5 @@ Mean: {{ (perf.stdout_lines | map('float') | sum) / (hpctests_computes.stdout_lines | length) }} gflops Individual node results (gflops): - {{ dict(hpctests_computes.stdout_lines | zip(perf.stdout_lines | map('float') )) | to_nice_yaml }} + {{ dict(hpctests_computes.stdout_lines | zip(perf.stdout_lines | map('float'))) | to_nice_yaml }} + # yamllint enable rule:line-length diff --git a/ansible/roles/hpctests/tasks/main.yml b/ansible/roles/hpctests/tasks/main.yml index f0f0817..bee1b76 100644 --- a/ansible/roles/hpctests/tasks/main.yml +++ b/ansible/roles/hpctests/tasks/main.yml @@ -1,38 +1,39 @@ -- name: setup - block: - - include_tasks: setup.yml +--- +- name: Setup become: true become_user: "{{ hpctests_user }}" tags: always -- name: pingpong block: - - include_tasks: pingpong.yml - when: hpctests_computes.stdout_lines | length > 1 + - ansible.builtin.include_tasks: setup.yml +- name: Pingpong become: true become_user: "{{ hpctests_user }}" tags: pingpong -- name: pingmatrix block: - - include_tasks: pingmatrix.yml + - ansible.builtin.include_tasks: pingpong.yml when: hpctests_computes.stdout_lines | length > 1 +- name: Pingmatrix become: true become_user: "{{ hpctests_user }}" tags: pingmatrix -- name: build HPL block: - - include_tasks: build-hpl.yml + - ansible.builtin.include_tasks: pingmatrix.yml + when: hpctests_computes.stdout_lines | length > 1 +- name: Build HPL become: true become_user: "{{ hpctests_user }}" tags: - hpl-solo -- name: run HPL on individual nodes block: - - include_tasks: hpl-solo.yml + - ansible.builtin.include_tasks: build-hpl.yml +- name: Run HPL on individual nodes become: true become_user: "{{ hpctests_user }}" tags: - hpl-solo + block: + - ansible.builtin.include_tasks: hpl-solo.yml diff --git a/ansible/roles/hpctests/tasks/pingmatrix.yml b/ansible/roles/hpctests/tasks/pingmatrix.yml index 3d20b78..5d5d41f 100644 --- a/ansible/roles/hpctests/tasks/pingmatrix.yml +++ b/ansible/roles/hpctests/tasks/pingmatrix.yml @@ -1,40 +1,44 @@ --- - - name: Make directory - file: + ansible.builtin.file: path: "{{ hpctests_rootdir }}/pingmatrix" state: directory + mode: "0755" - name: Copy source - copy: + ansible.builtin.copy: src: mpi_nxnlatbw.c dest: "{{ hpctests_rootdir }}/pingmatrix/mpi_nxnlatbw.c" + mode: "0644" - name: Create sbatch script - template: + ansible.builtin.template: src: pingmatrix.sh.j2 dest: "{{ hpctests_rootdir }}/pingmatrix/pingmatrix.sh" + mode: "0755" -- name: Run ping matrix - shell: bash -l -c 'sbatch --wait pingmatrix.sh' # need login shell for module command +- name: Run ping matrix # noqa: no-changed-when + ansible.builtin.command: bash -l -c 'sbatch --wait pingmatrix.sh' args: chdir: "{{ hpctests_rootdir }}/pingmatrix" register: hpctests_pingmatrix_sbatch -# nxnlatbw outputs ranks, not nodenames which would be more useful for finding issues. The sbatch manpage says nodes provided via --nodelist are sorted, but doesn't specify how. -# Some testing using a "helloworld" program showed it is NOT sorted the same as python's sorted(), it's lexicographical. So we use scontrol to guarantee the same sort order. +# nxnlatbw outputs ranks, not nodenames which would be more useful for finding issues. +# The sbatch manpage says nodes provided via --nodelist are sorted, but doesn't specify how. +# Some testing using a "helloworld" program showed it is NOT sorted the same as python's sorted(), +# it's lexicographical. So we use scontrol to guarantee the same sort order. # Note this still doesn't fix any non-unique names but we should get a length mis-match at least with that. # although this looks a bit crazy: -- name: Expand node list - shell: "scontrol show hostnames {{ hpctests_nodes if hpctests_nodes is defined else (hpctests_computes.stdout_lines | join(',')) }}" +- name: Expand node list # noqa: no-changed-when + ansible.builtin.command: "scontrol show hostnames {{ hpctests_nodes if hpctests_nodes is defined else (hpctests_computes.stdout_lines | join(',')) }}" register: scontrol_hostnames -- name: Create sorted node expression - shell: "scontrol show hostlistsorted {{ scontrol_hostnames.stdout_lines | join(',') }}" +- name: Create sorted node expression # noqa: no-changed-when + ansible.builtin.command: "scontrol show hostlistsorted {{ scontrol_hostnames.stdout_lines | join(',') }}" register: scontrol_hostlistsorted -- name: Expand node list again - shell: "scontrol show hostnames {{ scontrol_hostlistsorted.stdout_lines | join(',') }}" +- name: Expand node list again # noqa: no-changed-when + ansible.builtin.command: "scontrol show hostnames {{ scontrol_hostlistsorted.stdout_lines | join(',') }}" register: slurm_names - name: Process output @@ -45,16 +49,18 @@ register: nxnlatbw - name: Fetch html results table to ansible control host - fetch: + ansible.builtin.fetch: src: "{{ hpctests_rootdir }}/pingmatrix/pingmatrix.html" dest: "{{ hpctests_outdir }}/pingmatrix.html" - flat: yes + flat: true - name: Summarise results - debug: + ansible.builtin.debug: + # yamllint disable rule:line-length msg: | Summary for pingmatrix pairwise over {{ slurm_names.stdout_lines | length }} nodes in '{{ hpctests_partition }}' partition, job ID {{ hpctests_pingmatrix_sbatch.stdout.split()[-1] }}, device '{{ hpctests_ucx_net_devices }}': - + {{ nxnlatbw['stats'] | to_nice_yaml }} - + Tabular output on ansible control host at {{ hpctests_outdir }}/pingmatrix.html + # yamllint enable rule:line-length diff --git a/ansible/roles/hpctests/tasks/pingpong.yml b/ansible/roles/hpctests/tasks/pingpong.yml index 3cde8c2..6c80065 100644 --- a/ansible/roles/hpctests/tasks/pingpong.yml +++ b/ansible/roles/hpctests/tasks/pingpong.yml @@ -1,45 +1,46 @@ --- - - name: Make directory - file: + ansible.builtin.file: path: "{{ hpctests_rootdir }}/pingpong" state: directory + mode: "0755" - name: Create sbatch script - template: + ansible.builtin.template: src: pingpong.sh.j2 dest: "{{ hpctests_rootdir }}/pingpong/pingpong.sh" + mode: "0755" - name: Run pingpong block: - - name: Submit jobscript - shell: bash -l -c 'sbatch --wait pingpong.sh' # need login shell for module command + - name: Submit jobscript # noqa: command-instead-of-shell no-changed-when + ansible.builtin.shell: bash -l -c 'sbatch --wait pingpong.sh' # need login shell for module command args: chdir: "{{ hpctests_rootdir }}/pingpong" register: hpctests_pingpong_sbatch rescue: - name: Get slurm job output - slurp: + ansible.builtin.slurp: src: "{{ hpctests_rootdir }}/pingpong/pingpong.sh.out" register: _pingpong_out - name: Show job output - debug: + ansible.builtin.debug: msg: | PingPong output was: - + {{ _pingpong_out.content | b64decode }} failed_when: true -- set_fact: +- ansible.builtin.set_fact: _pingpong_jobid: "{{ hpctests_pingpong_sbatch.stdout.split()[-1] }}" -- set_fact: - _pingpong_local_output: "{{ hpctests_outdir }}/pingpong/{{_pingpong_jobid}}/pingpong.sh.out" +- ansible.builtin.set_fact: + _pingpong_local_output: "{{ hpctests_outdir }}/pingpong/{{ _pingpong_jobid }}/pingpong.sh.out" - name: Retrieve results file ansible.builtin.fetch: src: "{{ hpctests_rootdir }}/pingpong/pingpong.sh.out" dest: "{{ _pingpong_local_output }}" - flat: yes + flat: true - name: Read pingpong results read_imb_pingpong: @@ -48,22 +49,23 @@ delegate_to: localhost become: false -- name: Read nodes used - shell: "grep 'SLURM_JOB_NODELIST:' {{ _pingpong_local_output }}" +- name: Read nodes used # noqa: no-changed-when + ansible.builtin.command: "grep 'SLURM_JOB_NODELIST:' {{ _pingpong_local_output }}" register: hpctests_pingpong_run_nodes delegate_to: localhost become: false - name: Plot image - shell: + ansible.builtin.command: cmd: "python {{ role_path }}/files/plot_imb_pingpong.py {{ _pingpong_local_output }}" creates: "{{ _pingpong_local_output | dirname }}/latency.png" register: _pingpong_plot delegate_to: localhost become: false when: hpctests_pingpong_plot | bool - -- debug: + +- ansible.builtin.debug: + # yamllint disable rule:line-length msg: | Summary for pingpong using 2x scheduler-selected nodes in '{{ hpctests_partition }}' partition, job ID {{ _pingpong_jobid }}, device '{{ hpctests_ucx_net_devices }}': @@ -75,3 +77,4 @@ See plot on localhost: {{ _pingpong_plot.stdout }} {% endif %} + # yamllint enable rule:line-length diff --git a/ansible/roles/hpctests/tasks/setup.yml b/ansible/roles/hpctests/tasks/setup.yml index 316b328..cc9832a 100644 --- a/ansible/roles/hpctests/tasks/setup.yml +++ b/ansible/roles/hpctests/tasks/setup.yml @@ -1,34 +1,36 @@ --- - - name: Get partition information - shell: "sinfo --format %P --noheader" + ansible.builtin.command: "sinfo --format %P --noheader" register: _sinfo_partitions changed_when: false - name: Select default partition if hpctests_partition not given - set_fact: - hpctests_partition: "{{ (_sinfo_partitions.stdout_lines | select('contains', '*') | first)[:-1] }}" + ansible.builtin.set_fact: + hpctests_partition: "{{ (_sinfo_partitions.stdout_lines | select('contains', '*') | first)[:-1] }}" when: hpctests_partition is not defined - name: Get info about compute nodes - shell: "sinfo --Node --noheader{%if hpctests_nodes is defined %} --nodes {{hpctests_nodes}}{% endif %} --partition {{hpctests_partition}} --format %N" + # yamllint disable-line rule:line-length + ansible.builtin.command: "sinfo --Node --noheader{%if hpctests_nodes is defined %} --nodes {{hpctests_nodes}}{% endif %} --partition {{hpctests_partition}} --format + %N" register: hpctests_computes changed_when: false failed_when: hpctests_computes.rc != 0 - name: Check compute node selection valid - assert: + ansible.builtin.assert: that: hpctests_computes.stdout_lines | length > 0 fail_msg: "No nodes selected - was variable `hpctests_nodes` set (correctly)?" - name: Create test root directory - file: + ansible.builtin.file: path: "{{ hpctests_rootdir }}" state: directory owner: "{{ hpctests_user }}" group: "{{ hpctests_group }}" + mode: "0755" - name: Set fact for UCX_NET_DEVICES - set_fact: + ansible.builtin.set_fact: hpctests_ucx_net_devices: "{{ hpctests_ucx_net_devices.get(hpctests_partition, 'all') }}" when: hpctests_ucx_net_devices is mapping diff --git a/ansible/roles/hpctests/tasks/source-hpl.yml b/ansible/roles/hpctests/tasks/source-hpl.yml index 43585d3..6083240 100644 --- a/ansible/roles/hpctests/tasks/source-hpl.yml +++ b/ansible/roles/hpctests/tasks/source-hpl.yml @@ -1,7 +1,7 @@ --- - name: Make directory - file: + ansible.builtin.file: path: "/opt/hpl" state: directory owner: root @@ -9,7 +9,8 @@ mode: '0755' - name: Download HPL tarball - get_url: + # checkov:skip=CKV2_ANSIBLE_2: "Ensure that HTTPS url is used with get_url" + ansible.builtin.get_url: url: "http://www.netlib.org/benchmark/hpl/hpl-{{ hpctests_hpl_version }}.tar.gz" dest: "/opt/hpl/hpl-{{ hpctests_hpl_version }}.tar.gz" owner: root diff --git a/ansible/roles/hpctests/templates/hpl-build.sh.j2 b/ansible/roles/hpctests/templates/hpl-build.sh.j2 old mode 100644 new mode 100755 diff --git a/ansible/roles/hpctests/templates/hpl-solo.sh.j2 b/ansible/roles/hpctests/templates/hpl-solo.sh.j2 old mode 100644 new mode 100755 diff --git a/ansible/roles/hpctests/templates/pingmatrix.sh.j2 b/ansible/roles/hpctests/templates/pingmatrix.sh.j2 old mode 100644 new mode 100755 diff --git a/ansible/roles/hpctests/templates/pingpong.sh.j2 b/ansible/roles/hpctests/templates/pingpong.sh.j2 old mode 100644 new mode 100755 diff --git a/ansible/roles/k3s/README.md b/ansible/roles/k3s/README.md index 68e8e24..4031a00 100644 --- a/ansible/roles/k3s/README.md +++ b/ansible/roles/k3s/README.md @@ -1,16 +1,12 @@ -k3s -===== +# k3s Installs k3s agent and server services on nodes and an ansible-init playbook to activate them. The service that each node will activate on init is determined by OpenStack metadata. Also includes Helm install. Currently only supports a single k3s-server (i.e one control node). Install based on the [official k3s ansible role](https://github.com/k3s-io/k3s-ansible). - -Requirements ------------- +## Requirements `azimuth_cloud.image_utils.linux_ansible_init` must have been run previously on targeted nodes during image build. -Role Variables --------------- +## Role Variables - `k3s_version`: Optional str. K3s version to install, see [official releases](https://github.com/k3s-io/k3s/releases/). diff --git a/ansible/roles/k3s/defaults/main.yml b/ansible/roles/k3s/defaults/main.yml index 984c63d..38a5f73 100644 --- a/ansible/roles/k3s/defaults/main.yml +++ b/ansible/roles/k3s/defaults/main.yml @@ -1,8 +1,9 @@ +--- # Warning: changes to these variables won't be reflected in the cluster/image if k3s is already installed k3s_version: "v1.31.0+k3s1" k3s_selinux_release: v1.6.latest.1 k3s_selinux_rpm_version: 1.6-1 k3s_helm_version: v3.11.0 -k3s_bootstrap_token: '' # matches common environment default +k3s_bootstrap_token: "" # matches common environment default k3s_bootstrap_token_expiry: 10m k3s_server_name: "{{ None }}" # ansible managed diff --git a/ansible/roles/k3s/tasks/agent-runtime.yml b/ansible/roles/k3s/tasks/agent-runtime.yml index 8377817..732fcee 100644 --- a/ansible/roles/k3s/tasks/agent-runtime.yml +++ b/ansible/roles/k3s/tasks/agent-runtime.yml @@ -1,5 +1,4 @@ --- - - name: Template k3s agent env file when: k3s_bootstrap_token != '' ansible.builtin.template: @@ -7,16 +6,16 @@ src: k3s-agent.service.env.j2 owner: root group: root - mode: 0640 + mode: "0640" register: _k3s_agent_token_result - name: Ensure password directory exists - ansible.builtin.file: + ansible.builtin.file: path: "/etc/rancher/node" state: directory owner: root group: root - mode: 0640 + mode: "0640" - name: Write node password ansible.builtin.copy: @@ -24,10 +23,10 @@ content: "{{ vault_k3s_node_password }}" owner: root group: root - mode: 0640 # normal k3s install is 644 but that doesn't feel right + mode: "0640" # normal k3s install is 644 but that doesn't feel right - name: Start/restart k3s agent - when: _k3s_agent_token_result.changed + when: _k3s_agent_token_result.changed # noqa: no-handler ansible.builtin.systemd: name: k3s-agent daemon_reload: true diff --git a/ansible/roles/k3s/tasks/install.yml b/ansible/roles/k3s/tasks/install.yml index c250f87..79efb65 100644 --- a/ansible/roles/k3s/tasks/install.yml +++ b/ansible/roles/k3s/tasks/install.yml @@ -1,7 +1,6 @@ --- - - name: Check for existing k3s installation - stat: + ansible.builtin.stat: path: /var/lib/rancher/k3s register: stat_result @@ -9,62 +8,64 @@ # Using air-gapped install so containers are pre-installed to avoid rate-limiting from registries on cluster startup when: not stat_result.stat.exists block: + - name: Download k3s binary + ansible.builtin.get_url: + url: "https://github.com/k3s-io/k3s/releases/download/{{ k3s_version | urlencode }}/k3s" + dest: /usr/bin/k3s + owner: root + group: root + mode: "0755" - - name: Download k3s binary - ansible.builtin.get_url: - url: "https://github.com/k3s-io/k3s/releases/download/{{ k3s_version | urlencode }}/k3s" - dest: /usr/bin/k3s - owner: root - group: root - mode: "0755" - - - name: Install k3s SELinux policy package - yum: - name: "https://github.com/k3s-io/k3s-selinux/releases/download/{{ k3s_selinux_release }}/k3s-selinux-{{ k3s_selinux_rpm_version }}.el{{ ansible_distribution_major_version }}.noarch.rpm" - disable_gpg_check: true + - name: Install k3s SELinux policy package + ansible.builtin.dnf: + # yamllint disable-line rule:line-length + name: "https://github.com/k3s-io/k3s-selinux/releases/download/{{ k3s_selinux_release }}/k3s-selinux-{{ k3s_selinux_rpm_version }}.el{{ ansible_distribution_major_version }}.noarch.rpm" + disable_gpg_check: true - - name: Create image directory - ansible.builtin.file: - path: "/var/lib/rancher/k3s/agent/images" - state: directory + - name: Create image directory + ansible.builtin.file: + path: "/var/lib/rancher/k3s/agent/images" + state: directory + mode: "0755" - - name: Install k3s' internal images - ansible.builtin.get_url: - url: "https://github.com/k3s-io/k3s/releases/download/{{ k3s_version | urlencode }}/k3s-airgap-images-amd64.tar.zst" - dest: /var/lib/rancher/k3s/agent/images/k3s-airgap-images-amd64.tar.zst + - name: Install k3s' internal images + ansible.builtin.get_url: + url: "https://github.com/k3s-io/k3s/releases/download/{{ k3s_version | urlencode }}/k3s-airgap-images-amd64.tar.zst" + dest: /var/lib/rancher/k3s/agent/images/k3s-airgap-images-amd64.tar.zst + mode: "0644" - - name: Download k3s install script - ansible.builtin.get_url: - url: https://get.k3s.io/ - timeout: 120 - dest: /usr/bin/k3s-install.sh - owner: root - group: root - mode: "0755" + - name: Download k3s install script + ansible.builtin.get_url: + url: https://get.k3s.io/ + timeout: 120 + dest: /usr/bin/k3s-install.sh + owner: root + group: root + mode: "0755" - - name: Install k3s - ansible.builtin.shell: - cmd: /usr/bin/k3s-install.sh - environment: - INSTALL_K3S_VERSION: "{{ k3s_version }}" - INSTALL_K3S_EXEC: "{{ item }} --node-ip=${K3S_NODE_IP}" - INSTALL_K3S_SKIP_START: "true" - INSTALL_K3S_SKIP_ENABLE: "true" - INSTALL_K3S_BIN_DIR: "/usr/bin" - INSTALL_K3S_SKIP_DOWNLOAD: "true" - changed_when: true - loop: - - server --disable=traefik - - agent + - name: Install k3s + ansible.builtin.command: + cmd: /usr/bin/k3s-install.sh + environment: + INSTALL_K3S_VERSION: "{{ k3s_version }}" + INSTALL_K3S_EXEC: "{{ item }} --node-ip=${K3S_NODE_IP}" + INSTALL_K3S_SKIP_START: "true" + INSTALL_K3S_SKIP_ENABLE: "true" + INSTALL_K3S_BIN_DIR: "/usr/bin" + INSTALL_K3S_SKIP_DOWNLOAD: "true" + changed_when: true + loop: + - server --disable=traefik + - agent - name: Install helm - unarchive: + ansible.builtin.unarchive: src: "https://get.helm.sh/helm-{{ k3s_helm_version }}-linux-amd64.tar.gz" dest: /usr/bin extra_opts: "--strip-components=1" owner: root group: root - mode: 0755 + mode: "0755" remote_src: true - name: Add k3s kubeconfig as environment variable diff --git a/ansible/roles/k3s/tasks/server-runtime.yml b/ansible/roles/k3s/tasks/server-runtime.yml index 6c0878e..1221cda 100644 --- a/ansible/roles/k3s/tasks/server-runtime.yml +++ b/ansible/roles/k3s/tasks/server-runtime.yml @@ -1,9 +1,9 @@ --- - - name: Template k3s env file ansible.builtin.template: dest: /etc/systemd/system/k3s.service.env src: k3s.service.env.j2 + mode: "0644" register: _k3s_env_file_status - name: Start k3s server @@ -14,9 +14,9 @@ enabled: true # Possible race here as there is a delay between agents disconnecting and being registered as down, probably won't be hit in general use though -- name: Check which k3s agents are connected +- name: Check which k3s agents are connected # noqa: no-changed-when ansible.builtin.shell: - cmd: kubectl get nodes --no-headers | grep -w Ready + cmd: set -o pipefail && kubectl get nodes --no-headers | grep -w Ready register: _k3s_connected_nodes retries: 6 # task may fail if server is not ready yet delay: 10 @@ -24,12 +24,12 @@ - when: _k3s_connected_nodes.stdout_lines | length != groups['k3s'] | length block: - - name: Generate new bootstrap token if not all agents are connected - no_log: true - shell: - cmd: "k3s token create --ttl {{ k3s_bootstrap_token_expiry }}" - register: _k3s_token_output + - name: Generate new bootstrap token if not all agents are connected # noqa: no-changed-when + no_log: true + ansible.builtin.command: + cmd: "k3s token create --ttl {{ k3s_bootstrap_token_expiry }}" + register: _k3s_token_output - - name: Set bootstrap token as fact - set_fact: - k3s_bootstrap_token: "{{ _k3s_token_output.stdout }}" + - name: Set bootstrap token as fact + ansible.builtin.set_fact: + k3s_bootstrap_token: "{{ _k3s_token_output.stdout }}" diff --git a/ansible/roles/k3s/templates/k3s-agent.service.env.j2 b/ansible/roles/k3s/templates/k3s-agent.service.env.j2 index b994b06..9444765 100644 --- a/ansible/roles/k3s/templates/k3s-agent.service.env.j2 +++ b/ansible/roles/k3s/templates/k3s-agent.service.env.j2 @@ -1,3 +1,3 @@ -K3S_NODE_IP={{ ansible_host }} -K3S_TOKEN={{ k3s_bootstrap_token }} -K3S_URL=https://{{ k3s_server_name }}:6443 +K3S_NODE_IP="{{ ansible_host }}" +K3S_TOKEN="{{ k3s_bootstrap_token }}" +K3S_URL="https://{{ k3s_server_name }}:6443" diff --git a/ansible/roles/k3s/templates/k3s.service.env.j2 b/ansible/roles/k3s/templates/k3s.service.env.j2 index 746e6d8..38fb911 100644 --- a/ansible/roles/k3s/templates/k3s.service.env.j2 +++ b/ansible/roles/k3s/templates/k3s.service.env.j2 @@ -1 +1 @@ -K3S_NODE_IP={{ ansible_host }} +K3S_NODE_IP="{{ ansible_host }}" diff --git a/ansible/roles/k9s/tasks/main.yml b/ansible/roles/k9s/tasks/main.yml index 674b4df..bebe7b8 100644 --- a/ansible/roles/k9s/tasks/main.yml +++ b/ansible/roles/k9s/tasks/main.yml @@ -1,12 +1,12 @@ --- - - - name: Check if k9s is installed - ansible.builtin.stat: - path: "/usr/bin/k9s" - register: _k9s_stat_result +- name: Check if k9s is installed + ansible.builtin.stat: + path: "/usr/bin/k9s" + register: _k9s_stat_result - - name: Install k9s and clean up temporary files - block: +- name: Install k9s and clean up temporary files + when: not _k9s_stat_result.stat.exists + block: - name: Create install directory ansible.builtin.file: path: /tmp/k9s @@ -28,17 +28,16 @@ ansible.builtin.unarchive: src: /tmp/k9s/k9s_Linux_amd64.tar.gz dest: /tmp/k9s - remote_src: yes + remote_src: true - name: Add k9s to root path ansible.builtin.copy: src: /tmp/k9s/k9s dest: /usr/bin/k9s mode: u+rwx - remote_src: yes + remote_src: true - name: Cleanup k9s install directory ansible.builtin.file: path: /tmp/k9s state: absent - when: not _k9s_stat_result.stat.exists diff --git a/ansible/roles/lustre/README.md b/ansible/roles/lustre/README.md index 0269ad6..56e6b3a 100644 --- a/ansible/roles/lustre/README.md +++ b/ansible/roles/lustre/README.md @@ -7,22 +7,25 @@ Install and configure a Lustre client. This builds RPM packages from source. **NB:** Currently this only supports RockyLinux 9. ## Role Variables + The following variables control configuration of Lustre clients. + - `lustre_lnet_label`: Optional str. The "lnet label" part of the host's NID, e.g. `tcp0`. Only the `tcp` protocol type is currently supported. Default `tcp`. - `lustre_mgs_nid`: Required str. The NID(s) for the MGS, e.g. `192.168.227.11@tcp1` (separate mutiple MGS NIDs using `:`). - `lustre_mounts`: Required list. Define Lustre filesystems and mountpoints as a list of dicts with keys: - - `fs_name`: Required str. The name of the filesystem to mount - - `mount_point`: Required str. Path to mount filesystem at. - - `mount_state`: Optional mount state, as for [ansible.posix.mount](https://docs.ansible.com/ansible/latest/collections/ansible/posix/mount_module.html#parameter-state). Default is `lustre_mount_state`. - - `mount_options`: Optional mount options. Default is `lustre_mount_options`. + - `fs_name`: Required str. The name of the filesystem to mount + - `mount_point`: Required str. Path to mount filesystem at. + - `mount_state`: Optional mount state, as for [ansible.posix.mount](https://docs.ansible.com/ansible/latest/collections/ansible/posix/mount_module.html#parameter-state). Default is `lustre_mount_state`. + - `mount_options`: Optional mount options. Default is `lustre_mount_options`. - `lustre_mount_state`. Optional default mount state for all mounts, as for [ansible.posix.mount](https://docs.ansible.com/ansible/latest/collections/ansible/posix/mount_module.html#parameter-state). Default is `mounted`. - `lustre_mount_options`. Optional default mount options. Default values are systemd defaults from [Lustre client docs](http://wiki.lustre.org/Mounting_a_Lustre_File_System_on_Client_Nodes). The following variables control the package build and and install: + - `lustre_version`: Optional str. Version of lustre to build, default `2.15.7` -- `lustre_repo`: Optional str. URL for Lustre repo. Default is `git://git.whamcloud.com/fs/lustre-release`.git. +- `lustre_repo`: Optional str. URL for Lustre repository. Default is `git://git.whamcloud.com/fs/lustre-release`.git. - `lustre_build_packages`: Optional list. Prerequisite packages required to build Lustre. See `defaults/main.yml`. - `lustre_build_dir`: Optional str. Path to build lustre at, default `/tmp/lustre-release`. - `lustre_configure_opts`: Optional list. Options to `./configure` command. Default builds client rpms supporting Mellanox OFED, without support for GSS keys. -- `lustre_rpm_globs`: Optional list. Shell glob patterns for rpms to install. Note order is important as the built RPMs are not in a yum repo. Default is just the `kmod-lustre-client` and `lustre-client` packages. +- `lustre_rpm_globs`: Optional list. Shell glob patterns for rpms to install. Note order is important as the built RPMs are not in a yum repository. Default is just the `kmod-lustre-client` and `lustre-client` packages. - `lustre_build_cleanup`: Optional bool. Whether to uninstall prerequisite packages and delete the build directories etc. Default `true`. diff --git a/ansible/roles/lustre/defaults/main.yml b/ansible/roles/lustre/defaults/main.yml index 14ddc05..9958eec 100644 --- a/ansible/roles/lustre/defaults/main.yml +++ b/ansible/roles/lustre/defaults/main.yml @@ -1,9 +1,10 @@ +--- lustre_version: '2.15.7' lustre_lnet_label: tcp -#lustre_mgs_nid: +# lustre_mgs_nid: lustre_mounts: [] lustre_mount_state: mounted -lustre_mount_options: 'defaults,_netdev,noauto,x-systemd.automount,x-systemd.requires=lnet.service,nosuid,nodev' +lustre_mount_options: "defaults,_netdev,noauto,x-systemd.automount,x-systemd.requires=lnet.service,nosuid,nodev" # below variables are for build and should not generally require changes lustre_repo: "https://github.com/lustre/lustre-release.git" diff --git a/ansible/roles/lustre/tasks/configure.yml b/ansible/roles/lustre/tasks/configure.yml index be5ba35..fab9e60 100644 --- a/ansible/roles/lustre/tasks/configure.yml +++ b/ansible/roles/lustre/tasks/configure.yml @@ -1,5 +1,6 @@ +--- - name: Gather Lustre interface info - shell: + ansible.builtin.shell: cmd: | ip --json r get {{ _lustre_mgs_ip }} changed_when: false @@ -8,23 +9,23 @@ _lustre_mgs_ip: "{{ lustre_mgs_nid | split('@') | first }}" - name: Set facts for Lustre interface - set_fact: + ansible.builtin.set_fact: _lustre_interface: "{{ _lustre_ip_r_mgs_info.dev }}" _lustre_ip: "{{ _lustre_ip_r_mgs_info.prefsrc }}" vars: _lustre_ip_r_mgs_info: "{{ _lustre_ip_r_mgs.stdout | from_json | first }}" - name: Write LNet configuration file - template: + ansible.builtin.template: src: lnet.conf.j2 - dest: /etc/lnet.conf # exists from package install, expected by lnet service + dest: /etc/lnet.conf # exists from package install, expected by lnet service owner: root group: root mode: u=rw,go=r # from package install register: _lnet_conf - name: Ensure lnet service state - systemd: + ansible.builtin.systemd: name: lnet state: "{{ 'restarted' if _lnet_conf.changed else 'started' }}" @@ -32,6 +33,7 @@ ansible.builtin.file: path: "{{ item.mount_point }}" state: directory + mode: "0755" loop: "{{ lustre_mounts }}" when: "(item.mount_state | default(lustre_mount_state)) != 'absent'" diff --git a/ansible/roles/lustre/tasks/install.yml b/ansible/roles/lustre/tasks/install.yml index aedc3a5..7a91a38 100644 --- a/ansible/roles/lustre/tasks/install.yml +++ b/ansible/roles/lustre/tasks/install.yml @@ -1,25 +1,26 @@ +--- - name: Install lustre build prerequisites ansible.builtin.dnf: name: "{{ lustre_build_packages }}" register: _lustre_dnf_build_packages - + - name: Clone lustre git repo ansible.builtin.git: repo: "{{ lustre_repo }}" dest: "{{ lustre_build_dir }}" version: "{{ lustre_version }}" -- name: Prepare for lustre configuration +- name: Prepare for lustre configuration # noqa: no-changed-when ansible.builtin.command: cmd: sh ./autogen.sh chdir: "{{ lustre_build_dir }}" -- name: Configure lustre build +- name: Configure lustre build # noqa: no-changed-when ansible.builtin.command: cmd: "./configure {{ lustre_configure_opts | join(' ') }}" chdir: "{{ lustre_build_dir }}" -- name: Build lustre +- name: Build lustre # noqa: no-changed-when ansible.builtin.command: cmd: make rpms chdir: "{{ lustre_build_dir }}" @@ -32,17 +33,18 @@ register: _lustre_find_rpms - name: Check rpms found - assert: + ansible.builtin.assert: that: _lustre_find_rpms.files | length fail_msg: "No lustre repos found with lustre_rpm_globs = {{ lustre_rpm_globs }}" - name: Install lustre rpms + # checkov:skip=CKV2_ANSIBLE_4: "Ensure that packages with untrusted or missing GPG signatures are not used by dnf" ansible.builtin.dnf: - name: "{{ _lustre_find_rpms.files | map(attribute='path')}}" - disable_gpg_check: yes + name: "{{ _lustre_find_rpms.files | map(attribute='path') }}" + disable_gpg_check: true - name: Delete lustre build dir - file: + ansible.builtin.file: path: "{{ lustre_build_dir }}" state: absent when: lustre_build_cleanup | bool diff --git a/ansible/roles/lustre/tasks/validate.yml b/ansible/roles/lustre/tasks/validate.yml index 609a77f..6469ac1 100644 --- a/ansible/roles/lustre/tasks/validate.yml +++ b/ansible/roles/lustre/tasks/validate.yml @@ -1,20 +1,21 @@ +--- - name: Check kernel-devel package is installed - command: "dnf list --installed kernel-devel-{{ ansible_kernel }}" + ansible.builtin.command: "dnf list --installed kernel-devel-{{ ansible_kernel }}" changed_when: false # NB: we don't check here the kernel will remain the same after reboot etc, see ofed/install.yml - name: Ensure SELinux in permissive mode - assert: + ansible.builtin.assert: that: selinux_state in ['permissive', 'disabled'] fail_msg: "SELinux must be permissive for Lustre not '{{ selinux_state }}'; see variable selinux_state" - name: Ensure lustre_mgs_nid is defined - assert: + ansible.builtin.assert: that: lustre_mgs_nid is defined fail_msg: Variable lustre_mgs_nid must be defined - name: Ensure lustre_mounts entries define filesystem name and mount point - assert: + ansible.builtin.assert: that: - item.fs_name is defined - item.mount_point is defined diff --git a/ansible/roles/mysql/README.md b/ansible/roles/mysql/README.md index 2c735db..e85c173 100644 --- a/ansible/roles/mysql/README.md +++ b/ansible/roles/mysql/README.md @@ -1,18 +1,14 @@ -mysql -===== +# MySQL Deploy containerised `mysql` server using Podman. - -Requirements ------------- +## Requirements None. -Role Variables --------------- +## Role Variables -- `mysql_root_password`: Required str. Password to set for `root` mysql user. **NB** This cannot be changed by this role once mysql server has initialised. +- `mysql_root_password`: Required str. Password to set for `root` MySQL user. **NB** This cannot be changed by this role once MySQL server has initialised. - `mysql_tag`: Optional str. Tag for version of `mysql` container image to use. Default `8.0.30`. - `mysql_systemd_service_enabled`: Optional bool. Whether `mysql` service starts on boot. Default `yes`. - `mysql_state`: Optional str. As per `ansible.builtin.systemd:state`. Default is `started` or `restarted` as required. @@ -22,13 +18,11 @@ Role Variables - `mysql_users`: Optional list of dicts defining users as per `community.mysql.mysql_user`. Default `[]`. - `mysql_databases`: Optional list of dicts defining databases as per `community.mysql.mysql_db`. Default `[]`. -Dependencies ------------- +## Dependencies None. -Example Playbook ----------------- +## Example Playbook ```yaml - name: Setup DB @@ -38,15 +32,13 @@ Example Playbook - mysql tasks: - include_role: - name: mysql + name: mysql ``` -License -------- +## License Apache v2 -Author Information ------------------- +## Author Information -Steve Brasier steveb@stackhpc.com +Steve Brasier diff --git a/ansible/roles/mysql/defaults/main.yml b/ansible/roles/mysql/defaults/main.yml index b15c800..9d549b3 100644 --- a/ansible/roles/mysql/defaults/main.yml +++ b/ansible/roles/mysql/defaults/main.yml @@ -1,9 +1,10 @@ +--- # required: # mysql_root_password: # TODO: make it possible to CHANGE root password mysql_tag: 8.0.30 -mysql_systemd_service_enabled: yes -#mysql_state: # default is started or restarted as required +mysql_systemd_service_enabled: true +# mysql_state: # default is started or restarted as required mysql_podman_user: "{{ ansible_user }}" mysql_datadir: /var/lib/mysql mysql_mysqld_options: [] # list of str options to mysqld, see `run -it --rm mysql:tag --verbose --help` diff --git a/ansible/roles/mysql/tasks/configure.yml b/ansible/roles/mysql/tasks/configure.yml index d4dd4cd..7bf9cb3 100644 --- a/ansible/roles/mysql/tasks/configure.yml +++ b/ansible/roles/mysql/tasks/configure.yml @@ -1,6 +1,7 @@ +--- - name: Create environment file for mysql server root password # NB: This doesn't trigger a restart on changes as it will be ignored once mysql is initialised - copy: + ansible.builtin.copy: dest: /etc/sysconfig/mysqld content: | MYSQL_INITIAL_ROOT_PASSWORD='{{ mysql_root_password }}' @@ -9,29 +10,29 @@ mode: u=rw,go= - name: Ensure mysql service state - systemd: + ansible.builtin.systemd: name: mysql state: "{{ mysql_state | default('restarted' if _mysql_unitfile.changed else 'started') }}" enabled: "{{ mysql_systemd_service_enabled }}" daemon_reload: "{{ _mysql_unitfile.changed }}" -- block: - - name: Wait for mysql to initialise +- when: "mysql_state | default('unspecified') != 'stopped'" + block: + - name: Wait for mysql to initialise # NB: It is not sufficent to wait_for the port - community.mysql.mysql_info: - login_user: root - login_password: "{{ mysql_root_password }}" - no_log: "{{ no_log | default(true) }}" - register: _mysql_info - until: "'version' in _mysql_info" - retries: 90 - delay: 2 + community.mysql.mysql_info: + login_user: root + login_password: "{{ mysql_root_password }}" + no_log: "{{ no_log | default(true) }}" + register: _mysql_info + until: "'version' in _mysql_info" + retries: 90 + delay: 2 - - name: Ensure mysql databases created - community.mysql.mysql_db: "{{ item }}" - loop: "{{ mysql_databases}}" + - name: Ensure mysql databases created + community.mysql.mysql_db: "{{ item }}" # noqa: args[module] + loop: "{{ mysql_databases}}" - - name: Ensure mysql users present - community.mysql.mysql_user: "{{ item }}" - loop: "{{ mysql_users }}" - when: "mysql_state | default('unspecified') != 'stopped'" + - name: Ensure mysql users present + community.mysql.mysql_user: "{{ item }}" # noqa: args[module] + loop: "{{ mysql_users }}" diff --git a/ansible/roles/mysql/tasks/install.yml b/ansible/roles/mysql/tasks/install.yml index 4ed5d30..0a108d2 100644 --- a/ansible/roles/mysql/tasks/install.yml +++ b/ansible/roles/mysql/tasks/install.yml @@ -1,22 +1,25 @@ +--- - name: Install pip - dnf: + ansible.builtin.dnf: name: python3-pip - name: Install python mysql client - pip: + ansible.builtin.pip: name: - pymysql - cryptography state: present - name: Create systemd mysql container unit file - template: + ansible.builtin.template: dest: /etc/systemd/system/mysql.service src: mysql.service.j2 + mode: "0644" register: _mysql_unitfile - name: Pull container image containers.podman.podman_image: name: docker.io/library/mysql tag: "{{ mysql_tag }}" + become: true become_user: "{{ mysql_podman_user }}" diff --git a/ansible/roles/mysql/tasks/main.yml b/ansible/roles/mysql/tasks/main.yml index 2b65e84..cc29fba 100644 --- a/ansible/roles/mysql/tasks/main.yml +++ b/ansible/roles/mysql/tasks/main.yml @@ -1,2 +1,3 @@ -- import_tasks: install.yml -- import_tasks: configure.yml +--- +- ansible.builtin.import_tasks: install.yml +- ansible.builtin.import_tasks: configure.yml diff --git a/ansible/roles/nhc/README.md b/ansible/roles/nhc/README.md index 8831e0e..a826932 100644 --- a/ansible/roles/nhc/README.md +++ b/ansible/roles/nhc/README.md @@ -22,6 +22,7 @@ compute ``` When the `anisble/site.yml` playbook is run this will automatically: + 1. Add NHC-related configuration to the `slurm.conf` Slurm configuration file. The default configuration is defined in `openhpc_config_nhc` (see [environments/common/inventory/group_vars/all/openhpc.yml](../../../environments/common/inventory/group_vars/all/openhpc.yml)). @@ -33,10 +34,11 @@ When the `anisble/site.yml` playbook is run this will automatically: 2. Template out node health check rules using Ansible facts for each compute node. Currently these check: - - Filesystem mounts - - Ethernet interfaces - See `/etc/nhc/nhc.conf` on a compute node for the full configuration. + - Filesystem mounts + - Ethernet interfaces + + See `/etc/nhc/nhc.conf` on a compute node for the full configuration. If a node healthcheck run fails, Slurm will mark the node `DOWN`. With the default [alerting configuration](../../../docs/alerting.md) this will trigger @@ -52,15 +54,17 @@ an alert. ## Structure This role contains 3x task files, which run at different times: + - `main.yml`: Runs from `site.yml` -> `slurm.yml`. Templates health check configuration to nodes. - `export.yml`: Runs from `site.yml` -> `final.yml` via role `compute_init` tasks `export.yml`. Templates health check configuration to the cluster NFS - share for compute-init. + share for compute-init. - `boot.yml`: Runs on boot via `compute_init/files/compute-init.yml`. Copies the node's generated health check configuration from the cluster share to local disk. Note that the `stackhpc.openhpc` role: + - Installs the required package - Configures slurm.conf parameterss diff --git a/ansible/roles/nhc/tasks/export.yml b/ansible/roles/nhc/tasks/export.yml index afa440f..d6b1120 100644 --- a/ansible/roles/nhc/tasks/export.yml +++ b/ansible/roles/nhc/tasks/export.yml @@ -3,4 +3,5 @@ ansible.builtin.template: src: "{{ nhc_config_template }}" dest: "/exports/cluster/hostconfig/{{ inventory_hostname }}/nhc.conf" + mode: "0644" delegate_to: "{{ groups['control'] | first }}" diff --git a/ansible/roles/nhc/tasks/main.yml b/ansible/roles/nhc/tasks/main.yml index 5f6034f..a507113 100644 --- a/ansible/roles/nhc/tasks/main.yml +++ b/ansible/roles/nhc/tasks/main.yml @@ -1,4 +1,4 @@ - +--- - name: Ensure NHC configuration directory exists # When running site.yml after login/control upgrade, nhc group might be # enabled in repo, but as the compute nodes have not yet been upgraded they diff --git a/ansible/roles/ofed/README.md b/ansible/roles/ofed/README.md index 7d4bb60..9eab86f 100644 --- a/ansible/roles/ofed/README.md +++ b/ansible/roles/ofed/README.md @@ -6,20 +6,21 @@ > instead. This role installs Mellanox OFED: + - It checks that the running kernel is the latest installed one, and errors if not. - Installation uses the `mlnxofedinstall` command, with support for the running kernel -and (by default) without firmware updates. + and (by default) without firmware updates. As OFED installation takes a long time generally this should only be used during image build, for example by setting: -``` +```yaml environments/groups//groups: [ofed:children] builder ``` -# Role variables +## Role variables See `defaults/main.yml` diff --git a/ansible/roles/ofed/defaults/main.yml b/ansible/roles/ofed/defaults/main.yml index 63caf24..422ccc1 100644 --- a/ansible/roles/ofed/defaults/main.yml +++ b/ansible/roles/ofed/defaults/main.yml @@ -1,4 +1,6 @@ -ofed_version: '24.10-3.2.5.0' # LTS +--- +ofed_version: "24.10-3.2.5.0" # LTS +# yamllint disable-line rule:line-length ofed_download_url: https://content.mellanox.com/ofed/MLNX_OFED-{{ ofed_version }}/MLNX_OFED_LINUX-{{ ofed_version }}-{{ ofed_distro }}{{ ofed_distro_version }}-{{ ofed_arch }}.tgz ofed_distro: rhel # NB: not expected to work on other distros due to installation differences ofed_distro_version: "{{ ansible_distribution_version }}" # e.g. '8.9' diff --git a/ansible/roles/ofed/tasks/install.yml b/ansible/roles/ofed/tasks/install.yml index 45f341b..1532fa4 100644 --- a/ansible/roles/ofed/tasks/install.yml +++ b/ansible/roles/ofed/tasks/install.yml @@ -1,30 +1,34 @@ +--- - name: Get installed kernels - command: dnf list --installed kernel + ansible.builtin.command: dnf list --installed kernel register: _ofed_dnf_kernels changed_when: false - name: Determine running kernel - command: uname -r # e.g. 4.18.0-513.18.1.el8_9.x86_64 + ansible.builtin.command: uname -r register: _ofed_loaded_kernel changed_when: false - name: Check current kernel is newest installed - assert: + ansible.builtin.assert: that: _ofed_kernel_current == _ofed_dnf_kernels_newest fail_msg: "Kernel {{ _ofed_loaded_kernel.stdout }} is loaded but newer {{ _ofed_dnf_kernels_newest }} is installed: consider rebooting?" vars: + # yamllint disable rule:line-length _ofed_kernel_current: >- {{ _ofed_loaded_kernel.stdout | regex_replace('\.(?:.(?!\.))+$', '') | regex_replace('\.(?:.(?!\.))+$', '') }} _ofed_dnf_kernels_newest: >- - {{ _ofed_dnf_kernels.stdout_lines[1:] | map('split') | map(attribute=1) | map('regex_replace', '\.(?:.(?!\.))+$', '') | community.general.version_sort | last }} - # dnf line format e.g. "kernel.x86_64 4.18.0-513.18.1.el8_9 @baseos " + {{ _ofed_dnf_kernels.stdout_lines[1:] | map('split') | map(attribute=1) | map('regex_replace', '\.(?:.(?!\.))+$', '') | community.general.version_sort | last + }} + # yamllint enable rule:line-length + # dnf line format e.g. "kernel.x86_64 4.18.0-513.18.1.el8_9 @baseos " - name: Enable epel - dnf: + ansible.builtin.dnf: name: epel-release - name: Check for existing OFED installation - command: ofed_info + ansible.builtin.command: ofed_info changed_when: false failed_when: - _ofed_info.rc > 0 @@ -32,7 +36,7 @@ register: _ofed_info - name: Install build prerequisites - dnf: + ansible.builtin.dnf: name: "{{ ofed_build_packages + (ofed_build_rl8_packages if ofed_distro_major_version == '8' else []) }}" when: "'MLNX_OFED_LINUX-' + ofed_version not in _ofed_info.stdout" # don't want to install a load of prereqs unnecessarily @@ -41,13 +45,13 @@ ansible.builtin.unarchive: src: "{{ ofed_download_url }}" dest: "{{ ofed_tmp_dir }}" - remote_src: yes - become: no + remote_src: true + become: false when: "'MLNX_OFED_LINUX-' + ofed_version not in _ofed_info.stdout" # Below from https://docs.nvidia.com/networking/display/mlnxofedv24010331/user+manual -- name: Run OFED install script - command: +- name: Run OFED install script # noqa: no-changed-when + ansible.builtin.command: cmd: > ./mlnxofedinstall --add-kernel-support @@ -63,13 +67,13 @@ async: "{{ 45 * 60 }}" # wait for up to 45 minutes poll: 15 # check every 15 seconds -- name: Update initramfs - command: +- name: Update initramfs # noqa: no-changed-when + ansible.builtin.command: cmd: dracut -f when: '"update your initramfs" in _ofed_install.stdout | default("")' failed_when: false # always shows errors due to deleted modules for inbox RDMA drivers -- name: Load the new driver - command: +- name: Load the new driver # noqa: no-changed-when + ansible.builtin.command: cmd: /etc/init.d/openibd restart when: '"To load the new driver" in _ofed_install.stdout | default("")' diff --git a/ansible/roles/ofed/tasks/main.yml b/ansible/roles/ofed/tasks/main.yml index e7a272f..df97825 100644 --- a/ansible/roles/ofed/tasks/main.yml +++ b/ansible/roles/ofed/tasks/main.yml @@ -1 +1,2 @@ -- include_tasks: install.yml +--- +- ansible.builtin.include_tasks: install.yml diff --git a/ansible/roles/openondemand/README.md b/ansible/roles/openondemand/README.md index 099276c..b1fb673 100644 --- a/ansible/roles/openondemand/README.md +++ b/ansible/roles/openondemand/README.md @@ -17,9 +17,14 @@ This uses the [osc.ood](https://github.com/OSC/ood-ansible) Ansible role to prov ### General - `openondemand_clusters`: Required. Synonym for [osc.ood: clusters](https://github.com/OSC/ood-ansible#clusters) role variable. -- `openondemand_servername`: Required. Synonym for [osc.ood: servername](https://github.com/OSC/ood-ansible/blob/master/defaults/main/ood_portal.yml#L27) role variable. This defines what the Open Ondemand portal's Apache server uses for the [name-based virtual host](https://httpd.apache.org/docs/current/mod/core.html#servername). It should be the IP or hostname(+domain) part of the URL used to access Open Ondemand in the browser, e.g. `ondemand.mysite.org`. **NB:** If a domain or external IP is not available, specify the host's internal IP here and use ssh with a `DynamicForward` option and a SOCKS proxy to access this address. Using ssh's `LocalForward` option is not recommended as the server name will have to be `localhost` which causes some issues. Changing this value on an already deployed cluster requires a reboot of the login node for OOD app state to be correctly refreshed. +- `openondemand_servername`: Required. Synonym for [osc.ood: servername](https://github.com/OSC/ood-ansible/blob/master/defaults/main/ood_portal.yml#L27) role variable. + This defines what the Open Ondemand portal's Apache server uses for the [name-based virtual host](https://httpd.apache.org/docs/current/mod/core.html#servername). + It should be the IP or hostname(+domain) part of the URL used to access Open Ondemand in the browser, e.g. `ondemand.mysite.org`. **NB:** If a domain or external IP is not available, specify the host's internal IP here and use SSH with a `DynamicForward` option and a SOCKS proxy to access this address. + Using ssh's `LocalForward` option is not recommended as the server name will have to be `localhost` which causes some issues. + Changing this value on an already deployed cluster requires a reboot of the login node for OOD app state to be correctly refreshed. ### Authentication + See the Open Ondemand [Authentication docs](https://osc.github.io/ood-documentation/latest/authentication/overview.html) for an overview of the authentication process. - `openondemand_auth`: Required. Authentication method, either `'oidc'` or `'basic_pam'`. See relevant subsection below. @@ -28,36 +33,41 @@ See the Open Ondemand [Authentication docs](https://osc.github.io/ood-documentat - `openondemand_username`: The remote authenticated username. See also `openondemand_oidc_remote_user_claim` if using OIDC authentication. #### OIDC authentication + The following variables are active when `openondemand_auth` is `oidc`. This role uses the variables below plus a few required defaults to set the `osc.ood: ood_auth_openidc` [variable](https://github.com/OSC/ood-ansible#open-id-connect) - if the below is insufficent to correctly configure OIDC then set `ood_auth_openidc` directly. + - `openondemand_oidc_client_id`: Required. Client ID, as specified by the OIDC provider - `openondemand_oidc_client_secret`: Required. Client secret, as specified the OIDC provider (should be vault-protected). - `openondemand_oidc_provider_url`: Required. URL including protocol for the OIDC provider. - `openondemand_oidc_crypto_passphrase`: Required. Random string (should be vault protected) - `openondemand_oidc_scope`: Optional. A space-separated string giving the [OIDC scopes](https://auth0.com/docs/configure/apis/scopes/openid-connect-scopes) to request from the OIDC provider. What is available depends on the provider. Default: `openid profile preferred_username`. -- `openondemand_oidc_remote_user_claim`: Optional. A string giving the [OIDC claim](https://auth0.com/docs/configure/apis/scopes/openid-connect-scopes#standard-claims) to use as the remote user name. What is available depends on the provider and the claims made. Default: `preferred_username`. +- `openondemand_oidc_remote_user_claim`: Optional. A string giving the [OIDC claim](https://auth0.com/docs/configure/apis/scopes/openid-connect-scopes#standard-claims) to use as the remote username. What is available depends on the provider and the claims made. Default: `preferred_username`. The OIDC provider should be configured to redirect to `https://{{ openondemand_servername }}/oidc` with scopes as appropriate for `openondemand_oidc_scope`. - #### Basic/PAM authentication + This option uses HTTP Basic Authentication (i.e. browser prompt) to get a username and password. This is then checked against an existing local user using PAM. Note that HTTPS is configured by default, so the password is protected in transit, although there are [other](https://security.stackexchange.com/a/990) security concerns with Basic Authentication. No other authentication options are required for this method. ### SSL Certificates + This role enables SSL on the Open Ondemand server, using the following self-signed certificate & key which are autogenerated by the `mod_ssl` package installed as part of the `ondemand-apache` package. Replace with your own keys if required. + - `openondemand_ssl_cert`: Optional. Default `/etc/pki/tls/certs/localhost.crt`. - `openondemand_ssl_cert_key`: Optional. Default `/etc/pki/tls/private/localhost.key` ### Dashboard and application configuration + - `openondemand_dashboard_docs_url`: Optional. URL of docs to show under Help in dashboard. Default `(undefined)`. - `openondemand_dashboard_links`: Optional. List of mappings defining additional links to add as menu items in the dashboard. Keys are: - - `name`: Required. User-facing name for the link. - - `category`: Required. Menu to add link under, either a default one (e.g. `Files`, `Jobs`, `Clusters`, `Interactive Apps`) or a new category to add. - - `icon`: Optional. URL of icon, defaults to Open Ondemand clock icon as used in standard menus. - - `url`: Required. URL of link. - - `new_window`: Optional. Whether to open link in new window. Bool, default `false`. - - `app_name`: Optional. Unique name for app appended to `/var/www/ood/apps/sys/`. Default is `name`, useful if that is not unique or not suitable as a path component. + - `name`: Required. User-facing name for the link. + - `category`: Required. Menu to add link under, either a default one (e.g. `Files`, `Jobs`, `Clusters`, `Interactive Apps`) or a new category to add. + - `icon`: Optional. URL of icon, defaults to Open Ondemand clock icon as used in standard menus. + - `url`: Required. URL of link. + - `new_window`: Optional. Whether to open link in new window. Bool, default `false`. + - `app_name`: Optional. Unique name for app appended to `/var/www/ood/apps/sys/`. Default is `name`, useful if that is not unique or not suitable as a path component. - `openondemand_dashboard_support_url`: Optional. URL or email etc to show as support contact under Help in dashboard. Default `(undefined)`. - `openondemand_desktop_partition`: Optional. Name of Slurm partition to use for remote desktops. Requires a corresponding group named "openondemand_desktop" and entry in openhpc_partitions. - `openondemand_desktop_screensaver`: Optional. Whether to enable screen locking/screensaver. **NB:** Users must have passwords if this is enabled. Bool, default `false`. @@ -65,16 +75,19 @@ This role enables SSL on the Open Ondemand server, using the following self-sign - `openondemand_jupyter_partition`: Required. Name of Slurm partition to use for Jupyter Notebook servers. Requires a corresponding group named "openondemand_jupyter" and entry in openhpc_partitions. ### Monitoring + - `openondemand_exporter`: Optional. Install the Prometheus [ondemand_exporter](https://github.com/OSC/ondemand_exporter) on the `openondemand` node to export metrics about Open Ondemand itself. Default `true`. ### Proxying + The Open Ondemand portal can proxy other servers. Variables: -- `openondemand_host_regex`: Synomyn for the `osc.ood: host_regex` [variable](https://osc.github.io/ood-documentation/latest/app-development/interactive/setup/enable-reverse-proxy.html). A Python regex matching servernames which Open Ondemand should proxy. Enables proxying and restricts which addresses are proxied (for security). E.g. this might be: +- `openondemand_host_regex`: Synomyn for the `osc.ood: host_regex` [variable](https://osc.github.io/ood-documentation/latest/app-development/interactive/setup/enable-reverse-proxy.html). A Python regular expression matching servernames which Open Ondemand should proxy. Enables proxying and restricts which addresses are proxied (for security). E.g. this might be: `'({{ openhpc_cluster_name }}-compute-\d+)|({{ groups["grafana"] | first }})'` to proxy: + - All "compute" nodes, e.g. for Open Ondemand interactive apps such as remote desktop and Jupyter notebook server. - The Grafana server - note a link to Grafana is always added to the Open Ondemand dashboard. @@ -83,21 +96,22 @@ The Open Ondemand portal can proxy other servers. Variables: - `openondemand_node_proxy_directives`: Optional, default ''. Multiline string to insert into Apache directives definition for `node_uri` ([docs](https://osc.github.io/ood-documentation/master/reference/files/ood-portal-yml.html#configure-reverse-proxy)). Note that: + - If Open Ondemand and Grafana are deployed, Grafana is automatically configured so that proxying it through Open Ondemand works. - The `osc.ood` role variables `node_uri` and `rnode_uri` are set automatically if `openondemand_host_regex` is set. -# Dependencies +## Dependencies - `osc.ood` role as described above. -# Example Playbook +## Example Playbook See `ansible/portal.yml`. Note the `main` playbook should be run on the `openondemand` node (i.e. the node to configure as hosting the Open Ondemand server/portal), and the other playbooks should be run on some subset of the `compute` group. -# License +## License Apache v2 -# Author Information +## Author Information Stackhpc Ltd. diff --git a/ansible/roles/openondemand/defaults/main.yml b/ansible/roles/openondemand/defaults/main.yml index 851804e..86fb49f 100644 --- a/ansible/roles/openondemand/defaults/main.yml +++ b/ansible/roles/openondemand/defaults/main.yml @@ -3,7 +3,6 @@ # Authentication: openondemand_auth: # "oidc" or "basic_pam" openondemand_mapping_users: [] - ## Variables for `openondemand_auth=oidc` : openondemand_oidc_client_id: openondemand_oidc_client_secret: @@ -19,10 +18,10 @@ openondemand_ssl_cert_key: /etc/pki/tls/private/localhost.key # Dashboard and application config: openondemand_dashboard_docs_url: (undefined) openondemand_dashboard_support_url: (undefined) -openondemand_desktop_partition: '' +openondemand_desktop_partition: "" openondemand_desktop_screensaver: false openondemand_filesapp_paths: [] -openondemand_jupyter_partition: '' +openondemand_jupyter_partition: "" openondemand_dashboard_links: [] openondemand_rstudio_partition: '' openondemand_matlab_partition: '' @@ -33,11 +32,10 @@ openondemand_exporter: true # Synonyms for osc:ood role vars: openondemand_clusters: {} # synonym for osc.ood:clusters -openondemand_servername: '' +openondemand_servername: "" openondemand_host_regex: - # Other: -openondemand_node_proxy_directives: '' # Added to Apache directives for `node_uri` forwarding. +openondemand_node_proxy_directives: "" # Added to Apache directives for `node_uri` forwarding. openondemand_auth_defaults: # Defaults for OIDC auth - keys are osc.ood vars & can be overriden using the osc.ood var name in inventory @@ -54,23 +52,23 @@ openondemand_auth_defaults: OIDCScope: "{{ openondemand_oidc_scope }}" OIDCRemoteUserClaim: "{{ openondemand_oidc_remote_user_claim }}" httpd_auth: # ood_portal.yml.j2 - - 'AuthType openid-connect' - - 'Require valid-user' - - 'ProxyPreserveHost On' # see under https://grafana.com/blog/2022/02/08/grafana-7.5.15-and-8.3.5-released-with-moderate-severity-security-fixes/ + - "AuthType openid-connect" + - "Require valid-user" + - "ProxyPreserveHost On" # see under https://grafana.com/blog/2022/02/08/grafana-7.5.15-and-8.3.5-released-with-moderate-severity-security-fixes/ user_map_cmd: /opt/ood/ood_auth_map/bin/ood_auth_map.mapfile user_map_match: none - + # Defaults for basic/PAM auth - see https://osc.github.io/ood-documentation/latest/authentication/pam.html basic_pam: httpd_auth: # ood_portal.yml.j2 - - 'AuthType Basic' + - "AuthType Basic" - 'AuthName "Open OnDemand"' - - 'AuthBasicProvider PAM' - - 'AuthPAMService ood' - - 'Require valid-user' - - 'ProxyPreserveHost On' # see under https://grafana.com/blog/2022/02/08/grafana-7.5.15-and-8.3.5-released-with-moderate-severity-security-fixes/ + - "AuthBasicProvider PAM" + - "AuthPAMService ood" + - "Require valid-user" + - "ProxyPreserveHost On" # see under https://grafana.com/blog/2022/02/08/grafana-7.5.15-and-8.3.5-released-with-moderate-severity-security-fixes/ user_map_cmd: null - user_map_match: '.*' + user_map_match: ".*" # The below mapping is used to override osc.ood defaults. Keys are osc.ood variable names. # If you need to override *these* defaults (i.e. this role's vars are not sufficent) just set the corresponding osc.ood var as normal. @@ -94,7 +92,7 @@ openondemand_osc_ood_defaults: - SSLHonorCipherOrder On - SSLCompression off - SSLSessionTickets Off - + # User mapping: user_map_cmd: "{{ openondemand_auth_defaults[openondemand_auth | lower].user_map_cmd }}" user_map_match: "{{ openondemand_auth_defaults[openondemand_auth | lower].user_map_match }}" @@ -106,4 +104,4 @@ openondemand_osc_ood_defaults: openondemand_code_server_version: 4.102.2 openondemand_rstudio_version: 2025.05.1-513 -openondemand_matlab_version: '' \ No newline at end of file +openondemand_matlab_version: '' diff --git a/ansible/roles/openondemand/files/missing_home_directory.html b/ansible/roles/openondemand/files/missing_home_directory.html index db790c9..512fb92 100644 --- a/ansible/roles/openondemand/files/missing_home_directory.html +++ b/ansible/roles/openondemand/files/missing_home_directory.html @@ -1,49 +1,54 @@ - + - - Home Directory Not Found - - - -

Home directory not found

-

- Your home directory appears to be missing. If this is the first time you have logged in with this account, you may - need to access our systems using SSH in order to trigger the creation of your home directory. -

-
    - Open Shell to create home directory -
    -
    - Restart Web Server -
- + + Home Directory Not Found + + + +

Home directory not found

+

+ Your home directory appears to be missing. If this is the first time you + have logged in with this account, you may need to access our systems using + SSH in order to trigger the creation of your home directory. +

+
    + Open Shell to create home directory +
    +
    + Restart Web Server +
+ diff --git a/ansible/roles/openondemand/tasks/codeserver_compute.yml b/ansible/roles/openondemand/tasks/codeserver_compute.yml index 7b39bf7..6f178c5 100644 --- a/ansible/roles/openondemand/tasks/codeserver_compute.yml +++ b/ansible/roles/openondemand/tasks/codeserver_compute.yml @@ -1,24 +1,25 @@ - name: Download Code Server RPM ansible.builtin.get_url: - url: "https://github.com/coder/code-server/releases/download/v{{ openondemand_code_server_version }}/code-server-{{ openondemand_code_server_version }}-amd64.rpm" + url: "https://github.com/coder/code-server/releases/download/v{{ openondemand_code_server_version }}/code-server-{{ openondemand_code_server_version }}-amd64.rpm" # noqa: yaml[line-length] dest: /tmp/code-server.rpm mode: '0644' - name: Install Code Server + # checkov:skip=CKV2_ANSIBLE_4: "Ensure that packages with untrusted or missing GPG signatures are not used by dnf" ansible.builtin.dnf: name: /tmp/code-server.rpm state: present - disable_gpg_check: yes + disable_gpg_check: true - name: Create module directory for Code Server ansible.builtin.file: path: /opt/ohpc/pub/modulefiles/code-server state: directory mode: '0755' - recurse: yes + recurse: true - name: Create modulefile for Code Server - copy: + ansible.builtin.copy: dest: "/opt/ohpc/pub/modulefiles/code-server/{{ openondemand_code_server_version }}" mode: "0644" content: | diff --git a/ansible/roles/openondemand/tasks/config_changes.yml b/ansible/roles/openondemand/tasks/config_changes.yml index f83c670..835411d 100644 --- a/ansible/roles/openondemand/tasks/config_changes.yml +++ b/ansible/roles/openondemand/tasks/config_changes.yml @@ -1,5 +1,6 @@ +--- - name: Add Apache directives for node_uri forwarding - blockinfile: + ansible.builtin.blockinfile: path: /opt/ood/ood-portal-generator/templates/ood-portal.conf.erb block: "{{ openondemand_node_proxy_directives }}" insertafter: ' Header edit Set-Cookie "\^\(\[\^;\]\+\)" "\$1; Path=<%= @node_uri %>\/%{MATCH_HOST}e\/%{MATCH_PORT}e"' diff --git a/ansible/roles/openondemand/tasks/exporter.yml b/ansible/roles/openondemand/tasks/exporter.yml index e3c387a..f9100f7 100644 --- a/ansible/roles/openondemand/tasks/exporter.yml +++ b/ansible/roles/openondemand/tasks/exporter.yml @@ -1,10 +1,11 @@ +--- - name: Install ondemand prometheus exporter - yum: + ansible.builtin.dnf: name: ondemand_exporter when: openondemand_exporter - name: Start and enable ondemand prometheus exporter - service: + ansible.builtin.service: name: ondemand_exporter enabled: true state: started diff --git a/ansible/roles/openondemand/tasks/jupyter_compute.yml b/ansible/roles/openondemand/tasks/jupyter_compute.yml index a87d07d..6df0c78 100644 --- a/ansible/roles/openondemand/tasks/jupyter_compute.yml +++ b/ansible/roles/openondemand/tasks/jupyter_compute.yml @@ -1,32 +1,32 @@ +--- # Should be run on compute nodes you want to run jupyter notebook on # See https://osc.github.io/ood-documentation/latest/app-development/tutorials-interactive-apps/add-jupyter/software-requirements.html # - Will already have openssl and lmod - name: Ensure python3.9 installed - dnf: + ansible.builtin.dnf: name: python39 tags: install - name: Install jupyter venv # Requires separate step so that the upgraded pip is used to install packages - pip: + ansible.builtin.pip: name: pip - state: latest + state: latest # noqa: package-latest virtualenv: /opt/jupyter-py39 virtualenv_command: python3.9 -m venv tags: install - name: Copy jupyter requirements file - copy: + ansible.builtin.copy: src: jupyter_requirements.txt dest: /opt/jupyter-py39/jupyter_requirements.txt + mode: "0644" tags: install - name: Install jupyter package in venv - pip: + ansible.builtin.pip: virtualenv: /opt/jupyter-py39 virtualenv_command: python3.9 -m venv requirements: /opt/jupyter-py39/jupyter_requirements.txt tags: install - - diff --git a/ansible/roles/openondemand/tasks/main.yml b/ansible/roles/openondemand/tasks/main.yml index bd5706e..783be89 100644 --- a/ansible/roles/openondemand/tasks/main.yml +++ b/ansible/roles/openondemand/tasks/main.yml @@ -1,7 +1,6 @@ --- - - name: Set osc.ood variables from this role's defaults if no overriding inventory var - set_fact: + ansible.builtin.set_fact: "{{ item.key }}": "{{ lookup('vars', item.key, default=item.value) }}" loop: "{{ openondemand_osc_ood_defaults | dict2items }}" when: (item.key in hostvars[inventory_hostname]) or (item.value) @@ -14,47 +13,48 @@ file: "{{ playbook_dir }}/roles/osc.ood/vars/Rocky/{{ ansible_distribution_major_version }}.yml" # if using PAM auth we need apache installed but NOT started so split the osc.ood role up: -- include_role: +- ansible.builtin.include_role: name: osc.ood tasks_from: install-package.yml vars_from: "Rocky/{{ ansible_distribution_major_version }}.yml" when: appliances_mode != 'configure' # can't set vars: from a dict hence the workaround above -- include_tasks: +- ansible.builtin.include_tasks: file: pam_auth.yml when: openondemand_auth | lower == 'basic_pam' -- include_tasks: +- ansible.builtin.include_tasks: file: config_changes.yml # The configure.yml playbook needs vars from Rocky (for nginx) and main if using OIDC auth. However vars_from doensn't take a list. # include_vars doens't interpolate from role vars, so we use that for main.yml which only requires things we override in the appliance inventory # and use vars_from for Rocky which requires interpolation from role vars. -#- include_vars: -# file: roles/osc.ood/vars/main.yml +# - include_vars: +# file: roles/osc.ood/vars/main.yml -- include_role: +- ansible.builtin.include_role: name: osc.ood tasks_from: configure.yml vars_from: main.yml - public: yes + public: true -- include_role: +- ansible.builtin.include_role: name: osc.ood tasks_from: install-apps.yml when: ood_install_apps -- include_role: +- ansible.builtin.include_role: name: osc.ood tasks_from: apps.yml # vars_from: Rocky.yml when: ood_apps - name: Ensure post_tasks dirs exists - file: + ansible.builtin.file: path: "{{ item }}" state: directory + mode: "0755" loop: # - /etc/ood/config/apps/dashboard/initializers - /etc/ood/config/locales @@ -62,15 +62,15 @@ - /etc/ood/config/pun/html - name: Create dashboard additional config directory - file: + ansible.builtin.file: path: /etc/ood/config/apps/dashboard/initializers state: directory - recurse: yes + recurse: true owner: root mode: o=rwX,go=rX - name: Create additional shortcuts in Files app - template: + ansible.builtin.template: src: files_shortcuts.rb.j2 dest: /etc/ood/config/apps/dashboard/initializers/ood.rb owner: root @@ -78,21 +78,22 @@ when: openondemand_filesapp_paths - name: Create job template directory - file: + ansible.builtin.file: path: "/etc/ood/config/apps/myjobs/templates/" state: directory - recurse: True + recurse: true owner: root group: root mode: o=rwX,go=rX - name: Copy web page to let users create their home directory - copy: + ansible.builtin.copy: src: missing_home_directory.html dest: /etc/ood/config/pun/html/missing_home_directory.html + mode: "0644" - name: Create mapping directory - file: + ansible.builtin.file: path: /etc/grid-security state: directory owner: root @@ -101,7 +102,7 @@ when: openondemand_mapping_users - name: Create mapping file - template: + ansible.builtin.template: dest: /etc/grid-security/grid-mapfile src: grid-mapfile.j2 owner: root @@ -110,15 +111,17 @@ when: openondemand_mapping_users - name: Create app directories for dashboard links - file: + ansible.builtin.file: path: /var/www/ood/apps/sys/{{ item.app_name | default(item.name) }} state: directory + mode: "0755" loop: "{{ openondemand_dashboard_links }}" - name: Create app manifests for dashboard links - template: + ansible.builtin.template: src: dashboard_app_links.yml.j2 dest: /var/www/ood/apps/sys/{{ item.app_name | default(item.name) }}/manifest.yml + mode: "0644" loop: "{{ openondemand_dashboard_links }}" # - name: Ensure ondemand-dex is running and active @@ -137,13 +140,13 @@ # - /usr/share/ondemand-dex/web/themes/ - name: Keyscan login host - command: + ansible.builtin.command: cmd: "ssh-keyscan {{ openondemand_clusters.slurm.v2.login.host }}" register: _openondemand_login_key changed_when: false - name: Add login hostkeys to known hosts - blockinfile: + ansible.builtin.blockinfile: path: /etc/ssh/ssh_known_hosts create: true block: "{{ _openondemand_login_key.stdout_lines | sort | join('\n') }}" diff --git a/ansible/roles/openondemand/tasks/pam_auth.yml b/ansible/roles/openondemand/tasks/pam_auth.yml index 6bc4bda..2cf8a5b 100644 --- a/ansible/roles/openondemand/tasks/pam_auth.yml +++ b/ansible/roles/openondemand/tasks/pam_auth.yml @@ -1,31 +1,31 @@ # https://osc.github.io/ood-documentation/latest/authentication/pam.html --- - name: Install Apache PAM module # Extracted from start of roles/openondemand/tasks/pam_auth.yml to ensure only installed during build - yum: + ansible.builtin.dnf: name: mod_authnz_pam - name: Enable Apache PAM module - lineinfile: + ansible.builtin.lineinfile: path: /etc/httpd/conf.modules.d/55-authnz_pam.conf line: LoadModule authnz_pam_module modules/mod_authnz_pam.so regexp: ^LoadModule authnz_pam_module modules/mod_authnz_pam.so - name: Set PAM service # TODO: might need subsequent modification?? - command: + ansible.builtin.command: cmd: cp /etc/pam.d/sshd /etc/pam.d/ood creates: /etc/pam.d/ood - name: Allow the Apache user to read /etc/shadow - file: + ansible.builtin.file: path: /etc/shadow - mode: 0640 + mode: "0640" group: apache - name: Allow httpd access to PAM in SELinux ansible.posix.seboolean: name: httpd_mod_auth_pam - state: yes - persistent: yes + state: true + persistent: true when: ansible_facts.selinux.status == 'enabled' # TODO: do we need to restart OOD here?? diff --git a/ansible/roles/openondemand/tasks/rstudio_compute.yml b/ansible/roles/openondemand/tasks/rstudio_compute.yml index 99dd83a..8cb3c91 100644 --- a/ansible/roles/openondemand/tasks/rstudio_compute.yml +++ b/ansible/roles/openondemand/tasks/rstudio_compute.yml @@ -9,22 +9,23 @@ - name: Download RStudio Server RPM ansible.builtin.get_url: - url: "https://download2.rstudio.org/server/rhel{{ ansible_distribution_major_version }}/x86_64/rstudio-server-rhel-{{ openondemand_rstudio_version }}-x86_64.rpm" + url: "https://download2.rstudio.org/server/rhel{{ ansible_distribution_major_version }}/x86_64/rstudio-server-rhel-{{ openondemand_rstudio_version }}-x86_64.rpm" # noqa: yaml[line-length] dest: /tmp/rstudio-server.rpm mode: '0644' - name: Install RStudio Server + # checkov:skip=CKV2_ANSIBLE_4: "Ensure that packages with untrusted or missing GPG signatures are not used by dnf" ansible.builtin.dnf: name: /tmp/rstudio-server.rpm state: present - disable_gpg_check: yes + disable_gpg_check: true - name: Create module directory for RStudio Server ansible.builtin.file: path: /opt/ohpc/pub/modulefiles/rstudio-server state: directory mode: '0755' - recurse: yes + recurse: true - name: Write modulefile for RStudio Server ansible.builtin.copy: diff --git a/ansible/roles/openondemand/tasks/validate.yml b/ansible/roles/openondemand/tasks/validate.yml index 92e83d3..b22f51b 100644 --- a/ansible/roles/openondemand/tasks/validate.yml +++ b/ansible/roles/openondemand/tasks/validate.yml @@ -1,4 +1,5 @@ +--- - name: Check Open Ondemand servername is defined - assert: + ansible.builtin.assert: that: openondemand_servername != '' fail_msg: "Variable `openondemand_servername` must be set on openondemand and (by default) grafana hosts. See ansible/roles/openondemand/README.md" diff --git a/ansible/roles/openondemand/tasks/vnc_compute.yml b/ansible/roles/openondemand/tasks/vnc_compute.yml index 8b6f6cd..1fba0cd 100644 --- a/ansible/roles/openondemand/tasks/vnc_compute.yml +++ b/ansible/roles/openondemand/tasks/vnc_compute.yml @@ -1,13 +1,15 @@ +--- # Should be run on compute nodes you want to run the graphical desktop on - name: Enable TurboVNC repo tags: install - get_url: + ansible.builtin.get_url: url: https://raw.githubusercontent.com/TurboVNC/repo/main/TurboVNC.repo dest: /etc/yum.repos.d/TurboVNC.repo + mode: "0644" - name: Install EPEL tags: install - yum: + ansible.builtin.dnf: name: epel-release - name: Check /etc/init.d @@ -28,7 +30,7 @@ - name: Install VNC-related packages tags: install - dnf: + ansible.builtin.dnf: name: - turbovnc-3.0.1 - nmap-ncat @@ -37,7 +39,7 @@ - name: Stop turbovnc service # This is not actually required - systemd: + ansible.builtin.systemd: name: tvncserver state: stopped enabled: false @@ -47,16 +49,18 @@ src: /etc/init.d.orig/ # trailing / to get contents dest: /etc/init.d remote_src: true + directory_mode: "preserve" + mode: "preserve" when: - init_d.stat.exists - not init_d.stat.islnk - name: Install Xfce desktop tags: install - yum: - name: '@Xfce' + ansible.builtin.dnf: + name: "@Xfce" when: appliances_mode != 'configure' # dnf group/module installs aren't idempotent so only run during build - + # - name: Ensure python3.9 installed # dnf: # name: python39 @@ -64,22 +68,22 @@ - name: Install websockify venv # Requires separate step so that the upgraded pip is used to install packages - pip: + ansible.builtin.pip: name: pip - state: latest + state: latest # noqa: package-latest virtualenv: /opt/websockify-py39 virtualenv_command: python3.9 -m venv tags: install - name: Install websockify package in venv - pip: + ansible.builtin.pip: name: websockify virtualenv: /opt/websockify-py39 virtualenv_command: python3 -m venv tags: install -- name: Symlink websockify to where Open Ondemand expects - file: "{{ item }}" +- name: Symlink websockify to where Open Ondemand expects # noqa: args[module] + ansible.builtin.file: "{{ item }}" loop: - path: /opt/websockify state: directory @@ -87,7 +91,7 @@ dest: /opt/websockify/run state: link - name: Disable screensaver # as users might not have passwords - yum: + ansible.builtin.dnf: name: xfce4-screensaver state: absent when: not (openondemand_desktop_screensaver | bool) diff --git a/ansible/roles/opensearch/defaults/main.yml b/ansible/roles/opensearch/defaults/main.yml index 69e7f9c..1b05521 100644 --- a/ansible/roles/opensearch/defaults/main.yml +++ b/ansible/roles/opensearch/defaults/main.yml @@ -1,9 +1,9 @@ --- # Used to set passwords -#opensearch_internal_users_path: +# opensearch_internal_users_path: opensearch_podman_user: "{{ ansible_user }}" -opensearch_version: '2.9.0' # https://hub.docker.com/r/opensearchproject/opensearch/tags +opensearch_version: "2.9.0" # https://hub.docker.com/r/opensearchproject/opensearch/tags opensearch_config_path: /usr/share/opensearch/config opensearch_data_path: /usr/share/opensearch/data opensearch_state: started # will be restarted if required diff --git a/ansible/roles/opensearch/handlers/main.yml b/ansible/roles/opensearch/handlers/main.yml index d3a040d..61f5bbf 100644 --- a/ansible/roles/opensearch/handlers/main.yml +++ b/ansible/roles/opensearch/handlers/main.yml @@ -1,7 +1,6 @@ --- - - name: Restart opensearch service - systemd: + ansible.builtin.systemd: name: opensearch.service state: "{{ 'restarted' if 'started' in opensearch_state else opensearch_state }}" enabled: "{{ opensearch_systemd_service_enabled }}" diff --git a/ansible/roles/opensearch/tasks/archive_data.yml b/ansible/roles/opensearch/tasks/archive_data.yml index 298f66a..cb3403e 100644 --- a/ansible/roles/opensearch/tasks/archive_data.yml +++ b/ansible/roles/opensearch/tasks/archive_data.yml @@ -1,8 +1,9 @@ +--- # Remove data which was NOT indexed by Slurm Job ID # It will be re-ingested by filebeat from the slurmdbd, with that index - name: Ensure opensearch stopped - systemd: + ansible.builtin.systemd: name: opensearch state: stopped register: _opensearch_stop @@ -15,3 +16,4 @@ path: "{{ opensearch_data_path }}" dest: "{{ opensearch_data_path | dirname }}/data-{{ lookup('pipe', 'date --iso-8601=minutes') }}.tar.gz" remove: true + mode: "0644" diff --git a/ansible/roles/opensearch/tasks/certs.yml b/ansible/roles/opensearch/tasks/certs.yml index e40f652..4eee580 100644 --- a/ansible/roles/opensearch/tasks/certs.yml +++ b/ansible/roles/opensearch/tasks/certs.yml @@ -1,5 +1,6 @@ +--- - name: Ensure host certs directory exists - file: + ansible.builtin.file: path: "{{ opensearch_config_path }}/certs" state: directory owner: "{{ opensearch_podman_user }}" diff --git a/ansible/roles/opensearch/tasks/install.yml b/ansible/roles/opensearch/tasks/install.yml index 9a0ffd3..0ca5ebd 100644 --- a/ansible/roles/opensearch/tasks/install.yml +++ b/ansible/roles/opensearch/tasks/install.yml @@ -1,25 +1,28 @@ +--- # safe to use during build - name: Increase maximum number of virtual memory maps # see https://opensearch.org/docs/2.0/opensearch/install/important-settings/ ansible.posix.sysctl: name: vm.max_map_count - value: '262144' + value: "262144" state: present - reload: yes + reload: true - name: Create systemd unit file - template: + ansible.builtin.template: dest: /etc/systemd/system/opensearch.service src: opensearch.service.j2 + mode: "0644" register: _opensearch_unit - name: Pull container image containers.podman.podman_image: name: docker.io/opensearchproject/opensearch tag: "{{ opensearch_version }}" + become: true become_user: "{{ opensearch_podman_user }}" -- name: Reload opensearch unit file - command: systemctl daemon-reload - when: _opensearch_unit.changed +- name: Reload opensearch unit file # noqa: no-changed-when + ansible.builtin.command: systemctl daemon-reload # noqa: command-instead-of-module + when: _opensearch_unit.changed # noqa: no-handler diff --git a/ansible/roles/opensearch/tasks/migrate-opendistro.yml b/ansible/roles/opensearch/tasks/migrate-opendistro.yml index 7cb5c81..fd239bc 100644 --- a/ansible/roles/opensearch/tasks/migrate-opendistro.yml +++ b/ansible/roles/opensearch/tasks/migrate-opendistro.yml @@ -1,3 +1,4 @@ +--- # Migrate data from existing containerised opendistro v1.12.0 to containerised opensearch 2.1.0. # # This relies on: @@ -22,7 +23,7 @@ dest: "{{ opensearch_data_path | dirname }}/" # copying a directory, so need to specify the parent for destination owner: "{{ opensearch_podman_user }}" group: "{{ opensearch_podman_user }}" - mode: 0770 + mode: "0770" vars: # from environments/common/inventory/group_vars/all/opendistro.yml: _default_opendistro_data_path: "{{ appliances_state_dir | default('/usr/share') }}/elasticsearch/data" diff --git a/ansible/roles/opensearch/tasks/runtime.yml b/ansible/roles/opensearch/tasks/runtime.yml index 7fe197a..7247f15 100644 --- a/ansible/roles/opensearch/tasks/runtime.yml +++ b/ansible/roles/opensearch/tasks/runtime.yml @@ -1,55 +1,54 @@ --- - - name: Check for existing opendistro service - stat: + ansible.builtin.stat: path: /etc/systemd/system/opendistro.service register: _opensearch_opendistro_service - name: Migrate opendistro data - import_tasks: + ansible.builtin.import_tasks: file: migrate-opendistro.yml when: _opensearch_opendistro_service.stat.exists - name: Remove opendistro service - file: + ansible.builtin.file: path: /etc/systemd/system/opendistro.service state: absent - name: Enumerate files in data directory - find: + ansible.builtin.find: path: "{{ opensearch_data_path }}" register: _find_opensearch_data - name: Archive incorrectly indexed data - import_tasks: archive_data.yml + ansible.builtin.import_tasks: archive_data.yml when: - _find_opensearch_data.files | length > 0 - "'slurm_jobid_index' not in _find_opensearch_data.files | map(attribute='path') | map('basename')" - name: Ensure required opensearch host directories exist - file: + ansible.builtin.file: state: directory path: "{{ item }}" owner: "{{ opensearch_podman_user }}" group: "{{ opensearch_podman_user }}" - mode: 0770 + mode: "0770" become: true loop: - "{{ opensearch_config_path }}" - "{{ opensearch_data_path }}" - name: Set indexed data flag - copy: + ansible.builtin.copy: dest: "{{ opensearch_data_path }}/slurm_jobid_index" content: | This is a flag file to indicate that filebeat is pushing data indexed by Slurm JobID to prevent duplicate OpenSearch records owner: "{{ opensearch_podman_user }}" group: "{{ opensearch_podman_user }}" + mode: "0644" - name: Create certs - import_tasks: certs.yml - + ansible.builtin.import_tasks: certs.yml - name: Template general configuration ansible.builtin.template: src: opensearch.yml.j2 @@ -58,27 +57,26 @@ group: "{{ opensearch_podman_user }}" # NOTE: root user in container maps to user on host, so this will appear as # owned by root in the container. - mode: 0660 + mode: "0660" notify: Restart opensearch service become: true - name: Template internal user configuration - template: - src: "{{ opensearch_internal_users_path }}" - dest: "{{ opensearch_config_path }}/internal_users.yml" - owner: "{{ opensearch_podman_user }}" - group: "{{ opensearch_podman_user }}" - # NOTE: root user in container maps to user on host, so this will appear as - # owned by root in the container. - mode: 0660 + ansible.builtin.template: + src: "{{ opensearch_internal_users_path }}" + dest: "{{ opensearch_config_path }}/internal_users.yml" + owner: "{{ opensearch_podman_user }}" + group: "{{ opensearch_podman_user }}" + # NOTE: root user in container maps to user on host, so this will appear as + # owned by root in the container. + mode: "0660" notify: Restart opensearch service become: true - name: Flush handlers - meta: flush_handlers - + ansible.builtin.meta: flush_handlers - name: Ensure opensearch service state - systemd: + ansible.builtin.systemd: name: opensearch.service state: "{{ opensearch_state }}" enabled: "{{ opensearch_systemd_service_enabled }}" diff --git a/ansible/roles/passwords/defaults/main.yml b/ansible/roles/passwords/defaults/main.yml index 95e3b6a..a848431 100644 --- a/ansible/roles/passwords/defaults/main.yml +++ b/ansible/roles/passwords/defaults/main.yml @@ -1,7 +1,9 @@ --- slurm_appliance_secrets: + # yamllint disable-line rule:line-length vault_grafana_admin_password: "{{ secrets_openhpc_grafana_admin_password | default(vault_grafana_admin_password | default(lookup('password', '/dev/null'))) }}" + # yamllint disable-line rule:line-length vault_elasticsearch_admin_password: "{{ secrets_openhpc_elasticsearch_admin_password | default(vault_elasticsearch_admin_password | default(lookup('password', '/dev/null'))) }}" vault_mysql_root_password: "{{ secrets_openhpc_mysql_root_password | default(vault_mysql_root_password | default(lookup('password', '/dev/null'))) }}" vault_mysql_slurm_password: "{{ secrets_openhpc_mysql_slurm_password | default(vault_mysql_slurm_password | default(lookup('password', '/dev/null'))) }}" @@ -16,4 +18,5 @@ slurm_appliance_secrets: secrets_openhpc_mungekey_default: content: "{{ lookup('pipe', 'dd if=/dev/urandom bs=1 count=1024 2>/dev/null | base64') }}" +# yamllint disable-line rule:line-length openhpc_passwords_output_path: "{{ lookup('env', 'APPLIANCES_ENVIRONMENT_ROOT') | default(undefined, true) | mandatory('You must define the APPLIANCES_ENVIRONMENT_ROOT environment variable') }}/inventory/group_vars/all/secrets.yml" diff --git a/ansible/roles/passwords/tasks/main.yml b/ansible/roles/passwords/tasks/main.yml index 743a6cd..cb41cbb 100644 --- a/ansible/roles/passwords/tasks/main.yml +++ b/ansible/roles/passwords/tasks/main.yml @@ -1,8 +1,8 @@ --- - - name: Template passwords - template: + ansible.builtin.template: src: passwords.yml dest: "{{ openhpc_passwords_output_path }}" + mode: "0644" delegate_to: localhost run_once: true diff --git a/ansible/roles/passwords/tasks/validate.yml b/ansible/roles/passwords/tasks/validate.yml index b30b069..6cde144 100644 --- a/ansible/roles/passwords/tasks/validate.yml +++ b/ansible/roles/passwords/tasks/validate.yml @@ -1,4 +1,5 @@ +--- - name: Assert secrets created - assert: + ansible.builtin.assert: that: (hostvars[inventory_hostname].keys() | select('contains', 'vault_') | length) > 1 # 1 as may have vault_demo_user_password defined in dev fail_msg: "No inventory variables 'vault_*' found: Has ansible/adhoc/generate-passwords.yml been run?" diff --git a/ansible/roles/persist_hostkeys/defaults/main.yml b/ansible/roles/persist_hostkeys/defaults/main.yml index 3c00004..0de0b71 100644 --- a/ansible/roles/persist_hostkeys/defaults/main.yml +++ b/ansible/roles/persist_hostkeys/defaults/main.yml @@ -1,2 +1,3 @@ +--- persist_hostkeys_state_server: "{{ groups['control'] | first }}" persist_hostkeys_state_dir: "{{ hostvars[persist_hostkeys_state_server]['appliances_state_dir'] }}/hostkeys" diff --git a/ansible/roles/persist_hostkeys/tasks/main.yml b/ansible/roles/persist_hostkeys/tasks/main.yml index deff112..139281a 100644 --- a/ansible/roles/persist_hostkeys/tasks/main.yml +++ b/ansible/roles/persist_hostkeys/tasks/main.yml @@ -1,47 +1,47 @@ --- - - name: Generate persistent hostkeys in state directory delegate_to: "{{ persist_hostkeys_state_server }}" block: - - name: Ensure hostkeys directory exists on persistent storage - file: - path: "{{ persist_hostkeys_state_dir }}" - state: directory - owner: root - group: root - mode: 0600 + - name: Ensure hostkeys directory exists on persistent storage + ansible.builtin.file: + path: "{{ persist_hostkeys_state_dir }}" + state: directory + owner: root + group: root + mode: "0600" + + - name: Check for existing hostkeys + ansible.builtin.find: + paths: "{{ persist_hostkeys_state_dir }}/" + register: _files_found - - name: Check for existing hostkeys - find: - paths: "{{ persist_hostkeys_state_dir }}/" - register: _files_found + - name: Generate hostkeys # noqa: no-changed-when + when: _files_found.matched == 0 + ansible.builtin.shell: + # ssh-keygen -A needs a directory with an /etc/ssh suffix to write hostkeys into + cmd: | + mkdir -p {{ persist_hostkeys_state_dir }}/etc/ssh + ssh-keygen -A -N '' -f {{ persist_hostkeys_state_dir }} + mv {{ persist_hostkeys_state_dir }}/etc/ssh/* {{ persist_hostkeys_state_dir }} + rm -rf {{ persist_hostkeys_state_dir }}/etc/ssh - - name: Generate hostkeys - when: _files_found.matched == 0 - shell: - # ssh-keygen -A needs a directory with an /etc/ssh suffix to write hostkeys into - cmd: | - mkdir -p {{ persist_hostkeys_state_dir }}/etc/ssh - ssh-keygen -A -N '' -f {{ persist_hostkeys_state_dir }} - mv {{ persist_hostkeys_state_dir }}/etc/ssh/* {{ persist_hostkeys_state_dir }} - rm -rf {{ persist_hostkeys_state_dir }}/etc/ssh - - - name: Get created key names - find: - path: "{{ persist_hostkeys_state_dir }}/" - register: _find_ssh_keys + - name: Get created key names + ansible.builtin.find: + path: "{{ persist_hostkeys_state_dir }}/" + register: _find_ssh_keys - - name: Create in-memory copies of keys - ansible.builtin.slurp: - src: "{{ item.path }}" - loop: "{{ _find_ssh_keys.files }}" - register: _slurp_keys + - name: Create in-memory copies of keys + ansible.builtin.slurp: + src: "{{ item.path }}" + loop: "{{ _find_ssh_keys.files }}" + register: _slurp_keys - name: Copy keys to hosts no_log: true - copy: + ansible.builtin.copy: content: "{{ item.content | b64decode }}" dest: "/etc/ssh/{{ item.source | regex_search('[^/]+$') }}" + mode: "preserve" loop: "{{ _slurp_keys.results }}" -- meta: reset_connection +- ansible.builtin.meta: reset_connection diff --git a/ansible/roles/persist_openhpc_secrets/tasks/main.yml b/ansible/roles/persist_openhpc_secrets/tasks/main.yml index e0f5865..dc12e2a 100644 --- a/ansible/roles/persist_openhpc_secrets/tasks/main.yml +++ b/ansible/roles/persist_openhpc_secrets/tasks/main.yml @@ -1,16 +1,16 @@ --- - name: Check if OpenHPC secrets exist in persistent storage - stat: + ansible.builtin.stat: path: "{{ appliances_state_dir }}/ansible.facts.d/openhpc_secrets.fact" register: openhpc_secrets_stat - name: Ensure Ansible facts directories exist - file: + ansible.builtin.file: path: "{{ item }}" state: directory owner: root - mode: 0600 + mode: "0600" loop: - "{{ appliances_state_dir }}/ansible.facts.d" - "/etc/ansible/facts.d" @@ -21,19 +21,19 @@ when: openhpc_secrets_stat.stat.exists - name: Write OpenHPC secrets - template: + ansible.builtin.template: src: openhpc_secrets.fact dest: "{{ appliances_state_dir }}/ansible.facts.d/openhpc_secrets.fact" owner: root - mode: 0600 + mode: "0600" - name: Symlink persistent facts to facts_path - file: + ansible.builtin.file: state: link src: "{{ appliances_state_dir }}/ansible.facts.d/openhpc_secrets.fact" dest: /etc/ansible/facts.d/openhpc_secrets.fact owner: root - + - name: Refresh facts to pick up any new secrets ansible.builtin.setup: filter: ansible_local diff --git a/ansible/roles/podman/defaults/main.yml b/ansible/roles/podman/defaults/main.yml index 8b3c9ef..fc76d06 100644 --- a/ansible/roles/podman/defaults/main.yml +++ b/ansible/roles/podman/defaults/main.yml @@ -1,2 +1,3 @@ +--- podman_users: - name: "{{ ansible_user }}" diff --git a/ansible/roles/podman/tasks/configure.yml b/ansible/roles/podman/tasks/configure.yml index 74cf1d5..962712f 100644 --- a/ansible/roles/podman/tasks/configure.yml +++ b/ansible/roles/podman/tasks/configure.yml @@ -1,7 +1,6 @@ --- - - name: Up default resource limits - copy: + ansible.builtin.copy: content: | # WARNING: This file is managed by ansible, do not modify. # This is so non-root containers can use more resources. This is useful @@ -11,6 +10,7 @@ * soft nofile 65536 * hard nofile 65536 dest: /etc/security/limits.d/custom.conf + mode: "0644" become: true - name: Up number of non-root kernel keys permitted per user @@ -36,17 +36,16 @@ value: '"cgroupfs"' become: true -- name: reset ssh connection to allow user changes to affect 'current login user' - meta: reset_connection - +- name: Reset ssh connection to allow user changes to affect 'current login user' + ansible.builtin.meta: reset_connection - name: Ensure podman users exist - user: "{{ item }}" + ansible.builtin.user: "{{ item }}" # noqa: args[module] with_items: "{{ podman_users }}" register: podman_user_info - become: yes + become: true - name: Clear up podman temporary files on startup - copy: + ansible.builtin.copy: content: | # Created by ansible # Delete ephemeral podman files to avoid issues where /tmp is not of type tmpfs and persists across reboots. @@ -59,5 +58,5 @@ dest: /etc/tmpfiles.d/podman-local.conf owner: root group: root - mode: 0660 + mode: "0660" become: true diff --git a/ansible/roles/podman/tasks/install.yml b/ansible/roles/podman/tasks/install.yml index 362d3a1..d7a4d86 100644 --- a/ansible/roles/podman/tasks/install.yml +++ b/ansible/roles/podman/tasks/install.yml @@ -1,8 +1,8 @@ --- - name: Install OS packages - yum: + ansible.builtin.dnf: name: - podman - python3 state: installed - become: true \ No newline at end of file + become: true diff --git a/ansible/roles/podman/tasks/main.yml b/ansible/roles/podman/tasks/main.yml index 2b65e84..2538c7f 100644 --- a/ansible/roles/podman/tasks/main.yml +++ b/ansible/roles/podman/tasks/main.yml @@ -1,2 +1,2 @@ -- import_tasks: install.yml -- import_tasks: configure.yml +- ansible.builtin.import_tasks: install.yml +- ansible.builtin.import_tasks: configure.yml diff --git a/ansible/roles/proxy/defaults/main.yml b/ansible/roles/proxy/defaults/main.yml index f87f340..289e819 100644 --- a/ansible/roles/proxy/defaults/main.yml +++ b/ansible/roles/proxy/defaults/main.yml @@ -1,3 +1,4 @@ +--- # proxy_http_proxy: proxy_https_proxy: "{{ proxy_http_proxy }}" proxy_no_proxy_defaults: "{{ ['localhost', '127.0.0.1', '169.254.169.254'] + groups['all'] + hostvars.values() | map(attribute='ansible_host') }}" diff --git a/ansible/roles/proxy/tasks/main.yml b/ansible/roles/proxy/tasks/main.yml index b6d880f..be3898d 100644 --- a/ansible/roles/proxy/tasks/main.yml +++ b/ansible/roles/proxy/tasks/main.yml @@ -7,9 +7,9 @@ for convenience variables to set this. - name: Define configuration in /etc/environment tags: proxy - lineinfile: + ansible.builtin.lineinfile: path: "/etc/environment" - create: yes + create: true owner: root group: root mode: o=rw,go=r @@ -25,7 +25,7 @@ value: "{{ proxy_no_proxy }}" - name: Define dnf proxy - ini_file: + community.general.ini_file: path: /etc/dnf/dnf.conf section: main option: "proxy" @@ -38,7 +38,7 @@ when: proxy_dnf | bool - name: Create systemd configuration directory - file: + ansible.builtin.file: path: /etc/systemd/system.conf.d/ state: directory owner: root @@ -52,9 +52,9 @@ section: Manager option: DefaultEnvironment value: >- - "http_proxy={{ proxy_http_proxy }}" - "https_proxy={{ proxy_http_proxy }}" - "no_proxy={{ proxy_no_proxy }}" + "http_proxy={{ proxy_http_proxy }}" + "https_proxy={{ proxy_http_proxy }}" + "no_proxy={{ proxy_no_proxy }}" no_extra_spaces: true state: "{{ proxy_state }}" owner: root @@ -63,12 +63,11 @@ register: _copy_systemd_proxy when: proxy_systemd | bool -- name: Restart systemd - command: systemctl daemon-reexec - when: +- name: Restart systemd # noqa: no-changed-when + ansible.builtin.command: systemctl daemon-reexec # noqa: command-instead-of-module + when: - proxy_systemd | bool - _copy_systemd_proxy.changed | default(false) - name: Reset connection to get new /etc/environment - meta: reset_connection - # NB: conditionals not supported + ansible.builtin.meta: reset_connection diff --git a/ansible/roles/pulp_site/README.md b/ansible/roles/pulp_site/README.md index 3af801c..f860954 100644 --- a/ansible/roles/pulp_site/README.md +++ b/ansible/roles/pulp_site/README.md @@ -1,21 +1,18 @@ -pulp_site -========= +# pulp_site -Contains playbooks to deploy a Pulp server and sync its content with repo snapshots in +Contains playbooks to deploy a Pulp server and sync its content with repository snapshots in StackHPC's Ark Pulp server -Requirements ------------- +## Requirements Requires Ark credentials. The VM you are deploying Pulp on must allow ingress on `pulp_site_port` and not be externally accessible (as the Pulp server's content is unauthenticated). Rocky Linux 9 has been tested as the target VM for deploying Pulp. -Role Variables --------------- +## Role Variables -- `pulp_site_url`: Required str. The base url from which Pulp content will be hosted. Defaults to `{{ appliances_pulp_url }}`. - Value to set for ``appliances_pulp_url` will be generated and output by the deploy.yml playbook. +- `pulp_site_url`: Required str. The base URL from which Pulp content will be hosted. Defaults to `{{ appliances_pulp_url }}`. + Value to set for ``appliances_pulp_url` will be generated and output by the deploy.yml playbook. - `pulp_site_port`: Optional str. Port to serve Pulp server on. Defaults to `8080`. - `pulp_site_username`: Optional str. Admin username for the Pulp server. Defaults to `admin`. - `pulp_site_password`: Required str. Admin password for the Pulp server. Defaults to `{{ vault_pulp_admin_password }}`. @@ -24,13 +21,13 @@ Role Variables - `pulp_site_upstream_content_url`: Optional str. Content URL of upstream Ark Pulp. Defaults to `https://ark.stackhpc.com/pulp/content`. - `pulp_site_install_dir`: Optional str. Directory on Pulp host to install config and persistent state to be mounted into Pulp container. Defaults to `/home/rocky/pulp`. - `pulp_site_target_facts`: Optional str. The `ansible_facts` of a host which will be pulling from your Pulp server, allowing the role to auto-discover the necessary repos to pull. - defaults to `{{ hostvars[groups['pulp'][0]]['ansible_facts'] }}`. + defaults to `{{ hostvars[groups['pulp'][0]]['ansible_facts'] }}`. - `pulp_site_target_distribution_version`: Optional str. The Rocky Linux minor release to sync repos from Ark for. Defaults to `{{ pulp_site_target_facts['distribution_version'] }}`. -- `pulp_site_rpm_repo_defaults`: Optional dict. Contains key value pairs for fields which are common to all repo definition in `pulp_site_rpm_repos`. Includes values for `remote_username`, - `remote_password` and `policy` by default. -- `pulp_site_rpm_repos`: Optional list of dicts. List of repo definitions in format required by the `stackhpc.pulp.pulp_repository`. Defaults to modified versions of repos defined in - `dnf_repos_all`. -- `pulp_site_rpm_publications`: Optional list of dicts. List of repo definitions in format required by the `stackhpc.pulp.pulp_publication`. Defaults to list of publications for repos defined in - `dnf_repos_all`. -- `pulp_site_rpm_distributions`: Optional list of dicts. List of repo definitions in format required by the `stackhpc.pulp.pulp_distribution`. Defaults to list of distributions for repos defined in - `dnf_repos_all`. +- `pulp_site_rpm_repo_defaults`: Optional dict. Contains key-value pairs for fields which are common to all repository definition in `pulp_site_rpm_repos`. Includes values for `remote_username`, + `remote_password` and `policy` by default. +- `pulp_site_rpm_repos`: Optional list of dicts. List of repository definitions in format required by the `stackhpc.pulp.pulp_repository`. Defaults to modified versions of repos defined in + `dnf_repos_all`. +- `pulp_site_rpm_publications`: Optional list of dicts. List of repository definitions in format required by the `stackhpc.pulp.pulp_publication`. Defaults to list of publications for repos defined in + `dnf_repos_all`. +- `pulp_site_rpm_distributions`: Optional list of dicts. List of repository definitions in format required by the `stackhpc.pulp.pulp_distribution`. Defaults to list of distributions for repos defined in + `dnf_repos_all`. diff --git a/ansible/roles/pulp_site/filter_plugins/pulp-list-filters.py b/ansible/roles/pulp_site/filter_plugins/pulp-list-filters.py index 41e995c..76c62c9 100644 --- a/ansible/roles/pulp_site/filter_plugins/pulp-list-filters.py +++ b/ansible/roles/pulp_site/filter_plugins/pulp-list-filters.py @@ -1,19 +1,23 @@ +# pylint: disable=invalid-name, missing-module-docstring +# pylint: disable-next=missing-class-docstring, useless-object-inheritance class FilterModule(object): - def filters(self): + + def filters(self): # pylint: disable=missing-function-docstring return { - 'to_rpm_repos': self.to_rpm_repos, - 'to_rpm_pubs': self.to_rpm_pubs, - 'to_rpm_distros': self.to_rpm_distros, - 'select_repos': self.select_repos, + "to_rpm_repos": self.to_rpm_repos, + "to_rpm_pubs": self.to_rpm_pubs, + "to_rpm_distros": self.to_rpm_distros, + "select_repos": self.select_repos, } - + def select_repos(self, dnf_repos, target_distro_ver): - """ Filter dnf_repos to only those for a relevant distribution version (M.m or M). Returns a list of dicts. - Also adds pulp_repo_name field to give the repository a unique name in Pulp to be referenced by subsequent - filters + """Filter dnf_repos to only those for a relevant distribution version (M.m or M). + Returns a list of dicts. + Also adds pulp_repo_name field to give the repository a unique name in Pulp + to be referenced by subsequent filters """ - - target_distro_ver_major = target_distro_ver.split('.')[0] + + target_distro_ver_major = target_distro_ver.split(".")[0] rpm_repos = [] for repokey in dnf_repos: @@ -23,41 +27,54 @@ def select_repos(self, dnf_repos, target_distro_ver): elif target_distro_ver_major in dnf_repos[repokey]: selected_ver = target_distro_ver_major else: - raise ValueError(f'No key matching {target_distro_ver_major} or {target_distro_ver} found in f{repokey}') + raise ValueError( + # pylint: disable-next=line-too-long + f"No key matching {target_distro_ver_major} or {target_distro_ver} found in f{repokey}" + ) repo_data = dnf_repos[repokey][selected_ver] - repo_data['pulp_repo_name'] = f"{repokey}-{selected_ver}-{dnf_repos[repokey][selected_ver]['pulp_timestamp']}" + repo_data["pulp_repo_name"] = ( + f"{repokey}-{selected_ver}-{dnf_repos[repokey][selected_ver]['pulp_timestamp']}" + ) rpm_repos.append(repo_data) return rpm_repos def to_rpm_repos(self, rpm_info, content_url, repo_defaults): - """ Filter repo object list given by select_repos into dict required by the pulp_repository_rpm_repos variable - from stackhpc.pulp.pulp_repository role + """Filter repo object list given by select_repos into dict required by the + pulp_repository_rpm_repos variable from stackhpc.pulp.pulp_repository role """ rpm_repos = [] for repo_data in rpm_info: - rpm_data = repo_defaults.copy() # NB: this changes behaviour vs before, so now defaults can correctly be overriden - rpm_data['name'] = repo_data['pulp_repo_name'] - rpm_data['url'] = '/'.join([content_url, repo_data['pulp_path'], repo_data['pulp_timestamp']]) - rpm_data['state'] = 'present' + rpm_data = ( + repo_defaults.copy() + ) # NB: this changes behaviour vs before, so now defaults can correctly be overriden + rpm_data["name"] = repo_data["pulp_repo_name"] + rpm_data["url"] = "/".join( + [content_url, repo_data["pulp_path"], repo_data["pulp_timestamp"]] + ) + rpm_data["state"] = "present" rpm_repos.append(rpm_data) return rpm_repos - def to_rpm_pubs(self, list): - """ Filter repo object list given by select_repos into dict required by the pulp_publication_rpm variable - from stackhpc.pulp.pulp_publication role + def to_rpm_pubs(self, _list): + """Filter repo object list given by select_repos into dict required by the + pulp_publication_rpm variable from stackhpc.pulp.pulp_publication role """ - pub_list = map(lambda x: { - 'repository': x['pulp_repo_name'], - 'state': 'present' }, list) + pub_list = map( + lambda x: {"repository": x["pulp_repo_name"], "state": "present"}, _list + ) return pub_list - - def to_rpm_distros(self, list): - """ Filter repo object list given by select_repos into dict required by the pulp_distirubtion_rpm variable - from stackhpc.pulp.pulp_distribution role + + def to_rpm_distros(self, _list): + """Filter repo object list given by select_repos into dict required by the + pulp_distirubtion_rpm variable from stackhpc.pulp.pulp_distribution role """ - distro_list = map(lambda x: { - 'name': x['pulp_repo_name'], - 'repository': x['pulp_repo_name'], - 'base_path': '/'.join([x['pulp_path'],x['pulp_timestamp']]), - 'state': 'present' }, list) + distro_list = map( + lambda x: { + "name": x["pulp_repo_name"], + "repository": x["pulp_repo_name"], + "base_path": "/".join([x["pulp_path"], x["pulp_timestamp"]]), + "state": "present", + }, + _list, + ) return distro_list diff --git a/ansible/roles/pulp_site/tasks/install.yml b/ansible/roles/pulp_site/tasks/install.yml index 75b0f66..3be89d0 100644 --- a/ansible/roles/pulp_site/tasks/install.yml +++ b/ansible/roles/pulp_site/tasks/install.yml @@ -1,24 +1,25 @@ --- - - name: Install packages - dnf: + ansible.builtin.dnf: name: - - podman + - podman - name: Create install directories ansible.builtin.file: state: directory path: "{{ pulp_site_install_dir }}/{{ item }}" + mode: "0755" loop: - - settings/certs - - pulp_storage - - pgsql - - containers + - settings/certs + - pulp_storage + - pgsql + - containers - name: Template settings file ansible.builtin.template: src: settings.py.j2 dest: "{{ pulp_site_install_dir }}/settings/settings.py" + mode: "0644" - name: Install pulp podman container containers.podman.podman_container: @@ -26,30 +27,31 @@ publish: - "{{ pulp_site_port }}:80" volume: - - "{{ pulp_site_install_dir }}/settings:/etc/pulp{{ _pulp_site_selinux_suffix }}" - - "{{ pulp_site_install_dir }}/pulp_storage:/var/lib/pulp{{ _pulp_site_selinux_suffix }}" - - "{{ pulp_site_install_dir }}/pgsql:/var/lib/pgsql{{ _pulp_site_selinux_suffix }}" - - "{{ pulp_site_install_dir }}/containers:/var/lib/containers{{ _pulp_site_selinux_suffix }}" + - "{{ pulp_site_install_dir }}/settings:/etc/pulp{{ _pulp_site_selinux_suffix }}" + - "{{ pulp_site_install_dir }}/pulp_storage:/var/lib/pulp{{ _pulp_site_selinux_suffix }}" + - "{{ pulp_site_install_dir }}/pgsql:/var/lib/pgsql{{ _pulp_site_selinux_suffix }}" + - "{{ pulp_site_install_dir }}/containers:/var/lib/containers{{ _pulp_site_selinux_suffix }}" device: /dev/fuse image: docker.io/pulp/pulp:3.68.1 state: present - name: Create systemd file - copy: + ansible.builtin.copy: src: pulp.service dest: /etc/systemd/system/pulp.service + mode: "0644" register: _pulp_service - + - name: Start Pulp service - systemd: + ansible.builtin.systemd: name: pulp state: "{{ 'started' if _pulp_service.changed else 'restarted' }}" daemon_reload: "{{ _pulp_service.changed }}" enabled: true - -- name: Reset admin password once container has initialised + +- name: Reset admin password once container has initialised # noqa: no-changed-when no_log: true - ansible.builtin.shell: + ansible.builtin.command: cmd: "podman exec pulp bash -c 'pulpcore-manager reset-admin-password -p {{ pulp_site_password }}'" register: _admin_reset_output until: 0 == _admin_reset_output.rc diff --git a/ansible/roles/pulp_site/tasks/sync.yml b/ansible/roles/pulp_site/tasks/sync.yml index 9a2a932..670a940 100644 --- a/ansible/roles/pulp_site/tasks/sync.yml +++ b/ansible/roles/pulp_site/tasks/sync.yml @@ -1,5 +1,4 @@ --- - - ansible.builtin.assert: that: pulp_site_upstream_password != '' quiet: true @@ -9,13 +8,14 @@ ansible.builtin.file: path: ~/.config/pulp state: directory + mode: "0755" - name: Create config file no_log: true ansible.builtin.template: src: cli.toml.j2 dest: ~/.config/pulp/cli.toml - mode: '0644' + mode: "0644" - name: Wait for Pulp server pulp.squeezer.status: @@ -27,25 +27,28 @@ retries: 30 delay: 20 -- block: - - name: Ensure squeezer cache exists - ansible.builtin.file: - path: "{{ _cache_dir }}" - state: directory +- vars: + _cache_dir: "~/.cache/squeezer/{{ pulp_site_url | regex_replace(':|/', '_') }}" - - name: Check if squeezer cache is populated - ansible.builtin.stat: - path: "{{ _cache_dir }}/api.json" - register: _cache_stat + block: + - name: Ensure squeezer cache exists + ansible.builtin.file: + path: "{{ _cache_dir }}" + state: directory + mode: "0755" - - name: Prepopulate squeezer cache # workaround for race on the cache - ansible.builtin.get_url: - url: "{{ pulp_site_url }}/pulp/api/v3/docs/api.json" - dest: "{{ _cache_dir }}/api.json" - timeout: 40 - when: not _cache_stat.stat.exists - vars: - _cache_dir: "~/.cache/squeezer/{{ pulp_site_url | regex_replace( ':|/' , '_' ) }}" + - name: Check if squeezer cache is populated + ansible.builtin.stat: + path: "{{ _cache_dir }}/api.json" + register: _cache_stat + + - name: Prepopulate squeezer cache # workaround for race on the cache + ansible.builtin.get_url: + url: "{{ pulp_site_url }}/pulp/api/v3/docs/api.json" + dest: "{{ _cache_dir }}/api.json" + mode: "0644" + timeout: 40 + when: not _cache_stat.stat.exists - name: Get Pulp repos from release train ansible.builtin.include_role: diff --git a/ansible/roles/rebuild/README.md b/ansible/roles/rebuild/README.md index 4e4e87a..affc7b6 100644 --- a/ansible/roles/rebuild/README.md +++ b/ansible/roles/rebuild/README.md @@ -1,17 +1,14 @@ -rebuild -========= +# rebuild -Enables reboot tool from https://github.com/stackhpc/slurm-openstack-tools.git +Enables reboot tool from to be run from control node. -Requirements ------------- +## Requirements An OpenStack clouds.yaml file containing credentials for a cloud under the "openstack" key. -Role Variables --------------- +## Role Variables The below is only used by this role's `main.yml` task file, i.e. when running the `ansible/site.yml` or `ansible/slurm.yml` playbooks: diff --git a/ansible/roles/rebuild/defaults/main.yml b/ansible/roles/rebuild/defaults/main.yml index 9482836..16e2141 100644 --- a/ansible/roles/rebuild/defaults/main.yml +++ b/ansible/roles/rebuild/defaults/main.yml @@ -4,9 +4,9 @@ rebuild_clouds_path: ~/.config/openstack/clouds.yaml rebuild_job_partitions: rebuild rebuild_job_name: "rebuild-{{ item }}" # item is nodename -rebuild_job_command: 'sleep 5' +rebuild_job_command: "sleep 5" rebuild_job_reboot: true -rebuild_job_options: '' +rebuild_job_options: "" rebuild_job_user: root rebuild_job_template: >- sbatch @@ -20,4 +20,4 @@ rebuild_job_template: >- --output=/dev/null --wrap="{{ rebuild_job_command }}" {{ rebuild_job_options }} -#rebuild_job_hostlist: \ No newline at end of file +# rebuild_job_hostlist: diff --git a/ansible/roles/rebuild/tasks/configure.yml b/ansible/roles/rebuild/tasks/configure.yml index 78a3b7b..801e2ea 100644 --- a/ansible/roles/rebuild/tasks/configure.yml +++ b/ansible/roles/rebuild/tasks/configure.yml @@ -1,7 +1,7 @@ --- - name: Create /etc/openstack - file: + ansible.builtin.file: path: /etc/openstack state: directory owner: slurm @@ -9,7 +9,7 @@ mode: u=rX,g=rwX - name: Copy out clouds.yaml - copy: + ansible.builtin.copy: src: "{{ rebuild_clouds_path }}" dest: /etc/openstack/clouds.yaml owner: slurm diff --git a/ansible/roles/rebuild/tasks/install.yml b/ansible/roles/rebuild/tasks/install.yml index 1152426..1c1b63a 100644 --- a/ansible/roles/rebuild/tasks/install.yml +++ b/ansible/roles/rebuild/tasks/install.yml @@ -1,3 +1,3 @@ - name: Setup slurm tools - include_role: + ansible.builtin.include_role: name: slurm_tools diff --git a/ansible/roles/rebuild/tasks/main.yml b/ansible/roles/rebuild/tasks/main.yml index 79d326c..e5e0787 100644 --- a/ansible/roles/rebuild/tasks/main.yml +++ b/ansible/roles/rebuild/tasks/main.yml @@ -1,4 +1,4 @@ --- -- include_tasks: install.yml -- include_tasks: configure.yml +- ansible.builtin.include_tasks: install.yml +- ansible.builtin.include_tasks: configure.yml diff --git a/ansible/roles/rebuild/tasks/rebuild.yml b/ansible/roles/rebuild/tasks/rebuild.yml index 466951f..bc202df 100644 --- a/ansible/roles/rebuild/tasks/rebuild.yml +++ b/ansible/roles/rebuild/tasks/rebuild.yml @@ -1,11 +1,11 @@ +--- - name: Create rebuild jobs for partition - include_tasks: + ansible.builtin.include_tasks: file: rebuild_partition.yml args: apply: - become: yes + become: true become_user: "{{ rebuild_job_user }}" loop: "{{ rebuild_job_partitions | split(',') }}" loop_control: loop_var: _rebuild_job_current_partition - diff --git a/ansible/roles/rebuild/tasks/rebuild_partition.yml b/ansible/roles/rebuild/tasks/rebuild_partition.yml index 3b319e6..35c748a 100644 --- a/ansible/roles/rebuild/tasks/rebuild_partition.yml +++ b/ansible/roles/rebuild/tasks/rebuild_partition.yml @@ -1,4 +1,5 @@ -- name: Get list of nodes in partition +--- +- name: Get list of nodes in partition # noqa: no-changed-when ansible.builtin.command: cmd: >- sinfo @@ -9,13 +10,13 @@ register: _sinfo_partition when: rebuild_job_hostlist is not defined -- name: Expand rebuild_job_hostlist to host names +- name: Expand rebuild_job_hostlist to host names # noqa: no-changed-when ansible.builtin.command: cmd: "scontrol show hostnames {{ rebuild_job_hostlist }}" register: _scontrol_hostnames when: rebuild_job_hostlist is defined -- name: Submit rebuild jobs +- name: Submit rebuild jobs # noqa: no-changed-when ansible.builtin.command: cmd: "{{ rebuild_job_template }}" loop: "{{ _scontrol_hostnames.stdout_lines | default(_sinfo_partition.stdout_lines) }}" diff --git a/ansible/roles/resolv_conf/README.md b/ansible/roles/resolv_conf/README.md index 3746407..781ec49 100644 --- a/ansible/roles/resolv_conf/README.md +++ b/ansible/roles/resolv_conf/README.md @@ -3,9 +3,11 @@ Template out `/etc/resolv.conf`. ## Role variables + - `resolv_conf_nameservers`: List of up to 3 nameserver addresses. Notes: + - `NetworkManager` (if used) will be prevented from rewriting this file on boot. - If `/etc/resolv.conf` includes `127.0.0.1` (e.g. due to a FreeIPA server installation), then `resolv_conf_nameservers` is ignored and this role does not change `/etc/resolv.conf` - For hosts in the `resolv_conf` group, the `/etc/resolv.conf` created with `resolv_conf_nameservers` will diff --git a/ansible/roles/resolv_conf/defaults/main.yml b/ansible/roles/resolv_conf/defaults/main.yml index 37c97b7..44e2d85 100644 --- a/ansible/roles/resolv_conf/defaults/main.yml +++ b/ansible/roles/resolv_conf/defaults/main.yml @@ -1 +1,2 @@ +--- resolv_conf_nameservers: [] diff --git a/ansible/roles/resolv_conf/tasks/main.yml b/ansible/roles/resolv_conf/tasks/main.yml index 486ec18..41ef9c1 100644 --- a/ansible/roles/resolv_conf/tasks/main.yml +++ b/ansible/roles/resolv_conf/tasks/main.yml @@ -1,3 +1,4 @@ +--- - name: Read nameservers from /etc/resolv.conf ansible.builtin.slurp: src: /etc/resolv.conf @@ -27,4 +28,4 @@ ansible.builtin.systemd: name: NetworkManager state: reloaded - when: _copy_nm_config.changed | default(false) + when: _copy_nm_config.changed | default(false) # noqa: no-handler diff --git a/ansible/roles/slurm_exporter/README.md b/ansible/roles/slurm_exporter/README.md index 7ade273..3b42f13 100644 --- a/ansible/roles/slurm_exporter/README.md +++ b/ansible/roles/slurm_exporter/README.md @@ -1,37 +1,34 @@ -slurm_exporter -============== +# slurm_exporter -Build, install and configure a Prometheus exporter for metrics about Slurm itself: https://github.com/vpenso/prometheus-slurm-exporter/ +Build, install and configure a Prometheus exporter for metrics about Slurm itself: -Requirements ------------- +## Requirements Rocky Linux 8.5 host. -Role Variables --------------- +## Role Variables See `defaults/main.yml` -Dependencies ------------- +## Dependencies None. -Example Playbook ----------------- +## Example Playbook - - name: Deploy Slurm exporter - hosts: control - become: true - tags: slurm_exporter - tasks: - - import_role: - name: slurm_exporter +```yaml +- name: Deploy Slurm exporter + hosts: control + become: true + tags: slurm_exporter + tasks: + - import_role: + name: slurm_exporter +``` Prometheus scrape configuration for this might look like: -``` +```text - job_name: "slurm_exporter" scrape_interval: 30s scrape_timeout: 30s @@ -40,12 +37,10 @@ Prometheus scrape configuration for this might look like: - "{{ openhpc_slurm_control_host }}:9341" ``` -License -------- +## License Apache v2 -Author Information ------------------- +## Author Information StackHPC Ltd. diff --git a/ansible/roles/slurm_exporter/defaults/main.yml b/ansible/roles/slurm_exporter/defaults/main.yml index eda259b..d0b5a0f 100644 --- a/ansible/roles/slurm_exporter/defaults/main.yml +++ b/ansible/roles/slurm_exporter/defaults/main.yml @@ -1,5 +1,5 @@ --- # see https://github.com/stackhpc/prometheus-slurm-exporter/releases - version follows upstream, release is stackhpc build -slurm_exporter_version: '0.21' -slurm_exporter_release: '1' +slurm_exporter_version: "0.21" +slurm_exporter_release: "1" slurm_exporter_state: started diff --git a/ansible/roles/slurm_exporter/handlers/main.yml b/ansible/roles/slurm_exporter/handlers/main.yml index b55c9c6..33266fe 100644 --- a/ansible/roles/slurm_exporter/handlers/main.yml +++ b/ansible/roles/slurm_exporter/handlers/main.yml @@ -1,7 +1,7 @@ --- - name: Restart slurm exporter become: true - systemd: + ansible.builtin.systemd: daemon_reload: true name: prometheus-slurm-exporter state: restarted diff --git a/ansible/roles/slurm_exporter/tasks/configure.yml b/ansible/roles/slurm_exporter/tasks/configure.yml index e511be0..d8f2aae 100644 --- a/ansible/roles/slurm_exporter/tasks/configure.yml +++ b/ansible/roles/slurm_exporter/tasks/configure.yml @@ -1,5 +1,5 @@ - name: Ensure slurm exporter state - systemd: + ansible.builtin.systemd: name: prometheus-slurm-exporter state: "{{ slurm_exporter_state }}" enabled: true diff --git a/ansible/roles/slurm_exporter/tasks/install.yml b/ansible/roles/slurm_exporter/tasks/install.yml index cba7aa9..48196dd 100644 --- a/ansible/roles/slurm_exporter/tasks/install.yml +++ b/ansible/roles/slurm_exporter/tasks/install.yml @@ -1,8 +1,10 @@ +--- - name: Install slurm_exporter package - dnf: - name: "https://github.com/stackhpc/prometheus-slurm-exporter/releases/download/{{ slurm_exporter_version }}/prometheus-slurm-exporter-{{ slurm_exporter_version }}-{{slurm_exporter_release}}.el8.x86_64.rpm" - disable_gpg_check: yes + # checkov:skip=CKV2_ANSIBLE_4: "Ensure that packages with untrusted or missing GPG signatures are not used by dnf" + ansible.builtin.dnf: + # yamllint disable-line rule:line-length + name: "https://github.com/stackhpc/prometheus-slurm-exporter/releases/download/{{ slurm_exporter_version }}/prometheus-slurm-exporter-{{ slurm_exporter_version }}-{{ slurm_exporter_release }}.el8.x86_64.rpm" + disable_gpg_check: true notify: Restart slurm exporter -- meta: flush_handlers - +- ansible.builtin.meta: flush_handlers diff --git a/ansible/roles/slurm_exporter/tasks/main.yml b/ansible/roles/slurm_exporter/tasks/main.yml index 0171113..cc29fba 100644 --- a/ansible/roles/slurm_exporter/tasks/main.yml +++ b/ansible/roles/slurm_exporter/tasks/main.yml @@ -1,3 +1,3 @@ --- -- import_tasks: install.yml -- import_tasks: configure.yml +- ansible.builtin.import_tasks: install.yml +- ansible.builtin.import_tasks: configure.yml diff --git a/ansible/roles/slurm_recompile/README.md b/ansible/roles/slurm_recompile/README.md index e42572a..27b162c 100644 --- a/ansible/roles/slurm_recompile/README.md +++ b/ansible/roles/slurm_recompile/README.md @@ -1,28 +1,22 @@ # slurm_recompile -================= Recompiles slurm from source RPMs and installs the packages that were built. -Requirements ------------- +## Requirements -Role Variables --------------- +## Role Variables See `defaults/main.yml`. -Dependencies ------------- +## Dependencies -Example Playbook ----------------- +## Example Playbook - hosts: compute tasks: - import_role: name: slurm_recompile -License -------- +## License Apache-2.0 diff --git a/ansible/roles/slurm_recompile/tasks/main.yml b/ansible/roles/slurm_recompile/tasks/main.yml index 4720a6a..22961d6 100644 --- a/ansible/roles/slurm_recompile/tasks/main.yml +++ b/ansible/roles/slurm_recompile/tasks/main.yml @@ -1,6 +1,6 @@ --- - name: Get facts about CUDA installation - import_role: + ansible.builtin.import_role: name: cuda tasks_from: facts.yml @@ -9,15 +9,16 @@ manager: auto - name: Set fact containing slurm package facts - set_fact: + ansible.builtin.set_fact: slurm_package: "{{ ansible_facts.packages['slurm-slurmd-ohpc'].0 }}" - name: Install build packages ansible.builtin.dnf: name: "{{ slurm_recompile_build_packages }}" -- name: Recompile and install slurm packages - shell: | +- name: Recompile and install slurm packages # noqa: no-changed-when + # yamllint disable rule:line-length + ansible.builtin.shell: | #!/bin/bash source /etc/profile set -eux @@ -27,17 +28,18 @@ dnf builddep -y slurm.spec rpmbuild -bb{% if slurm_recompile_with_nvml | bool %} -D "_with_nvml --with-nvml=/usr/local/cuda-{{ cuda_facts_version_short }}/targets/x86_64-linux/"{% endif %} slurm.spec dnf reinstall -y /root/rpmbuild/RPMS/x86_64/*.rpm + # yamllint enable rule:line-length become: true - name: Workaround missing symlink # Workaround path issue: https://groups.google.com/g/slurm-users/c/cvGb4JnK8BY - command: ln -s /lib64/libnvidia-ml.so.1 /lib64/libnvidia-ml.so + ansible.builtin.command: ln -s /lib64/libnvidia-ml.so.1 /lib64/libnvidia-ml.so args: creates: /lib64/libnvidia-ml.so when: slurm_recompile_with_nvml | bool -- name: Cleanup Dependencies - shell: | +- name: Cleanup Dependencies # noqa: no-changed-when + ansible.builtin.shell: | #!/bin/bash set -eux set -o pipefail diff --git a/ansible/roles/slurm_stats/README.md b/ansible/roles/slurm_stats/README.md index f8bd38c..c67e2c0 100644 --- a/ansible/roles/slurm_stats/README.md +++ b/ansible/roles/slurm_stats/README.md @@ -1,33 +1,25 @@ -stackhpc.slurm_openstack_tools.slurm-stats -========================================== +# stackhpc.slurm_openstack_tools.slurm-stats -Configures slurm-stats from https://github.com/stackhpc/slurm-openstack-tools.git which +Configures slurm-stats from which transforms sacct output into a form that is more amenable for importing into elasticsearch/loki. -Requirements ------------- +## Requirements -Role Variables --------------- +## Role Variables See `defaults/main.yml`. -Dependencies ------------- +## Dependencies -Example Playbook ----------------- +## Example Playbook - hosts: compute tasks: - import_role: name: slurm_stats - -License -------- +## License Apache-2.0 -Author Information ------------------- +## Author Information diff --git a/ansible/roles/slurm_stats/tasks/configure.yml b/ansible/roles/slurm_stats/tasks/configure.yml index 6bd87b2..e83c33f 100644 --- a/ansible/roles/slurm_stats/tasks/configure.yml +++ b/ansible/roles/slurm_stats/tasks/configure.yml @@ -1,13 +1,14 @@ --- - name: Create a directory to house the log files - file: + ansible.builtin.file: state: directory path: /var/log/slurm-stats + mode: "0755" become: true - name: Create cron job - cron: + ansible.builtin.cron: name: Generate slurm stats minute: "*/5" user: root @@ -17,7 +18,7 @@ become: true - name: Setup log rotate - copy: + ansible.builtin.copy: content: | # WARNING: This file is managed by ansible, do not modify. /var/log/slurm-stats/finished_jobs.json { @@ -27,4 +28,5 @@ delaycompress } dest: /etc/logrotate.d/slurm-stats + mode: "0644" become: true diff --git a/ansible/roles/slurm_stats/tasks/install.yml b/ansible/roles/slurm_stats/tasks/install.yml index 748272e..981bf84 100644 --- a/ansible/roles/slurm_stats/tasks/install.yml +++ b/ansible/roles/slurm_stats/tasks/install.yml @@ -1,5 +1,5 @@ --- - name: Setup slurm tools - include_role: + ansible.builtin.include_role: name: slurm_tools diff --git a/ansible/roles/slurm_stats/tasks/main.yml b/ansible/roles/slurm_stats/tasks/main.yml index 79d326c..e5e0787 100644 --- a/ansible/roles/slurm_stats/tasks/main.yml +++ b/ansible/roles/slurm_stats/tasks/main.yml @@ -1,4 +1,4 @@ --- -- include_tasks: install.yml -- include_tasks: configure.yml +- ansible.builtin.include_tasks: install.yml +- ansible.builtin.include_tasks: configure.yml diff --git a/ansible/roles/slurm_tools/README.md b/ansible/roles/slurm_tools/README.md index 9724c44..07911cb 100644 --- a/ansible/roles/slurm_tools/README.md +++ b/ansible/roles/slurm_tools/README.md @@ -1,10 +1,8 @@ -slurm_tools -========= +# slurm_tools -Install python-based tools from https://github.com/stackhpc/slurm-openstack-tools.git into `/opt/slurm-tools/bin/`. +Install python-based tools from into `/opt/slurm-tools/bin/`. -Role Variables --------------- +## Role Variables - `pytools_editable`: Optional bool. Whether to install the package using `pip`'s editable mode (installing source to `/opt/slurm-tools/src`). Default `false`. diff --git a/ansible/roles/slurm_tools/tasks/main.yml b/ansible/roles/slurm_tools/tasks/main.yml index deedb03..9f5eff0 100644 --- a/ansible/roles/slurm_tools/tasks/main.yml +++ b/ansible/roles/slurm_tools/tasks/main.yml @@ -1,33 +1,33 @@ --- -- name: install python3 - package: +- name: Install python3 + ansible.builtin.package: name: python3,git become: true - name: Create virtualenv directory - file: + ansible.builtin.file: path: /opt/slurm-tools owner: "{{ pytools_user }}" group: "{{ pytools_user }}" state: directory + mode: "0755" become: true -- block: - - name: Upgrade pip - # This needs to a separate step so that we use the updated version - # to install the packages below. - pip: - name: pip - - - name: Create virtualenv - pip: - name: "git+https://github.com/stackhpc/slurm-openstack-tools.git@{{ pytools_gitref }}#egg=slurm_openstack_tools" - editable: "{{ pytools_editable }}" - - module_defaults: +- module_defaults: ansible.builtin.pip: virtualenv: /opt/slurm-tools virtualenv_command: "{{ 'python3.9 -m venv' if ansible_distribution_major_version == '8' else 'python3 -m venv' }}" state: latest become: true become_user: "{{ pytools_user }}" + block: + - name: Upgrade pip + # This needs to a separate step so that we use the updated version + # to install the packages below. + ansible.builtin.pip: + name: pip + + - name: Create virtualenv + ansible.builtin.pip: + name: "git+https://github.com/stackhpc/slurm-openstack-tools.git@{{ pytools_gitref }}#egg=slurm_openstack_tools" + editable: "{{ pytools_editable }}" diff --git a/ansible/roles/squid/README.md b/ansible/roles/squid/README.md index e514c36..7b7b8db 100644 --- a/ansible/roles/squid/README.md +++ b/ansible/roles/squid/README.md @@ -35,5 +35,5 @@ Where noted these map to squid parameters of the same name without the `squid_` http_access allow localhost # Finally deny all other access to this proxy http_access deny all - + See squid parameter. diff --git a/ansible/roles/squid/defaults/main.yml b/ansible/roles/squid/defaults/main.yml index 7457bdc..b224d13 100644 --- a/ansible/roles/squid/defaults/main.yml +++ b/ansible/roles/squid/defaults/main.yml @@ -1,3 +1,4 @@ +--- squid_conf_template: squid.conf.j2 squid_started: true squid_enabled: true @@ -5,8 +6,8 @@ squid_enabled: true squid_cache_mem: "{{ undef(hint='squid_cache_mem required, e.g. \"12 GB\"') }}" squid_cache_dir: /var/spool/squid squid_cache_disk: "{{ undef(hint='squid_cache_disk (in MB) required, e.g. \"1024\"') }}" # always in MB -squid_maximum_object_size_in_memory: '64 MB' -squid_maximum_object_size: '200 MB' +squid_maximum_object_size_in_memory: "64 MB" +squid_maximum_object_size: "200 MB" squid_http_port: 3128 squid_acls: acl anywhere src all # rely on openstack security groups squid_http_access: | diff --git a/ansible/roles/squid/handlers/main.yml b/ansible/roles/squid/handlers/main.yml index 135d98d..7448a01 100644 --- a/ansible/roles/squid/handlers/main.yml +++ b/ansible/roles/squid/handlers/main.yml @@ -1,5 +1,6 @@ +--- - name: Restart squid - service: + ansible.builtin.service: name: squid state: restarted when: squid_started | bool diff --git a/ansible/roles/squid/tasks/configure.yml b/ansible/roles/squid/tasks/configure.yml index 0d4dec6..d1e49e3 100644 --- a/ansible/roles/squid/tasks/configure.yml +++ b/ansible/roles/squid/tasks/configure.yml @@ -1,5 +1,6 @@ +--- - name: Ensure squid cache directory exists - file: + ansible.builtin.file: path: "{{ squid_cache_dir }}" # based on what dnf package creates: owner: squid @@ -7,7 +8,7 @@ mode: u=rwx,g=rw,o= - name: Template squid configuration - template: + ansible.builtin.template: src: "{{ squid_conf_template }}" dest: /etc/squid/squid.conf owner: squid @@ -15,10 +16,9 @@ mode: ug=rwX,go= notify: Restart squid -- meta: flush_handlers - +- ansible.builtin.meta: flush_handlers - name: Ensure squid service state - systemd: + ansible.builtin.systemd: name: squid state: "{{ 'started' if squid_started | bool else 'stopped' }}" enabled: "{{ true if squid_enabled else false }}" diff --git a/ansible/roles/squid/tasks/install.yml b/ansible/roles/squid/tasks/install.yml index 672186c..d60af91 100644 --- a/ansible/roles/squid/tasks/install.yml +++ b/ansible/roles/squid/tasks/install.yml @@ -1,3 +1,4 @@ +--- - name: Install squid package - dnf: + ansible.builtin.dnf: name: squid diff --git a/ansible/roles/squid/tasks/main.yml b/ansible/roles/squid/tasks/main.yml index 2b65e84..cc29fba 100644 --- a/ansible/roles/squid/tasks/main.yml +++ b/ansible/roles/squid/tasks/main.yml @@ -1,2 +1,3 @@ -- import_tasks: install.yml -- import_tasks: configure.yml +--- +- ansible.builtin.import_tasks: install.yml +- ansible.builtin.import_tasks: configure.yml diff --git a/ansible/roles/sshd/defaults/main.yml b/ansible/roles/sshd/defaults/main.yml index c7a83b8..ca2f8c7 100644 --- a/ansible/roles/sshd/defaults/main.yml +++ b/ansible/roles/sshd/defaults/main.yml @@ -1,3 +1,4 @@ +--- sshd_password_authentication: false sshd_disable_forwarding: true sshd_conf_src: sshd.conf.j2 diff --git a/ansible/roles/sshd/handlers/main.yml b/ansible/roles/sshd/handlers/main.yml index e11aa78..e3e8b1c 100644 --- a/ansible/roles/sshd/handlers/main.yml +++ b/ansible/roles/sshd/handlers/main.yml @@ -1,4 +1,5 @@ +--- - name: Restart sshd - systemd: + ansible.builtin.systemd: name: sshd state: restarted diff --git a/ansible/roles/sshd/tasks/configure.yml b/ansible/roles/sshd/tasks/configure.yml index 359d782..f47d48c 100644 --- a/ansible/roles/sshd/tasks/configure.yml +++ b/ansible/roles/sshd/tasks/configure.yml @@ -1,17 +1,18 @@ +--- - name: Grab facts to determine distribution - setup: + ansible.builtin.setup: - name: Ensure drop in directory exists - file: + ansible.builtin.file: path: /etc/ssh/sshd_config.d/ state: directory owner: root group: root - mode: 700 + mode: "0700" become: true - name: Ensure drop in configuration is included - blockinfile: + ansible.builtin.blockinfile: dest: /etc/ssh/sshd_config content: | # To modify the system-wide sshd configuration, create .conf @@ -32,7 +33,7 @@ # Include /etc/ssh/sshd_config.d/*.conf # early on, which is generally held to be the correct approach, so adding # values to the end of that file won't work - template: + ansible.builtin.template: src: "{{ sshd_conf_src }}" dest: "{{ sshd_conf_dest }}" owner: root diff --git a/ansible/roles/sshd/tasks/export.yml b/ansible/roles/sshd/tasks/export.yml index 0c153ca..a21daee 100644 --- a/ansible/roles/sshd/tasks/export.yml +++ b/ansible/roles/sshd/tasks/export.yml @@ -1,6 +1,7 @@ +--- # Exclusively used for compute-init - name: Inject host specific config template - template: + ansible.builtin.template: src: "{{ sshd_conf_src }}" dest: "/exports/cluster/hostconfig/{{ inventory_hostname }}/sshd.conf" owner: root diff --git a/ansible/roles/sshd/tasks/main.yml b/ansible/roles/sshd/tasks/main.yml index 84f4934..ec83d2b 100644 --- a/ansible/roles/sshd/tasks/main.yml +++ b/ansible/roles/sshd/tasks/main.yml @@ -1 +1,2 @@ -- import_tasks: configure.yml +--- +- ansible.builtin.import_tasks: configure.yml diff --git a/ansible/roles/sssd/README.md b/ansible/roles/sssd/README.md index 5c9b50e..ad6de4a 100644 --- a/ansible/roles/sssd/README.md +++ b/ansible/roles/sssd/README.md @@ -2,7 +2,6 @@ Install and configure [sssd](https://sssd.io/docs/introduction.html). - ## Role variables The only required configuration is to create a [sssd.conf](https://www.mankier.com/5/sssd.conf) template at the location specified by `sssd_conf_src`. diff --git a/ansible/roles/sssd/defaults/main.yml b/ansible/roles/sssd/defaults/main.yml index 5bc58c9..605e746 100644 --- a/ansible/roles/sssd/defaults/main.yml +++ b/ansible/roles/sssd/defaults/main.yml @@ -1,3 +1,4 @@ +--- sssd_packages: - sssd-common sssd_install_ldap: false diff --git a/ansible/roles/sssd/handlers/main.yml b/ansible/roles/sssd/handlers/main.yml index 72c36e7..4965b15 100644 --- a/ansible/roles/sssd/handlers/main.yml +++ b/ansible/roles/sssd/handlers/main.yml @@ -1,5 +1,6 @@ +--- - name: Restart sssd - systemd: + ansible.builtin.systemd: name: sssd state: restarted when: sssd_started | bool diff --git a/ansible/roles/sssd/tasks/configure.yml b/ansible/roles/sssd/tasks/configure.yml index c8ebd82..66d86f6 100644 --- a/ansible/roles/sssd/tasks/configure.yml +++ b/ansible/roles/sssd/tasks/configure.yml @@ -1,5 +1,6 @@ +--- - name: Manage sssd.conf configuration - template: + ansible.builtin.template: src: "{{ sssd_conf_src }}" dest: "{{ sssd_conf_dest }}" owner: root @@ -7,29 +8,28 @@ mode: u=rw,go= notify: "Restart sssd" -- meta: flush_handlers - +- ansible.builtin.meta: flush_handlers - name: Ensure sssd service state - systemd: + ansible.builtin.systemd: name: sssd state: "{{ 'started' if sssd_started | bool else 'stopped' }}" enabled: "{{ sssd_enabled | bool }}" - name: Get current authselect configuration - command: authselect current --raw + ansible.builtin.command: authselect current --raw changed_when: false failed_when: - _authselect_current.rc != 0 - "'No existing configuration detected' not in _authselect_current.stdout" register: _authselect_current # stdout: sssd with-mkhomedir -- name: Configure nsswitch and PAM for SSSD - command: "authselect select sssd --force{% if sssd_enable_mkhomedir | bool %} with-mkhomedir{% endif %}" +- name: Configure nsswitch and PAM for SSSD # noqa: no-changed-when + ansible.builtin.command: "authselect select sssd --force{% if sssd_enable_mkhomedir | bool %} with-mkhomedir{% endif %}" when: "'sssd' not in _authselect_current.stdout" - name: "Ensure oddjob is started" - service: + ansible.builtin.service: name: oddjobd - state: 'started' + state: "started" enabled: true - when: sssd_enable_mkhomedir | bool \ No newline at end of file + when: sssd_enable_mkhomedir | bool diff --git a/ansible/roles/sssd/tasks/export.yml b/ansible/roles/sssd/tasks/export.yml index 0be6674..6078786 100644 --- a/ansible/roles/sssd/tasks/export.yml +++ b/ansible/roles/sssd/tasks/export.yml @@ -1,9 +1,10 @@ +--- # Exclusively used for compute-init - name: Inject host specific config template - template: + ansible.builtin.template: src: "{{ sssd_conf_src }}" dest: "/exports/cluster/hostconfig/{{ inventory_hostname }}/sssd.conf" owner: root group: root mode: u=rw,go= - delegate_to: "{{ groups['control'] | first }}" \ No newline at end of file + delegate_to: "{{ groups['control'] | first }}" diff --git a/ansible/roles/sssd/tasks/install.yml b/ansible/roles/sssd/tasks/install.yml index 97aa82a..b7c8f11 100644 --- a/ansible/roles/sssd/tasks/install.yml +++ b/ansible/roles/sssd/tasks/install.yml @@ -1,13 +1,14 @@ +--- - name: Ensure sssd packages are installed - dnf: + ansible.builtin.dnf: name: "{{ sssd_packages + sssd_ldap_packages if (sssd_install_ldap | bool) else [] }}" - name: Control if sssd should start on boot # Needs to be done here to prevent starting after image build, is enabled by default - systemd: + ansible.builtin.systemd: name: sssd enabled: "{{ sssd_enabled | bool }}" - name: Ensure mkhomedir packages are installed if required - dnf: + ansible.builtin.dnf: name: "{{ sssd_mkhomedir_packages }}" diff --git a/ansible/roles/sssd/tasks/main.yml b/ansible/roles/sssd/tasks/main.yml index 2b65e84..cc29fba 100644 --- a/ansible/roles/sssd/tasks/main.yml +++ b/ansible/roles/sssd/tasks/main.yml @@ -1,2 +1,3 @@ -- import_tasks: install.yml -- import_tasks: configure.yml +--- +- ansible.builtin.import_tasks: install.yml +- ansible.builtin.import_tasks: configure.yml diff --git a/ansible/roles/systemd/README.md b/ansible/roles/systemd/README.md index e18599f..9ec8cb8 100644 --- a/ansible/roles/systemd/README.md +++ b/ansible/roles/systemd/README.md @@ -2,18 +2,17 @@ Create drop-in files for systemd services. -# Role Variables +## Role Variables + - `systemd_dropins`: Required. A mapping where keys = systemd service name, values are a dict as follows: - - `group`: Required str. Inventory group this drop-in applies to. - - `comment`: Optional str. Comment describing reason for drop-in. - - `content`: Required str. Content of drop-in file. -# systemd + - `group`: Required str. Inventory group this drop-in applies to. + - `comment`: Optional str. Comment describing reason for drop-in. + - `content`: Required str. Content of drop-in file. -Create drop-in files for systemd services. +## Role Variables - optional restart -# Role Variables - `systemd_dropins`: Required. A mapping where keys = systemd service name, values are a dict as follows: - - `group`: Required str. Inventory group this drop-in applies to. - - `comment`: Optional str. Comment describing reason for drop-in. - - `content`: Required str. Content of drop-in file. + - `group`: Required str. Inventory group this drop-in applies to. + - `comment`: Optional str. Comment describing reason for drop-in. + - `content`: Required str. Content of drop-in file. - `systemd_restart`: Optional bool. Whether to reload unit definitions and restart services. Default `false`. diff --git a/ansible/roles/systemd/defaults/main.yml b/ansible/roles/systemd/defaults/main.yml index 7ca54aa..29b9b75 100644 --- a/ansible/roles/systemd/defaults/main.yml +++ b/ansible/roles/systemd/defaults/main.yml @@ -1,4 +1,5 @@ -#systemd_dropins: +--- +# systemd_dropins: # : # group: # comment: diff --git a/ansible/roles/systemd/tasks/main.yml b/ansible/roles/systemd/tasks/main.yml index 822a676..8fa6f48 100644 --- a/ansible/roles/systemd/tasks/main.yml +++ b/ansible/roles/systemd/tasks/main.yml @@ -1,11 +1,12 @@ +--- # NB: As `systemd_TODO:` is defined in group_vars/all, all tasks here are conditional on group. - name: Make directory for unit dropins - file: + ansible.builtin.file: path: "/etc/systemd/system/{{ item.key }}.service.d/" state: directory owner: root group: root - mode: 0644 + mode: "0644" loop: "{{ systemd_dropins | dict2items }}" when: "item.value.group in group_names" @@ -17,14 +18,14 @@ dest: "/etc/systemd/system/{{ item.key }}.service.d/slurm_app.conf" owner: root group: root - mode: 0644 + mode: "0644" loop: "{{ systemd_dropins | dict2items }}" register: _systemd_dropins when: "item.value.group in group_names" -- name: Reload unit definitions - ansible.builtin.shell: - cmd: systemctl daemon-reload +- name: Reload unit definitions # noqa: no-changed-when + ansible.builtin.command: + cmd: systemctl daemon-reload # noqa: command-instead-of-module when: - _systemd_dropins.changed - systemd_restart | default(false) | bool diff --git a/ansible/roles/topology/README.md b/ansible/roles/topology/README.md index 0571344..6bdeaae 100644 --- a/ansible/roles/topology/README.md +++ b/ansible/roles/topology/README.md @@ -1,5 +1,4 @@ -topology -======== +# topology Templates out /etc/slurm/topology.conf file based on an OpenStack project for use by Slurm's [topology/tree plugin.](https://slurm.schedmd.com/topology.html) Models @@ -12,22 +11,23 @@ reconfigure an already running cluster after a `ansible/site.yml` run. You will to run the `ansible/adhoc/restart-slurm.yml` playbook for changes to topology.conf to be recognised. -Role Variables --------------- +## Role Variables - `topology_nodes:`: Required list of strs. List of inventory hostnames of nodes to include in topology tree. Must be set to include all compute nodes in Slurm cluster. Default `[]`. - `topology_conf_template`: Optional str. Path to Jinja2 template of topology.conf file. Default `templates/topology.conf.j2` -- `topology_above_rack_topology`: Optionally multiline str. Used to define topology above racks/AZs if - you wish to partition racks further under different logical switches. New switches above should be - defined as [SwitchName lines](https://slurm.schedmd.com/topology.html#hierarchical) referencing - rack Availability Zones under that switch in their `Switches fields`. These switches must themselves - be under a top level switch. e.g - ``` - topology_above_rack_topology: | - SwitchName=rack-group-1 Switches=rack-az-1,rack-az-2 - SwitchName=rack-group-2 Switches=rack-az-3,rack-az-4 - SwitchName=top-level Switches=rack-group-1,rack-group-2 - ``` - Defaults to an empty string, which causes all AZs to be put under a - single top level switch. \ No newline at end of file +- `topology_above_rack_topology`: Optionally multiline str. Used to define topology above racks/AZs if + you wish to partition racks further under different logical switches. New switches above should be + defined as [SwitchName lines](https://slurm.schedmd.com/topology.html#hierarchical) referencing + rack Availability Zones under that switch in their `Switches fields`. These switches must themselves + be under a top level switch. e.g + + ```yaml + topology_above_rack_topology: | + SwitchName=rack-group-1 Switches=rack-az-1,rack-az-2 + SwitchName=rack-group-2 Switches=rack-az-3,rack-az-4 + SwitchName=top-level Switches=rack-group-1,rack-group-2 + ``` + + Defaults to an empty string, which causes all AZs to be put under a + single top level switch. diff --git a/ansible/roles/topology/defaults/main.yml b/ansible/roles/topology/defaults/main.yml index 6b62243..87801e8 100644 --- a/ansible/roles/topology/defaults/main.yml +++ b/ansible/roles/topology/defaults/main.yml @@ -5,4 +5,3 @@ topology_nodes: [] topology_conf_template: templates/topology.conf.j2 topology_above_rack_topology: "" - diff --git a/ansible/roles/topology/library/map_hosts.py b/ansible/roles/topology/library/map_hosts.py index 1961132..42f22ee 100644 --- a/ansible/roles/topology/library/map_hosts.py +++ b/ansible/roles/topology/library/map_hosts.py @@ -1,10 +1,11 @@ #!/usr/bin/python +# pylint: disable=missing-module-docstring # Copyright: (c) 2025, StackHPC # Apache 2 License -from ansible.module_utils.basic import AnsibleModule -import openstack +import openstack # pylint: disable=import-error +from ansible.module_utils.basic import AnsibleModule # pylint: disable=import-error DOCUMENTATION = """ --- @@ -47,35 +48,39 @@ - mycluster-compute-1 """ + def min_prefix(uuids, start=4): - """ Take a list of uuids and return the smallest length >= start which keeps them unique """ + """Take a list of uuids and return the smallest length >= start which keeps them unique""" for length in range(start, len(uuids[0])): prefixes = set(uuid[:length] for uuid in uuids) if len(prefixes) == len(uuids): return length + # Fallback to returning the full length + return len(uuids[0]) + -def run_module(): - module_args = dict( - compute_vms=dict(type='list', elements='str', required=True) - ) +def run_module(): # pylint: disable=missing-function-docstring + module_args = {"compute_vms": {"type": "list", "elements": "str", "required": True}} module = AnsibleModule(argument_spec=module_args, supports_check_mode=True) conn = openstack.connection.from_config() - servers = [s for s in conn.compute.servers() if s["name"] in module.params["compute_vms"]] + servers = [ + s for s in conn.compute.servers() if s["name"] in module.params["compute_vms"] + ] topo = {} all_host_ids = [] for s in servers: - az = s['availability_zone'] - host_id = s['host_id'] - if host_id != '': # empty string if e.g. server is shelved + az = s["availability_zone"] + host_id = s["host_id"] + if host_id != "": # empty string if e.g. server is shelved all_host_ids.append(host_id) if az not in topo: topo[az] = {} if host_id not in topo[az]: topo[az][host_id] = [] - topo[az][host_id].append(s['name']) + topo[az][host_id].append(s["name"]) uuid_len = min_prefix(list(set(all_host_ids))) @@ -83,14 +88,14 @@ def run_module(): topo[az] = dict((k[:uuid_len], v) for (k, v) in topo[az].items()) result = { - "changed": False, + "changed": False, "topology": topo, } - + module.exit_json(**result) -def main(): +def main(): # pylint: disable=missing-function-docstring run_module() diff --git a/ansible/roles/topology/tasks/main.yml b/ansible/roles/topology/tasks/main.yml index 8debdde..3872a0c 100644 --- a/ansible/roles/topology/tasks/main.yml +++ b/ansible/roles/topology/tasks/main.yml @@ -13,4 +13,4 @@ dest: /etc/slurm/topology.conf owner: root group: root - mode: 0644 + mode: "0644" diff --git a/ansible/roles/tuned/README.md b/ansible/roles/tuned/README.md index 34885af..a4626c4 100644 --- a/ansible/roles/tuned/README.md +++ b/ansible/roles/tuned/README.md @@ -1,14 +1,11 @@ -tuned -========= +# tuned This role configures the TuneD tool for system tuning, ensuring optimal performance based on the profile settings defined. -Role Variables --------------- +## Role Variables See the [TuneD documentation](https://docs.redhat.com/en/documentation/red_hat_enterprise_linux/9/html/monitoring_and_managing_system_status_and_performance/getting-started-with-tuned_monitoring-and-managing-system-status-and-performance) for profile details. - - `tuned_profile_baremetal`: Optional str. Name of default profile for non-virtualised hosts. Default `hpc-compute`. - `tuned_profile_vm`: Optional str. Name of default profile for virtualised hosts. Default `virtual-guest`. - `tuned_profile`: Optional str. Name of profile to apply to host. Defaults to `tuned_profile_baremetal` or `tuned_profile_vm` as appropriate. diff --git a/ansible/roles/tuned/defaults/main.yml b/ansible/roles/tuned/defaults/main.yml index 1426bbe..8ddb139 100644 --- a/ansible/roles/tuned/defaults/main.yml +++ b/ansible/roles/tuned/defaults/main.yml @@ -4,4 +4,4 @@ tuned_profile_baremetal: hpc-compute tuned_profile_vm: virtual-guest tuned_profile: "{{ tuned_profile_baremetal if ansible_virtualization_role != 'guest' else tuned_profile_vm }}" tuned_enabled: true -tuned_started: true +tuned_started: true diff --git a/ansible/roles/tuned/tasks/configure.yml b/ansible/roles/tuned/tasks/configure.yml index cf122d1..fa10648 100644 --- a/ansible/roles/tuned/tasks/configure.yml +++ b/ansible/roles/tuned/tasks/configure.yml @@ -12,7 +12,7 @@ register: _tuned_profile_current changed_when: false -- name: Set TuneD profile +- name: Set TuneD profile # noqa: no-changed-when ansible.builtin.command: cmd: "tuned-adm profile {{ tuned_profile }}" when: diff --git a/ansible/roles/tuned/tasks/install.yml b/ansible/roles/tuned/tasks/install.yml index 0a2db4e..0890684 100644 --- a/ansible/roles/tuned/tasks/install.yml +++ b/ansible/roles/tuned/tasks/install.yml @@ -12,5 +12,6 @@ path: /usr/lib/tuned/hpc-compute/tuned.conf section: sysctl option: vm.min_free_kbytes - value: '>135168' + value: ">135168" no_extra_spaces: true + mode: "0644" diff --git a/ansible/roles/tuned/tasks/main.yml b/ansible/roles/tuned/tasks/main.yml index ef0bea2..cc29fba 100644 --- a/ansible/roles/tuned/tasks/main.yml +++ b/ansible/roles/tuned/tasks/main.yml @@ -1,3 +1,3 @@ --- -- import_tasks: install.yml -- import_tasks: configure.yml \ No newline at end of file +- ansible.builtin.import_tasks: install.yml +- ansible.builtin.import_tasks: configure.yml diff --git a/ansible/roles/zenith_proxy/defaults/main.yml b/ansible/roles/zenith_proxy/defaults/main.yml index 02267cb..748ad71 100644 --- a/ansible/roles/zenith_proxy/defaults/main.yml +++ b/ansible/roles/zenith_proxy/defaults/main.yml @@ -15,7 +15,7 @@ zenith_proxy_pod_name: "{{ zenith_proxy_service_name }}" zenith_proxy_client_container_name: "{{ zenith_proxy_client_service_name }}" zenith_proxy_mitm_container_name: "{{ zenith_proxy_mitm_service_name }}" -zenith_proxy_image_tag: '0.14.0' +zenith_proxy_image_tag: "0.14.0" zenith_proxy_client_image_repository: ghcr.io/azimuth-cloud/zenith-client zenith_proxy_client_image: "{{ zenith_proxy_client_image_repository }}:{{ zenith_proxy_image_tag }}" @@ -27,14 +27,12 @@ zenith_proxy_upstream_scheme: http zenith_proxy_upstream_host: "{{ undef(hint = 'zenith_proxy_upstream_host is required') }}" zenith_proxy_upstream_port: "{{ undef(hint = 'zenith_proxy_upstream_port is required') }}" zenith_proxy_upstream_read_timeout: - zenith_proxy_client_token: "{{ undef(hint = 'zenith_proxy_client_token is required') }}" zenith_proxy_client_auth_skip: false zenith_proxy_client_auth_params: {} - -zenith_proxy_mitm_enabled: no +zenith_proxy_mitm_enabled: false zenith_proxy_mitm_listen_port: 8080 -zenith_proxy_mitm_auth_inject: none # valid values are 'basic' and 'bearer' +zenith_proxy_mitm_auth_inject: none # valid values are 'basic' and 'bearer' zenith_proxy_mitm_auth_basic_username: >- {{ undef(hint = 'zenith_proxy_mitm_auth_basic_username is required') diff --git a/ansible/roles/zenith_proxy/files/podman-pod-infra-attach.sh b/ansible/roles/zenith_proxy/files/podman-pod-infra-attach.sh old mode 100644 new mode 100755 index aab232a..0cdfae2 --- a/ansible/roles/zenith_proxy/files/podman-pod-infra-attach.sh +++ b/ansible/roles/zenith_proxy/files/podman-pod-infra-attach.sh @@ -14,4 +14,4 @@ echo "[INFO] Finding infra container for pod '$1'" INFRA_CONTAINER_ID="$(podman pod inspect --format '{{.InfraContainerID}}' "$1")" echo "[INFO] Attaching to infra container '${INFRA_CONTAINER_ID}'" -exec podman container attach --no-stdin ${INFRA_CONTAINER_ID} +exec podman container attach --no-stdin "${INFRA_CONTAINER_ID}" diff --git a/ansible/roles/zenith_proxy/tasks/main.yml b/ansible/roles/zenith_proxy/tasks/main.yml index 1a42b04..7a4c034 100644 --- a/ansible/roles/zenith_proxy/tasks/main.yml +++ b/ansible/roles/zenith_proxy/tasks/main.yml @@ -1,68 +1,71 @@ --- - - name: Install script for attaching to pod infra containers - copy: + ansible.builtin.copy: src: podman-pod-infra-attach.sh dest: /usr/bin/ mode: +x become: true - name: Create systemd unit for Zenith pod - template: + ansible.builtin.template: src: pod.service.j2 dest: /etc/systemd/system/{{ zenith_proxy_service_name }}.service + mode: "0644" become: true register: zenith_proxy_pod_systemd_unit - name: Ensure Zenith pod is started and enabled - service: + ansible.builtin.service: name: "{{ zenith_proxy_service_name }}.service" state: "{{ 'restarted' if zenith_proxy_pod_systemd_unit is changed else 'started' }}" - enabled: yes + enabled: true daemon_reload: "{{ zenith_proxy_pod_systemd_unit is changed }}" become: true -- block: +- become: true + when: zenith_proxy_mitm_enabled + block: - name: Create systemd unit file for MITM proxy - template: + ansible.builtin.template: src: mitm.service.j2 dest: /etc/systemd/system/{{ zenith_proxy_mitm_service_name }}.service + mode: "0644" register: zenith_proxy_mitm_systemd_unit - name: Ensure MITM proxy is started and enabled - service: + ansible.builtin.service: name: "{{ zenith_proxy_mitm_service_name }}.service" state: "{{ 'restarted' if zenith_proxy_mitm_systemd_unit is changed else 'started' }}" - enabled: yes + enabled: true daemon_reload: "{{ zenith_proxy_mitm_systemd_unit is changed }}" - become: true - when: zenith_proxy_mitm_enabled - - name: Ensure Zenith config directory exists - file: + ansible.builtin.file: path: /etc/zenith/{{ zenith_proxy_service_name }} state: directory + mode: "0755" become: true - name: Write Zenith client configuration - template: + ansible.builtin.template: src: zenith-client.yaml.j2 dest: /etc/zenith/{{ zenith_proxy_service_name }}/client.yaml + mode: "0644" become: true register: zenith_proxy_client_config_file - name: Create directory to persist SSH key - file: + ansible.builtin.file: path: "{{ appliances_state_dir }}/{{ zenith_proxy_service_name }}-ssh" state: directory owner: "{{ zenith_proxy_podman_user }}" group: "{{ zenith_proxy_podman_user }}" + mode: "0755" become: true - name: Initialise Zenith client # Use a foreground command rather than the podman_container module as I could not # work out the combination of parameters that produced the desired behaviour :-( - command: >- + ansible.builtin.command: >- podman run --name {{ zenith_proxy_service_name }}-init --replace @@ -79,14 +82,15 @@ "token has already been used" not in zenith_proxy_client_init.stderr - name: Create systemd unit file for Zenith client - template: + ansible.builtin.template: src: client.service.j2 dest: /etc/systemd/system/{{ zenith_proxy_client_service_name }}.service + mode: "0644" become: true register: zenith_proxy_client_systemd_unit - name: Ensure Zenith client is started and enabled - service: + ansible.builtin.service: name: "{{ zenith_proxy_client_service_name }}.service" state: >- {{ @@ -98,6 +102,6 @@ ) else 'started' }} - enabled: yes + enabled: true daemon_reload: "{{ zenith_proxy_client_systemd_unit is changed }}" become: true diff --git a/ansible/site.yml b/ansible/site.yml index faeca23..79b71e1 100644 --- a/ansible/site.yml +++ b/ansible/site.yml @@ -1,41 +1,38 @@ --- - - name: Run pre.yml hook vars: # hostvars not available here, so have to recalculate environment root: appliances_environment_root: "{{ ansible_inventory_sources | last | dirname }}" hook_path: "{{ appliances_environment_root }}/hooks/pre.yml" - import_playbook: "{{ hook_path if hook_path | exists else 'noop.yml' }}" + ansible.builtin.import_playbook: "{{ hook_path if hook_path | exists else 'noop.yml' }}" when: hook_path | exists -- import_playbook: validate.yml +- ansible.builtin.import_playbook: validate.yml when: appliances_validate | default(true) -- import_playbook: bootstrap.yml +- ansible.builtin.import_playbook: bootstrap.yml - name: Run post-bootstrap.yml hook vars: # hostvars not available here, so have to recalculate environment root: appliances_environment_root: "{{ ansible_inventory_sources | last | dirname }}" hook_path: "{{ appliances_environment_root }}/hooks/post-bootstrap.yml" - import_playbook: "{{ hook_path if hook_path | exists else 'noop.yml' }}" + ansible.builtin.import_playbook: "{{ hook_path if hook_path | exists else 'noop.yml' }}" when: hook_path | exists -- import_playbook: iam.yml -- import_playbook: filesystems.yml -- import_playbook: extras.yml -- import_playbook: slurm.yml -- import_playbook: portal.yml -- import_playbook: monitoring.yml +- ansible.builtin.import_playbook: iam.yml +- ansible.builtin.import_playbook: filesystems.yml +- ansible.builtin.import_playbook: extras.yml +- ansible.builtin.import_playbook: slurm.yml +- ansible.builtin.import_playbook: portal.yml +- ansible.builtin.import_playbook: monitoring.yml - name: Run post.yml hook vars: # hostvars not available here, so have to recalculate environment root: appliances_environment_root: "{{ ansible_inventory_sources | last | dirname }}" hook_path: "{{ appliances_environment_root }}/hooks/post.yml" - import_playbook: "{{ hook_path if hook_path | exists else 'noop.yml' }}" + ansible.builtin.import_playbook: "{{ hook_path if hook_path | exists else 'noop.yml' }}" when: hook_path | exists -- import_playbook: final.yml - -... \ No newline at end of file +- ansible.builtin.import_playbook: final.yml diff --git a/ansible/slurm.yml b/ansible/slurm.yml index 3529755..d6d306e 100644 --- a/ansible/slurm.yml +++ b/ansible/slurm.yml @@ -1,22 +1,21 @@ --- - - name: Setup DB hosts: mysql become: true tags: - mysql tasks: - - include_role: - name: mysql + - ansible.builtin.include_role: + name: mysql - name: Setup slurm-driven rebuild hosts: rebuild:!builder - become: yes + become: true tags: - rebuild - openhpc tasks: - - include_role: + - ansible.builtin.include_role: name: rebuild tasks_from: "{{ 'configure.yml' if appliances_mode == 'configure' else 'main.yml' }}" @@ -24,59 +23,59 @@ hosts: - compute - login - become: yes + become: true tags: - openhpc tasks: - - name: set memory limits - lineinfile: + - name: Set memory limits + ansible.builtin.lineinfile: path: /etc/security/limits.conf - regexp: '\* soft memlock unlimited' + regexp: "\\* soft memlock unlimited" line: "* soft memlock unlimited" - name: Block ssh to compute nodes for non-privileged users without running jobs hosts: compute - become: yes + become: true tags: - openhpc tasks: - name: Configure sshd pam module - blockinfile: + ansible.builtin.blockinfile: path: /etc/pam.d/sshd - insertafter: 'account\s+required\s+pam_nologin.so' + insertafter: "account\\s+required\\s+pam_nologin.so" block: | account sufficient pam_access.so account required pam_slurm.so - name: Configure login access control - blockinfile: + ansible.builtin.blockinfile: path: /etc/security/access.conf block: | +:adm:ALL -:ALL:ALL - # vagrant uses (deprecated) ansible_ssh_user + # vagrant uses (deprecated) ansible_ssh_user - name: Setup slurm hosts: openhpc - become: yes + become: true tags: - openhpc tasks: - - include_role: + - ansible.builtin.include_role: name: topology # Gated on topology group having compute nodes but role also # needs to run on control and login nodes when: - appliances_mode == 'configure' - groups['topology'] | length > 0 - - include_role: + - ansible.builtin.include_role: name: stackhpc.openhpc tasks_from: "{{ 'runtime.yml' if appliances_mode == 'configure' else 'main.yml' }}" - name: Setup Node Health Checks # Has to be done here as it requires openhpc repos etc for installation hosts: nhc:!builder - become: yes + become: true tags: nhc tasks: - - include_role: + - ansible.builtin.include_role: name: nhc diff --git a/ansible/validate.yml b/ansible/validate.yml index 034f469..2352fff 100644 --- a/ansible/validate.yml +++ b/ansible/validate.yml @@ -1,5 +1,4 @@ --- - # Fail early if configuration is invalid - name: Validate secrets created @@ -9,7 +8,7 @@ - validate - passwords tasks: - - import_role: + - ansible.builtin.import_role: name: passwords tasks_from: validate.yml @@ -26,7 +25,7 @@ # the actual installed version. # So this compares requirements.yml against a .last version produced by a # successful dev/setup-env.sh run. - - assert: + - ansible.builtin.assert: that: "{{ _requirements_current == _requirements_installed }}" fail_msg: | Ansible Galaxy installs are out of date: @@ -34,7 +33,12 @@ {% for req in _requirements_installed | difference(_requirements_current) %} {{ req }} {% endfor %} - + + _requirements_current: + .{{ _requirements_current }}. + _requirements_installed: + .{{ _requirements_installed }}. + Run dev/setup-env.sh to fix this. vars: # note difference filter requires lists, so need to rearrange yaml from files. @@ -53,7 +57,7 @@ - opentofu tasks: - name: Check templated groups - assert: + ansible.builtin.assert: that: - item in groups - groups[item] | length > 0 @@ -66,7 +70,7 @@ - compute - login - name: Check templated 'all' vars - assert: + ansible.builtin.assert: that: - openhpc_cluster_name is defined - cluster_domain_suffix is defined @@ -81,7 +85,7 @@ - validate - openhpc tasks: - - assert: + - ansible.builtin.assert: that: groups['control'] | length fail_msg: "no hosts found in group 'control' - has control node been deployed?" @@ -92,7 +96,7 @@ - validate - openhpc tasks: - - import_role: + - ansible.builtin.import_role: name: stackhpc.openhpc tasks_from: validate.yml @@ -103,7 +107,7 @@ - validate - filebeat tasks: - - import_role: + - ansible.builtin.import_role: name: filebeat tasks_from: validate.yml tags: validate @@ -119,17 +123,17 @@ - openondemand_server - grafana tasks: - - import_role: + - ansible.builtin.import_role: name: openondemand tasks_from: validate.yml - # This set of tasks will run if there are grafana hosts configured. - # It is a valid configuration to have a grafana group with hosts + # This set of tasks will run if there are grafana hosts configured. + # It is a valid configuration to have a grafana group with hosts # when *not* deploying openondemand. This would mean that openondemand # vars validated in the below task are not set in a way that passes # this set of validation tasks. To ensure that this validation does # not fail with a valid config, only run these tasks when the # openondemand group both exists *and* contains hosts. - when: + when: - "'openondemand' in groups" - groups['openondemand'] | length > 0 tags: @@ -143,7 +147,7 @@ - validate - freeipa tasks: - - import_role: + - ansible.builtin.import_role: name: freeipa tasks_from: validate.yml @@ -153,16 +157,16 @@ - validate - lustre tasks: - - import_role: + - ansible.builtin.import_role: name: lustre tasks_from: validate.yml - name: Validate vGPU configuration hosts: vgpu - become: yes - gather_facts: yes + become: true + gather_facts: true tags: vgpu tasks: - - include_role: + - ansible.builtin.include_role: name: stackhpc.linux.vgpu tasks_from: validate.yml diff --git a/cookiecutter/cookiecutter.json b/cookiecutter/cookiecutter.json index 93b8e7e..3eb7acf 100644 --- a/cookiecutter/cookiecutter.json +++ b/cookiecutter/cookiecutter.json @@ -1,4 +1,4 @@ { - "environment": "foo", - "description" : "Describe the environment here" + "environment": "foo", + "description": "Describe the environment here" } diff --git a/cookiecutter/{{cookiecutter.environment}}/README.md b/cookiecutter/{{cookiecutter.environment}}/README.md index 202ca67..89fe6b4 100644 --- a/cookiecutter/{{cookiecutter.environment}}/README.md +++ b/cookiecutter/{{cookiecutter.environment}}/README.md @@ -2,4 +2,4 @@ {{ cookiecutter.description }} -See the main README.md in the repo root for an overview and general install instructions. Any environment-specific instructions should be added here. \ No newline at end of file +See the main README.md in the repository root for an overview and general install instructions. Any environment-specific instructions should be added here. diff --git a/cookiecutter/{{cookiecutter.environment}}/inventory/group_vars/all/basic_users.yml b/cookiecutter/{{cookiecutter.environment}}/inventory/group_vars/all/basic_users.yml index dc993c3..4b4287c 100644 --- a/cookiecutter/{{cookiecutter.environment}}/inventory/group_vars/all/basic_users.yml +++ b/cookiecutter/{{cookiecutter.environment}}/inventory/group_vars/all/basic_users.yml @@ -1,3 +1,4 @@ +--- basic_users_users: - name: demo_user password: "{% raw %}{{ vault_demo_user_password | password_hash('sha512', 65534 | random(seed=inventory_hostname) | string) }}{% endraw %}" # idempotent diff --git a/cookiecutter/{{cookiecutter.environment}}/inventory/group_vars/all/hpctests.yml b/cookiecutter/{{cookiecutter.environment}}/inventory/group_vars/all/hpctests.yml index e8cfcea..4724621 100644 --- a/cookiecutter/{{cookiecutter.environment}}/inventory/group_vars/all/hpctests.yml +++ b/cookiecutter/{{cookiecutter.environment}}/inventory/group_vars/all/hpctests.yml @@ -1 +1,2 @@ +--- hpctests_user: demo_user diff --git a/cookiecutter/{{cookiecutter.environment}}/tofu/main.tf b/cookiecutter/{{cookiecutter.environment}}/tofu/main.tf index 9aa4475..abbcf94 100644 --- a/cookiecutter/{{cookiecutter.environment}}/tofu/main.tf +++ b/cookiecutter/{{cookiecutter.environment}}/tofu/main.tf @@ -1,21 +1,23 @@ +# tflint-ignore: terraform_required_version + variable "environment_root" { - type = string - description = "Path to environment root, automatically set by activate script" + type = string + description = "Path to environment root, automatically set by activate script" } module "cluster" { - source = "../../site/tofu/" - environment_root = var.environment_root + source = "../../site/tofu/" + environment_root = var.environment_root - # Environment specific variables - # Note that some of the variables below may need to be moved to the site environment - # defaults e.g cluster_networks should be in site if your staging and prod - # environments use the same networks - cluster_name = - cluster_image_id = - control_node_flavor = - cluster_networks = - key_pair = - login = - compute = + # Environment specific variables + # Note that some of the variables below may need to be moved to the site environment + # defaults e.g cluster_networks should be in site if your staging and prod + # environments use the same networks + cluster_name = null + cluster_image_id = null + control_node_flavor = null + cluster_networks = null + key_pair = null + login = null + compute = null } diff --git a/dev/ansible-ssh b/dev/ansible-ssh index 1e7bf75..b2e13ff 100755 --- a/dev/ansible-ssh +++ b/dev/ansible-ssh @@ -1,23 +1,28 @@ #!/usr/bin/env python3 # This tool allows you to ssh into a host using the ansible inventory. -# Example: ansible-ssh compute[0] -o GlobalKnownHostsFile=/dev/null -o UserKnownHostsFile=/dev/null +# Example: ansible-ssh compute[0] -o GlobalKnownHostsFile=/dev/null -o +# UserKnownHostsFile=/dev/null -import sys -import subprocess -import shlex import json import os +import shlex +import subprocess +import sys from collections import defaultdict + def _optional_arg(prototype, *values): # returns empty string if any of the values are falsey filtered = [value for value in values if value] return prototype.format(*values) if len(values) == len(filtered) else "" + if __name__ == "__main__": if len(sys.argv) < 2: - msg = (f"Usage: {sys.argv[0]} [args to pass to ssh]") + msg = ( + f"Usage: { + sys.argv[0]} [args to pass to ssh]") print(msg, file=sys.stderr) sys.exit(-1) @@ -25,7 +30,8 @@ if __name__ == "__main__": host = shlex.quote(sys.argv[1]) try: - output = subprocess.check_output(f'ansible-inventory --host { host }', shell=True) + output = subprocess.check_output( + f'ansible-inventory --host {host}', shell=True) except (subprocess.CalledProcessError) as e: msg = (f"[ERROR]: Is {host} missing from the inventory?") print(msg, file=sys.stderr) @@ -56,7 +62,5 @@ if __name__ == "__main__": base = shlex.split(f'ssh {port} {identity} {opts}') extras = sys.argv[2:] cmd = base + extras + [host] - print(f"[INFO]: Running: { subprocess.list2cmdline(cmd) }") - os.execvp(cmd[0],cmd) - - + print(f"[INFO]: Running: {subprocess.list2cmdline(cmd)}") + os.execvp(cmd[0], cmd) diff --git a/dev/delete-cluster.py b/dev/delete-cluster.py index 05f53fb..f329e74 100755 --- a/dev/delete-cluster.py +++ b/dev/delete-cluster.py @@ -1,4 +1,5 @@ #!/usr/bin/env python +# pylint: disable=invalid-name """ Delete infrastructure for a cluster without using Terraform. Useful for CI clusters. @@ -10,41 +11,53 @@ If --force is provided, it will delete all resources without confirmation. """ -import sys, json, subprocess +import json +import subprocess +import sys +CLUSTER_RESOURCES = ["server", "port", "volume"] -CLUSTER_RESOURCES = ['server', 'port', 'volume'] +# pylint: disable-next=missing-function-docstring, redefined-outer-name def delete_cluster(cluster_prefix, force=False): + to_delete = {} for resource_type in CLUSTER_RESOURCES: to_delete[resource_type] = [] - resource_list = subprocess.run(f'openstack {resource_type} list --format json', stdout=subprocess.PIPE, shell=True) + resource_list = subprocess.run( # pylint: disable=subprocess-run-check + f"openstack {resource_type} list --format json", + stdout=subprocess.PIPE, + shell=True, + ) resources = json.loads(resource_list.stdout) for item in resources: try: - if item['Name'] is not None and item['Name'].startswith(cluster_prefix): - print(resource_type, item['Name'], item['ID']) + if item["Name"] is not None and item["Name"].startswith(cluster_prefix): + print(resource_type, item["Name"], item["ID"]) to_delete[resource_type].append(item) - except: + except BaseException: print(resource_type, item) raise - - if force or input('Delete these (y/n)?:') == 'y': + + if force or input("Delete these (y/n)?:") == "y": for resource_type in CLUSTER_RESOURCES: - items = [v['ID'] for v in to_delete[resource_type]] + items = [v["ID"] for v in to_delete[resource_type]] if items: # delete all resources of each type in a single call for speed: - subprocess.run(f"openstack {resource_type} delete {' '.join(items)}", stdout=subprocess.PIPE, shell=True) - print(f'Deleted {len(items)} {resource_type}s') + subprocess.run( # pylint: disable=subprocess-run-check + f"openstack {resource_type} delete {' '.join(items)}", + stdout=subprocess.PIPE, + shell=True, + ) + print(f"Deleted {len(items)} {resource_type}s") else: - print('Cancelled - no resources deleted') + print("Cancelled - no resources deleted") -if __name__ == '__main__': + +if __name__ == "__main__": if len(sys.argv) < 2 or len(sys.argv) > 3: - print('ERROR: Incorrect argument(s).\n' + __doc__) - exit(1) - force_flag = '--force' in sys.argv + print("ERROR: Incorrect argument(s).\n" + __doc__) + exit(1) # pylint: disable=consider-using-sys-exit + force_flag = "--force" in sys.argv cluster_prefix = sys.argv[1] delete_cluster(cluster_prefix, force_flag) - diff --git a/dev/extract_logs.py b/dev/extract_logs.py index 65df014..3aecd35 100644 --- a/dev/extract_logs.py +++ b/dev/extract_logs.py @@ -1,7 +1,7 @@ #!/usr/bin/env python """ -Process packer build workflow logs into CSV. Useful for timing +Process packer build workflow logs into CSV. Useful for timing dissemination. Usage: @@ -13,70 +13,94 @@ import csv import re -import os import sys -def convert_time_to_seconds(time_str): - h, m, s = time_str.split(':') + +def convert_time_to_seconds(time_str): # pylint: disable=missing-function-docstring + h, m, s = time_str.split(":") return int(h) * 3600 + int(m) * 60 + float(s) -def extract_log_info_and_generate_csv(log_file_path, output_csv_path, target_directory): + +# pylint: disable-next=missing-function-docstring, too-many-locals +def extract_log_info_and_generate_csv( + # pylint: disable=redefined-outer-name + log_file_path, + output_csv_path, + target_directory, + # pylint: enable=redefined-outer-name +): data = [] - unwanted_chars = re.compile(r'(\x1B\[[0-9;]*m)|([^\x00-\x7F])') + unwanted_chars = re.compile(r"(\x1B\[[0-9;]*m)|([^\x00-\x7F])") - with open(log_file_path, 'r') as file: + with open(log_file_path, "r") as file: # pylint: disable=unspecified-encoding lines = file.readlines() previous_task = None - for i in range(len(lines)): + for i in range(len(lines)): # pylint: disable=consider-using-enumerate if "TASK [" in lines[i]: - task_name = lines[i].strip().split('TASK [')[1].split(']')[0] + task_name = lines[i].strip().split("TASK [")[1].split("]")[0] - full_task_path = lines[i + 1].strip().split('task path: ')[1] + full_task_path = lines[i + 1].strip().split("task path: ")[1] if target_directory in full_task_path: - start_index = full_task_path.find(target_directory) + len(target_directory) + start_index = full_task_path.find(target_directory) + len( + target_directory + ) partial_task_path = full_task_path[start_index:] else: partial_task_path = full_task_path - partial_task_path = unwanted_chars.sub('', partial_task_path).strip() + partial_task_path = unwanted_chars.sub("", partial_task_path).strip() - time_to_complete = lines[i + 2].strip().split('(')[1].split(')')[0] + time_to_complete = lines[i + 2].strip().split("(")[1].split(")")[0] if previous_task: - previous_task[2] = time_to_complete # Shift the time to the previous task + # pylint: disable-next=unsupported-assignment-operation + previous_task[2] = ( + time_to_complete # Shift the time to the previous task + ) data.append(previous_task) - previous_task = [task_name, partial_task_path, None] # Placeholder for the next time_to_complete + previous_task = [ + task_name, + partial_task_path, + None, + ] # Placeholder for the next time_to_complete if previous_task: - previous_task[2] = time_to_complete if time_to_complete else 'N/A' + previous_task[2] = time_to_complete if time_to_complete else "N/A" data.append(previous_task) for row in data: - if row[2] != 'N/A': + if row[2] != "N/A": row[2] = convert_time_to_seconds(row[2]) data.sort(key=lambda x: x[2], reverse=True) for row in data: if isinstance(row[2], float): - row[2] = f'{int(row[2] // 3600):02}:{int((row[2] % 3600) // 60):02}:{row[2] % 60:.3f}' + row[2] = ( + f"{int(row[2] // 3600):02}:{int((row[2] % 3600) // 60):02}:{row[2] % 60:.3f}" + ) - with open(output_csv_path, 'w', newline='') as csvfile: + # pylint: disable-next=unspecified-encoding + with open(output_csv_path, "w", newline="") as csvfile: csvwriter = csv.writer(csvfile) - csvwriter.writerow(['Task Name', 'Task Path', 'Time to Complete']) + csvwriter.writerow(["Task Name", "Task Path", "Time to Complete"]) csvwriter.writerows(data) print(f"Data extracted, sorted, and saved to {output_csv_path}") - + + if len(sys.argv) != 2: - print("Path to workflow log plain text file should be provided as the only arg to this script") + print( + "Path to workflow log plain text file should be provided as the only arg to this script" + ) sys.exit(1) -log_file_path = sys.argv[1] # Input workflow log name -output_csv_path = log_file_path.replace('.txt', '.csv') # Output CSV name -target_directory = '/ansible/' # Shared directory for task path +log_file_path = sys.argv[1] # Input workflow log name +output_csv_path = log_file_path.replace(".txt", ".csv") # Output CSV name +# pylint: disable-next=invalid-name +target_directory = "/ansible/" # Shared directory for task path extract_log_info_and_generate_csv(log_file_path, output_csv_path, target_directory) diff --git a/dev/image-share.sh b/dev/image-share.sh index 93a57ca..f109f16 100755 --- a/dev/image-share.sh +++ b/dev/image-share.sh @@ -13,18 +13,18 @@ DEST=$2 IMAGE_NAME=$3 export OS_CLOUD=$SOURCE -SOURCE_PROJECT=$(openstack project show -c id -f value $SOURCE) +SOURCE_PROJECT=$(openstack project show -c id -f value "$SOURCE") export OS_CLOUD=$DEST -DEST_PROJECT=$(openstack project show -c id -f value $DEST) +DEST_PROJECT=$(openstack project show -c id -f value "$DEST") export OS_CLOUD=$SOURCE -IMAGE=$(openstack image show -c id -f value $IMAGE_NAME) +IMAGE=$(openstack image show -c id -f value "$IMAGE_NAME") echo "Sharing $IMAGE_NAME ($IMAGE) from $SOURCE ($SOURCE_PROJECT) ..." -openstack image set --shared $IMAGE +openstack image set --shared "$IMAGE" echo "Adding destination project $DEST ($DEST_PROJECT) ..." -openstack image add project $IMAGE $DEST_PROJECT +openstack image add project "$IMAGE" "$DEST_PROJECT" export OS_CLOUD=$DEST echo "Accepting share ..." -openstack image set --accept $IMAGE +openstack image set --accept "$IMAGE" echo "Done" diff --git a/dev/output_manifest.py b/dev/output_manifest.py index b68ed49..04c9ffe 100755 --- a/dev/output_manifest.py +++ b/dev/output_manifest.py @@ -1,4 +1,6 @@ #!/usr/bin/env python +# pylint: disable=missing-module-docstring +# pylint: disable=line-too-long # Set github workflow output parameters defining image IDs from a packer manifest. # Usage: # ./packer/read_manifest.py packer/packer-manifest.json @@ -10,14 +12,23 @@ # which can be used in subsequent workflow steps: [1] # # [1]: https://docs.github.com/en/actions/using-workflows/workflow-commands-for-github-actions#example-setting-a-value +# pylint: enable=line-too-long + +import json +import sys -import sys, json output = {} -with open(sys.argv[1]) as f: +with open(sys.argv[1]) as f: # pylint: disable=unspecified-encoding data = json.load(f) -for build in data['builds']: - node_type = build['custom_data']['source'] - image_id = build['artifact_id'] - output[node_type] = image_id # NB: this deliberately gets the LAST build for a node type +for build in data["builds"]: + node_type = build["custom_data"]["source"] + image_id = build["artifact_id"] + output[node_type] = ( + image_id # NB: this deliberately gets the LAST build for a node type + ) for node_type, image_id in output.items(): - print('::set-output name=NEW_%s_IMAGE_ID::%s' % (node_type.upper(), image_id)) + print( + # pylint: disable-next=consider-using-f-string + "::set-output name=NEW_%s_IMAGE_ID::%s" + % (node_type.upper(), image_id) + ) diff --git a/dev/setup-env.sh b/dev/setup-env.sh index c37978a..d0c14bb 100755 --- a/dev/setup-env.sh +++ b/dev/setup-env.sh @@ -5,33 +5,35 @@ set -euo pipefail PYTHON_VERSION=${PYTHON_VERSION:-} if [[ "$PYTHON_VERSION" == "" ]]; then - if [[ -f /etc/os-release ]]; then - . /etc/os-release - OS=$ID - OS_VERSION=$VERSION_ID - else - exit 1 - fi + if [[ -f /etc/os-release ]]; then + # shellcheck disable=SC1091 + . /etc/os-release + OS=$ID + OS_VERSION=$VERSION_ID + else + exit 1 + fi - MAJOR_VERSION=$(echo $OS_VERSION | cut -d. -f1) + MAJOR_VERSION=$(echo "$OS_VERSION" | cut -d. -f1) - if [[ "$OS" == "ubuntu" && "$MAJOR_VERSION" == "22" ]]; then - PYTHON_VERSION="/usr/bin/python3.10" - elif [[ "$OS" == "rocky" && "$MAJOR_VERSION" == "8" ]]; then - # python3.9+ doesn't have selinux bindings - PYTHON_VERSION="/usr/bin/python3.8" # use `sudo yum install python38` on Rocky Linux 8 to install this - elif [[ "$OS" == "rocky" && "$MAJOR_VERSION" == "9" ]]; then - PYTHON_VERSION="/usr/bin/python3.9" - else - echo "Unsupported OS version: $OS $MAJOR_VERSION" - exit 1 - fi + if [[ "$OS" == "ubuntu" && "$MAJOR_VERSION" == "22" ]]; then + PYTHON_VERSION="/usr/bin/python3.10" + elif [[ "$OS" == "rocky" && "$MAJOR_VERSION" == "8" ]]; then + # python3.9+ doesn't have selinux bindings + PYTHON_VERSION="/usr/bin/python3.8" # use `sudo yum install python38` on Rocky Linux 8 to install this + elif [[ "$OS" == "rocky" && "$MAJOR_VERSION" == "9" ]]; then + PYTHON_VERSION="/usr/bin/python3.9" + else + echo "Unsupported OS version: $OS $MAJOR_VERSION" + exit 1 + fi fi if [[ ! -d "venv" ]]; then - $PYTHON_VERSION -m venv venv + $PYTHON_VERSION -m venv venv fi +# shellcheck disable=SC1091 . venv/bin/activate pip install -U pip pip install -r requirements.txt diff --git a/docs/README.md b/docs/README.md index dfa9144..c66868a 100644 --- a/docs/README.md +++ b/docs/README.md @@ -1,6 +1,6 @@ # StackHPC Slurm Appliance Documentation -### Operator docs +## Operator docs [Image build](image-build.md) @@ -16,7 +16,7 @@ [Sequence diagrams](sequence.md) -### Configuration docs +## Configuration docs [Alerting](alerting.md) @@ -32,7 +32,7 @@ [Persistent state](persistent-state.md) -#### Experimental fetaures +### Experimental fetaures [Compute init](experimental/compute-init.md) @@ -40,6 +40,6 @@ [Slurm controlled rebuild](experimental/slurm-controlled-rebuild.md) -### Contributor docs +## Contributor docs [Adding functionality](adding-functionality.md) diff --git a/docs/adding-functionality.md b/docs/adding-functionality.md index 05bcbb5..da0b879 100644 --- a/docs/adding-functionality.md +++ b/docs/adding-functionality.md @@ -1,9 +1,10 @@ # Adding new functionality Please contact us for specific advice, but this generally involves: + - Adding a role. - Adding a play calling that role into an existing playbook in `ansible/`, or adding a new playbook there and updating `site.yml`. - Adding a new (empty) group named after the role into `environments/common/inventory/groups` and a non-empty example group into `environments/site/inventory/groups`. - Adding new default group vars into `environments/common/inventory/group_vars/all//`. - Updating the default Packer build variables in `environments/common/inventory/group_vars/builder/defaults.yml`. -- Updating READMEs. +- Updating readmes. diff --git a/docs/alerting.md b/docs/alerting.md index e030d23..38bfb05 100644 --- a/docs/alerting.md +++ b/docs/alerting.md @@ -4,10 +4,10 @@ The [prometheus.io docs](https://prometheus.io/docs/alerting/latest/overview/) describe the overall alerting process: > Alerting with Prometheus is separated into two parts. Alerting rules in - Prometheus servers send alerts to an Alertmanager. The Alertmanager then - manages those alerts, including silencing, inhibition, aggregation and - sending out notifications via methods such as email, on-call notification - systems, and chat platforms. +> Prometheus servers send alerts to an Alertmanager. The Alertmanager then +> manages those alerts, including silencing, inhibition, aggregation and +> sending out notifications via methods such as email, on-call notification +> systems, and chat platforms. The general Prometheus configuration is described in [monitoring-and-logging.md](./monitoring-and-logging.md#defaults-3) - note that @@ -21,37 +21,35 @@ must be configured to generate notifications. ## Enabling alertmanager 1. Ensure both the `prometheus` and `alertmanager` servers are deployed on the -control node - these are deployed by default in the site environment's groups: - - ```ini - # environments/site/groups: - [prometheus:children] - control - - [alertmanager:children] - control - ``` + control node - these are deployed by default in the site environment's groups: + +```ini +# environments/site/groups: +[prometheus:children] +control +[alertmanager:children] +control +``` -2. If the appliance was deployed before the alertmanager functionality was included, -generate a password for the alertmanager UI user: +2. If the appliance was deployed before the alertmanager functionality was included, generate a password for the alertmanager UI user: - ```shell - ansible-playbook ansible/adhoc/generate-passwords.yml - ``` +```shell +ansible-playbook ansible/adhoc/generate-passwords.yml +``` 3. Configure a receiver to generate notifications from alerts. Currently a Slack -integration is provided (see below) but alternative receivers could be defined -via overriding role defaults. - + integration is provided (see below) but alternative receivers could be defined + via overriding role defaults. + 4. If desired, any other [role defaults](../ansible/roles/alertmanager/README.md) -may be overriden in e.g. `environments/site/inventory/group_vars/all/alertmanager.yml`. + may be overriden in e.g. `environments/site/inventory/group_vars/all/alertmanager.yml`. 5. Run the `monitoring.yml` playbook (if the cluster is already up) to configure -both alertmanager and prometheus: + both alertmanager and prometheus: - ```shell - ansible-playbook ansible/monitoring.yml - ``` +```shell +ansible-playbook ansible/monitoring.yml +``` ## Access @@ -75,7 +73,7 @@ of alerts via Slack. 1. Create an app with a bot token: -- Go to https://api.slack.com/apps +- Go to - select "Create an App" - select "From scratch" - Set app name and workspace fields, select "Create" @@ -92,16 +90,20 @@ of alerts via Slack. - Uncomment `vault_alertmanager_slack_integration_app_creds` and add the token - Vault-encrypt that file: - ansible-vault encrypt environments/$ENV/inventory/group_vars/all/vault_alertmanager.yml +```shell +ansible-vault encrypt environments/$ENV/inventory/group_vars/all/vault_alertmanager.yml +``` - Open `environments/$ENV/inventory/group_vars/all/alertmanager.yml` - Uncomment the `alertmanager_slack_integration` mapping and set your alert channel name 3. Invite the bot to your alerts channel -- In the appropriate Slack channel type: - /invite @YOUR_BOT_NAME +- In the appropriate Slack channel type: +```text +/invite @YOUR_BOT_NAME +``` ## Alerting Rules @@ -111,15 +113,16 @@ which is defined for the appliance at Two [cloudalchemy.prometheus](https://github.com/cloudalchemy/ansible-prometheus) role variables are relevant: + - `prometheus_alert_rules_files`: Paths to check for files providing rules. Note these are copied to Prometheus config directly, so jinja expressions for Prometheus do not need escaping. - `prometheus_alert_rules`: Yaml-format rules. Jinja templating here will be -interpolated by Ansible, so templating intended for Prometheus must be escaped -using `{% raw %}`/`{% endraw %}` tags. + interpolated by Ansible, so templating intended for Prometheus must be escaped + using `{% raw %}`/`{% endraw %}` tags. By default, `prometheus_alert_rules_files` is set so that any `*.rules` files -in a directory `files/prometheus/rules` in the current environment or *any* +in a directory `files/prometheus/rules` in the current environment or _any_ parent environment are loaded. So usually, site-specific alerts should be added by creating additional rules files in `environments/site/files/prometheus/rules`. If the same file exists in more than one environment, the "child" file will take @@ -127,6 +130,7 @@ precedence and any rules in the "parent" file will be ignored. A set of default alert rule files is provided at `environments/common/files/prometheus/rules/`. These cover: + - Some node-exporter metrics for disk, filesystems, memory and clock. Note no alerts are triggered on memory for compute nodes due to the intended use of those nodes. @@ -136,6 +140,7 @@ These cover: When defining additional rules, note the [labels defined](./monitoring-and-logging.md#prometheus_node_exporter_targets) for node-exporter targets. In future more alerts may be added for: + - smartctl-exporter-based rules for baremetal nodes where there is no infrastructure-level smart monitoring - loss of "up" network interfaces diff --git a/docs/chrony.md b/docs/chrony.md index 0d6f8b1..a80cd40 100644 --- a/docs/chrony.md +++ b/docs/chrony.md @@ -4,7 +4,7 @@ Use variables from the [mrlesmithjr.chrony](https://github.com/mrlesmithjr/ansib For example in: `environments//inventory/group_vars/all/chrony`: -``` +```yaml --- chrony_ntp_servers: - server: ntp-0.example.org @@ -17,5 +17,4 @@ chrony_ntp_servers: - option: iburst - option: minpoll val: 8 - ``` diff --git a/docs/ci.md b/docs/ci.md index c6fa890..1352649 100644 --- a/docs/ci.md +++ b/docs/ci.md @@ -2,7 +2,6 @@ The `.github` directory contains a set of sample workflows which can be used by downstream site-specific configuration repositories to simplify ongoing maintainence tasks. These include: -- An [upgrade check](.github/workflows/upgrade-check.yml.sample) workflow which automatically checks this upstream stackhpc/ansible-slurm-appliance repo for new releases and proposes a pull request to the downstream site-specific repo when a new release is published. +- An [upgrade check](.github/workflows/upgrade-check.yml.sample) workflow which automatically checks this upstream stackhpc/ansible-slurm-appliance repository for new releases and proposes a pull request to the downstream site-specific repository when a new release is published. - An [image upload](.github/workflows/upload-s3-image.yml.sample) workflow which takes an image name, downloads it from StackHPC's public S3 bucket if available, and uploads it to the target OpenStack cloud. - diff --git a/docs/environments.md b/docs/environments.md index 183b775..ae23410 100644 --- a/docs/environments.md +++ b/docs/environments.md @@ -3,6 +3,7 @@ ## Overview An environment defines the configuration for a single instantiation of this Slurm appliance. Each environment is a directory in `environments/`, containing: + - Any deployment automation required - e.g. OpenTofu configuration or HEAT templates. - An Ansible `inventory/` directory. - An `activate` script which sets environment variables to point to this configuration. @@ -13,21 +14,23 @@ All environments load the inventory from the `common` environment first, with th ### Environment-specific inventory structure The ansible inventory for the environment is in `environments//inventory/`. It should generally contain: -- A `hosts` file. This defines the hosts in the appliance. Generally it should be templated out by the deployment automation so it is also a convenient place to define variables which depend on the deployed hosts such as connection variables, IP addresses, ssh proxy arguments etc. + +- A `hosts` file. This defines the hosts in the appliance. Generally it should be templated out by the deployment automation so it is also a convenient place to define variables which depend on the deployed hosts such as connection variables, IP addresses, SSH proxy arguments etc. - A `groups` file defining ansible groups, which essentially controls which features of the appliance are enabled and where they are deployed. This repository generally follows a convention where functionality is defined using ansible roles applied to a group -of the same name, e.g. `openhpc` or `grafana`. The meaning and use of each group is described in comments in `environments/common/inventory/groups`. As the groups defined there for the common environment are empty, functionality is disabled by default and must be -enabled in a specific environment's `groups` file. The `site` environment contains an ini file at `environments/site/inventory/groups` which enables groups for default appliance functionality across all environments. Additional groups should generally also be -enabled in this file to avoid divergence between staging and production environments. Note that enabling some groups may require a site-specific image build and Ark credentials (see [operations guide](operations.md)). + of the same name, e.g. `openhpc` or `grafana`. The meaning and use of each group is described in comments in `environments/common/inventory/groups`. As the groups defined there for the common environment are empty, functionality is disabled by default and must be + enabled in a specific environment's `groups` file. The `site` environment contains an ini file at `environments/site/inventory/groups` which enables groups for default appliance functionality across all environments. Additional groups should generally also be + enabled in this file to avoid divergence between staging and production environments. Note that enabling some groups may require a site-specific image build and Ark credentials (see [operations guide](operations.md)). - Optionally, group variable files in `group_vars//overrides.yml`, where the group names match the functional groups described above. These can be used to override the default configuration for each functionality, which are defined in `environments/common/inventory/group_vars/all/.yml` (the use of `all` here is due to ansible's precedence rules). Although most of the inventory uses the group convention described above there are a few special cases: + - The `control`, `login` and `compute` groups are special as they need to contain actual hosts rather than child groups, and so should generally be defined in the templated-out `hosts` file. - The cluster name must be set on all hosts using `openhpc_cluster_name`. Using an `[all:vars]` section in the `hosts` file is usually convenient. - `environments/common/inventory/group_vars/all/defaults.yml` contains some variables which are not associated with a specific role/feature. These are unlikely to need changing, but if necessary that could be done using a `environments//inventory/group_vars/all/overrides.yml` file. - The `ansible/adhoc/generate-passwords.yml` playbook sets secrets for all hosts in `environments//inventory/group_vars/all/secrets.yml`. - The Packer-based pipeline for building compute images creates a VM in groups `builder` and `compute`, allowing build-specific properties to be set in `environments/common/inventory/group_vars/builder/defaults.yml` or the equivalent inventory-specific path. - Each Slurm partition must have: - - An inventory group `_` defining the hosts it contains - these must be homogenous w.r.t CPU and memory. - - An entry in the `openhpc_slurm_partitions` mapping in `environments//inventory/group_vars/openhpc/overrides.yml`. + - An inventory group `_` defining the hosts it contains - these must be homogenous w.r.t CPU and memory. + - An entry in the `openhpc_slurm_partitions` mapping in `environments//inventory/group_vars/openhpc/overrides.yml`. See the [openhpc role documentation](https://github.com/stackhpc/ansible-role-openhpc#slurmconf) for more options. - On an OpenStack cloud, rebuilding/reimaging compute nodes from Slurm can be enabled by defining a `rebuild` group containing the relevant compute hosts (e.g. in the generated `hosts` file). diff --git a/docs/experimental/compute-init.md b/docs/experimental/compute-init.md index 8b5d5e3..dfad27b 100644 --- a/docs/experimental/compute-init.md +++ b/docs/experimental/compute-init.md @@ -2,7 +2,7 @@ See the role README.md -# Changes to image / tofu state +## Changes to image / tofu state When a compute group has the `ignore_image_changes` parameter set to true, changes to the `image_id` parameter (which defaults to `cluster_image_id`) are @@ -14,17 +14,21 @@ role templates out hostvars to the control node, which means the "target" image ID is then available on the control node. Subsequent work will use this to rebuild the node via slurm. -# CI workflow +## CI workflow The compute node rebuild is tested in CI after the tests for rebuilding the login and control nodes. The process follows 1. Compute nodes are reimaged: - ansible-playbook -v --limit compute ansible/adhoc/rebuild.yml +```shell +ansible-playbook -v --limit compute ansible/adhoc/rebuild.yml +``` 2. Ansible-init runs against newly reimaged compute nodes 3. Run sinfo and check nodes have expected slurm state - ansible-playbook -v ansible/ci/check_slurm.yml \ No newline at end of file +```shell +ansible-playbook -v ansible/ci/check_slurm.yml +``` diff --git a/docs/experimental/isolated-clusters.md b/docs/experimental/isolated-clusters.md index c136e99..5cf5a7b 100644 --- a/docs/experimental/isolated-clusters.md +++ b/docs/experimental/isolated-clusters.md @@ -11,68 +11,70 @@ all "default" features, i.e. roles/groups which are enabled either in the The full list of features and whether they are functional on such an "isolated" network is shown in the table below. Note that: -- Using [EESSI](https://www.eessi.io/docs/) necessarily requires outbound - network access for the CernVM File System. However this can be provided - via an authenticated proxy. While the proxy configuration on the cluster node - is readable by all users, this proxy could be limited via acls to only provide - access to EESSI's CVMFS Stratum 1 servers. +- Using [EESSI](https://www.eessi.io/docs/) necessarily requires outbound + network access for the CernVM File System. However this can be provided + via an authenticated proxy. While the proxy configuration on the cluster node + is readable by all users, this proxy could be limited via acls to only provide + access to EESSI's CVMFS Stratum 1 servers. ## Support by feature for isolated networks See above for definition of "Default" features. In the "Isolated?" column: + - "Y": Feature works without outbound internet access. - "N": Known not to work. - "?": Not investigated at present. -| Inventory group/role | Default? | Isolated? | -| ----------------------| -------- | --------- | -| alertmanager | Y | Y | -| ansible_init | Y | Y | -| basic_users | Y | Y | -| block_devices | Y | No (depreciated) | -| cacerts | - | Y | -| chrony | - | Y | -| compute_init | - | Y | -| cuda | - | ? | -| eessi | Y | Y - see above | -| etc_hosts | Y | Y | -| extra_packages | - | No | -| fail2ban | Y | Y | -| filebeat | Y | Y | -| firewalld | Y | Y | -| freeipa_client | - | Y - image build required | -| gateway | n/a | n/a - build only | -| grafana | Y | Y | -| hpctests | Y | Y | -| k3s_agent | - | ? | -| k3s_server | - | ? | -| k9s | - | ? | -| lustre | - | ? | -| manila | Y | Y | -| mysql | Y | Y | -| nfs | Y | Y | -| nhc | Y | Y | -| node_exporter | Y | Y | -| openhpc | Y | Y | -| openondemand | Y | Y | -| openondemand_desktop | Y | Y | -| openondemand_jupyter | Y | Y | -| opensearch | Y | Y | -| podman | Y | Y | -| persist_hostkeys | Y | Y | -| prometheus | Y | Y | -| proxy | - | Y | -| resolv_conf | - | ? | -| slurm_exporter | Y | Y | -| slurm_stats | Y | Y | -| squid | - | ? | -| sshd | - | ? | -| sssd | - | ? | -| systemd | Y | Y | -| tuned | - | Y | -| update | - | No | +| Inventory group/role | Default? | Isolated? | +| -------------------- | -------- | ------------------------ | +| alertmanager | Y | Y | +| ansible_init | Y | Y | +| basic_users | Y | Y | +| block_devices | Y | No (depreciated) | +| cacerts | - | Y | +| chrony | - | Y | +| compute_init | - | Y | +| cuda | - | ? | +| eessi | Y | Y - see above | +| etc_hosts | Y | Y | +| extra_packages | - | No | +| fail2ban | Y | Y | +| filebeat | Y | Y | +| firewalld | Y | Y | +| freeipa_client | - | Y - image build required | +| gateway | n/a | n/a - build only | +| grafana | Y | Y | +| hpctests | Y | Y | +| k3s_agent | - | ? | +| k3s_server | - | ? | +| k9s | - | ? | +| lustre | - | ? | +| manila | Y | Y | +| MySQL | Y | Y | +| nfs | Y | Y | +| nhc | Y | Y | +| node_exporter | Y | Y | +| openhpc | Y | Y | +| openondemand | Y | Y | +| openondemand_desktop | Y | Y | +| openondemand_jupyter | Y | Y | +| opensearch | Y | Y | +| podman | Y | Y | +| persist_hostkeys | Y | Y | +| prometheus | Y | Y | +| proxy | - | Y | +| resolv_conf | - | ? | +| slurm_exporter | Y | Y | +| slurm_stats | Y | Y | +| squid | - | ? | +| sshd | - | ? | +| sssd | - | ? | +| systemd | Y | Y | +| tuned | - | Y | +| update | - | No | ## Image build + A site image build may be required, either for features using packages not present in StackHPC images (e.g `freeipa_client`) or to [add additional packages](../operations.md#adding-additional-packages). Clearly in this case the build VM does require outbound internet access. For an @@ -82,7 +84,7 @@ proxy is available the image build can be configured to use that, e.g.: ```yaml # environments/$ENV/builder.pkrvars.hcl: -... +--- inventory_groups = 'proxy,freeipa_client' ``` @@ -96,7 +98,7 @@ proxy_http_address: squid.mysite.org ```yaml # environments/$ENV/group_vars/builder/vault_overrrides.yml: # NB: vault-encrypt this file -vault_proxy_basic_password: 'super-secret-password' +vault_proxy_basic_password: "super-secret-password" ``` See [ansible/roles/proxy/README.md](../../ansible/roles/proxy/README.md) and @@ -117,28 +119,32 @@ default security groups are less restrictive than these. Assuming nodes and the deploy host have a security group `isolated` applied then the following rules are required: - # allow outbound DNS - ALLOW IPv4 53/tcp to 0.0.0.0/0 - ALLOW IPv4 53/udp to 0.0.0.0/0 - - # allow everything within the cluster: - ALLOW IPv4 from isolated - ALLOW IPv4 to isolated - - # allow hosts to reach metadata server (e.g. for cloud-init keys): - ALLOW IPv4 80/tcp to 169.254.169.254/32 +```text +# allow outbound DNS +ALLOW IPv4 53/tcp to 0.0.0.0/0 +ALLOW IPv4 53/udp to 0.0.0.0/0 + +# allow everything within the cluster: +ALLOW IPv4 from isolated +ALLOW IPv4 to isolated - # optionally: allow hosts to reach squid proxy for EESSI: - ALLOW IPv4 3128/tcp to +# allow hosts to reach metadata server (e.g. for cloud-init keys): +ALLOW IPv4 80/tcp to 169.254.169.254/32 + +# optionally: allow hosts to reach squid proxy for EESSI: +ALLOW IPv4 3128/tcp to +``` Note that name resolution happens on the hosts, not on the proxy, hence DNS is required for nodes even with a proxy. -For nodes running OpenOndemand, inbound ssh and https are also required +For nodes running OpenOndemand, inbound SSH and https are also required (e.g. in a security group called `isolated-ssh-https`): - ALLOW IPv4 443/tcp from 0.0.0.0/0 - ALLOW IPv4 22/tcp from 0.0.0.0/0 +```text +ALLOW IPv4 443/tcp from 0.0.0.0/0 +ALLOW IPv4 22/tcp from 0.0.0.0/0 +``` If non-default security groups are required, then the OpenTofu variables `login_security_groups` and `nonlogin_security_groups` can be used to set diff --git a/docs/experimental/pulp.md b/docs/experimental/pulp.md index 582eec9..f0748c0 100644 --- a/docs/experimental/pulp.md +++ b/docs/experimental/pulp.md @@ -5,17 +5,18 @@ In order to ensure reproducible builds, the appliance can build images using rep ## Deploying/configuring Pulp Server ### Deploying a Pulp server + A playbook is provided to install and configure a Pulp server on a given host. Admin credentials for this server are automatically generated through the `ansible/adhoc/generate-passwords.yml` playbook. To use this, create an inventory file -defining a group `pulp_server` containing a single host, which requires at least 2 vCPUs and 4GB RAM. The group should be defined in your `site` environment's inventory so that a single Pulp server is shared between all environments and +defining a group `pulp_server` containing a single host, which requires at least 2 vCPUs and 4GB RAM. The group should be defined in your `site` environment's inventory so that a single Pulp server is shared between all environments and the same snapshots are tested in staging and production. -Deploying and syncing Pulp has been tested on an RL9 host. The hostvar `ansible_host` should be defined, giving the IP address Ansible should use for ssh. For example, you can create an ini file at `environments/site/inventory/pulp` with the contents: +Deploying and syncing Pulp has been tested on an RL9 host. The hostvar `ansible_host` should be defined, giving the IP address Ansible should use for SSH. For example, you can create an ini file at `environments/site/inventory/pulp` with the contents: -``` +```ini [pulp_server] pulp_host ansible_host= ``` -> [!WARNING] +> [!WARNING] > The inventory hostname cannot conflict with group names i.e can't be called `pulp_site` or `pulp_server`. Once complete, it will print a message giving a value to set for `appliances_pulp_url` (see example config below), assuming the `ansible_host` address is also the address the cluster @@ -24,6 +25,7 @@ should use to reach the Pulp server. Note access to this server's content isn't authenticated so this assumes the `pulp_server` host is not externally reachable. ### Using an existing Pulp server + An existing Pulp server can be used to host Ark repos by overriding `pulp_site_password` and `appliances_pulp_url` in the target environment. Note that this assumes the same configuration as the appliance deployed Pulp i.e no content authentication. ## Syncing Pulp content with Ark @@ -34,7 +36,7 @@ Content can also be synced by running `ansible/adhoc/sync-pulp.yml`. By default ## Example config in site variables -``` +```yaml # environments/site/inventory/group_vars/all/pulp_site.yml: appliances_pulp_url: "http://:8080" pulp_site_upstream_username: @@ -42,10 +44,11 @@ pulp_site_upstream_password: ``` ## Installing packages from Pulp at runtime + By default, system repos are overwritten to point at Pulp repos during [image builds,](../image-build.md) so using a site Pulp server will require a new fatimage. If you instead wish to install packages at runtime, you will need to add all host groups on which you will be installing packages to the `dnf_repos` group in `environments/site/inventory/groups` e.g: -``` +```yaml [dnf_repos:children] cluster ``` diff --git a/docs/experimental/slurm-controlled-rebuild.md b/docs/experimental/slurm-controlled-rebuild.md index 7f9efa2..fc654d3 100644 --- a/docs/experimental/slurm-controlled-rebuild.md +++ b/docs/experimental/slurm-controlled-rebuild.md @@ -9,6 +9,7 @@ This provides a way to upgrade nodes with less impact than the normal approach. > or usage may change with further development. In summary, the way this functionality works is as follows: + 1. The image references(s) are manually updated in the OpenTofu configuration in the normal way. 2. `tofu apply` is run which rebuilds the login and control nodes to the new @@ -20,7 +21,7 @@ In summary, the way this functionality works is as follows: and control nodes and the old image for the compute nodes. This playbook also: - Writes cluster configuration to the control node, using the - [compute_init](../../ansible/roles/compute_init/README.md) role. + [compute_init](../../ansible/roles/compute_init/README.md) role. - Configures an application credential and helper programs on the control node, using the [rebuild](../../ansible/roles/rebuild/README.md) role. 4. An admin submits Slurm jobs, one for each node, to a special "rebuild" @@ -34,7 +35,7 @@ In summary, the way this functionality works is as follows: configuration, and if it does not match, uses OpenStack to rebuild the node to the desired (updated) image. TODO: Describe the logic if they DO match -6. After a rebuild, the compute node runs various Ansible tasks during boot, +6. After a rebuild, the compute node runs various Ansible tasks during boot, controlled by the [compute_init](../../ansible/roles/compute_init/README.md) role, to fully configure the node again. It retrieves the required cluster configuration information from the control node via an NFS mount. @@ -47,7 +48,7 @@ In summary, the way this functionality works is as follows: To enable a compute node to rejoin the cluster after a rebuild, functionality must be built into the image. Before progressing you should check that all the functionality required for your cluster is currently supported by the -`compute_init` role. Review that role's [README](../../ansible/roles/compute_init/README.md) +`compute_init` role. Review that role's [Readme](../../ansible/roles/compute_init/README.md) against `environments/*/inventory/groups` files (and any similar files which define groups). Note that some functionality does not require support, e.g. because it does not run on compute nodes. @@ -55,9 +56,10 @@ because it does not run on compute nodes. ## Configuration The configuration of this is complex and involves: + - OpenTofu variables to stop tracking image changes on compute nodes - Definition of partition(s) to use for launching rebuild jobs -- Configuration of the [rebuild](../../ansible/roles/rebuild/README.md) role +- Configuration of the [rebuild](../../ansible/roles/rebuild/README.md) role to enable the Slurm controller to rebuild compute nodes via OpenStack. - Configuration of the [compute_init](../../ansible/roles/compute_init/README.md) role so that compute nodes rejoin the cluster after rebuilding - this is likely @@ -71,107 +73,110 @@ The configuration of this is complex and involves: relevant node group in the OpenTofu `compute` variable, set the parameter `ignore_image_changes: true`. E.g. - ```terraform - # environments/$ENV/main.tf: - ... - compute = { - general = { - nodes = ["general-0", "general-1"] - ignore_image_changes = true - ... - } - gpu = { - node = ["a100-0", "a100-1"] - ignore_image_changes = true - ... - } +```terraform +# environments/$ENV/main.tf: +... +compute = { + general = { + nodes = ["general-0", "general-1"] + ignore_image_changes = true + ... } - ... - ``` + gpu = { + node = ["a100-0", "a100-1"] + ignore_image_changes = true + ... + } +} +... +``` -3. Follow the [compute_init](../../ansible/roles/compute_init/README.md) README +3. Follow the [compute_init](../../ansible/roles/compute_init/README.md) readme to add OpenTofu and Ansible configuration for that role. The "rebootable" nodes should all be in the `compute_init` group with the `compute_init_enable` OpenTofu parameter set. -4. If the [compute_init](../../ansible/roles/compute_init/README.md) README +4. If the [compute_init](../../ansible/roles/compute_init/README.md) readme showed that a custom image is required for any entry in the `compute_init_enable` parameter, follow the usual process to build new images as required. 5. Update image references in the OpenTofu configuration. Normally these should be in: - - `environments/site/tofu/variables.tf`: `cluster_image_id` for the default - cluster image. - - `environments/$ENV/tofu/main.tf`: parameter `image_id` in node groups - defined in the `compute` or `login` variables, to override the default - image for specific node groups. -5. Ensure `openhpc_partitions` contains a partition covering the nodes to run + - `environments/site/tofu/variables.tf`: `cluster_image_id` for the default + cluster image. + - `environments/$ENV/tofu/main.tf`: parameter `image_id` in node groups + defined in the `compute` or `login` variables, to override the default + image for specific node groups. + +6. Ensure `openhpc_partitions` contains a partition covering the nodes to run rebuild jobs. The default definition in `environments/common/inventory/group_vars/all/openhpc.yml` will automatically include this via `openhpc_rebuild_partition` also in that file. If modifying this, note the important parameters are: - - - `name`: Partition name matching `rebuild` role variable `rebuild_partitions`, - default `rebuild`. - - `groups`: A list of nodegroup names, matching `openhpc_nodegroup` and - keys in the OpenTofu `compute` variable (see example in step 2 above). - Normally every compute node group should be listed here, unless - Slurm-controlled rebuild is not required for certain node groups. - - `default`: Must be set to `NO` so that it is not the default partition. - - `maxtime`: Maximum time to allow for rebuild jobs, in - [slurm.conf format](https://slurm.schedmd.com/slurm.conf.html#OPT_MaxTime). - The example here is 30 minutes, but see discussion below. - - `partition_params`: A mapping of additional parameters, which must be set - as follows: - - `PriorityJobFactor`: Ensures jobs in this partition (i.e. rebuild jobs) - are always scheduled before jobs in "normal" partitions on the same - nodes. This value is the highest which can be set. See - [slurm.conf docs](https://slurm.schedmd.com/slurm.conf.html#OPT_PriorityJobFactor). - Note this is used instead of `PriorityTier` as the latter (with the - default appliance configuration) allows rebuild jobs to preempt and - suspend running user jobs, which is probably undesirable. - - `Hidden`: Don't show this partition in e.g. `sinfo` for unpriviledged - users. - - `RootOnly`: Only allow the root user to submit jobs to this partition. - - `DisableRootJobs`: Don't disable the root user, in case this parameter - is set globally via `openhpc_config_extra`. - - `PreemptMode`: Don't allow reboot jobs to be preempted/suspended. - - `OverSubscribe`: Ensure that jobs run in this partition require the - entire node. This means they do not run on nodes as the same time as - user jobs running in partitions allowing non-exclusive use. - - The value for `maxtime` needs to be sufficent not just for a single node - to be rebuilt, but also to allow for any batching in either OpenTofu or - in Nova - see remarks in the [production docs](../production.md). - - If it is desirable to roll out changes more gradually, it is possible to - create multiple "rebuild" partitions, but it is necessary that: - - The rebuild partitions should not themselves overlap, else nodes may be - rebuilt more than once. - - Each rebuild partition should entirely cover one or more "normal" - partitions, to avoid the possibility of user jobs being scheduled to a - mix of nodes using old and new images. - -6. Configure the [rebuild](../../ansible/roles/rebuild/README.md) role: - - Add the `control` node into the `rebuild` group. - - Ensure an application credential to use for rebuilding nodes is available - on the deploy host (default location `~/.config/openstack/clouds.yaml`). - - If required, override `rebuild_clouds_path` or other variables in the site - environment. - -7. Run `tofu apply` as usual to apply the new OpenTofu configuration. - - > [!NOTE] - > If the cluster image references were updated at step 5, this will be - > a disruptive operation and should be planned as part of a normal upgrade - > cycle. - - > [!CAUTION] - > Due to OpenTofu/Terraform state limitations, this will plan to delete and - > recreate all compute nodes in node groups where `ignore_image_changes: true`. - > was not previously set. This is a one-time issue with adding this - > parameter, i.e. subsequent applys will not require this. + + - `name`: Partition name matching `rebuild` role variable `rebuild_partitions`, + default `rebuild`. + - `groups`: A list of nodegroup names, matching `openhpc_nodegroup` and + keys in the OpenTofu `compute` variable (see example in step 2 above). + Normally every compute node group should be listed here, unless + Slurm-controlled rebuild is not required for certain node groups. + - `default`: Must be set to `NO` so that it is not the default partition. + - `maxtime`: Maximum time to allow for rebuild jobs, in + [slurm.conf format](https://slurm.schedmd.com/slurm.conf.html#OPT_MaxTime). + The example here is 30 minutes, but see discussion below. + - `partition_params`: A mapping of additional parameters, which must be set + as follows: + - `PriorityJobFactor`: Ensures jobs in this partition (i.e. rebuild jobs) + are always scheduled before jobs in "normal" partitions on the same + nodes. This value is the highest which can be set. See + [slurm.conf docs](https://slurm.schedmd.com/slurm.conf.html#OPT_PriorityJobFactor). + Note this is used instead of `PriorityTier` as the latter (with the + default appliance configuration) allows rebuild jobs to preempt and + suspend running user jobs, which is probably undesirable. + - `Hidden`: Don't show this partition in e.g. `sinfo` for unpriviledged + users. + - `RootOnly`: Only allow the root user to submit jobs to this partition. + - `DisableRootJobs`: Don't disable the root user, in case this parameter + is set globally via `openhpc_config_extra`. + - `PreemptMode`: Don't allow reboot jobs to be preempted/suspended. + - `OverSubscribe`: Ensure that jobs run in this partition require the + entire node. This means they do not run on nodes as the same time as + user jobs running in partitions allowing non-exclusive use. + + The value for `maxtime` needs to be sufficent not just for a single node + to be rebuilt, but also to allow for any batching in either OpenTofu or + in Nova - see remarks in the [production docs](../production.md). + + If it is desirable to roll out changes more gradually, it is possible to + create multiple "rebuild" partitions, but it is necessary that: + + - The rebuild partitions should not themselves overlap, else nodes may be + rebuilt more than once. + - Each rebuild partition should entirely cover one or more "normal" + partitions, to avoid the possibility of user jobs being scheduled to a + mix of nodes using old and new images. + +7. Configure the [rebuild](../../ansible/roles/rebuild/README.md) role: + + - Add the `control` node into the `rebuild` group. + - Ensure an application credential to use for rebuilding nodes is available + on the deploy host (default location `~/.config/openstack/clouds.yaml`). + - If required, override `rebuild_clouds_path` or other variables in the site + environment. + +8. Run `tofu apply` as usual to apply the new OpenTofu configuration. + + > [!NOTE] + > If the cluster image references were updated at step 5, this will be + > a disruptive operation and should be planned as part of a normal upgrade + > cycle. + > + > [!CAUTION] + > Due to OpenTofu/Terraform state limitations, this will plan to delete and + > recreate all compute nodes in node groups where `ignore_image_changes: true`. + > was not previously set. This is a one-time issue with adding this + > parameter, i.e. subsequent applys will not require this. TODO: clarify whether, if the image is bumped at this point, the compute nodes actually get recreated on the new or the old image?? @@ -193,7 +198,9 @@ However there is no need to drain compute nodes and create reservations etc. Triggering rebuild jobs is done using the following playbook: - ansible-playbook ansible/adhoc/rebuild-via-slurm.yml +```shell +ansible-playbook ansible/adhoc/rebuild-via-slurm.yml +``` This will create jobs to reimage every slurm-rebuildable node to the image currently defined in the OpenTofu configuration. @@ -204,17 +211,22 @@ example the following comand will run in a non-default partition and does not actually reboot/rebuild nodes, which may be useful for testing interactions with other priority or QOS settings: - ansible-playbook ansible/adhoc/rebuild-via-slurm.yml -e 'rebuild_job_partitions=test rebuild_job_reboot=false' +```shell +ansible-playbook ansible/adhoc/rebuild-via-slurm.yml -e 'rebuild_job_partitions=test rebuild_job_reboot=false' +``` ## Testing The below demonstrates testing this using the `.stackhpc` CI environment, using: + - A 2-node default "standard" partition. - A 2-node "extra" partition (note this does not usually have any nodes by default). In one terminal launch a watch of job state: - [root@RL9-control rocky]# clear && ~/ewatch/ewatch.py -n 1 -i '\d+:\d+' 'squeue --all --Format=PARTITION,NAME:25,USERNAME:11,STATE:12,NUMNODES:8,NODELIST' +```shell +[root@RL9-control rocky]# clear && ~/ewatch/ewatch.py -n 1 -i '\d+:\d+' 'squeue --all --Format=PARTITION,NAME:25,USERNAME:11,STATE:12,NUMNODES:8,NODELIST' +``` This uses [ewatch](https://github.com/sjpb/ewatch) to summarise changes in output. @@ -222,17 +234,24 @@ output. In a second terminal, launch 2x normal jobs into the default ("standard") partition: - [demo_user@RL9-login-0 ~]$ sbatch -N2 --job-name=JobA --wrap "sleep 20" && sbatch -N2 --job-name=JobB --wrap "sleep 10" +```shell +[demo_user@RL9-login-0 ~]$ sbatch -N2 --job-name=JobA --wrap "sleep 20" && sbatch -N2 --job-name=JobB --wrap "sleep 10" +``` In a third terminal, trigger rebuild jobs: - .stackhpc/ (venv) [rocky@steveb-dev slurm-app-rl9]$ ansible-playbook ansible/adhoc/rebuild-via-slurm.yml -e 'rebuild_job_reboot=false rebuild_job_command="sleep 30"' - +```shell +.stackhpc/ (venv) [rocky@steveb-dev slurm-app-rl9]$ ansible-playbook ansible/adhoc/rebuild-via-slurm.yml -e 'rebuild_job_reboot=false rebuild_job_command="sleep 30"' - +``` Back in the second terminal, submit more user jobs to either partition: - [demo_user@RL9-login-0 ~]$ sbatch -N2 --job-name=JobC --partition,standard,extra --wrap "sleep 10" +```shell +[demo_user@RL9-login-0 ~]$ sbatch -N2 --job-name=JobC --partition,standard,extra --wrap "sleep 10" +``` The output from the first terminal should show: + - Job A runs on submission in the default "standard" partition. - Job B pends for the default "standard" partition. - Rebuild jobs runs on submission in the "extra" partition and pend for the "standard" partition @@ -246,48 +265,49 @@ The output from the first terminal should show: - Job B runs in the "standard" partition Example output: -``` + +```text [2025-03-28T14:26:34.510466] -PARTITION NAME USER STATE NODES NODELIST -standard JobB demo_user PENDING 2 -standard JobA demo_user RUNNING 2 RL9-compute-[0-1] +PARTITION NAME USER STATE NODES NODELIST +standard JobB demo_user PENDING 2 +standard JobA demo_user RUNNING 2 RL9-compute-[0-1] [2025-03-28T14:26:38.530213] -PARTITION NAME USER STATE NODES NODELIST -rebuild rebuild-RL9-compute-1 root PENDING 1 -rebuild rebuild-RL9-compute-0 root PENDING 1 -rebuild rebuild-RL9-extra-0 root RUNNING 1 RL9-extra-0 -rebuild rebuild-RL9-extra-1 root RUNNING 1 RL9-extra-1 -standard JobB demo_user PENDING 2 -standard JobA demo_user RUNNING 2 RL9-compute-[0-1] -standard,extra JobC demo_user PENDING 2 +PARTITION NAME USER STATE NODES NODELIST +rebuild rebuild-RL9-compute-1 root PENDING 1 +rebuild rebuild-RL9-compute-0 root PENDING 1 +rebuild rebuild-RL9-extra-0 root RUNNING 1 RL9-extra-0 +rebuild rebuild-RL9-extra-1 root RUNNING 1 RL9-extra-1 +standard JobB demo_user PENDING 2 +standard JobA demo_user RUNNING 2 RL9-compute-[0-1] +standard,extra JobC demo_user PENDING 2 [2025-03-28T14:26:54.609651] -PARTITION NAME USER STATE NODES NODELIST -rebuild rebuild-RL9-compute-0 root RUNNING 1 RL9-compute-0 -rebuild rebuild-RL9-compute-1 root RUNNING 1 RL9-compute-1 -rebuild rebuild-RL9-extra-0 root RUNNING 1 RL9-extra-0 -rebuild rebuild-RL9-extra-1 root RUNNING 1 RL9-extra-1 -standard JobB demo_user PENDING 2 -standard,extra JobC demo_user PENDING 2 +PARTITION NAME USER STATE NODES NODELIST +rebuild rebuild-RL9-compute-0 root RUNNING 1 RL9-compute-0 +rebuild rebuild-RL9-compute-1 root RUNNING 1 RL9-compute-1 +rebuild rebuild-RL9-extra-0 root RUNNING 1 RL9-extra-0 +rebuild rebuild-RL9-extra-1 root RUNNING 1 RL9-extra-1 +standard JobB demo_user PENDING 2 +standard,extra JobC demo_user PENDING 2 [2025-03-28T14:28:39.091571] -PARTITION NAME USER STATE NODES NODELIST -extra JobC demo_user RUNNING 2 RL9-extra-[0-1] -rebuild rebuild-RL9-compute-0 root RUNNING 1 RL9-compute-0 -rebuild rebuild-RL9-compute-1 root RUNNING 1 RL9-compute-1 -standard JobB demo_user PENDING 2 +PARTITION NAME USER STATE NODES NODELIST +extra JobC demo_user RUNNING 2 RL9-extra-[0-1] +rebuild rebuild-RL9-compute-0 root RUNNING 1 RL9-compute-0 +rebuild rebuild-RL9-compute-1 root RUNNING 1 RL9-compute-1 +standard JobB demo_user PENDING 2 [2025-03-28T14:28:49.139349] -PARTITION NAME USER STATE NODES NODELIST -rebuild rebuild-RL9-compute-0 root RUNNING 1 RL9-compute-0 -rebuild rebuild-RL9-compute-1 root RUNNING 1 RL9-compute-1 -standard JobB demo_user PENDING 2 +PARTITION NAME USER STATE NODES NODELIST +rebuild rebuild-RL9-compute-0 root RUNNING 1 RL9-compute-0 +rebuild rebuild-RL9-compute-1 root RUNNING 1 RL9-compute-1 +standard JobB demo_user PENDING 2 [2025-03-28T14:28:55.168264] -PARTITION NAME USER STATE NODES NODELIST -standard JobB demo_user RUNNING 2 RL9-compute-[0-1] +PARTITION NAME USER STATE NODES NODELIST +standard JobB demo_user RUNNING 2 RL9-compute-[0-1] [2025-03-28T14:29:05.216346] -PARTITION NAME USER STATE NODES NODELIST +PARTITION NAME USER STATE NODES NODELIST ``` diff --git a/docs/filesystems.md b/docs/filesystems.md index 5509aef..14669f9 100644 --- a/docs/filesystems.md +++ b/docs/filesystems.md @@ -8,7 +8,7 @@ The Slurm appliance supports multiple ways of configuring shared filesystems, in - Lustre -# Manila +=# Manila The Slurm appliance supports mounting shared filesystems using [CephFS](https://docs.ceph.com/en/latest/cephfs/) via [OpenStack Manila](https://docs.openstack.org/manila/latest/). This section explains: @@ -24,65 +24,69 @@ The Slurm appliance requires that the Manila shares already exist on the system. If this is the first time Manila is being used on the system, a CephFS share type will need to be created. You will need admin credentials to do this. - ```bash - openstack share type create cephfs-type false --extra-specs storage_protocol=CEPHFS vendor_name=Ceph - ``` +```bash +openstack share type create cephfs-type false --extra-specs storage_protocol=CEPHFS vendor_name=Ceph +``` -Once this exists, create a share using credentials for the Slurm project. An access rule also needs to be created, where the `access_to` argument (`openstack share access create `) is a user that will be created in Ceph. This needs to be globally unique in Ceph, so needs to be different for each OpenStack project. Ideally, this share should include your environment name. In this example, the name is "production". +Once this exists, create a share using credentials for the Slurm project. +An access rule also needs to be created, where the `access_to` argument +(`openstack share access create `) is a user that will be created in Ceph. +This needs to be globally unique in Ceph, so needs to be different for each OpenStack project. +Ideally, this share should include your environment name. In this example, the name is "production". - ```bash - openstack share create CephFS 300 --description 'Scratch dir for Slurm prod' --name slurm-production-scratch --share-type cephfs-type --wait - openstack share access create slurm-production-scratch cephx slurm-production - ``` +```bash +openstack share create CephFS 300 --description 'Scratch dir for Slurm prod' --name slurm-production-scratch --share-type cephfs-type --wait +openstack share access create slurm-production-scratch cephx slurm-production +``` ## Configuring the Slurm Appliance for Manila To mount shares onto hosts in a group, add them to the `manila` group. - ```ini - # environments/site/inventory/groups: - [manila:children]: - login - compute - ``` +```yaml +# environments/site/inventory/groups: +[manila:children]: +login +compute +``` If you are running a different version of Ceph from the defaults in the [os-manila-mount role](https://github.com/stackhpc/ansible-role-os-manila-mount/blob/master/defaults/main.yml), you will need to update the package version by setting: - ```yaml - # environments/site/inventory/group_vars/manila.yml: - os_manila_mount_ceph_version: "18.2.4" - ``` +```yaml +# environments/site/inventory/group_vars/manila.yml: +os_manila_mount_ceph_version: "18.2.4" +``` -A [site-specific image](image-build.md) should be built which includes this package; add ``manila`` to the Packer ``inventory_groups`` variable. +A [site-specific image](image-build.md) should be built which includes this package; add `manila` to the Packer `inventory_groups` variable. Define the list of shares to be mounted, and the paths to mount them to. The example below parameterises the share name using the environment name. See the [stackhpc.os-manila-mount role](https://github.com/stackhpc/ansible-role-os-manila-mount) for further configuration options. - ```yaml - # environments/site/inventory/group_vars/manila.yml: - os_manila_mount_shares: - - share_name: "slurm-{{ appliances_environment_name }}-scratch" - mount_path: /scratch - ``` +```yaml +# environments/site/inventory/group_vars/manila.yml: +os_manila_mount_shares: + - share_name: "slurm-{{ appliances_environment_name }}-scratch" + mount_path: /scratch +``` ### Shared home directory -By default, the Slurm appliance configures the control node as an NFS server and exports a directory which is mounted on the other cluster nodes as `/home`. When using Manila + CephFS for the home directory instead, this will need to be disabled. To do this, set the tf var `home_volume_provisioning` to `None`. +By default, the Slurm appliance configures the control node as an NFS server and exports a directory which is mounted on the other cluster nodes as `/home`. When using Manila + CephFS for the home directory instead, this will need to be disabled. To do this, set the tf var `home_volume_provisioning` to `None`. Some `basic_users_homedir_*` parameters need overriding as the provided defaults are only satisfactory for the default root-squashed NFS share: - ```yaml - # environments/site/inventory/group_vars/all/basic_users.yml: - basic_users_homedir_server: "{{ groups['login'] | first }}" # if not mounting /home on control node - basic_users_homedir_server_path: /home - ``` +```yaml +# environments/site/inventory/group_vars/all/basic_users.yml: +basic_users_homedir_server: "{{ groups['login'] | first }}" # if not mounting /home on control node +basic_users_homedir_server_path: /home +``` Finally, add the home directory to the list of shares (the share should be already created in OpenStack). - ```yaml - # environments/site/inventory/group_vars/all/manila.yml: - os_manila_mount_shares: - - share_name: "slurm-{{ appliances_environment_name }}-scratch" - mount_path: /scratch - - share_name: "slurm-{{ appliances_environment_name }}-home" - mount_path: /home - ``` +```yaml +# environments/site/inventory/group_vars/all/manila.yml: +os_manila_mount_shares: + - share_name: "slurm-{{ appliances_environment_name }}-scratch" + mount_path: /scratch + - share_name: "slurm-{{ appliances_environment_name }}-home" + mount_path: /home +``` diff --git a/docs/image-build.md b/docs/image-build.md index dc968eb..71be030 100644 --- a/docs/image-build.md +++ b/docs/image-build.md @@ -3,59 +3,67 @@ The appliance contains code and configuration to use [Packer](https://developer.hashicorp.com/packer) with the [OpenStack builder](https://www.packer.io/plugins/builders/openstack) to build images. The Packer configuration defined here builds "fat images" which contain packages, binaries and container images but no cluster-specific configuration. Using these: + - Enables the image to be tested in CI before production use. - Ensures re-deployment of the cluster or deployment of additional nodes can be completed even if packages are changed in upstream repositories (e.g. due to RockyLinux or OpenHPC updates). - Improves deployment speed by reducing the number of package downloads to improve deployment speed. The fat images StackHPC builds and tests in CI are available from [GitHub releases](https://github.com/stackhpc/ansible-slurm-appliance/releases). However with some additional configuration it is also possible to: + 1. Build site-specific fat images from scratch. 2. Extend an existing fat image with additional functionality. - -# Usage +## Usage To build either a site-specific fat image from scratch, or to extend an existing StackHPC fat image: 1. Ensure the current OpenStack credentials have sufficient authorisation to upload images (this may or may not require the `member` role for an application credential, depending on your OpenStack configuration). 2. The provided dev credentials for StackHPC's "Ark" Pulp server must be added to the target environments. This is done by overriding `dnf_repos_username` and `dnf_repos_password` with your vault encrypted credentials in `environments//inventory/group_vars/all/pulp.yml`. See the [experimental docs](experimental/pulp.md) if you wish instead wish to use a local Pulp server. 3. Create a Packer [variable definition file](https://developer.hashicorp.com/packer/docs/templates/hcl_templates/variables#assigning-values-to-input-variables) at e.g. `environments//builder.pkrvars.hcl` containing at a minimum: - - ```hcl - flavor = "general.v1.small" # VM flavor to use for builder VMs - networks = ["26023e3d-bc8e-459c-8def-dbd47ab01756"] # List of network UUIDs to attach the VM to - source_image_name = "Rocky-9-GenericCloud-Base-9.4" # Name of image to create VM with, i.e. starting image - inventory_groups = "control,login,compute" # Additional inventory groups to add build VM to - - ``` - - Note that: - - The network used for the Packer VM must provide outbound internet access but does not need to provide access to resources which the final cluster nodes require (e.g. Slurm control node, network filesystem servers etc.). - - The flavor used must have sufficent memory for the build tasks, but otherwise does not need to match the final cluster nodes. Usually 8GB is sufficent. By default, the build VM is volume-backed to allow control of the root disk size (and hence final image size) so the flavor disk size does not matter. - - The source image should be either a RockyLinux GenericCloud image for a site-specific image build from scratch, or a StackHPC fat image if extending an existing image. - - The `inventory_groups` variable takes a comma-separated list of Ansible inventory groups to add the build VM to. This is in addition to the `builder` group which it is always added to. This controls which Ansible roles and functionality run during build, and hence what gets added to the image. All possible groups are listed in `environments/common/groups` but common options for this variable will be: - - `update,control,login,compute`: The resultant image has all packages in the source image updated, and then packages for all types of nodes in the cluster are added. When using a GenericCloud image for `source_image_name` this builds a site-specific fat image from scratch. - - One or more specific groups which are not enabled in the appliance by default, e.g. `lustre`. When using a StackHPC fat image for `source_image_name` this extends the image with just this additional functionality. + +```hcl +flavor = "general.v1.small" # VM flavor to use for builder VMs +networks = ["26023e3d-bc8e-459c-8def-dbd47ab01756"] # List of network UUIDs to attach the VM to +source_image_name = "Rocky-9-GenericCloud-Base-9.4" # Name of image to create VM with, i.e. starting image +inventory_groups = "control,login,compute" # Additional inventory groups to add build VM to +``` + +Note that: + +- The network used for the Packer VM must provide outbound internet access but does not need to provide access to resources which the final cluster nodes require (e.g. Slurm control node, network filesystem servers etc.). +- The flavor used must have sufficent memory for the build tasks, but otherwise does not need to match the final cluster nodes. Usually 8GB is sufficent. By default, the build VM is volume-backed to allow control of the root disk size (and hence final image size) so the flavor disk size does not matter. +- The source image should be either a RockyLinux GenericCloud image for a site-specific image build from scratch, or a StackHPC fat image if extending an existing image. +- The `inventory_groups` variable takes a comma-separated list of Ansible inventory groups to add the build VM to. This is in addition to the `builder` group which it is always added to. This controls which Ansible roles and functionality run during build, and hence what gets added to the image. + All possible groups are listed in `environments/common/groups` but common options for this variable will be: + - `update,control,login,compute`: The resultant image has all packages in the source image updated, and then packages for all types of nodes in the cluster are added. When using a GenericCloud image for `source_image_name` this builds a site-specific fat image from scratch. + - One or more specific groups which are not enabled in the appliance by default, e.g. `lustre`. When using a StackHPC fat image for `source_image_name` this extends the image with just this additional functionality. 4. Activate the venv and the relevant environment. 5. Build images using the relevant variable definition file, e.g.: - cd packer/ - PACKER_LOG=1 /usr/bin/packer build -on-error=ask -var-file=$PKR_VAR_environment_root/builder.pkrvars.hcl openstack.pkr.hcl +```shell +cd packer/ +PACKER_LOG=1 /usr/bin/packer build -on-error=ask -var-file=$PKR_VAR_environment_root/builder.pkrvars.hcl openstack.pkr.hcl +``` - **NB:** If the build fails while creating the volume, check if the source image has the `signature_verified` property: +**NB:** If the build fails while creating the volume, check if the source image has the `signature_verified` property: - openstack image show $SOURCE_IMAGE +```shell +openstack image show $SOURCE_IMAGE +``` - If it does, remove this property: +If it does, remove this property: - openstack image unset --property signature_verified $SOURCE_IMAGE +```shell +openstack image unset --property signature_verified $SOURCE_IMAGE +``` - then delete the failed volume, select cancelling the build when Packer queries, and then retry. This is [OpenStack bug 1823445](https://bugs.launchpad.net/cinder/+bug/1823445). +then delete the failed volume, select cancelling the build when Packer queries, and then retry. This is [OpenStack bug 1823445](https://bugs.launchpad.net/cinder/+bug/1823445). -6. The built image will be automatically uploaded to OpenStack with a name prefixed `openhpc` and including a timestamp and a shortened git hash. +6. The built image will be automatically uploaded to OpenStack with a name prefixed `openhpc` and including a timestamp and a shortened Git hash. -# Build Process +## Build Process In summary, Packer creates an OpenStack VM, runs Ansible on that, shuts it down, then creates an image from the root disk. @@ -66,6 +74,7 @@ shows the use of the environment variable `$PKR_VAR_environment_root` (which its using a path in a "parent" environment is likely to be more appropriate (as builds should not be environment-specific to allow testing before deployment to a production environment). What is Slurm Appliance-specific are the details of how Ansible is run: + - The build VM is always added to the `builder` inventory group, which differentiates it from nodes in a cluster. This allows Ansible variables to be set differently during Packer builds, e.g. to prevent services starting. The defaults for this are in `environments/common/inventory/group_vars/builder/`, which could be extended or overriden for site-specific fat image builds using `builder` groupvars for the relevant environment. It also runs some builder-specific code (e.g. to clean up the image). - The default fat image builds also add the build VM to the "top-level" `compute`, `control` and `login` groups. This ensures @@ -76,9 +85,10 @@ What is Slurm Appliance-specific are the details of how Ansible is run: groupvars is not sufficient (e.g. a role always attempts to configure or start services). There are some things to be aware of when developing Ansible to run in a Packer build VM: - - Only some tasks make sense. E.g. any services with a reliance on the network cannot be started, and should not be enabled if, when creating an instance with the resulting image, the remote service will not be immediately present. - - Nothing should be written to the persistent state directory `appliances_state_dir`, as this is on the root filesystem rather than an OpenStack volume. - - Care should be taken not to leave data on the root filesystem which is not wanted in the final image (e.g secrets). - - Build VM hostnames are not the same as for equivalent "real" hosts and do not contain `login`, `control` etc. Therefore variables used by the build VM must be defined as groupvars not hostvars. - - Ansible may need to use a proxyjump to reach cluster nodes, which can be defined via Ansible's `ansible_ssh_common_args` variable. If Packer should not use the same proxy - to connect to build VMs (e.g. because build happens on a different network), this proxy configuration should not be added to the `all` group. + +- Only some tasks make sense. E.g. any services with a reliance on the network cannot be started, and should not be enabled if, when creating an instance with the resulting image, the remote service will not be immediately present. +- Nothing should be written to the persistent state directory `appliances_state_dir`, as this is on the root filesystem rather than an OpenStack volume. +- Care should be taken not to leave data on the root filesystem which is not wanted in the final image (e.g secrets). +- Build VM hostnames are not the same as for equivalent "real" hosts and do not contain `login`, `control` etc. Therefore variables used by the build VM must be defined as groupvars not hostvars. +- Ansible may need to use a proxyjump to reach cluster nodes, which can be defined via Ansible's `ansible_ssh_common_args` variable. If Packer should not use the same proxy + to connect to build VMs (e.g. because build happens on a different network), this proxy configuration should not be added to the `all` group. diff --git a/docs/k3s.README.md b/docs/k3s.README.md index 1b66511..500a789 100644 --- a/docs/k3s.README.md +++ b/docs/k3s.README.md @@ -1,8 +1,10 @@ # Overview -A K3s cluster is deployed with the Slurm cluster. Both an agent and server instance of K3s is installed during image build and the correct service (determined by OpenStack metadata) will be -enabled during boot. Nodes with the `k3s_server` metadata field defined will be configured as K3s agents (this field gives them the address of the server). The Slurm control node is currently configured as a server while all other nodes are configured as agents. Using multiple K3s servers isn't supported. Currently only the root user on the control node has + +A K3s cluster is deployed with the Slurm cluster. Both an agent and server instance of K3s is installed during image build and the correct service (determined by OpenStack metadata) will be +enabled during boot. Nodes with the `k3s_server` metadata field defined will be configured as K3s agents (this field gives them the address of the server). The Slurm control node is currently configured as a server while all other nodes are configured as agents. Using multiple K3s servers isn't supported. Currently only the root user on the control node has access to the Kubernetes API. The `k3s` role installs Helm for package management. K9s is also installed in the image and can be used by the root user. -# Idempotency +## Idempotency + K3s is intended to only be installed during image build as it is configured by the appliance on first boot with `azimuth_cloud.image_utils.linux_ansible_init`. Therefore, the `k3s` role isn't idempotent and changes to variables will not be reflected in the image when running `site.yml`. diff --git a/docs/mig.md b/docs/mig.md index 0d52f96..b8eeae8 100644 --- a/docs/mig.md +++ b/docs/mig.md @@ -10,9 +10,9 @@ This page details how to configure Multi Instance GPU (MIG) in Slurm. ## Inventory -Add relevant hosts to the ``vgpu`` group, for example in `environments/$ENV/inventory/groups`: +Add relevant hosts to the `vgpu` group, for example in `environments/$ENV/inventory/groups`: -``` +```yaml [vgpu:children] cuda ``` @@ -23,24 +23,24 @@ Use variables from the [stackhpc.linux.vgpu](https://github.com/stackhpc/ansible For example in: `environments//inventory/group_vars/all/vgpu`: -``` +```yaml --- vgpu_definitions: - - pci_address: "0000:17:00.0" - mig_devices: - "1g.10gb": 4 - "4g.40gb": 1 - - pci_address: "0000:81:00.0" - mig_devices: - "1g.10gb": 4 - "4g.40gb": 1 + - pci_address: "0000:17:00.0" + mig_devices: + "1g.10gb": 4 + "4g.40gb": 1 + - pci_address: "0000:81:00.0" + mig_devices: + "1g.10gb": 4 + "4g.40gb": 1 ``` -The appliance will use the driver installed via the ``cuda`` role. +The appliance will use the driver installed via the `cuda` role. -Use ``lspci`` to determine the PCI addresses e.g: +Use `lspci` to determine the PCI addresses e.g: -``` +```text [root@io-io-gpu-02 ~]# lspci -nn | grep -i nvidia 06:00.0 3D controller [0302]: NVIDIA Corporation GH100 [H100 SXM5 80GB] [10de:2330] (rev a1) 0c:00.0 3D controller [0302]: NVIDIA Corporation GH100 [H100 SXM5 80GB] [10de:2330] (rev a1) @@ -51,7 +51,7 @@ Use ``lspci`` to determine the PCI addresses e.g: The supported profiles can be discovered by consulting the [NVIDIA documentation](https://docs.nvidia.com/datacenter/tesla/mig-user-guide/index.html#supported-mig-profiles) or interactively by running the following on one of the compute nodes with GPU resources: -``` +```text [rocky@io-io-gpu-05 ~]$ sudo nvidia-smi -i 0 -mig 1 Enabled MIG Mode for GPU 00000000:06:00.0 All done. @@ -150,7 +150,7 @@ All done. ## compute_init configuration for slurm triggered rebuild (optional) You only need to configure this if you are using the slurm triggered rebuild -feature. Use the ``vgpu`` metadata option to enable creation of mig devices on +feature. Use the `vgpu` metadata option to enable creation of mig devices on rebuild. ## GRES configuration @@ -160,19 +160,19 @@ do this you need to determine the names of the GPU types as detected by slurm. F deploy slurm with the default nodegroup definitions to get a working cluster. Make a temporary copy of slurm.conf: -``` +```text cp /var/spool/slurm/conf-cache/slurm.conf /tmp/ ``` Then create a `/tmp/gres.conf` which enables autodetection: -``` +```text AutoDetect=nvml ``` You will then be able to run: `sudo slurmd -f /tmp/slurm.conf -G` on a compute node where GPU resources exist. An example is shown below: -``` +```text [rocky@io-io-gpu-02 ~]$ sudo slurmd -f /tmp/slurm.conf -G slurmd-io-io-gpu-02: Gres Name=gpu Type=nvidia_h100_80gb_hbm3 Count=1 Index=0 ID=7696487 File=/dev/nvidia0 Links=(null) Flags=HAS_FILE,HAS_TYPE,ENV_NVML,ENV_RSMI,ENV_ONEAPI ,ENV_OPENCL,ENV_DEFAULT @@ -201,24 +201,23 @@ NOTE: If you have configured a Gres= line in slurm.conf already. You may have to GRES resources can then be configured manually. An example is shown below (`environments//inventory/group_vars/all/openhpc.yml`): -``` +```yaml openhpc_partitions: - name: cpu - name: gpu openhpc_nodegroups: - - name: cpu - - name: gpu - gres_autodetect: nvml - gres: - - conf: "gpu:nvidia_h100_80gb_hbm3:2" - - conf: "gpu:nvidia_h100_80gb_hbm3_4g.40gb:2" - - conf: "gpu:nvidia_h100_80gb_hbm3_1g.10gb:6" + - name: cpu + - name: gpu + gres_autodetect: nvml + gres: + - conf: "gpu:nvidia_h100_80gb_hbm3:2" + - conf: "gpu:nvidia_h100_80gb_hbm3_4g.40gb:2" + - conf: "gpu:nvidia_h100_80gb_hbm3_1g.10gb:6" openhpc_config: GresTypes: - gpu - ``` Making sure the types (the identifier after `gpu:`) match those collected with `slurmd -G`. Substrings diff --git a/docs/monitoring-and-logging.md b/docs/monitoring-and-logging.md index 46b405a..ad6fc0c 100644 --- a/docs/monitoring-and-logging.md +++ b/docs/monitoring-and-logging.md @@ -39,14 +39,15 @@ Where `role_name` is the name of the internal role. ## Customising variables -You should only customise the variables in `environments/common` if you are working on a feature that you intend to contribute back. Instead you should override the variables in the environment relevant to your deployment. This is possible since inventories later in the inheritance chain have greater precedence. Please see [README.md](../README.md#environments) for a more detailed explanation. This notice exists to avoid the need to need to keep repeating this point in the following sections. Where it is noted that you should customise a variable, it is implied that this change should be made to your own environment e.g `environments/production` in preference to `environments/common`, even when +You should only customise the variables in `environments/common` if you are working on a feature that you intend to contribute back. Instead you should override the variables in the environment relevant to your deployment. This is possible since inventories later in the inheritance chain have greater precedence. Please see [README.md](../README.md#environments) for a more detailed explanation. +This notice exists to avoid the need to need to keep repeating this point in the following sections. Where it is noted that you should customise a variable, it is implied that this change should be made to your own environment e.g `environments/production` in preference to `environments/common`, even when this is not explicitly stated. ## filebeat This section details the configuration of filebeat. -### Defaults +### filebeat defaults Filebeat is configured by the internal `filebeat` role which can be found here: @@ -56,7 +57,7 @@ The appliance defaults for the `filebeat` role can be found at the following loc > [environments/common/inventory/group_vars/all/filebeat.yml](../environments/common/inventory/group_vars/all/filebeat.yml) -### Overview +### filebeat overview Filebeat is configured to scrape the output of slurm stats. Slurm stats produces a json log file in the following location on the host: @@ -73,9 +74,9 @@ This file is configurable by the `filebeat_config_path` variable. It is not currently possible to partially override `filebeat.yml`. You will have to configure `filebeat_config_path` to refer to another file, copying the parts of the default configuration you want to keep. Pull requests are welcomed to add the functionality needed to allow for partial overrides. -### Placement +### filebeat placement -The `filebeat` group controls the placement of the `filebeat` service. The default configuration scrapes the `slurm_stats` service output. This requires a `filebeat` instance to be co-located with the `slurm_stats` service. +The `filebeat` group controls the placement of the `filebeat` service. The default configuration scrapes the `slurm_stats` service output. This requires a `filebeat` instance to be colocateed with the `slurm_stats` service. In the simplest configuration, a single host should be assigned to the `filebeat` and `slurm_stats` group. The host assigned to the `slurm_stats` group should the same host as assigned to the `filebeat` group. More advanced configurations are possible, but require overriding `filebeat_config_path` using `group` or `host` variables. @@ -83,18 +84,18 @@ In the simplest configuration, a single host should be assigned to the `filebeat This section details the configuration of grafana. -### Defaults +### grafana defaults Internally, we use the [cloudalchemy.grafana](https://github.com/cloudalchemy/ansible-grafana) role. You can customise any of the variables that the role supports. For a full list, please see the [upstream documentation](https://github.com/cloudalchemy/ansible-grafana). The appliance defaults can be found here: > [environments/common/inventory/group_vars/all/grafana.yml](../environments/common/inventory/group_vars/all/grafana.yml) -### Placement +### grafana placement The `grafana` group controls the placement of the grafana service. Load balancing is currently unsupported so it is important that you only assign one host to this group. -### Access +### grafana access If Open OnDemand is enabled then by default this is used to proxy Grafana, otherwise Grafana is accessed through the first . See `grafana_url` in [environments/common/inventory/group_vars/all/grafana.yml](../environments/common/inventory/group_vars/all/grafana.yml). The port used (variable `grafana_port`) defaults to `3000`. @@ -159,7 +160,7 @@ This can be customised with the `grafana_datasources` variable. This section details the configuration of OpenSearch. -### Defaults +### opensearch defaults The internal `opensearch` role is used to configure the service. The list of variables that can be customised can found in: @@ -169,11 +170,11 @@ The appliance defaults are in the following file: > [environments/common/inventory/group_vars/all/opensearch.yml](../environments/common/inventory/group_vars/all/opensearch.yml) -### Placement +### opensearch placement The `opensearch` group determines the placement of the OpenSearch service. Load balancing is currently unsupported so it is important that you only assign one host to this group. -### Access +### opensearch access By default, OpenSearch only listens on the loopback interface. It should therefore be placed on the same node as `filebeat` and `grafana` which need to access the OpenSearch API. @@ -185,9 +186,9 @@ The default set of users is defined in: This defines an the following accounts: -| username | password | purpose | -| ------------- | ------------------------------------------------|-------------------------------------------| -| admin | | User of highest privilege | +| username | password | purpose | +| -------- | ------------------------------------ | ------------------------- | +| admin | | User of highest privilege | Where the password field refers to a variable containing the actual password. These are generated by the `generate-passwords.yml` adhoc playbook (see [README.md](../README.md#creating-a-slurm-appliance)). @@ -208,7 +209,7 @@ found in: This section details the configuration of prometheus. -### Defaults +### Prometheus defaults Internally, we use the [cloudalchemy.prometheus](https://github.com/cloudalchemy/ansible-prometheus) role. You can customise any of the variables that the role supports. For a full list, please see the [upstream documentation](https://github.com/cloudalchemy/ansible-prometheus). The appliance defaults can be found here: @@ -217,19 +218,20 @@ Internally, we use the [cloudalchemy.prometheus](https://github.com/cloudalchemy Prometheus will be functional by default but the following variables should commonly be modified: + - `prometheus_web_external_url` - `prometheus_storage_retention` - `prometheus_storage_retention_size` -### Placement +### Prometheus placement The `prometheus` group determines the placement of the prometheus service. Load balancing is currently unsupported so it is important that you only assign one host to this group. -### Access +### Prometheus access Prometheus is exposed on port `9090` on all hosts in the prometheus group. Currently, the configuration assumes a single host. Following the reference layout in `environments/site/inventory/groups`, this will be set to the slurm `control` node, prometheus would then be accessible from: - > http://:9090 +> http://:9090 The port can customised by overriding the `prometheus_web_external_url` variable. @@ -268,7 +270,7 @@ The list can be customised by overriding the `collect[]` parameter of the `node` > [environments/common/inventory/group_vars/all/prometheus.yml](../environments/common/inventory/group_vars/all/prometheus.yml). -Variables in this file should *not* be customised directly, but should be overridden in your `environment`. See [README.md](../README.md#environments) which details the process of overriding default variables in more detail. +Variables in this file should _not_ be customised directly, but should be overridden in your `environment`. See [README.md](../README.md#environments) which details the process of overriding default variables in more detail. ### custom ansible filters @@ -276,12 +278,13 @@ Variables in this file should *not* be customised directly, but should be overri Groups prometheus targets. Metrics from `node_exporter` hosts have two labels applied: - - `env`: This is set from the Ansible variable `prometheus_env` if present - (e.g. from hostvars or groupvars), defaulting to `ungrouped`. This can be - used to group metrics by some arbitrary "environment", e.g. rack. - - `group`: This refers to the "top-level" inventory group for the host and - is one of `control`, `login`, `compute` or `other`. This can be used to - define rules for specific host functionalities. + +- `env`: This is set from the Ansible variable `prometheus_env` if present + (e.g. from hostvars or groupvars), defaulting to `ungrouped`. This can be + used to group metrics by some arbitrary "environment", e.g. rack. +- `group`: This refers to the "top-level" inventory group for the host and + is one of `control`, `login`, `compute` or `other`. This can be used to + define rules for specific host functionalities. ## slurm-stats @@ -291,16 +294,12 @@ Slurm stats periodically queries the slurm accounting database to gather informa The polling of this data is controlled by a cron job. The default is to scrape the data every 5 minutes. -### Defaults +### slurm-stats defaults -slurm-stats is configured `slurm-stats` role in the [slurm_openstack_tools collection](https://github.com/stackhpc/ansible_collection_slurm_openstack_tools). Currently there is no customisation of this role in the common environment i.e we are just using role defaults. It is possible to override these by setting the relevant variable in your environment config. See [here](https://github.com/stackhpc/ansible_collection_slurm_openstack_tools/tree/main/roles/slurm-stats) for a list of variables that can be set. +slurm-stats is configured `slurm-stats` role in the [slurm_openstack_tools collection](https://github.com/stackhpc/ansible_collection_slurm_openstack_tools). Currently there is no customisation of this role in the common environment i.e we are just using role defaults. +It is possible to override these by setting the relevant variable in your environment config. See [here](https://github.com/stackhpc/ansible_collection_slurm_openstack_tools/tree/main/roles/slurm-stats) for a list of variables that can be set. - -### Placement +### slurm-stats placement The `slurm_stats` group controls the placement of the `slurm_stats` service. -This should be configured to be a group with a single host. That host must be co-located on the same host as the `filebeat` service that scrapes its output. - - - - +This should be configured to be a group with a single host. That host must be colocated on the same host as the `filebeat` service that scrapes its output. diff --git a/docs/networks.md b/docs/networks.md index 69b7ece..c2c1d12 100644 --- a/docs/networks.md +++ b/docs/networks.md @@ -2,6 +2,7 @@ The default OpenTofu configurations in the appliance do not provision networks, subnets or associated infrastructure such as routers. The requirements are that: + 1. At least one network exists. 2. The first network defined spans all nodes, referred to as the "access network". 3. Only one subnet per network is attached to nodes. @@ -36,6 +37,7 @@ Note that if an OpenStack subnet has a gateway IP defined then by default nodes with ports attached to that subnet get a default route set via that gateway. ## Single network + This is the simplest possible configuration. A single network and subnet is used for all nodes. The subnet provides outbound internet access via the default route defined by the subnet gateway (often an OpenStack router to an external @@ -52,6 +54,7 @@ cluster_networks = [ ``` ## Multiple homogenous networks + This is similar to the above, except each node has multiple networks. The first network, "netA" is the access network. Note that only one subnet must have a gateway defined, else default routes via both subnets will be present causing @@ -76,7 +79,6 @@ vnic_types = { ... ``` - ## Additional networks on some nodes This example shows how to modify variables for specific node groups. In this @@ -120,13 +122,14 @@ In some multiple network configurations it may be necessary to manage default routes rather than them being automatically created from a subnet gateway. This can be done using the tofu variable `gateway_ip` which can be set for the cluster and/or overriden on the compute and login groups. If this is set: + - a default route via that address will be created on the appropriate interface during boot if it does not exist - any other default routes will be removed For example the cluster configuration below has a "campus" network with a default gateway which provides inbound SSH / ondemand access and outbound -internet attached only to the login nodes, and a "data" network attached to +internet attached only to the login nodes, and a "data" network attached to all nodes. The "data" network has no gateway IP set on its subnet to avoid dual default routes and routing conflicts on the multi-homed login nodes, but does have outbound connectivity via a router: @@ -183,7 +186,7 @@ compute # environments/$SITE/inventory/group_vars/all/squid.yml: # these are just examples squid_cache_disk: 1024 # MB -squid_cache_mem: '12 GB' +squid_cache_mem: "12 GB" ``` Note that name resolution must still be possible and may require defining an diff --git a/docs/openondemand.md b/docs/openondemand.md index 5dd3029..cd33cd5 100644 --- a/docs/openondemand.md +++ b/docs/openondemand.md @@ -2,42 +2,46 @@ The appliance can deploy the Open OnDemand portal. This page describes how to enable this and the default appliance configuration/behaviour. Note that detailed configuration documentation is provided by: -- The README for the included `openondemand` role in this repo - [ansible/roles/openondemand/README.md](../ansible/roles/openondemand/README.md). -- The README and default variables for the underlying "official" role which the above wraps - [Open OnDemand Ansible Role](https://github.com/OSC/ood-ansible) +- The readme for the included `openondemand` role in this repository - [ansible/roles/openondemand/README.md](../ansible/roles/openondemand/README.md). +- The readme and default variables for the underlying "official" role which the above wraps - [Open OnDemand Ansible Role](https://github.com/OSC/ood-ansible) - The documentation for Open OnDemand [itself](https://osc.github.io/ood-documentation/latest/index.html) This appliance can deploy and configure: + - The Open OnDemand server itself (usually on a single login node). - User authentication using one of: - - An external OIDC provider. - - HTTP basic authentication and PAM. + - An external OIDC provider. + - HTTP basic authentication and PAM. - Virtual desktops on compute nodes. - Jupyter nodebook servers on compute nodes. - Proxying of Grafana (usually deployed on the control node) via the Open OnDemand portal. - Links to additional filesystems and pages from the Open OnDemand Dashboard. - A Prometheus exporter for the Open OnDemand server and related Grafana dashboard -For examples of all of the above see the `smslabs-example` environment in this repo. +For examples of all of the above see the `smslabs-example` environment in this repository. + +## Enabling Open OnDemand -# Enabling Open OnDemand To enable the Open OnDemand server, add single host to the `openondemand` inventory group. Generally, this should be a node in the `login` group, as Open OnDemand must be able to access Slurm commands. -To enable compute nodes for virtual desktops, Jupyter notebooks, RStudio, VSCode, or MATLAB (accessed through the Open OnDemand portal), add nodes/groups to the `openondemand_desktop`, `openondemand_jupyter`, `openondemand_rstudio`, `openondemand_codeserver`, and `openondemand_matlab` inventory groups respectively. These may be all or a subset of the `compute` group. +To enable compute nodes for virtual desktops, Jupyter notebooks, RStudio, Visual Studio Code, or MATLAB (accessed through the Open OnDemand portal), add nodes/groups to the `openondemand_desktop`, `openondemand_jupyter`, `openondemand_rstudio`, `openondemand_codeserver`, and `openondemand_matlab` inventory groups respectively. These may be all or a subset of the `compute` group. The above functionality is configured by running the `ansible/portal.yml` playbook. This is automatically run as part of `ansible/site.yml`. ## MATLAB -*NB* Due to licensing, the MATLAB batch connect app requires a MATLAB intallation to be present on the relevant compute nodes. The MATLAB app is therefore disabled by default, and must be enabled by setting `openondemand_matlab_partition` in e.g. `environments/site/inventory/group_vars/all/openondemand.yml` to the name of the partition where MATLAB is available. + +_NB_ Due to licensing, the MATLAB batch connect app requires a MATLAB intallation to be present on the relevant compute nodes. The MATLAB app is therefore disabled by default, and must be enabled by setting `openondemand_matlab_partition` in e.g. `environments/site/inventory/group_vars/all/openondemand.yml` to the name of the partition where MATLAB is available. An Lmod modulefile also needs to be available on compute nodes - this is not provided by the appliance. See e.g.`roles/openondemand/tasks/rstudio_compute.yml` for an example. The modulefile must be named `matlab/$MATLAB_VERSION`, where the version matches thes `openondemand_matlab_version` variable. This variable is set to empty in the role default so must be defined in `environments/site/inventory/group_vars/all/openondemand.yml`. -As MATLAB requires a remote desktop, the TurboVNC and Xfce Desktop packages and configuration from the "openondemand_desktop" app will be automatically applied to nodes where the MATLAB app is enabled. +As MATLAB requires a remote desktop, the TurboVNC and Xfce Desktop packages and configuration from the "openondemand_desktop" app will be automatically applied to nodes where the MATLAB app is enabled. -# Default configuration +## Default configuration See the [ansible/roles/openondemand/README.md](../ansible/roles/openondemand/README.md) for more details on the variables described below. The following variables have been given default values to allow Open OnDemand to work in a newly created environment without additional configuration, but generally should be overridden in `environments/site/inventory/group_vars/all/` with site-specific values: + - `openondemand_servername` - this must be defined for both `openondemand` and `grafana` hosts (when Grafana is enabled). The default is `ansible_host` (i.e. the IP address) of the first host in the `openondemand` group. For production @@ -49,6 +53,7 @@ The following variables have been given default values to allow Open OnDemand to - `openondemand_desktop_partition`, `openondemand_jupyter_partition`, `openondemand_rstudio_partition`, and `openondemand_codeserver_partition` if the corresponding inventory groups are defined. Defaults to the first compute group defined in the `compute` OpenTofu variable in `environments/$ENV/tofu`. Note `openondemand_matlab_partition` is not set due to the additional requirements discussed above. It is also recommended to set: + - `openondemand_dashboard_support_url` - `openondemand_dashboard_docs_url` @@ -58,5 +63,8 @@ The appliance automatically configures Open OnDemand to proxy Grafana and adds a [^1]: Note that if `openondemand_auth` is `basic_pam` and anonymous Grafana login is enabled, the appliance will (by default) configure Open OnDemand's Apache server to remove the Authorisation header from proxying of all `node/` addresses. This is done as otherwise Grafana tries to use this header to authenticate, which fails with the default configuration where only the admin Grafana user `grafana` is created. Note that the removal of this header in this configuration means it cannot be used to authenticate proxied interactive applications - however the appliance-deployed remote desktop and Jupyter Notebook server applications use other authentication methods. An alternative if using `basic_pam` is not to enable anonymous Grafana login and to create Grafana users matching the local users (e.g. in `environments//hooks/post.yml`). -# Access -By default the appliance authenticates against OOD with basic auth through PAM. When creating a new environment, a new user with username `demo_user` will be created. Its password is found under `vault_openondemand_default_user` in the appliance secrets store in `environments/{ENV}/inventory/group_vars/all/secrets.yml`. Other users can be defined by overriding the `basic_users_users` variable in your environment (templated into `environments/{ENV}/inventory/group_vars/all/basic_users.yml` by default). +## Access + +By default the appliance authenticates against OOD with basic auth through PAM. When creating a new environment, a new user with username `demo_user` will be created. +Its password is found under `vault_openondemand_default_user` in the appliance secrets store in `environments/{ENV}/inventory/group_vars/all/secrets.yml`. +Other users can be defined by overriding the `basic_users_users` variable in your environment (templated into `environments/{ENV}/inventory/group_vars/all/basic_users.yml` by default). diff --git a/docs/operations.md b/docs/operations.md index 4c5c640..4064d44 100644 --- a/docs/operations.md +++ b/docs/operations.md @@ -3,6 +3,7 @@ This page describes the commands required for common operations. All subsequent sections assume that: + - Commands are run from the repository root, unless otherwise indicated by a `cd` command. - An Ansible vault secret is configured. - The correct private key is available to Ansible. @@ -15,24 +16,27 @@ All subsequent sections assume that: Review any [site-specific documentation](site/README.md) for more details on the above. -# Deploying a Cluster +## Deploying a Cluster This follows the same process as defined in the main [README.md](../README.md) for the default configuration. Note that tags as defined in the various sub-playbooks defined in `ansible/` may be used to only run part of the tasks in `site.yml`. -# SSH to Cluster Nodes +## SSH to Cluster Nodes This depends on how the cluster is accessed. The script `dev/ansible-ssh` may generally be used to connect to a host specified by a `inventory_hostname` using the same connection details as Ansible. If this does not work: + - Instance IPs are normally defined in `ansible_host` variables in an inventory file `environments/$ENV/inventory/hosts{,.yml}`. -- The ssh user is defined by `ansible_user`, default is `rocky`. This may be overridden in your environment. +- The SSH user is defined by `ansible_user`, default is `rocky`. This may be overridden in your environment. - If a jump host is required the user and address may be defined in the above inventory file. -# Modifying general Slurm.conf parameters +## Modifying general Slurm.conf parameters + Parameters for [slurm.conf](https://slurm.schedmd.com/slurm.conf.html) can be added to an `openhpc_config_extra` mapping in `environments/$SITE_ENV/inventory/group_vars/all/openhpc.yml`. Note that values in this mapping may be: + - A string, which will be inserted as-is. - A list, which will be converted to a comma-separated string. @@ -40,9 +44,9 @@ This allows specifying `slurm.conf` contents in an yaml-format Ansible-native wa **NB:** The appliance provides some default values in `environments/common/inventory/group_vars/all/openhpc.yml:openhpc_config_default` which is combined with the above. The `enable_configless` flag in the `SlurmCtldParameters` key this sets must not be overridden - a validation step checks this has not happened. -See [Reconfiguring Slurm](#Reconfiguring-Slurm) to apply changes. +See [Reconfiguring Slurm](#reconfiguring-slurm) to apply changes. -# Modifying Slurm Partition-specific Configuration +## Modifying Slurm Partition-specific Configuration Modify the `openhpc_slurm_partitions` mapping usually in `environments/$SITE_ENV/inventory/group_vars/all/openhpc.yml` as described for [stackhpc.openhpc:slurmconf](https://github.com/stackhpc/ansible-role-openhpc#slurmconf) (note the relevant version of this role is defined in the `requirements.yml`) @@ -50,23 +54,22 @@ Note an Ansible inventory group for the partition is required. This is generally **NB:** `default:NO` must be set on all non-default partitions, otherwise the last defined partition will always be set as the default. -See [Reconfiguring Slurm](#Reconfiguring-Slurm) to apply changes. +See [Reconfiguring Slurm](#reconfiguring-slurm) to apply changes. + +## Adding an Additional Partition -# Adding an Additional Partition This is a usually a two-step process: - If new nodes are required, define a new node group by adding an entry to the `compute` mapping in `environments/$ENV/tofu/main.tf` assuming the default OpenTofu configuration: - - The key is the partition name. - - The value should be a mapping, with the parameters defined in `environments/$SITE_ENV/tofu/compute/variables.tf`, but in brief will need at least `flavor` (name) and `nodes` (a list of node name suffixes). -- Add a new partition to the partition configuration as described under [Modifying Slurm Partition-specific Configuration](#Modifying-Slurm-Partition-specific-Configuration). + - The key is the partition name. + - The value should be a mapping, with the parameters defined in `environments/$SITE_ENV/tofu/compute/variables.tf`, but in brief will need at least `flavor` (name) and `nodes` (a list of node name suffixes). +- Add a new partition to the partition configuration as described under [Modifying Slurm Partition-specific Configuration](#modifying-slurm-partition-specific-configuration). -Deploying the additional nodes and applying these changes requires rerunning both OpenTofu and the Ansible site.yml playbook - follow [Deploying a Cluster](#Deploying-a-Cluster). +Deploying the additional nodes and applying these changes requires rerunning both OpenTofu and the Ansible site.yml playbook - follow [Deploying a Cluster](#deploying-a-cluster). -# Enabling additional functionality -Roles in the appliance which are disabled by default can be enabled by adding the appropriate groups as children of the role's corresponding group in `environments/site/inventory/groups`. For example, -to install a Squid proxy on nodes in the login group, you would modify the `squid` group definition in `environments/site/inventory/groups` to: +## Package Repositories -``` +```yaml [squid:children] # Hosts to run squid proxy login @@ -80,8 +83,10 @@ disabled during runtime to prevent Ark credentials from being leaked. To enable In both cases, Ark credentials will be required. -# Adding Additional Packages +=# Adding Additional Packages + By default, the following utility packages are installed during the StackHPC image build: + - htop - nano - screen @@ -90,22 +95,23 @@ By default, the following utility packages are installed during the StackHPC ima - bind-utils - net-tools - postfix -- git +- Git - latest python version for system (3.6 for for Rocky 8.9 and 3.12 for Rocky 9.4) - s-nail Additional packages can be added during image builds by: + - adding the `extra_packages` group to the build `inventory_groups` (see -[docs/image-build.md](./image-build.md)) + [docs/image-build.md](./image-build.md)) - defining a list of packages in `appliances_extra_packages_other` in e.g. -`environments/$SITE_ENV/inventory/group_vars/all/defaults.yml`. For example: + `environments/$SITE_ENV/inventory/group_vars/all/defaults.yml`. For example: - ```yaml - # environments/foo-base/inventory/group_vars/all/defaults.yml: - appliances_extra_packages_other: - - somepackage - - anotherpackage - ``` +```yaml +# environments/foo-base/inventory/group_vars/all/defaults.yml: +appliances_extra_packages_other: + - somepackage + - anotherpackage +``` For packages which come from repositories mirrored by StackHPC's "Ark" Pulp server (including rocky, EPEL and OpenHPC repositories), this will require either [Ark @@ -118,69 +124,85 @@ the OpenHPC installation guide (linked from the "user-facing" OpenHPC packages such as compilers, MPI libraries etc. include corresponding `lmod` modules. -Packages *may* also be installed during the site.yml, by adding the `cluster` +Packages _may_ also be installed during the site.yml, by adding the `cluster` group into the `extra_packages` group. An error will occur if Ark credentials are defined in this case, as they are readable by unprivileged users in the `.repo` files and a local Pulp mirror must be used instead. -If additional repositories are required, these could be added/enabled as necessary in a play added to `environments/$SITE_ENV/hooks/{pre,post}.yml` as appropriate. Note such a play should NOT exclude the builder group, so that the repositories are also added to built images. There are various Ansible modules which might be useful for this: - - `ansible.builtin.yum_repository`: Add a repo from an URL providing a 'repodata' directory. - - `ansible.builtin.rpm_key` : Add a GPG key to the RPM database. - - `ansible.builtin.get_url`: Can be used to install a repofile directly from an URL (e.g. https://turbovnc.org/pmwiki/uploads/Downloads/TurboVNC.repo) - - `ansible.builtin.dnf`: Can be used to install 'release packages' providing repos, e.g. `epel-release`, `ohpc-release`. +If additional repositories are required, these could be added/enabled as necessary in a play added to `environments/$SITE_ENV/hooks/{pre,post}.yml` as appropriate. +Note such a play should NOT exclude the builder group, so that the repositories are also added to built images. +There are various Ansible modules which might be useful for this: -The packages to be installed from that repo could also be defined in that play. Note using the `dnf` module with a list for its `name` parameter is more efficient and allows better dependency resolution than calling the module in a loop. +- `ansible.builtin.yum_repository`: Add a repository from a URL providing a 'repodata' directory. +- `ansible.builtin.rpm_key` : Add a GPG key to the RPM database. +- `ansible.builtin.get_url`: Can be used to install a repofile directly from a URL (e.g. ) +- `ansible.builtin.dnf`: Can be used to install 'release packages' providing repos, e.g. `epel-release`, `ohpc-release`. +The packages to be installed from that repository could also be defined in that play. Note using the `dnf` module with a list for its `name` parameter is more efficient and allows better dependency resolution than calling the module in a loop. Adding these repos/packages to the cluster/image would then require running: - ansible-playbook environments/$SITE_ENV/hooks/{pre,post}.yml +```shell +ansible-playbook environments/$SITE_ENV/hooks/{pre,post}.yml +``` as appropriate. TODO: improve description about adding these to extra images. - -# Reconfiguring Slurm +## Reconfiguring Slurm At a minimum run: - ansible-playbook ansible/slurm.yml --tags openhpc - +```shell +ansible-playbook ansible/slurm.yml --tags openhpc +``` **NB:** This will restart all daemons if the `slurm.conf` has any changes, even if technically only a `scontrol reconfigure` is required. - -# Running the MPI Test Suite +## Running the MPI Test Suite See [ansible/roles/hpctests/README.md](ansible/roles/hpctests/README.md) for a description of these. They can be run using - ansible-playbook ansible/adhoc/hpctests.yml +```shell +ansible-playbook ansible/adhoc/hpctests.yml +``` Note that: + - The above role provides variables to select specific partitions, nodes and interfaces which may be required. If not set in inventory, these can be passed as extravars: - ansible-playbook ansible/adhoc/hpctests.yml -e hpctests_myvar=foo +```shell +ansible-playbook ansible/adhoc/hpctests.yml -e hpctests_myvar=foo +``` + - The HPL-based test is only reasonably optimised on Intel processors due the libraries and default parallelisation scheme used. For AMD processors it is recommended this -is skipped using: + is skipped using: - ansible-playbook ansible/adhoc/hpctests.yml --skip-tags hpl-solo. +```shell +ansible-playbook ansible/adhoc/hpctests.yml --skip-tags hpl-solo. +``` Review any [site-specific documentation](site/README.md) for more details. -# Running CUDA Tests +## Running CUDA Tests + This uses the [cuda-samples](https://github.com/NVIDIA/cuda-samples/) utilities "deviceQuery" and "bandwidthTest" to test GPU functionality. It automatically runs on any host in the `cuda` inventory group: - ansible-playbook ansible/adhoc/cudatests.yml +```shell +ansible-playbook ansible/adhoc/cudatests.yml +``` **NB:** This test is not launched through Slurm, so confirm nodes are free/out of service or use `--limit` appropriately. -# Ad-hoc Commands and Playbooks +## Ad-hoc Commands and Playbooks A set of utility playbooks for managing a running appliance are provided in `ansible/adhoc` - run these by activating the environment and using: - ansible-playbook ansible/adhoc/$PLAYBOOK +```shell +ansible-playbook ansible/adhoc/$PLAYBOOK +``` Currently they include the following (see each playbook for links to documentation): @@ -191,6 +213,8 @@ Currently they include the following (see each playbook for links to documentati The `ansible` binary [can be used](https://docs.ansible.com/ansible/latest/command_guide/intro_adhoc.html) to run arbitrary shell commands against inventory groups or hosts, for example: - ansible [--become] -m shell -a "" +```shell +ansible [--become] -m shell -a "" +``` This can be useful for debugging and development but any modifications made this way will be lost if nodes are rebuilt/reimaged. diff --git a/docs/persistent-state.md b/docs/persistent-state.md index f5d4852..922ad16 100644 --- a/docs/persistent-state.md +++ b/docs/persistent-state.md @@ -3,6 +3,7 @@ To enable cluster state to persist beyond individual node lifetimes (e.g. to survive a cluster deletion or rebuild) set `appliances_state_dir` to the path of a directory on persistent storage, such as an OpenStack volume. At present this will affect the following: + - `slurmctld` state, i.e. the Slurm queue. - The MySQL database for `slurmdbd`, i.e. Slurm accounting information as shown by the `sacct` command. - Prometheus database @@ -23,6 +24,7 @@ The `site` environment supports persistent state in the default OpenTofu (see `e **NB: The default OpenTofu is provided as a working example and for internal CI use - therefore this volume is deleted when running `tofu destroy` - this may not be appropriate for a production environment.** In general, the Prometheus data is likely to be the only sizeable state stored. The size of this can be influenced through [Prometheus role variables](https://github.com/cloudalchemy/ansible-prometheus#role-variables), e.g.: + - `prometheus_storage_retention` - [default](../environments/common/inventory/group_vars/all/prometheus.yml) 31d - `prometheus_storage_retention_size` - [default](../environments/common/inventory/group_vars/all/prometheus.yml) 100GB - `prometheus_global.scrape_interval` and `scrape_interval` for [specific scrape definitions](../environments/common/inventory/group_vars/all/prometheus.yml) diff --git a/docs/production.md b/docs/production.md index 8808a56..83587f9 100644 --- a/docs/production.md +++ b/docs/production.md @@ -2,49 +2,49 @@ This page will guide you on how to create production-ready deployments. While you can start right away with this guide, you may find it useful to try with a -demo deployment first, as described in the [main README](../README.md). +demo deployment first, as described in the [main readme](../README.md). ## Prerequisites Before starting ensure that: - - You have root access on the deploy host. +- You have root access on the deploy host. - - You can create instances from the [latest Slurm appliance - image](https://github.com/stackhpc/ansible-slurm-appliance/releases), - which already contains the required packages. This is built and tested in - StackHPC's CI. +- You can create instances from the [latest Slurm appliance + image](https://github.com/stackhpc/ansible-slurm-appliance/releases), + which already contains the required packages. This is built and tested in + StackHPC's CI. - - You have an SSH keypair defined in OpenStack, with the private key - available on the deploy host. +- You have an SSH keypair defined in OpenStack, with the private key + available on the deploy host. - - Created instances have access to internet (note proxies can be setup - through the appliance if necessary). +- Created instances have access to internet (note proxies can be setup + through the appliance if necessary). - - Created instances have accurate/synchronised time (for VM instances this is - usually provided by the hypervisor; if not or for bare metal instances it - may be necessary to configure a time service via the appliance). +- Created instances have accurate/synchronised time (for VM instances this is + usually provided by the hypervisor; if not or for bare metal instances it + may be necessary to configure a time service via the appliance). - - Three security groups are present: ``default`` allowing intra-cluster - communication, ``SSH`` allowing external access via SSH and ``HTTPS`` - allowing access for Open OnDemand. +- Three security groups are present: `default` allowing intra-cluster + communication, `SSH` allowing external access via SSH and `HTTPS` + allowing access for Open OnDemand. - - Usually, you'll want to deploy the Slurm Appliance into its own dedicated - project. It's recommended that your OpenStack credentials are defined in a - [clouds.yaml](https://docs.openstack.org/python-openstackclient/latest/configuration/index.html#clouds-yaml) - file in a default location with the default cloud name of `openstack`. +- Usually, you'll want to deploy the Slurm Appliance into its own dedicated + project. It's recommended that your OpenStack credentials are defined in a + [clouds.yaml](https://docs.openstack.org/python-openstackclient/latest/configuration/index.html#clouds-yaml) + file in a default location with the default cloud name of `openstack`. ### Setup deploy host The following operating systems are supported for the deploy host: - - Rocky Linux 9 +- Rocky Linux 9 - - Rocky Linux 8 +- Rocky Linux 8 These instructions assume the deployment host is running Rocky Linux 8: -```bash +```shell sudo yum install -y git python38 git clone https://github.com/stackhpc/ansible-slurm-appliance cd ansible-slurm-appliance @@ -58,24 +58,24 @@ You will also need to install ## Version control A production deployment should be set up under version control, so you should -create a fork of this repo. +create a fork of this repository. First make an empty Git repository using your service of choice (e.g. GitHub or GitLab), then execute the following commands to turn the new empty repository into a copy of the ansible-slurm-appliance repository. - ```bash - git clone https://github.com/stackhpc/ansible-slurm-appliance.git - cd ansible-slurm-appliance - ``` +```shell +git clone https://github.com/stackhpc/ansible-slurm-appliance.git +cd ansible-slurm-appliance +``` Maintain the existing origin remote as upstream, and create a new origin remote for the repository location. - ```bash - git remote rename origin upstream - git remote add origin git@/ansible-slurm-appliance.git - ``` +```shell +git remote rename origin upstream +git remote add origin git@/ansible-slurm-appliance.git +``` You should use the [latest tagged release](https://github.com/stackhpc/ansible-slurm-appliance/releases). v1.161 @@ -83,11 +83,11 @@ has been used as an example here, make sure to change this. Do not use the default main branch, as this may have features that are still works in progress. - ```bash - git checkout v1.161 - git checkout -b site/main - git push -u origin site/main - ``` +```shell +git checkout v1.161 +git checkout -b site/main +git push -u origin site/main +``` ## Environment setup @@ -98,8 +98,9 @@ requires instance deletion/recreation. At least two environments should be created using cookiecutter, which will derive from the `site` base environment: - - `production`: production environment - - `staging`: staging environment + +- `production`: production environment +- `staging`: staging environment A `dev` environment should also be created if considered required, or this can be left until later. @@ -116,29 +117,29 @@ environment-specific (e.g. DNS names for `openondemand_servername`). Where possible hooks should also be placed in `environments/site/hooks/` and referenced from the `production` and `staging` environments, e.g.: - ```yaml - # environments/production/hooks/pre.yml: - - name: Import parent hook - import_playbook: "{{ lookup('env', 'APPLIANCES_ENVIRONMENT_ROOT') }}/../site/hooks/pre.yml" - ``` +```yaml +# environments/production/hooks/pre.yml: +- name: Import parent hook + import_playbook: "{{ lookup('env', 'APPLIANCES_ENVIRONMENT_ROOT') }}/../site/hooks/pre.yml" +``` OpenTofu configurations are defined in the `site` environment and referenced as a module by the site-specific cookie-cutter generated configurations. This will have been generated for you already under -``environments/$ENV/tofu/main.tf``. +`environments/$ENV/tofu/main.tf`. ### Cookiecutter instructions - Run the following from the repository root to activate the venv: - ```bash + ```shell . venv/bin/activate ``` - Use the `cookiecutter` template to create a new environment to hold your configuration: - ```bash + ```shell cd environments cookiecutter ../cookiecutter ``` @@ -149,14 +150,14 @@ will have been generated for you already under - Go back to the root folder and activate the new environment: - ```bash + ```shell cd .. . environments/$ENV/activate ``` And generate secrets for it: - ```bash + ```shell ansible-playbook ansible/adhoc/generate-passwords.yml ``` @@ -164,33 +165,33 @@ will have been generated for you already under Create an OpenTofu variables file to define the required infrastructure, e.g.: - ``` - # environments/$ENV/tofu/terraform.tfvars - cluster_name = "mycluster" - cluster_networks = [ - { - network = "some_network" # * - subnet = "some_subnet" # * - } - ] - key_pair = "my_key" # * - control_node_flavor = "some_flavor_name" - login = { - # Arbitrary group name for these login nodes - interactive = { - nodes: ["login-0"] - flavor: "login_flavor_name" # * - } - } - cluster_image_id = "rocky_linux_9_image_uuid" - compute = { - # Group name used for compute node partition definition - general = { - nodes: ["compute-0", "compute-1"] - flavor: "compute_flavor_name" # * - } +```text +# environments/$ENV/tofu/terraform.tfvars +cluster_name = "mycluster" +cluster_networks = [ + { + network = "some_network" # * + subnet = "some_subnet" # * } - ``` +] +key_pair = "my_key" # * +control_node_flavor = "some_flavor_name" +login = { + # Arbitrary group name for these login nodes + interactive = { + nodes: ["login-0"] + flavor: "login_flavor_name" # * + } +} +cluster_image_id = "rocky_linux_9_image_uuid" +compute = { + # Group name used for compute node partition definition + general = { + nodes: ["compute-0", "compute-1"] + flavor: "compute_flavor_name" # * + } +} +``` Variables marked `*` refer to OpenStack resources which must already exist. @@ -199,29 +200,30 @@ The above is a minimal configuration - for all variables and descriptions see Note that: - - Environment-specific variables (`cluster_name`) should be hardcoded into - the cluster module block. +- Environment-specific variables (`cluster_name`) should be hardcoded into + the cluster module block. - - Environment-independent variables (e.g. maybe `cluster_net` if the same - is used for staging and production) should be set as *defaults* in - `environments/site/tofu/variables.tf`, and then don't need to be passed - in to the module. +- Environment-independent variables (e.g. maybe `cluster_net` if the same + is used for staging and production) should be set as _defaults_ in + `environments/site/tofu/variables.tf`, and then don't need to be passed + in to the module. The cluster image used should match the release which you are deploying with. Published images are described in the release notes -[here](https://github.com/stackhpc/ansible-slurm-appliance/releases). +[here](https://github.com/stackhpc/ansible-slurm-appliance/releases). By default, the site OpenTofu configuration provisions two volumes and attaches them to the control node: - - "$cluster_name-home" for NFS-shared home directories - - "$cluster_name-state" for monitoring and Slurm data -The volumes mean this data is persisted when the control node is rebuilt. -However if the cluster is destroyed with `tofu destroy` then the volumes will -also be deleted. This is undesirable for production environments and usually -also for staging environments. Therefore the volumes should be manually -created, e.g. via the CLI: - ``` +- "$cluster_name-home" for NFS-shared home directories +- "$cluster_name-state" for monitoring and Slurm data + The volumes mean this data is persisted when the control node is rebuilt. + However if the cluster is destroyed with `tofu destroy` then the volumes will + also be deleted. This is undesirable for production environments and usually + also for staging environments. Therefore the volumes should be manually + created, e.g. via the CLI: + + ```shell openstack volume create --size 200 mycluster-home # size in GB openstack volume create --size 100 mycluster-state ``` @@ -229,10 +231,10 @@ created, e.g. via the CLI: and OpenTofu configured to use those volumes instead of managing them itself by setting: - ``` - home_volume_provisioning = "attach" - state_volume_provisioning = "attach" - ``` +```text +home_volume_provisioning = "attach" +state_volume_provisioning = "attach" +``` either for a specific environment within the cluster module block in `environments/$ENV/tofu/main.tf`, or as the site default by changing the @@ -245,19 +247,19 @@ allows for multiple clusters to be created with this environment. If no home volume at all is required because the home directories are provided by a parallel filesystem (e.g. Manila) set - ``` - home_volume_provisioning = "none" - ``` +```text +home_volume_provisioning = "none" +``` In this case the NFS share for home directories is automatically disabled. **NB:** To apply "attach" options to existing clusters, first remove the volume(s) from the tofu state, e.g.: - ``` - tofu state list # find the volume(s) - tofu state rm 'module.cluster.openstack_blockstorage_volume_v3.state[0]' - ``` +```shell +tofu state list # find the volume(s) +tofu state rm 'module.cluster.openstack_blockstorage_volume_v3.state[0]' +``` This leaves the volume itself intact, but means OpenTofu "forgets" it. Then set the "attach" options and run `tofu apply` again - this should show there are no @@ -271,15 +273,16 @@ the IPs into the OpenTofu `login` definition. Consider enabling topology aware scheduling. This is currently only supported if your cluster does not include any baremetal nodes. This can be enabled by: - 1. Creating Availability Zones in your OpenStack project for each physical - rack - 2. Setting the `availability_zone` fields of compute groups in your OpenTofu - configuration - 3. Adding the `compute` group as a child of `topology` in - `environments/$ENV/inventory/groups` - 4. (Optional) If you are aware of the physical topology of switches above the - rack-level, override `topology_above_rack_topology` in your groups vars - (see [topology docs](../ansible/roles/topology/README.md) for more detail) + +1. Creating Availability Zones in your OpenStack project for each physical + rack +2. Setting the `availability_zone` fields of compute groups in your OpenTofu + configuration +3. Adding the `compute` group as a child of `topology` in + `environments/$ENV/inventory/groups` +4. (Optional) If you are aware of the physical topology of switches above the + rack-level, override `topology_above_rack_topology` in your groups vars + (see [topology docs](../ansible/roles/topology/README.md) for more detail) Consider whether mapping of baremetal nodes to ironic nodes is required. See [PR 485](https://github.com/stackhpc/ansible-slurm-appliance/pull/485). @@ -287,12 +290,12 @@ Consider whether mapping of baremetal nodes to ironic nodes is required. See To deploy this infrastructure, ensure the venv and the environment are [activated](#cookiecutter-instructions) and run: - ```bash - export OS_CLOUD=openstack - cd environments/$ENV/tofu/ - tofu init - tofu apply - ``` +```shell +export OS_CLOUD=openstack +cd environments/$ENV/tofu/ +tofu init +tofu apply +``` and follow the prompts. Note the OS_CLOUD environment variable assumes that OpenStack credentials are defined using a @@ -305,9 +308,9 @@ number of concurrent operations to 10. This means that for example only 10 ports or 10 instances can be deployed at once. This should be raised by modifying `environments/$ENV/activate` to add a line like: - ```bash - export TF_CLI_ARGS_apply="-parallelism=25" - ``` +```shell +export TF_CLI_ARGS_apply="-parallelism=25" +``` The value chosen should be the highest value demonstrated during testing. Note that any time spent blocked due to this parallelism limit does not count @@ -328,14 +331,15 @@ against the (un-overridable) internal OpenTofu timeout of 30 minutes instances) it may be necessary to [configure chrony](./chrony.md). - Consider whether Prometheus storage configuration is required. By default: + - A 200GB state volume is provisioned (but see above) - The common environment [sets](../environments/common/inventory/group_vars/all/prometheus.yml) a maximum retention of 100 GB and 31 days. - These may or may not be appropriate depending on the number of nodes, the - scrape interval, and other uses of the state volume (primarily the `slurmctld` - state and the `slurmdbd` database). See - [docs/monitoring-and-logging](./monitoring-and-logging.md) for more options. + These may or may not be appropriate depending on the number of nodes, the + scrape interval, and other uses of the state volume (primarily the `slurmctld` + state and the `slurmdbd` database). See + [docs/monitoring-and-logging](./monitoring-and-logging.md) for more options. - Configure Open OnDemand - see [specific documentation](openondemand.md) which notes specific variables required. @@ -383,17 +387,22 @@ against the (un-overridable) internal OpenTofu timeout of 30 minutes ### Applying configuration To configure the appliance, ensure the venv and the environment are -[activated](#create-a-new-environment) and run: - ```bash - ansible-playbook ansible/site.yml - ``` +```text +[activated](#create-a-new-environment) +``` + +and run: + +```shell +ansible-playbook ansible/site.yml +``` Once it completes you can log in to the cluster using: - ```bash - ./dev/ansible-ssh login - ``` +```shell +./dev/ansible-ssh login +``` For further information, including additional configuration guides and operations instructions, see the [docs](README.md) directory. diff --git a/docs/sequence.md b/docs/sequence.md index 8723674..8149290 100644 --- a/docs/sequence.md +++ b/docs/sequence.md @@ -1,10 +1,9 @@ # Slurm Appliance Sequences - - ## Image build This sequence applies to both: + - "fatimage" builds, starting from GenericCloud images and using control,login,compute inventory groups to install all packages, e.g. StackHPC CI builds @@ -86,9 +85,10 @@ sequenceDiagram This sequence applies to active clusters, after running the `site.yml` playbook for the first time. Slurm controlled rebuild requires that: + - Compute groups in the OpenTofu `compute` variable have: - - `ignore_image_changes: true` - - `compute_init_enable: ['compute', ... ]` + - `ignore_image_changes: true` + - `compute_init_enable: ['compute', ... ]` - The Ansible `rebuild` inventory group contains the `control` group. TODO: should also document how compute-init does NOT run if the `site.yml` @@ -126,8 +126,9 @@ sequenceDiagram end nodes->>nodes: srun task completes ``` + Notes: + 1. And/or login/compute group overrides 2. Running on control node 3. On hosts targeted by job - diff --git a/docs/site/README.md b/docs/site/README.md index ee14787..6597ea4 100644 --- a/docs/site/README.md +++ b/docs/site/README.md @@ -2,5 +2,4 @@ This document is a placeholder for any site-specific documentation, e.g. environment descriptions. -#TODO: list things which should commonly be specified here. - +## TODO: list things which should commonly be specified here diff --git a/docs/upgrades.md b/docs/upgrades.md index b44cae9..5cf4bbe 100644 --- a/docs/upgrades.md +++ b/docs/upgrades.md @@ -6,51 +6,61 @@ Generally, upstream releases will happen roughly monthly. Releases may contain n Any site-specific instructions in [docs/site/README.md](site/README.md) should be reviewed in tandem with this. This document assumes the deployment repository has: + 1. Remotes: - - `origin` referring to the site-specific remote repository. - - `stackhpc` referring to the StackHPC repository at https://github.com/stackhpc/ansible-slurm-appliance.git. + - `origin` referring to the site-specific remote repository. + - `stackhpc` referring to the StackHPC repository at . 2. Branches: - - `main` - following `main/origin`, the current site-specific code deployed to production. - - `upstream` - following `main/stackhpc`, i.e. the upstream `main` branch from `stackhpc`. + - `main` - following `main/origin`, the current site-specific code deployed to production. + - `upstream` - following `main/stackhpc`, i.e. the upstream `main` branch from `stackhpc`. 3. The following environments: - - `$PRODUCTION`: a production environment, as defined by e.g. `environments/production/`. - - `$STAGING`: a production environment, as defined by e.g. `environments/staging/`. - - `$SITE_ENV`: a base site-specific environment, as defined by e.g. `environments/mysite/`. + - `$PRODUCTION`: a production environment, as defined by e.g. `environments/production/`. + - `$STAGING`: a production environment, as defined by e.g. `environments/staging/`. + - `$SITE_ENV`: a base site-specific environment, as defined by e.g. `environments/mysite/`. **NB:** Commands which should be run on the Slurm login node are shown below prefixed `[LOGIN]$`. All other commands should be run on the Ansible deploy host. 1. Update the `upstream` branch from the `stackhpc` remote, including tags: - git fetch stackhpc main --tags +```shell +git fetch stackhpc main --tags +``` 1. Identify the latest release from the [Slurm appliance release page](https://github.com/stackhpc/ansible-slurm-appliance/releases). Below this release is shown as `vX.Y`. 1. Ensure your local site branch is up to date and create a new branch from it for the site-specfic release code: - git checkout main - git pull --prune - git checkout -b update/vX.Y +```shell +git checkout main +git pull --prune +git checkout -b update/vX.Y +``` 1. Merge the upstream code into your release branch: - git merge vX.Y +```shell +git merge vX.Y +``` + +It is possible this will introduce merge conflicts; fix these following the usual Git +prompts. Generally merge conflicts should only exist where functionality which was added +for your site (not in a hook) has subsequently been merged upstream. - It is possible this will introduce merge conflicts; fix these following the usual git - prompts. Generally merge conflicts should only exist where functionality which was added - for your site (not in a hook) has subsequently been merged upstream. +Note that if upgrading from a release prior to v2.3, you will likely have merge conflicts +with existing site OpenTofu configurations in `environments/site/tofu`. Generally - Note that if upgrading from a release prior to v2.3, you will likely have merge conflicts - with existing site OpenTofu configurations in `environments/site/tofu`. Generally - - Changes to `default` values in `environments/site/tofu.variables.tf` should be rejected. - - All other changes to the OpenTofu configuration should be accepted, unless they overwrite - site-specific additional resources. +- Changes to `default` values in `environments/site/tofu.variables.tf` should be rejected. +- All other changes to the OpenTofu configuration should be accepted, unless they overwrite + site-specific additional resources. 1. Push this branch and create a PR: - git push - # follow instructions +```shell +git push +# follow instructions +``` 1. Review the PR to see if any added/changed functionality requires alteration of site-specific configuration. In general changes to existing functionality will aim to be @@ -58,17 +68,19 @@ All other commands should be run on the Ansible deploy host. necessary to use new functionality or where functionality has been upstreamed as above. Note that the upstream `environments/site/inventory/groups` file contains all possible groups which can be used to enable features. This will be updated when pulling changes - from the StackHPC repo, and any new groups should be enabled/disabled as required for + from the StackHPC repository, and any new groups should be enabled/disabled as required for your site. - Make changes as necessary. +Make changes as necessary. 1. Identify image(s) from the relevant [Slurm appliance release](https://github.com/stackhpc/ansible-slurm-appliance/releases), and download using the link on the release plus the image name, e.g. for an image `openhpc-RL9-250708-1547-1494192e`: - wget https://leafcloud.store/swift/v1/AUTH_f39848421b2747148400ad8eeae8d536/openhpc-images/openhpc-RL9-250708-1547-1494192e +```shell +wget https://leafcloud.store/swift/v1/AUTH_f39848421b2747148400ad8eeae8d536/openhpc-images/openhpc-RL9-250708-1547-1494192e +``` - Note that some releases may not include new images. In this case use the image from the latest previous release with new images. +Note that some releases may not include new images. In this case use the image from the latest previous release with new images. 1. If an "extra" image build with local modifications is required, update the Packer build configuration to use the above new image and run a build. See @@ -83,33 +95,42 @@ All other commands should be run on the Ansible deploy host. 1. Declare a future outage window to cluster users. A [Slurm reservation](https://slurm.schedmd.com/scontrol.html#lbAQ) can be used to prevent jobs running during that window, e.g.: - [LOGIN]$ sudo scontrol create reservation Flags=MAINT ReservationName="upgrade-vX.Y" StartTime=2024-10-16T08:00:00 EndTime=2024-10-16T10:00:00 Nodes=ALL Users=root +```shell +[LOGIN]$ sudo scontrol create reservation Flags=MAINT ReservationName="upgrade-vX.Y" StartTime=2024-10-16T08:00:00 EndTime=2024-10-16T10:00:00 Nodes=ALL Users=root +``` - Note a reservation cannot be created if it may overlap with currently running jobs (defined by job or partition time limits). +Note a reservation cannot be created if it may overlap with currently running jobs (defined by job or partition time limits). 1. At the outage window, check there are no jobs running: - [LOGIN]$ squeue +```shell +[LOGIN]$ squeue +``` 1. Deploy the branch created above to production, i.e. activate the production environment, run OpenTofu to reimage or -delete/recreate instances with the new images (depending on how the root disk is defined), and run Ansible's `site.yml` -playbook to reconfigure the cluster, e.g. as described in the main [README.md](../README.md). + delete/recreate instances with the new images (depending on how the root disk is defined), and run Ansible's `site.yml` + playbook to reconfigure the cluster, e.g. as described in the main [README.md](../README.md). 1. Check slurm is up: - [LOGIN]$ sinfo -R - - The `-R` shows the reason for any nodes being down. +```shell +[LOGIN]$ sinfo -R +``` + +The `-R` shows the reason for any nodes being down. 1. If the above shows nodes done for having been "unexpectedly rebooted", set them up again: - [LOGIN]$ sudo scontrol update state=RESUME nodename=$HOSTLIST_EXPR +```shell +[LOGIN]$ sudo scontrol update state=RESUME nodename=$HOSTLIST_EXPR +``` - where the hostlist expression might look like e.g. `general-[0-1]` to reset state for nodes 0 and 1 of the general partition. +where the hostlist expression might look like e.g. `general-[0-1]` to reset state for nodes 0 and 1 of the general partition. 1. Delete the reservation: - [LOGIN]$ sudo scontrol delete ReservationName="upgrade-slurm-v1.160" +```shell +[LOGIN]$ sudo scontrol delete ReservationName="upgrade-slurm-v1.160" +``` 1. Tell users the cluster is available again. - diff --git a/environments/.caas/README.md b/environments/.caas/README.md index 4a08433..8402845 100644 --- a/environments/.caas/README.md +++ b/environments/.caas/README.md @@ -3,9 +3,10 @@ Environment for default Azimuth Slurm. This is not intended to be manually deployed. Non-standard things for this environment: + - There is no activate script. -- `ansible.cgf` is provided in the repo root, as expected by the caas operator. -- `ANSIBLE_INVENTORY` is set in the cluster type template, using a path relative to the +- `ansible.cgf` is provided in the repository root, as expected by the caas operator. +- `ANSIBLE_INVENTORY` is set in the cluster type template, using a path relative to the runner project directory: azimuth_caas_stackhpc_slurm_appliance_template: @@ -13,6 +14,6 @@ Non-standard things for this environment: envVars: ANSIBLE_INVENTORY: environments/common/inventory,environments/.caas/inventory - Ansible then defines `ansible_inventory_sources` which contains absolute paths, and - that is used to derive the `appliances_environment_root` and - `appliances_repository_root`. + Ansible then defines `ansible_inventory_sources` which contains absolute paths, and + that is used to derive the `appliances_environment_root` and + `appliances_repository_root`. diff --git a/environments/.caas/hooks/post.yml b/environments/.caas/hooks/post.yml index eaaeb23..cf606c7 100644 --- a/environments/.caas/hooks/post.yml +++ b/environments/.caas/hooks/post.yml @@ -1,9 +1,10 @@ +--- # Configure the Zenith clients that are required # Note zenith hosts are in podman group - hosts: grafana tasks: - name: Deploy the Zenith client for Grafana - include_role: + ansible.builtin.include_role: name: zenith_proxy vars: zenith_proxy_service_name: zenith-monitoring @@ -11,7 +12,7 @@ zenith_proxy_upstream_host: "{{ ansible_host }}" # IP zenith_proxy_upstream_port: "{{ grafana_port }}" zenith_proxy_client_token: "{{ zenith_token_monitoring }}" - zenith_proxy_mitm_enabled: yes + zenith_proxy_mitm_enabled: true zenith_proxy_mitm_auth_inject: basic zenith_proxy_mitm_auth_basic_username: "{{ grafana_security.admin_user }}" zenith_proxy_mitm_auth_basic_password: "{{ grafana_security.admin_password }}" @@ -20,7 +21,7 @@ - hosts: openondemand tasks: - name: Deploy the Zenith client for OOD - include_role: + ansible.builtin.include_role: name: zenith_proxy vars: zenith_proxy_service_name: zenith-ood @@ -29,7 +30,7 @@ zenith_proxy_upstream_host: "{{ ansible_host }}" # IP zenith_proxy_upstream_port: 443 zenith_proxy_client_token: "{{ zenith_token_ood }}" - zenith_proxy_mitm_enabled: yes + zenith_proxy_mitm_enabled: true zenith_proxy_mitm_auth_inject: basic zenith_proxy_mitm_auth_basic_username: azimuth zenith_proxy_mitm_auth_basic_password: "{{ vault_azimuth_user_password }}" @@ -40,14 +41,15 @@ become: false gather_facts: false tasks: - - import_role: + - ansible.builtin.import_role: name: hpctests when: cluster_run_validation | default(false) | bool # Write the outputs as the final task - hosts: localhost tasks: - - debug: var=outputs + - ansible.builtin.debug: + var: outputs vars: # Ansible has a fit when there are two 'hostvars' evaluations in a resolution chain, # so we have to repeat logic here unfortunately @@ -62,4 +64,4 @@ if zenith_fqdn_ood is not defined else {} ) - }} \ No newline at end of file + }} diff --git a/environments/.caas/hooks/pre.yml b/environments/.caas/hooks/pre.yml index 8924dca..8209b39 100644 --- a/environments/.caas/hooks/pre.yml +++ b/environments/.caas/hooks/pre.yml @@ -1,8 +1,9 @@ --- - # Generate k3s token - name: Generate k3s token - # NB: Although this generates a new token on each run, the actual token set in metadata is retrieved from a set-once tofu resource, hence only the first value ever generated is relevant. + # NB: Although this generates a new token on each run, the actual token set in + # metadata is retrieved from a set-once tofu resource, hence only the first + # value ever generated is relevant. hosts: openstack tasks: - ansible.builtin.set_fact: @@ -17,44 +18,44 @@ # Ensure that the secrets are generated and persisted on the control host - name: Generate and persist secrets hosts: control - gather_facts: no - become: yes + gather_facts: false + become: true roles: - persist_openhpc_secrets # validate.yml asserts presence of a control group which doesn't exist when # destroying infra, so only validate when we're not destroying - hosts: openstack - gather_facts: no - become: no + gather_facts: false + become: false tasks: - - set_fact: + - ansible.builtin.set_fact: appliances_validate: false when: "cluster_state | default('') == 'absent'" # TODO: FIXME: maybe by doing the user move in cloud-init? -# The first task in the bootstrap playbook causes the home directory of the rocky user to be moved on the first run +# The first task in the bootstrap playbook causes the home directory of the rocky user to be moved on the first run # This can disrupt the SSH connection, particularly because we use the login host as a jump host # So we move the home directory on the login node and reset the connections first - hosts: login gather_facts: false tasks: - - name: Set up Ansible user - user: "{{ (appliances_local_users_default | selectattr('user.name', 'eq', appliances_local_users_ansible_user_name))[0]['user'] }}" - become_method: "sudo" + - name: Set up Ansible user # noqa: args[module] + ansible.builtin.user: "{{ (appliances_local_users_default | selectattr('user.name', 'eq', appliances_local_users_ansible_user_name))[0]['user'] }}" + become_method: ansible.builtin.sudo # Need to change working directory otherwise we try to switch back to non-existent directory. - become_flags: '-i' + become_flags: "-i" become: true - hosts: cluster - gather_facts: no + gather_facts: false tasks: - name: Reset persistent SSH connections - meta: reset_connection + ansible.builtin.meta: reset_connection - hosts: localhost - gather_facts: no - become: no + gather_facts: false + become: false tasks: - name: Add hosts to dnf_repos group to enable repofiles ansible.builtin.add_host: @@ -68,9 +69,10 @@ # https://github.com/stackhpc/ansible-slurm-appliance/blob/ba9699267449fba58cd9c04c451759a914fd7144/ansible/validate.yml#L16 # doesn't break CaaS platforms - hosts: localhost - gather_facts: no + gather_facts: false tasks: - name: Prepare requirements.yml.last for galaxy validation - copy: + ansible.builtin.copy: src: "{{ appliances_repository_root }}/requirements.yml" dest: "{{ appliances_repository_root }}/requirements.yml.last" + mode: "0644" diff --git a/environments/.caas/inventory/group_vars/all/basic_users.yml b/environments/.caas/inventory/group_vars/all/basic_users.yml index 0e38148..ef19398 100644 --- a/environments/.caas/inventory/group_vars/all/basic_users.yml +++ b/environments/.caas/inventory/group_vars/all/basic_users.yml @@ -1,3 +1,4 @@ +--- basic_users_users: - name: azimuth # Hash the password with a salt that is different for each host diff --git a/environments/.caas/inventory/group_vars/all/cluster.yml b/environments/.caas/inventory/group_vars/all/cluster.yml index 14633c8..ea38e9f 100644 --- a/environments/.caas/inventory/group_vars/all/cluster.yml +++ b/environments/.caas/inventory/group_vars/all/cluster.yml @@ -1,3 +1,4 @@ +--- # Account for the fact we are running outside of the expected environment system: caas_inventory: "{{ ansible_inventory_sources | last }}" # ansible_inventory_sources is absolute appliances_environment_root: "{{ caas_inventory | dirname }}" diff --git a/environments/.caas/inventory/group_vars/all/grafana.yml b/environments/.caas/inventory/group_vars/all/grafana.yml index 10fdc92..d831467 100644 --- a/environments/.caas/inventory/group_vars/all/grafana.yml +++ b/environments/.caas/inventory/group_vars/all/grafana.yml @@ -1 +1,2 @@ +--- grafana_auth_anonymous: "{{ groups['openondemand'] | count > 0 }}" diff --git a/environments/.caas/inventory/group_vars/all/hpctests.yml b/environments/.caas/inventory/group_vars/all/hpctests.yml index a6a2c91..f4ade94 100644 --- a/environments/.caas/inventory/group_vars/all/hpctests.yml +++ b/environments/.caas/inventory/group_vars/all/hpctests.yml @@ -1,8 +1,9 @@ +--- # Skip plotting pingpong as matplotlib not in runner environment hpctests_pingpong_plot: false # In Azimuth, the Ansible controller is an ephemeral pod, so all that matters is that -# this is a location that is writable by the container user +# this is a location that is writable by the container user hpctests_outdir: "{{ playbook_dir }}/.tmp/hpctests" # hpctests run by default in Azimuth but not trying to stress-test the nodes diff --git a/environments/.caas/inventory/group_vars/all/manila.yml b/environments/.caas/inventory/group_vars/all/manila.yml index 226ac21..ebd1dde 100644 --- a/environments/.caas/inventory/group_vars/all/manila.yml +++ b/environments/.caas/inventory/group_vars/all/manila.yml @@ -1,3 +1,4 @@ +--- caas_manila_home: share_name: "{{ cluster_name }}-home" mount_path: /home @@ -14,4 +15,6 @@ caas_manila_project: mount_group: root mount_mode: ugo=rwX -os_manila_mount_shares: "{{ ([caas_manila_home] if cluster_home_manila_share | bool else []) + ([caas_manila_project] if cluster_project_manila_share | bool else []) }}" +# yamllint disable-line rule:line-length +os_manila_mount_shares: "{{ ([caas_manila_home] if cluster_home_manila_share | bool else []) + ([caas_manila_project] if cluster_project_manila_share | bool else + []) }}" diff --git a/environments/.caas/inventory/group_vars/all/nfs.yml b/environments/.caas/inventory/group_vars/all/nfs.yml index 74a42cd..0eca0c8 100644 --- a/environments/.caas/inventory/group_vars/all/nfs.yml +++ b/environments/.caas/inventory/group_vars/all/nfs.yml @@ -1,10 +1,11 @@ +--- nfs_server: "{{ nfs_server_default }}" caas_nfs_home: - comment: Export /exports/home from Slurm control node as /home nfs_enable: - server: "{{ inventory_hostname in groups['control'] }}" - clients: "{{ inventory_hostname in groups['cluster'] }}" + server: "{{ inventory_hostname in groups['control'] }}" + clients: "{{ inventory_hostname in groups['cluster'] }}" nfs_export: "/exports/home" # assumes default site TF is being used nfs_client_mnt_point: "/home" diff --git a/environments/.caas/inventory/group_vars/all/openhpc.yml b/environments/.caas/inventory/group_vars/all/openhpc.yml index 74f196c..56c8b90 100644 --- a/environments/.caas/inventory/group_vars/all/openhpc.yml +++ b/environments/.caas/inventory/group_vars/all/openhpc.yml @@ -1,3 +1,4 @@ +--- openhpc_cluster_name: "{{ cluster_name }}" # Provision a single "standard" compute nodegroup using the supplied diff --git a/environments/.caas/inventory/group_vars/all/openondemand.yml b/environments/.caas/inventory/group_vars/all/openondemand.yml index 4dc0b93..83b15a2 100644 --- a/environments/.caas/inventory/group_vars/all/openondemand.yml +++ b/environments/.caas/inventory/group_vars/all/openondemand.yml @@ -6,4 +6,3 @@ openondemand_desktop_partition: "{{ openhpc_partitions[0]['name'] }}" httpd_listen_addr_port: - 80 - 443 - diff --git a/environments/.caas/inventory/group_vars/all/zenith.yml b/environments/.caas/inventory/group_vars/all/zenith.yml index 56dd0ca..652f2da 100644 --- a/environments/.caas/inventory/group_vars/all/zenith.yml +++ b/environments/.caas/inventory/group_vars/all/zenith.yml @@ -1 +1,2 @@ +--- zenith_proxy_podman_user: podman diff --git a/environments/.caas/inventory/group_vars/openstack.yml b/environments/.caas/inventory/group_vars/openstack.yml index f76c050..83dff89 100644 --- a/environments/.caas/inventory/group_vars/openstack.yml +++ b/environments/.caas/inventory/group_vars/openstack.yml @@ -1,3 +1,4 @@ +--- # The default Terraform state key for backends that support it terraform_state_key: "cluster/{{ cluster_id }}/tfstate" diff --git a/environments/.caas/ui-meta/slurm-infra-fast-volume-type.yml b/environments/.caas/ui-meta/slurm-infra-fast-volume-type.yml index ab10eff..5a105bd 100644 --- a/environments/.caas/ui-meta/slurm-infra-fast-volume-type.yml +++ b/environments/.caas/ui-meta/slurm-infra-fast-volume-type.yml @@ -1,8 +1,8 @@ +--- name: "slurm" label: "Slurm" -description: >- - Batch cluster running the Slurm workload manager, the Open - OnDemand web interface, and custom monitoring. +description: >- + Batch cluster running the Slurm workload manager, the Open OnDemand web interface, and custom monitoring. logo: https://upload.wikimedia.org/wikipedia/commons/thumb/3/3a/Slurm_logo.svg/158px-Slurm_logo.svg.png requires_ssh_key: true @@ -49,7 +49,7 @@ parameters: count_parameter: compute_count min_ram: 2048 min_disk: 20 - + - name: home_volume_size label: Home volume size (GB) description: The size of the cloud volume to use for home directories. @@ -79,7 +79,7 @@ parameters: this volume, 10GB is set aside for cluster state and the remaining space is used to store cluster metrics. - The oldest metrics records in the [Prometheus](https://prometheus.io/) database will be + The oldest metrics records in the [Prometheus](https://prometheus.io/) database will be discarded to ensure that the database does not grow larger than this volume. kind: cloud.volume_size immutable: true @@ -98,6 +98,7 @@ parameters: options: checkboxLabel: Run post-configuration validation? +# yamllint disable rule:line-length usage_template: |- # Accessing the cluster using Open OnDemand @@ -137,6 +138,7 @@ usage_template: |- Other parts of the filesystem may be affected during a patch operation, including any packages that have been installed using `dnf`. +# yamllint enable rule:line-length services: - name: ood @@ -145,4 +147,3 @@ services: - name: monitoring label: Monitoring icon_url: https://raw.githubusercontent.com/cncf/artwork/master/projects/prometheus/icon/color/prometheus-icon-color.png - diff --git a/environments/.caas/ui-meta/slurm-infra-manila-home.yml b/environments/.caas/ui-meta/slurm-infra-manila-home.yml index 4a01bb6..6255f46 100644 --- a/environments/.caas/ui-meta/slurm-infra-manila-home.yml +++ b/environments/.caas/ui-meta/slurm-infra-manila-home.yml @@ -1,9 +1,9 @@ +--- # Exactly as for slurm-infra.yml but to allow for separate manila/non-manila home appliances name: "slurm-manila-home" label: "Slurm (CephFS home)" -description: >- - Batch cluster running the Slurm workload manager, the Open - OnDemand web interface, and custom monitoring. +description: >- + Batch cluster running the Slurm workload manager, the Open OnDemand web interface, and custom monitoring. This version uses CephFS for home directories. logo: https://upload.wikimedia.org/wikipedia/commons/thumb/3/3a/Slurm_logo.svg/158px-Slurm_logo.svg.png @@ -52,7 +52,7 @@ parameters: count_parameter: compute_count min_ram: 2048 min_disk: 20 - + - name: home_volume_size label: Home share size (GB) description: The size of the share to use for home directories. @@ -69,7 +69,7 @@ parameters: this volume, 10GB is set aside for cluster state and the remaining space is used to store cluster metrics. - The oldest metrics records in the [Prometheus](https://prometheus.io/) database will be + The oldest metrics records in the [Prometheus](https://prometheus.io/) database will be discarded to ensure that the database does not grow larger than this volume. kind: cloud.volume_size immutable: true @@ -88,6 +88,7 @@ parameters: options: checkboxLabel: Run post-configuration validation? +# yamllint disable rule:line-length usage_template: |- # Accessing the cluster using Open OnDemand @@ -127,6 +128,7 @@ usage_template: |- Other parts of the filesystem may be affected during a patch operation, including any packages that have been installed using `dnf`. +# yamllint enable rule:line-length services: - name: ood diff --git a/environments/.caas/ui-meta/slurm-infra.yml b/environments/.caas/ui-meta/slurm-infra.yml index 36b8928..b2d4383 100644 --- a/environments/.caas/ui-meta/slurm-infra.yml +++ b/environments/.caas/ui-meta/slurm-infra.yml @@ -1,8 +1,8 @@ +--- name: "slurm" label: "Slurm" -description: >- - Batch cluster running the Slurm workload manager, the Open - OnDemand web interface, and custom monitoring. +description: >- + Batch cluster running the Slurm workload manager, the Open OnDemand web interface, and custom monitoring. logo: https://upload.wikimedia.org/wikipedia/commons/thumb/3/3a/Slurm_logo.svg/158px-Slurm_logo.svg.png requires_ssh_key: true @@ -49,7 +49,7 @@ parameters: count_parameter: compute_count min_ram: 2048 min_disk: 20 - + - name: home_volume_size label: Home volume size (GB) description: The size of the cloud volume to use for home directories. @@ -66,7 +66,7 @@ parameters: this volume, 10GB is set aside for cluster state and the remaining space is used to store cluster metrics. - The oldest metrics records in the [Prometheus](https://prometheus.io/) database will be + The oldest metrics records in the [Prometheus](https://prometheus.io/) database will be discarded to ensure that the database does not grow larger than this volume. kind: cloud.volume_size immutable: true @@ -85,6 +85,7 @@ parameters: options: checkboxLabel: Run post-configuration validation? +# yamllint disable rule:line-length usage_template: |- # Accessing the cluster using Open OnDemand @@ -124,6 +125,7 @@ usage_template: |- Other parts of the filesystem may be affected during a patch operation, including any packages that have been installed using `dnf`. +# yamllint enable rule:line-length services: - name: ood diff --git a/environments/.stackhpc/hooks/post-bootstrap.yml b/environments/.stackhpc/hooks/post-bootstrap.yml index df39026..3e24212 100644 --- a/environments/.stackhpc/hooks/post-bootstrap.yml +++ b/environments/.stackhpc/hooks/post-bootstrap.yml @@ -1,17 +1,19 @@ +--- - hosts: podman:!builder - become: yes + become: true gather_facts: false tags: podman tasks: - name: Configure container image registry to avoid docker.io ratelimits - copy: + ansible.builtin.copy: dest: /etc/containers/registries.conf.d/003-arcus-mirror.conf content: | [[registry]] location="docker.io/library/" prefix="docker.io/library/" - + [[registry.mirror]] location = "{{ podman_registry_address }}" insecure = true + mode: "0644" when: "ci_cloud == 'ARCUS'" diff --git a/environments/.stackhpc/hooks/pre.yml b/environments/.stackhpc/hooks/pre.yml index 305713a..e810c20 100644 --- a/environments/.stackhpc/hooks/pre.yml +++ b/environments/.stackhpc/hooks/pre.yml @@ -1,17 +1,18 @@ +--- - hosts: control:!builder - become: yes + become: true gather_facts: false tasks: - name: Output OS version - command: cat /etc/redhat-release + ansible.builtin.command: cat /etc/redhat-release changed_when: false - name: Write CI-generated inventory and secrets for debugging ansible.builtin.copy: dest: /etc/ci-config/ src: "{{ item }}" - directory_mode: 0400 - mode: 0400 + directory_mode: "0400" + mode: "0400" owner: root group: root no_log: "{{ no_log | default(true) }}" diff --git a/environments/.stackhpc/inventory/group_vars/all/basic_users.yml b/environments/.stackhpc/inventory/group_vars/all/basic_users.yml index e2088ff..235814c 100644 --- a/environments/.stackhpc/inventory/group_vars/all/basic_users.yml +++ b/environments/.stackhpc/inventory/group_vars/all/basic_users.yml @@ -1,3 +1,6 @@ +--- + +# yamllint disable-line rule:line-length test_demo_user_password: "{{ lookup('env', 'DEMO_USER_PASSWORD') | default(vault_demo_user_password, true) }}" # CI uses env, debug can set vault_demo_user_password basic_users_users: diff --git a/environments/.stackhpc/inventory/group_vars/all/bastion.yml b/environments/.stackhpc/inventory/group_vars/all/bastion.yml index a1001e8..ea2ad00 100644 --- a/environments/.stackhpc/inventory/group_vars/all/bastion.yml +++ b/environments/.stackhpc/inventory/group_vars/all/bastion.yml @@ -1,3 +1,4 @@ +--- ci_cloud: "{{ lookup('env', 'CI_CLOUD') }}" bastion_config: ARCUS: diff --git a/environments/.stackhpc/inventory/group_vars/all/freeipa.yml b/environments/.stackhpc/inventory/group_vars/all/freeipa.yml index 9a979ab..a92f011 100644 --- a/environments/.stackhpc/inventory/group_vars/all/freeipa.yml +++ b/environments/.stackhpc/inventory/group_vars/all/freeipa.yml @@ -1,3 +1,4 @@ +--- # This file provides examples of using freeipa role variables. These are NOT functional in CI as freeipa_{server,client} groups are not defined. # NB: Users defined this way have expired passwords diff --git a/environments/.stackhpc/inventory/group_vars/all/hpctests.yml b/environments/.stackhpc/inventory/group_vars/all/hpctests.yml index e8cfcea..4724621 100644 --- a/environments/.stackhpc/inventory/group_vars/all/hpctests.yml +++ b/environments/.stackhpc/inventory/group_vars/all/hpctests.yml @@ -1 +1,2 @@ +--- hpctests_user: demo_user diff --git a/environments/.stackhpc/inventory/group_vars/all/manila.yml b/environments/.stackhpc/inventory/group_vars/all/manila.yml index 59f9358..b37a130 100644 --- a/environments/.stackhpc/inventory/group_vars/all/manila.yml +++ b/environments/.stackhpc/inventory/group_vars/all/manila.yml @@ -1,3 +1,4 @@ +--- os_manila_mount_shares_arcus: - share_name: slurm-v2-home mount_path: /project diff --git a/environments/.stackhpc/inventory/group_vars/all/openhpc.yml b/environments/.stackhpc/inventory/group_vars/all/openhpc.yml index 5aac5f8..ae1342b 100644 --- a/environments/.stackhpc/inventory/group_vars/all/openhpc.yml +++ b/environments/.stackhpc/inventory/group_vars/all/openhpc.yml @@ -1,3 +1,4 @@ +--- openhpc_config_extra: SlurmctldDebug: debug SlurmdDebug: debug diff --git a/environments/.stackhpc/inventory/group_vars/all/openondemand.yml b/environments/.stackhpc/inventory/group_vars/all/openondemand.yml index a8f88e5..9779e96 100644 --- a/environments/.stackhpc/inventory/group_vars/all/openondemand.yml +++ b/environments/.stackhpc/inventory/group_vars/all/openondemand.yml @@ -1,8 +1,9 @@ +--- openondemand_auth: basic_pam openondemand_jupyter_partition: standard openondemand_desktop_partition: standard openondemand_rstudio_partition: standard openondemand_codeserver_partition: standard -#openondemand_dashboard_support_url: -#openondemand_dashboard_docs_url: -#openondemand_filesapp_paths: +# openondemand_dashboard_support_url: +# openondemand_dashboard_docs_url: +# openondemand_filesapp_paths: diff --git a/environments/.stackhpc/inventory/group_vars/all/podman.yml b/environments/.stackhpc/inventory/group_vars/all/podman.yml index b9d4109..02d7e7f 100644 --- a/environments/.stackhpc/inventory/group_vars/all/podman.yml +++ b/environments/.stackhpc/inventory/group_vars/all/podman.yml @@ -1,2 +1,3 @@ +--- arcus_podman_registry_address: 192.168.3.95:5000 podman_registry_address: "{{ arcus_podman_registry_address if ci_cloud == 'ARCUS' else '' }}" diff --git a/environments/.stackhpc/inventory/group_vars/all/tuned.yml b/environments/.stackhpc/inventory/group_vars/all/tuned.yml index f1cb034..a8074e7 100644 --- a/environments/.stackhpc/inventory/group_vars/all/tuned.yml +++ b/environments/.stackhpc/inventory/group_vars/all/tuned.yml @@ -1,2 +1,3 @@ +--- # Set profile which is not default (on VMs) for testing tuned_profile: hpc-compute diff --git a/environments/.stackhpc/inventory/group_vars/builder.yml b/environments/.stackhpc/inventory/group_vars/builder.yml index 10b15ad..788666a 100644 --- a/environments/.stackhpc/inventory/group_vars/builder.yml +++ b/environments/.stackhpc/inventory/group_vars/builder.yml @@ -1,4 +1,5 @@ -#update_enable: false # Can uncomment for speed debugging non-update related build issues +--- +# update_enable: false # Can uncomment for speed debugging non-update related build issues sssd_install_ldap: true # include sssd-ldap package in fatimage # update_enable: false # Can uncomment for speed debugging non-update related build issues diff --git a/environments/.stackhpc/tofu/cluster_image.auto.tfvars.json b/environments/.stackhpc/tofu/cluster_image.auto.tfvars.json index 6b294d1..2000b5e 100644 --- a/environments/.stackhpc/tofu/cluster_image.auto.tfvars.json +++ b/environments/.stackhpc/tofu/cluster_image.auto.tfvars.json @@ -1,6 +1,6 @@ { - "cluster_image": { - "RL8": "openhpc-RL8-250820-0800-767addd8", - "RL9": "openhpc-RL9-250908-2047-d90ebd0e" - } + "cluster_image": { + "RL8": "openhpc-RL8-250918-0840-930223fb", + "RL9": "openhpc-RL9-250918-0840-930223fb" + } } diff --git a/environments/.stackhpc/tofu/main.tf b/environments/.stackhpc/tofu/main.tf index 82c963c..22113cd 100644 --- a/environments/.stackhpc/tofu/main.tf +++ b/environments/.stackhpc/tofu/main.tf @@ -4,99 +4,106 @@ terraform { required_version = ">= 0.14" required_providers { openstack = { - source = "terraform-provider-openstack/openstack" + source = "terraform-provider-openstack/openstack" version = "~>3.0.0" } } } variable "environment_root" { - type = string - description = "Path to environment root, automatically set by activate script" + type = string + description = "Path to environment root, automatically set by activate script" } variable "cluster_name" { - type = string - description = "Name for cluster, used as prefix for resources - set by environment var in CI" + type = string + description = "Name for cluster, used as prefix for resources - set by environment var in CI" } variable "os_version" { - type = string + type = string description = "RL8 or RL9" - default = "RL9" + default = "RL9" } variable "cluster_image" { - description = "single image for all cluster nodes, keyed by os_version - a convenience for CI" - type = map(string) + description = "single image for all cluster nodes, keyed by os_version - a convenience for CI" + type = map(string) } +# tflint-ignore: terraform_typed_variables variable "cluster_networks" {} +# tflint-ignore: terraform_typed_variables variable "vnic_types" { - default = {} + default = {} } -variable "state_volume_type"{ - default = null +# tflint-ignore: terraform_typed_variables +variable "state_volume_type" { + default = null } -variable "home_volume_type"{ - default = null +# tflint-ignore: terraform_typed_variables +variable "home_volume_type" { + default = null } +# tflint-ignore: terraform_typed_variables variable "control_node_flavor" {} +# tflint-ignore: terraform_typed_variables variable "other_node_flavor" {} +# tflint-ignore: terraform_typed_variables variable "volume_backed_instances" { - default = false + default = false } data "openstack_images_image_v2" "cluster" { - name = var.cluster_image[var.os_version] - most_recent = true + name = var.cluster_image[var.os_version] + most_recent = true } module "cluster" { - source = "../../site/tofu/" + source = "../../site/tofu/" - cluster_name = var.cluster_name - cluster_networks = var.cluster_networks - vnic_types = var.vnic_types - key_pair = "slurm-app-ci" - cluster_image_id = data.openstack_images_image_v2.cluster.id - control_node_flavor = var.control_node_flavor + cluster_name = var.cluster_name + cluster_networks = var.cluster_networks + vnic_types = var.vnic_types + key_pair = "slurm-app-ci" + cluster_image_id = data.openstack_images_image_v2.cluster.id + control_node_flavor = var.control_node_flavor + login = { login = { - login = { - nodes = ["login-0"] - flavor = var.other_node_flavor - } + nodes = ["login-0"] + flavor = var.other_node_flavor } - compute = { - standard = { # NB: can't call this default! - nodes = ["compute-0", "compute-1"] - flavor = var.other_node_flavor - compute_init_enable = ["compute", "chrony", "etc_hosts", "nfs", "basic_users", "eessi", "tuned", "cacerts", "nhc"] - ignore_image_changes = true - } - # Normally-empty partition for testing: - extra = { - nodes = [] - #nodes = ["extra-0", "extra-1"] - flavor = var.other_node_flavor - } + } + compute = { + standard = { # NB: can't call this default! + nodes = ["compute-0", "compute-1"] + flavor = var.other_node_flavor + compute_init_enable = ["compute", "chrony", "etc_hosts", "nfs", "basic_users", "eessi", "tuned", "cacerts", "nhc"] + ignore_image_changes = true + } + # Normally-empty partition for testing: + extra = { + nodes = [] + #nodes = ["extra-0", "extra-1"] + flavor = var.other_node_flavor } + } - volume_backed_instances = var.volume_backed_instances + volume_backed_instances = var.volume_backed_instances - environment_root = var.environment_root - # Can reduce volume size a lot for short-lived CI clusters: - state_volume_size = 10 - home_volume_size = 20 + environment_root = var.environment_root + # Can reduce volume size a lot for short-lived CI clusters: + state_volume_size = 10 + home_volume_size = 20 - state_volume_type = var.state_volume_type - home_volume_type = var.home_volume_type + state_volume_type = var.state_volume_type + home_volume_type = var.home_volume_type } diff --git a/environments/README.md b/environments/README.md index b6e2cf9..94a66e1 100644 --- a/environments/README.md +++ b/environments/README.md @@ -32,15 +32,14 @@ for usage instructions for that component. ### common Shared configuration for all environments. This is not -intended to be used as a standalone environment, hence the README does *not* detail -how to provision the infrastructure. This environment should not be edited, except as part of upstreaming new features or bug fixes. +intended to be used as a standalone environment, hence the readme does _not_ detail +how to provision the infrastructure. This environment should not be edited, except as part of upstreaming new features or bugfixes. ## site Provides the base configuration for all subsequent `cookiecutter` created environments, including OpenTofu configurations for infrastructure. In general, most local customisations should be made by adding to this environment. - ## Defining an environment To define an environment using cookiecutter: @@ -53,7 +52,7 @@ Once you have answered all questions, a new environment directory will be created. The directory will be named according to the answer you gave for `environment`. -Follow the README in the new directory to perform initial configuration. +Follow the readme in the new directory to perform initial configuration. ## Activating environments @@ -69,13 +68,12 @@ hosts from the associated group in the inventory. A pattern we use is to name th ansible inventory `group` after the name of the `role` that configures it. The playbook that runs this role targets hosts in that group. The `common` environment typically defines all groups as the empty group. You must explicly opt-in and add hosts to these these groups -to configure that service. For example, if you don't want to deploy and configure grafana, +to configure that service. For example, if you don't want to deploy and configure grafana, you simply do not add any hosts to the `grafana` group in the inventory. This allows us to -have a shared ansible code base as we can define playbooks to configure all things, +have a shared ansible codebase as we can define playbooks to configure all things, but these playbooks end up not being run if no host is in the associated group. -See also: - - `common/inventory/groups` for a list of all groups. +See also: - `common/inventory/groups` for a list of all groups. ## Overriding configuration diff --git a/environments/common/files/filebeat/filebeat.yml b/environments/common/files/filebeat/filebeat.yml index 0f7186b..7f19aa0 100644 --- a/environments/common/files/filebeat/filebeat.yml +++ b/environments/common/files/filebeat/filebeat.yml @@ -1,3 +1,4 @@ +--- filebeat.config: modules: path: ${path.config}/modules.d/*.yml diff --git a/environments/common/inventory/group_vars/all/alertmanager.yml b/environments/common/inventory/group_vars/all/alertmanager.yml index c677aaa..8f5ef0f 100644 --- a/environments/common/inventory/group_vars/all/alertmanager.yml +++ b/environments/common/inventory/group_vars/all/alertmanager.yml @@ -1,5 +1,5 @@ - -alertmanager_port: '9093' # defined here as required for prometheus +--- +alertmanager_port: "9093" # defined here as required for prometheus alertmanager_slack_receiver_name: slack-receiver alertmanager_slack_receiver_send_resolved: true diff --git a/environments/common/inventory/group_vars/all/ansible_init.yml b/environments/common/inventory/group_vars/all/ansible_init.yml index df4060f..0a5198b 100644 --- a/environments/common/inventory/group_vars/all/ansible_init.yml +++ b/environments/common/inventory/group_vars/all/ansible_init.yml @@ -1,3 +1,4 @@ +--- ansible_init_wait: 300 # seconds ansible_init_pip_packages: diff --git a/environments/common/inventory/group_vars/all/basic_users.yml b/environments/common/inventory/group_vars/all/basic_users.yml index d94d129..8d5f86a 100644 --- a/environments/common/inventory/group_vars/all/basic_users.yml +++ b/environments/common/inventory/group_vars/all/basic_users.yml @@ -3,7 +3,6 @@ # See ansible/roles/basic_users/README.md for variable definitions. basic_users_users: [] - # The following are defined for the purpose of compute-init basic_users_homedir_server: "{{ groups['control'] | first }}" -basic_users_homedir_client: "{{ groups['login'] | first }}" \ No newline at end of file +basic_users_homedir_client: "{{ groups['login'] | first }}" diff --git a/environments/common/inventory/group_vars/all/defaults.yml b/environments/common/inventory/group_vars/all/defaults.yml index 8d629ff..027e407 100644 --- a/environments/common/inventory/group_vars/all/defaults.yml +++ b/environments/common/inventory/group_vars/all/defaults.yml @@ -5,7 +5,7 @@ appliances_repository_root: "{{ lookup('env', 'APPLIANCES_REPO_ROOT') }}" appliances_environment_root: "{{ lookup('env', 'APPLIANCES_ENVIRONMENT_ROOT') }}" appliances_environment_name: "{{ appliances_environment_root | basename | regex_replace('\\W+', '') }}" # [a-zA-Z0-9_] only appliances_cockpit_state: absent # RHEL cockpit installed but not enabled in genericcloud images; appliance defaults to removing it -#appliances_state_dir: # define an absolute path here to use for persistent state: NB: This is defined as /var/lib/state in inventory by the default Terraform +# appliances_state_dir: # define an absolute path here to use for persistent state: NB: This is defined as /var/lib/state in inventory by the default Terraform appliances_mode: configure appliances_pulp_url: https://ark.stackhpc.com @@ -29,53 +29,53 @@ alertmanager_address: "{{ hostvars[groups['alertmanager'].0].api_address }}" appliances_local_users_ansible_user_name: "{{ ansible_ssh_user | default(ansible_user) }}" appliances_local_users_podman_uid: 1001 # UID for podman user - normally next UID after default user appliances_local_users_podman: # also used in environments/common/inventory/group_vars/all/podman.yml:podman_users - name: podman - comment: Used for running all containers - # Would like to set subuid so that we that we know what will appear in /etc/subuid - # See: https://github.com/ansible/ansible/issues/68199 - home: /var/lib/podman - uid: "{{ appliances_local_users_podman_uid }}" + name: podman + comment: Used for running all containers + # Would like to set subuid so that we that we know what will appear in /etc/subuid + # See: https://github.com/ansible/ansible/issues/68199 + home: /var/lib/podman + uid: "{{ appliances_local_users_podman_uid }}" appliances_local_users_default: - - user: - name: "{{ appliances_local_users_ansible_user_name }}" - home: /var/lib/{{ appliances_local_users_ansible_user_name }} - move_home: true - local: true + - user: + name: "{{ appliances_local_users_ansible_user_name }}" + home: /var/lib/{{ appliances_local_users_ansible_user_name }} + move_home: true + local: true - - user: "{{ appliances_local_users_podman }}" - enable: "{{ 'podman' in group_names }}" + - user: "{{ appliances_local_users_podman }}" + enable: "{{ 'podman' in group_names }}" - - user: - name: slurm - comment: SLURM resource manager - home: /etc/slurm - shell: /sbin/nologin - uid: 202 - system: true + - user: + name: slurm + comment: SLURM resource manager + home: /etc/slurm + shell: /sbin/nologin + uid: 202 + system: true - - group: - name: prometheus - gid: 976 - user: - name: prometheus - uid: 981 - home: "{{ prometheus_db_dir }}" - shell: /usr/sbin/nologin - system: true - enable: "{{ 'prometheus' in group_names }}" + - group: + name: prometheus + gid: 976 + user: + name: prometheus + uid: 981 + home: "{{ prometheus_db_dir }}" + shell: /usr/sbin/nologin + system: true + enable: "{{ 'prometheus' in group_names }}" - - group: - name: grafana - gid: 979 - user: - name: grafana - comment: grafana user - uid: 984 - home: /usr/share/grafana - shell: /sbin/nologin - system: true - enable: "{{ 'grafana' in group_names }}" + - group: + name: grafana + gid: 979 + user: + name: grafana + comment: grafana user + uid: 984 + home: /usr/share/grafana + shell: /sbin/nologin + system: true + enable: "{{ 'grafana' in group_names }}" # Overide this to add extra users whilst keeping the defaults. appliances_local_users_extra: [] # see format of appliances_local_users_default above @@ -84,18 +84,17 @@ appliances_local_users: "{{ (appliances_local_users_default + appliances_local_u ################## bootstrap: extra package installs ###################################### appliances_extra_packages_default: - - htop - - nano - - screen - - tmux - - wget - - bind-utils - - net-tools - - postfix - - git - - "{{ 'python36' if ansible_distribution_version == '8.9' else 'python312' }}" - - s-nail + - htop + - nano + - screen + - tmux + - wget + - bind-utils + - net-tools + - postfix + - git + - "{{ 'python36' if ansible_distribution_version == '8.9' else 'python312' }}" + - s-nail appliances_extra_packages_other: [] - appliances_extra_packages: "{{ (appliances_extra_packages_default + appliances_extra_packages_other) | select | list }}" diff --git a/environments/common/inventory/group_vars/all/filebeat.yml b/environments/common/inventory/group_vars/all/filebeat.yml index d268af1..4b91726 100644 --- a/environments/common/inventory/group_vars/all/filebeat.yml +++ b/environments/common/inventory/group_vars/all/filebeat.yml @@ -4,4 +4,4 @@ filebeat_config_path: "{{ appliances_repository_root }}/environments/common/files/filebeat/filebeat.yml" # User that runs the filebeat container -filebeat_podman_user: podman \ No newline at end of file +filebeat_podman_user: podman diff --git a/environments/common/inventory/group_vars/all/firewalld.yml b/environments/common/inventory/group_vars/all/firewalld.yml index 569428e..498ec62 100644 --- a/environments/common/inventory/group_vars/all/firewalld.yml +++ b/environments/common/inventory/group_vars/all/firewalld.yml @@ -1,3 +1,4 @@ +--- # See ansible/roles/firewalld/README.md # for variable definitions. @@ -9,14 +10,14 @@ firewalld_configs_default: # name: An arbitrary name or description # group: An ansible group name - this rule is applied if the fail2ban node is in this group # rule: A dict of parameters passed to the `ansible.posix.firewalld` module. - # FaiBy default we rely on openstack security groups so + # FaiBy default we rely on openstack security groups so - name: Make firewalld permissive group: openhpc rule: zone: public state: enabled target: ACCEPT - permanent: yes + permanent: true firewalld_configs_extra: [] # list of dicts with parameters as for firewalld_configs_default diff --git a/environments/common/inventory/group_vars/all/freeipa_server.yml b/environments/common/inventory/group_vars/all/freeipa_server.yml index 7f0fee7..64a1f7a 100644 --- a/environments/common/inventory/group_vars/all/freeipa_server.yml +++ b/environments/common/inventory/group_vars/all/freeipa_server.yml @@ -1,3 +1,4 @@ +--- # See ansible/roles/freeipa/README.md # These vars are only used when freeipa_server is enabled. They are not required when enabling only freeipa_client freeipa_realm: "{{ openhpc_cluster_name | upper }}.{{ cluster_domain_suffix | upper }}" diff --git a/environments/common/inventory/group_vars/all/grafana.yml b/environments/common/inventory/group_vars/all/grafana.yml index b03d16f..b428849 100644 --- a/environments/common/inventory/group_vars/all/grafana.yml +++ b/environments/common/inventory/group_vars/all/grafana.yml @@ -2,7 +2,7 @@ # See: https://github.com/cloudalchemy/ansible-grafana # for variable definitions. -grafana_version: '10.4.18' +grafana_version: "10.4.18" # need to copy some role defaults here so we can use in inventory: grafana_port: 3000 @@ -53,9 +53,9 @@ grafana_dashboards_default: grafana_dashboards: "{{ grafana_dashboards_default + (openondemand_dashboard if groups.get('openondemand') else []) }}" grafana_security: - admin_user: grafana - admin_password: "{{ vault_grafana_admin_password }}" - allow_embedding: true + admin_user: grafana + admin_password: "{{ vault_grafana_admin_password }}" + allow_embedding: true grafana_datasources: - name: prometheus @@ -77,7 +77,7 @@ grafana_datasources: timeField: "@timestamp" # Have to set flavor and version, but ansible/roles/opensearch/templates/opensearch.yml.j2 fakes version for filebeat # so need to set to fake version here: - version: '7.10.2' + version: "7.10.2" flavor: elasticsearch editable: true # readOnly: false @@ -99,7 +99,7 @@ grafana_server: # appliance specific: serve_from_sub_path: "{{ grafana_serve_from_sub_path }}" - +# yamllint disable-line rule:line-length grafana_auth_anonymous: false # Enable anonymous View-only login - see implications: https://grafana.com/docs/grafana/latest/administration/security/#implications-of-enabling-anonymous-access-to-dashboards _grafana_auth_anon_cfg: diff --git a/environments/common/inventory/group_vars/all/k3s.yml b/environments/common/inventory/group_vars/all/k3s.yml index a7ba0a0..aa7172f 100644 --- a/environments/common/inventory/group_vars/all/k3s.yml +++ b/environments/common/inventory/group_vars/all/k3s.yml @@ -1 +1,2 @@ +--- k3s_bootstrap_token: "{{ hostvars[groups['k3s_server'] | first].k3s_bootstrap_token | default('') }}" diff --git a/environments/common/inventory/group_vars/all/manila.yml b/environments/common/inventory/group_vars/all/manila.yml index cb015f9..9bc6941 100644 --- a/environments/common/inventory/group_vars/all/manila.yml +++ b/environments/common/inventory/group_vars/all/manila.yml @@ -1,3 +1,4 @@ +--- # Default configuration for manila file shares, see # https://github.com/stackhpc/ansible-role-os-manila-mount # for all variable definitions, and override in your environment. diff --git a/environments/common/inventory/group_vars/all/mysql.yml b/environments/common/inventory/group_vars/all/mysql.yml index d5245fe..2c320af 100644 --- a/environments/common/inventory/group_vars/all/mysql.yml +++ b/environments/common/inventory/group_vars/all/mysql.yml @@ -17,7 +17,7 @@ mysql_datadir: "{{ appliances_state_dir | default('/var/lib') }}/mysql" mysql_databases: - name: slurm_acct_db - config_file: '' + config_file: "" login_user: root login_password: "{{ mysql_root_password }}" login_host: "{{ mysql_host }}" diff --git a/environments/common/inventory/group_vars/all/nfs.yml b/environments/common/inventory/group_vars/all/nfs.yml index 398bde7..05ecd89 100644 --- a/environments/common/inventory/group_vars/all/nfs.yml +++ b/environments/common/inventory/group_vars/all/nfs.yml @@ -14,22 +14,22 @@ nfs_export_clients: "{{ _nfs_node_ips }}" nfs_configuration_home_volume: # volume-backed home directories - comment: Export /exports/home from Slurm control node as /home nfs_enable: - server: "{{ inventory_hostname in groups['control'] }}" - # Don't mount share on control node: - clients: "{{ inventory_hostname in groups['cluster'] and inventory_hostname not in groups['control'] }}" + server: "{{ inventory_hostname in groups['control'] }}" + # Don't mount share on control node: + clients: "{{ inventory_hostname in groups['cluster'] and inventory_hostname not in groups['control'] }}" nfs_server: "{{ nfs_server_default }}" nfs_export: "/exports/home" # assumes default site TF is being used nfs_client_mnt_point: "/home" # prevent tunnelling and setuid binaries: # NB: this is stackhpc.nfs role defaults but are set here to prevent being # accidently overriden via default options - nfs_export_options: 'rw,secure,root_squash' + nfs_export_options: "rw,secure,root_squash" nfs_configuration_compute_nodes: # cluster configuration for compute_init/slurm-controlled rebuild - comment: Export /exports/cluster from Slurm control node nfs_enable: - server: "{{ inventory_hostname in groups['control'] }}" - clients: false + server: "{{ inventory_hostname in groups['control'] }}" + clients: false nfs_export: "/exports/cluster" nfs_configurations_extra: [] # site-specific nfs shares diff --git a/environments/common/inventory/group_vars/all/openhpc.yml b/environments/common/inventory/group_vars/all/openhpc.yml index 0fed1c9..bf212cb 100644 --- a/environments/common/inventory/group_vars/all/openhpc.yml +++ b/environments/common/inventory/group_vars/all/openhpc.yml @@ -9,7 +9,7 @@ openhpc_enable: database: "{{ inventory_hostname in groups['control'] }}" runtime: true openhpc_slurm_service_enabled: true -openhpc_slurm_accounting_storage_type: 'accounting_storage/slurmdbd' +openhpc_slurm_accounting_storage_type: "accounting_storage/slurmdbd" openhpc_slurmdbd_mysql_database: slurm_acct_db openhpc_slurmdbd_mysql_password: "{{ vault_mysql_slurm_password }}" openhpc_slurmdbd_mysql_username: slurm @@ -18,17 +18,18 @@ openhpc_slurmdbd_host: "{{ openhpc_slurm_control_host }}" openhpc_rebuild_partition: # not a role var - could actually add more indirection here for things we're expecting to be modified, e.g. groups and maxtime name: rebuild nodegroups: "{{ cluster_compute_groups | default([]) }}" - default: NO + default: false maxtime: 30 partition_params: PriorityJobFactor: 65533 - Hidden: YES - RootOnly: YES - DisableRootJobs: NO - PreemptMode: 'OFF' + Hidden: true + RootOnly: true + DisableRootJobs: false + PreemptMode: "OFF" OverSubscribe: EXCLUSIVE openhpc_nodegroups: "{{ cluster_compute_groups | map('community.general.dict_kv', 'name') }}" # create nodegroup for each compute group openhpc_user_partitions: "{{ openhpc_nodegroups }}" # create partition for each nodegroup (actually role default) - this is what we'd expect to be changed +# yamllint disable-line rule:line-length openhpc_partitions: "{{ openhpc_user_partitions + ([openhpc_rebuild_partition] if groups['rebuild'] | length > 0 else []) }}" # auto-create rebuild partition if reqd. openhpc_packages_default: # system packages diff --git a/environments/common/inventory/group_vars/all/openondemand.yml b/environments/common/inventory/group_vars/all/openondemand.yml index bd8ba76..af7554a 100644 --- a/environments/common/inventory/group_vars/all/openondemand.yml +++ b/environments/common/inventory/group_vars/all/openondemand.yml @@ -1,11 +1,11 @@ --- -# See: ansible/roles/openondemand/README.md +# See: ansible/roles/openondemand/README.md # for variable definitions. # NB: Variables prefixed ood_ are all from https://github.com/OSC/ood-ansible -ondemand_package_version: '3.1.10' # used in ansible/cleanup.yml +ondemand_package_version: "3.1.10" # used in ansible/cleanup.yml ondemand_package: ondemand-"{{ ondemand_package_version }}" # osc.ood role var controlling installed package openondemand_servername: "{{ hostvars[groups['openondemand'].0].ansible_host if groups['openondemand'] else '' }}" @@ -20,7 +20,7 @@ openondemand_codeserver_partition: "{{ openhpc_partitions[0]['name'] }}" # Regex defining hosts which openondemand can proxy; the default regex is compute nodes (for apps) and grafana host, # e.g. if the group `compute` has hosts `compute-{0,1,2,..}` this will be '(compute-\d+)|(control)'. -# The autogenerated regex may need overriding if compute node names do not contain numbers in a consistent position +# The autogenerated regex may need overriding if compute node names do not contain numbers in a consistent position # or include regex special characters. openondemand_host_regex: "{{ (groups['compute'] + groups['grafana']) | to_ood_regex }}" @@ -146,6 +146,7 @@ openondemand_apps_desktop_default: - <%= "--nodelist=#{node}" %> openondemand_apps_desktop: "{{ {'bc_desktop':openondemand_apps_desktop_default} if openondemand_desktop_partition | default(none) else {} }}" +# yamllint disable-line rule:line-length # See https://osc.github.io/ood-documentation/latest/app-development/tutorials-interactive-apps/add-jupyter.html#app-development-tutorials-interactive-apps-add-jupyter openondemand_apps_jupyter_default: title: Jupyter Notebook @@ -188,7 +189,7 @@ openondemand_apps_rstudio_default: description: Request a RStudio server cluster: slurm attributes: - bc_queue: + bc_queue: value: "{{ openondemand_rstudio_partition | default(none) }}" rstudio_module: label: RStudio module @@ -197,10 +198,11 @@ openondemand_apps_rstudio_default: help: Choose your RStudio module widget: select options: - - ["RStudio v{{ openondemand_rstudio_version }}", "rstudio-server/{{ openondemand_rstudio_version }}}"] + - "RStudio v{{ openondemand_rstudio_version }}" + - "rstudio-server/{{ openondemand_rstudio_version }}}" extra_modules_script: label: Extra modules script - help: If you'd like to load additional modules alongside RStudio-Server, put the 'module load ...' commands into a text file (one 'module load...' per line) and specify its path here + help: If you'd like to load additional modules alongside RStudio-Server, put the 'module load ...' commands into a text file (one 'module load...' per line) and specify its path here # noqa: yaml[line-length] widget: text_field required: false cores: @@ -217,7 +219,7 @@ openondemand_apps_rstudio_default: label: RAM in GB help: How much RAM to reserve for your session. NB Ensure this is within the maximum allowed by your chosen partition min: 4 - max: 700 + max: 700 step: 1 value: 4 cachable: true @@ -286,7 +288,8 @@ openondemand_apps_matlab_default: help: Choose your MATLAB module widget: select options: - - ["MATLAB v{{ openondemand_matlab_version }}", "matlab/{{ openondemand_matlab_version }}"] + - "MATLAB v{{ openondemand_matlab_version }}" + - "matlab/{{ openondemand_matlab_version }}" cores: label: Number of CPU cores help: How many CPU cores to reserve for your session. NB Ensure this is within the maximum allowed by your chosen partition. @@ -301,7 +304,7 @@ openondemand_apps_matlab_default: label: RAM in GB help: How much RAM to reserve for your session. NB Ensure this is within the maximum allowed by your chosen partition min: 4 - max: 700 + max: 700 step: 1 value: 4 cachable: true @@ -346,7 +349,8 @@ openondemand_apps_codeserver_default: help: Choose your Code Server module widget: select options: - - ["Code Server v{{ openondemand_code_server_version}}", "code-server/{{ openondemand_code_server_version }}"] + - "Code Server v{{ openondemand_code_server_version}}" + - "code-server/{{ openondemand_code_server_version }}" bc_queue: value: "{{ openondemand_codeserver_partition | default(none) }}" cores: @@ -363,7 +367,7 @@ openondemand_apps_codeserver_default: label: RAM in GB help: How much RAM to reserve for your session. NB Ensure this is within the maximum allowed by your chosen partition min: 4 - max: 700 + max: 700 step: 1 value: 4 cachable: true @@ -394,8 +398,8 @@ openondemand_apps_codeserver_default: openondemand_apps_codeserver: "{{ {'codeserver':openondemand_apps_codeserver_default} if openondemand_codeserver_partition | default(none) else {} }}" # osc.ood:ood_apps - see https://github.com/OSC/ood-ansible#ood_apps -openondemand_dashboard_support_url: '' -openondemand_dashboard_docs_url: '' +openondemand_dashboard_support_url: "" +openondemand_dashboard_docs_url: "" openondemand_apps: files: env: @@ -430,11 +434,11 @@ openondemand_scrape_configs: scrape_timeout: 20s scrape_interval: 2m static_configs: - - targets: - - "{{ openondemand_address }}:9301" - labels: - environment: "{{ appliances_environment_name }}" - service: "openondemand" + - targets: + - "{{ openondemand_address }}:9301" + labels: + environment: "{{ appliances_environment_name }}" + service: "openondemand" openondemand_dashboard: - dashboard_id: 13465 @@ -443,8 +447,12 @@ openondemand_dashboard: replacement: prometheus revision_id: 1 -_opeonondemand_unset_auth: ' RequestHeader unset Authorization' +_opeonondemand_unset_auth: " RequestHeader unset Authorization" # Fix grafana proxying for basic auth if anonymous grafana access enabled: +# yamllint disable-line rule:line-length openondemand_node_proxy_directives: "{{ _opeonondemand_unset_auth if (openondemand_auth == 'basic_pam' and 'openondemand_host_regex' and groups['grafana'] | length > 0 and hostvars[ groups['grafana'] | first]._grafana_auth_is_anonymous) else '' }}" -# Reason: OOD server forwards headers to proxied servers, so when if using basic auth Grafana gets passed the Open Ondemand user. This probably isn't a Grafana user so it throws an auth error. If anonymous access is enabled we can work around this by not forwarding auth header. +# Reason: OOD server forwards headers to proxied servers, so when if using basic auth +# Grafana gets passed the Open Ondemand user. +# This probably isn't a Grafana user so it throws an auth error. +# If anonymous access is enabled we can work around this by not forwarding auth header. diff --git a/environments/common/inventory/group_vars/all/podman.yml b/environments/common/inventory/group_vars/all/podman.yml index 8ca8eb1..a6d38f5 100644 --- a/environments/common/inventory/group_vars/all/podman.yml +++ b/environments/common/inventory/group_vars/all/podman.yml @@ -1 +1,2 @@ +--- podman_users: "{{ [appliances_local_users_podman] }}" # user to use for podman diff --git a/environments/common/inventory/group_vars/all/prometheus.yml b/environments/common/inventory/group_vars/all/prometheus.yml index f4587e6..6b33ce8 100644 --- a/environments/common/inventory/group_vars/all/prometheus.yml +++ b/environments/common/inventory/group_vars/all/prometheus.yml @@ -11,8 +11,8 @@ prometheus_db_dir: "{{ appliances_state_dir | default('/var/lib') }}/prometheus" prometheus_alertmanager_config_default: - static_configs: - - targets: - - "{{ alertmanager_address }}:{{ alertmanager_port }}" + - targets: + - "{{ alertmanager_address }}:{{ alertmanager_port }}" basic_auth: username: alertmanager # cloudalchemy.prometheus/preflight checks this config so it must be @@ -44,37 +44,38 @@ prometheus_targets: other: "{{ groups.get('node_exporter', []) | difference(groups['openhpc']) | prometheus_node_exporter_targets(hostvars, 'prometheus_env', 'other') }}" prometheus_scrape_configs_default: -- job_name: "prometheus" - metrics_path: "/metrics" - static_configs: - - targets: - - "{{ prometheus_address }}:9090" -- job_name: "grafana" - static_configs: - - targets: - - "{{ grafana_api_address }}:{{ grafana_port }}" -- job_name: "node" - file_sd_configs: - - files: - - /etc/prometheus/file_sd/control.yml - - /etc/prometheus/file_sd/login.yml - - /etc/prometheus/file_sd/compute.yml - - /etc/prometheus/file_sd/other.yml - relabel_configs: - # strip off port - - source_labels: ['__address__'] - separator: ':' - regex: '(.*):.*' - target_label: 'instance' - replacement: '${1}' - scrape_interval: 30s - scrape_timeout: 20s + - job_name: "prometheus" + metrics_path: "/metrics" + static_configs: + - targets: + - "{{ prometheus_address }}:9090" + - job_name: "grafana" + static_configs: + - targets: + - "{{ grafana_api_address }}:{{ grafana_port }}" + - job_name: "node" + file_sd_configs: + - files: + - /etc/prometheus/file_sd/control.yml + - /etc/prometheus/file_sd/login.yml + - /etc/prometheus/file_sd/compute.yml + - /etc/prometheus/file_sd/other.yml + relabel_configs: + # strip off port + - source_labels: + - '__address__' + separator: ':' + regex: '(.*):.*' + target_label: 'instance' + replacement: '${1}' + scrape_interval: 30s + scrape_timeout: 20s -- job_name: "slurm_exporter" - scrape_interval: 30s - scrape_timeout: 30s - static_configs: - - targets: - - "{{ openhpc_slurm_control_host }}:{{ slurm_exporter_port }}" + - job_name: "slurm_exporter" + scrape_interval: 30s + scrape_timeout: 30s + static_configs: + - targets: + - "{{ openhpc_slurm_control_host }}:{{ slurm_exporter_port }}" prometheus_scrape_configs: "{{ prometheus_scrape_configs_default + (openondemand_scrape_configs if groups['openondemand'] | count > 0 else []) }}" diff --git a/environments/common/inventory/group_vars/all/proxy.yml b/environments/common/inventory/group_vars/all/proxy.yml index 266ae45..1a0bdb4 100644 --- a/environments/common/inventory/group_vars/all/proxy.yml +++ b/environments/common/inventory/group_vars/all/proxy.yml @@ -1,3 +1,4 @@ +--- # If squid group is non-empty, default the proxy address to the hostname of # the first squid host, port 3128. Else empty string to avoid breaking hostvars diff --git a/environments/common/inventory/group_vars/all/pulp.yml b/environments/common/inventory/group_vars/all/pulp.yml index 22bb832..492a84a 100644 --- a/environments/common/inventory/group_vars/all/pulp.yml +++ b/environments/common/inventory/group_vars/all/pulp.yml @@ -1,3 +1,4 @@ +--- pulp_site_port: 8080 # If using Ark directly (no local Pulp server), override the following with Ark creds diff --git a/environments/common/inventory/group_vars/all/slurm_exporter.yml b/environments/common/inventory/group_vars/all/slurm_exporter.yml index 4902310..072c436 100644 --- a/environments/common/inventory/group_vars/all/slurm_exporter.yml +++ b/environments/common/inventory/group_vars/all/slurm_exporter.yml @@ -1,3 +1,4 @@ +--- slurm_exporter_port: 9341 # as defined by [1] and implemented in [2] -#[1]: https://github.com/prometheus/prometheus/wiki/Default-port-allocations -#[2]: https://github.com/stackhpc/prometheus-slurm-exporter/blob/master/lib/systemd/prometheus-slurm-exporter.service +# [1]: https://github.com/prometheus/prometheus/wiki/Default-port-allocations +# [2]: https://github.com/stackhpc/prometheus-slurm-exporter/blob/master/lib/systemd/prometheus-slurm-exporter.service diff --git a/environments/common/inventory/group_vars/all/squid.yml b/environments/common/inventory/group_vars/all/squid.yml index 5955729..4218c5c 100644 --- a/environments/common/inventory/group_vars/all/squid.yml +++ b/environments/common/inventory/group_vars/all/squid.yml @@ -1 +1,2 @@ +--- squid_http_port: 3128 # defined here for proxy role diff --git a/environments/common/inventory/group_vars/all/sshd.yaml b/environments/common/inventory/group_vars/all/sshd.yaml index 5d4ed22..cf22b12 100644 --- a/environments/common/inventory/group_vars/all/sshd.yaml +++ b/environments/common/inventory/group_vars/all/sshd.yaml @@ -1 +1,2 @@ +--- sshd_password_authentication: "{{ sssd_install_ldap | default(false) | bool }}" diff --git a/environments/common/inventory/group_vars/all/systemd.yml b/environments/common/inventory/group_vars/all/systemd.yml index ae72a78..bc267d5 100644 --- a/environments/common/inventory/group_vars/all/systemd.yml +++ b/environments/common/inventory/group_vars/all/systemd.yml @@ -1,3 +1,4 @@ +--- _systemd_requiresmount_statedir: | {% if appliances_state_dir is defined %} [Unit] diff --git a/environments/common/inventory/group_vars/all/update.yml b/environments/common/inventory/group_vars/all/update.yml index 715d418..a0b10ce 100644 --- a/environments/common/inventory/group_vars/all/update.yml +++ b/environments/common/inventory/group_vars/all/update.yml @@ -1,12 +1,13 @@ --- update_enable: false -# These variables define the packages updates and are passed to ansible's yum module parameters with the same names: https://docs.ansible.com/ansible/latest/collections/ansible/builtin/yum_module.html -update_name: '*' +# These variables define the packages updates and are passed to ansible's yum module parameters +# with the same names: https://docs.ansible.com/ansible/latest/collections/ansible/builtin/yum_module.html +update_name: "*" update_state: latest update_exclude: - grafana - apptainer # see https://github.com/stackhpc/ansible-slurm-appliance/pull/245 update_disablerepo: omit # Log changes during update here on localhost: -update_log_path: "{{ appliances_environment_root }}/logs/{{ inventory_hostname }}-updates.log" +update_log_path: "{{ appliances_environment_root }}/logs/{{ inventory_hostname }}-updates.log" diff --git a/environments/site/inventory/group_vars/all/grafana.yml b/environments/site/inventory/group_vars/all/grafana.yml index 521616a..3c49fd5 100644 --- a/environments/site/inventory/group_vars/all/grafana.yml +++ b/environments/site/inventory/group_vars/all/grafana.yml @@ -1 +1,2 @@ -grafana_auth_anonymous: true \ No newline at end of file +--- +grafana_auth_anonymous: true diff --git a/environments/site/inventory/group_vars/all/vault_alertmanager.yml b/environments/site/inventory/group_vars/all/vault_alertmanager.yml index 4375ed7..02abb00 100644 --- a/environments/site/inventory/group_vars/all/vault_alertmanager.yml +++ b/environments/site/inventory/group_vars/all/vault_alertmanager.yml @@ -1,3 +1,3 @@ # Add a bot token here THEN VAULT-ENCRYPT this file! -#vault_alertmanager_slack_integration_app_creds: '' +# vault_alertmanager_slack_integration_app_creds: '' diff --git a/environments/site/tofu/additional.tf b/environments/site/tofu/additional.tf index 872f957..1079ef9 100644 --- a/environments/site/tofu/additional.tf +++ b/environments/site/tofu/additional.tf @@ -4,41 +4,41 @@ module "additional" { for_each = var.additional_nodegroups # must be set for group: - nodes = each.value.nodes + nodes = each.value.nodes flavor = each.value.flavor # always taken from top-level value: - cluster_name = var.cluster_name + cluster_name = var.cluster_name cluster_domain_suffix = var.cluster_domain_suffix - key_pair = var.key_pair - environment_root = var.environment_root - config_drive = var.config_drive - + key_pair = var.key_pair + environment_root = var.environment_root + config_drive = var.config_drive + # can be set for group, defaults to top-level value: - image_id = lookup(each.value, "image_id", var.cluster_image_id) - vnic_types = lookup(each.value, "vnic_types", var.vnic_types) + image_id = lookup(each.value, "image_id", var.cluster_image_id) + vnic_types = lookup(each.value, "vnic_types", var.vnic_types) volume_backed_instances = lookup(each.value, "volume_backed_instances", var.volume_backed_instances) - root_volume_size = lookup(each.value, "root_volume_size", var.root_volume_size) - root_volume_type = lookup(each.value, "root_volume_type", var.root_volume_type) - gateway_ip = lookup(each.value, "gateway_ip", var.gateway_ip) - nodename_template = lookup(each.value, "nodename_template", var.cluster_nodename_template) - + root_volume_size = lookup(each.value, "root_volume_size", var.root_volume_size) + root_volume_type = lookup(each.value, "root_volume_type", var.root_volume_type) + gateway_ip = lookup(each.value, "gateway_ip", var.gateway_ip) + nodename_template = lookup(each.value, "nodename_template", var.cluster_nodename_template) + # optionally set for group: networks = concat(var.cluster_networks, lookup(each.value, "extra_networks", [])) # here null means "use module var default" - extra_volumes = lookup(each.value, "extra_volumes", null) - fip_addresses = lookup(each.value, "fip_addresses", null) - fip_network = lookup(each.value, "fip_network", null) - match_ironic_node = lookup(each.value, "match_ironic_node", null) - availability_zone = lookup(each.value, "availability_zone", null) - ip_addresses = lookup(each.value, "ip_addresses", null) - security_group_ids = lookup(each.value, "security_group_ids", [for o in data.openstack_networking_secgroup_v2.nonlogin: o.id]) - additional_cloud_config = lookup(each.value, "additional_cloud_config", var.additional_cloud_config) + extra_volumes = lookup(each.value, "extra_volumes", null) + fip_addresses = lookup(each.value, "fip_addresses", null) + fip_network = lookup(each.value, "fip_network", null) + match_ironic_node = lookup(each.value, "match_ironic_node", null) + availability_zone = lookup(each.value, "availability_zone", null) + ip_addresses = lookup(each.value, "ip_addresses", null) + security_group_ids = lookup(each.value, "security_group_ids", [for o in data.openstack_networking_secgroup_v2.nonlogin : o.id]) + additional_cloud_config = lookup(each.value, "additional_cloud_config", var.additional_cloud_config) additional_cloud_config_vars = lookup(each.value, "additional_cloud_config_vars", var.additional_cloud_config_vars) - server_group_id = lookup(each.value, "server_group_id", null) + server_group_id = lookup(each.value, "server_group_id", null) # can't be set for additional nodes - compute_init_enable = [] + compute_init_enable = [] ignore_image_changes = false # computed diff --git a/environments/site/tofu/baremetal-node-list.py b/environments/site/tofu/baremetal-node-list.py index 14bc3ce..c1747ec 100755 --- a/environments/site/tofu/baremetal-node-list.py +++ b/environments/site/tofu/baremetal-node-list.py @@ -1,32 +1,34 @@ #!/usr/bin/env python -""" opentofu external data program to list baremetal nodes +# pylint: disable=invalid-name +"""opentofu external data program to list baremetal nodes - Example usage: +Example usage: - data "external" "example" { - program = [this_file] - } + data "external" "example" { + program = [this_file] + } - The external data resource's result attribute then contains a mapping of - Ironic node names to their UUIDs. +The external data resource's result attribute then contains a mapping of +Ironic node names to their UUIDs. - An empty list is returned if: - - There are no baremetal nodes - - The listing fails for any reason, e.g. - - there is no baremetal service - - admin credentials are required and are not provided +An empty list is returned if: +- There are no baremetal nodes +- The listing fails for any reason, e.g. + - there is no baremetal service + - admin credentials are required and are not provided """ -import openstack import json +import openstack # pylint: disable=import-error + nodes = [] -proxy = None +proxy = None # pylint: disable=invalid-name output = {} conn = openstack.connection.from_config() try: - proxy = getattr(conn, 'baremetal', None) -except Exception: + proxy = getattr(conn, "baremetal", None) +except Exception: # pylint: disable=broad-exception-caught pass if proxy is not None: nodes = proxy.nodes() diff --git a/environments/site/tofu/compute.tf b/environments/site/tofu/compute.tf index 35d62c6..54d3871 100644 --- a/environments/site/tofu/compute.tf +++ b/environments/site/tofu/compute.tf @@ -4,45 +4,45 @@ module "compute" { for_each = var.compute # must be set for group: - nodes = each.value.nodes + nodes = each.value.nodes flavor = each.value.flavor # always taken from top-level value: - cluster_name = var.cluster_name + cluster_name = var.cluster_name cluster_domain_suffix = var.cluster_domain_suffix - key_pair = var.key_pair - environment_root = var.environment_root - config_drive = var.config_drive - + key_pair = var.key_pair + environment_root = var.environment_root + config_drive = var.config_drive + # can be set for group, defaults to top-level value: - image_id = lookup(each.value, "image_id", var.cluster_image_id) - vnic_types = lookup(each.value, "vnic_types", var.vnic_types) - volume_backed_instances = lookup(each.value, "volume_backed_instances", var.volume_backed_instances) - root_volume_size = lookup(each.value, "root_volume_size", var.root_volume_size) - root_volume_type = lookup(each.value, "root_volume_type", var.root_volume_type) - gateway_ip = lookup(each.value, "gateway_ip", var.gateway_ip) - nodename_template = lookup(each.value, "nodename_template", var.cluster_nodename_template) - additional_cloud_config = lookup(each.value, "additional_cloud_config", var.additional_cloud_config) + image_id = lookup(each.value, "image_id", var.cluster_image_id) + vnic_types = lookup(each.value, "vnic_types", var.vnic_types) + volume_backed_instances = lookup(each.value, "volume_backed_instances", var.volume_backed_instances) + root_volume_size = lookup(each.value, "root_volume_size", var.root_volume_size) + root_volume_type = lookup(each.value, "root_volume_type", var.root_volume_type) + gateway_ip = lookup(each.value, "gateway_ip", var.gateway_ip) + nodename_template = lookup(each.value, "nodename_template", var.cluster_nodename_template) + additional_cloud_config = lookup(each.value, "additional_cloud_config", var.additional_cloud_config) additional_cloud_config_vars = lookup(each.value, "additional_cloud_config_vars", var.additional_cloud_config_vars) # optionally set for group: networks = concat(var.cluster_networks, lookup(each.value, "extra_networks", [])) # here null means "use module var default" - extra_volumes = lookup(each.value, "extra_volumes", null) - compute_init_enable = lookup(each.value, "compute_init_enable", null) + extra_volumes = lookup(each.value, "extra_volumes", null) + compute_init_enable = lookup(each.value, "compute_init_enable", null) ignore_image_changes = lookup(each.value, "ignore_image_changes", null) - match_ironic_node = lookup(each.value, "match_ironic_node", null) - availability_zone = lookup(each.value, "availability_zone", null) - ip_addresses = lookup(each.value, "ip_addresses", null) - server_group_id = lookup(each.value, "server_group_id", null) + match_ironic_node = lookup(each.value, "match_ironic_node", null) + availability_zone = lookup(each.value, "availability_zone", null) + ip_addresses = lookup(each.value, "ip_addresses", null) + server_group_id = lookup(each.value, "server_group_id", null) # computed # not using openstack_compute_instance_v2.control.access_ip_v4 to avoid # updates to node metadata on deletion/recreation of the control node: - control_address = openstack_networking_port_v2.control[var.cluster_networks[0].network].all_fixed_ips[0] - security_group_ids = [for o in data.openstack_networking_secgroup_v2.nonlogin: o.id] - baremetal_nodes = data.external.baremetal_nodes.result - + control_address = openstack_networking_port_v2.control[var.cluster_networks[0].network].all_fixed_ips[0] + security_group_ids = [for o in data.openstack_networking_secgroup_v2.nonlogin : o.id] + baremetal_nodes = data.external.baremetal_nodes.result + # input dict validation: group_name = each.key group_keys = keys(each.value) @@ -67,5 +67,5 @@ module "compute" { "additional_cloud_config_vars", "server_group_id" ] - + } diff --git a/environments/site/tofu/control.tf b/environments/site/tofu/control.tf index 19a41ae..7bfa13f 100644 --- a/environments/site/tofu/control.tf +++ b/environments/site/tofu/control.tf @@ -1,26 +1,26 @@ locals { control_volumes = concat( # convert maps to lists with zero or one entries: - [for v in data.openstack_blockstorage_volume_v3.state: v], - [for v in data.openstack_blockstorage_volume_v3.home: v] + [for v in data.openstack_blockstorage_volume_v3.state : v], + [for v in data.openstack_blockstorage_volume_v3.home : v] ) control_fqdn = templatestring( var.cluster_nodename_template, { - node = "control", - cluster_name = var.cluster_name, + node = "control", + cluster_name = var.cluster_name, cluster_domain_suffix = var.cluster_domain_suffix, - environment_name = basename(var.environment_root) + environment_name = basename(var.environment_root) } ) } resource "openstack_networking_port_v2" "control" { - for_each = {for net in var.cluster_networks: net.network => net} + for_each = { for net in var.cluster_networks : net.network => net } - name = "${var.cluster_name}-control-${each.key}" - network_id = data.openstack_networking_network_v2.cluster_net[each.key].id + name = "${var.cluster_name}-control-${each.key}" + network_id = data.openstack_networking_network_v2.cluster_net[each.key].id admin_state_up = "true" fixed_ip { @@ -29,7 +29,7 @@ resource "openstack_networking_port_v2" "control" { } no_security_groups = lookup(each.value, "no_security_groups", false) - security_group_ids = lookup(each.value, "no_security_groups", false) ? [] : [for o in data.openstack_networking_secgroup_v2.nonlogin: o.id] + security_group_ids = lookup(each.value, "no_security_groups", false) ? [] : [for o in data.openstack_networking_secgroup_v2.nonlogin : o.id] binding { vnic_type = lookup(var.vnic_types, each.key, "normal") @@ -37,37 +37,37 @@ resource "openstack_networking_port_v2" "control" { } resource "openstack_compute_instance_v2" "control" { - - name = split(".", local.control_fqdn)[0] - image_id = var.cluster_image_id + + name = split(".", local.control_fqdn)[0] + image_id = var.cluster_image_id flavor_name = var.control_node_flavor - key_pair = var.key_pair - + key_pair = var.key_pair + # root device: block_device { - uuid = var.cluster_image_id - source_type = "image" - destination_type = var.volume_backed_instances ? "volume" : "local" - volume_size = var.volume_backed_instances ? var.root_volume_size : null - volume_type = var.volume_backed_instances ? var.root_volume_type : null - boot_index = 0 - delete_on_termination = true + uuid = var.cluster_image_id + source_type = "image" + destination_type = var.volume_backed_instances ? "volume" : "local" + volume_size = var.volume_backed_instances ? var.root_volume_size : null + volume_type = var.volume_backed_instances ? var.root_volume_type : null + boot_index = 0 + delete_on_termination = true } dynamic "block_device" { for_each = local.control_volumes content { destination_type = "volume" - source_type = "volume" - boot_index = -1 - uuid = block_device.value.id # actually openstack_blockstorage_volume_v3 id + source_type = "volume" + boot_index = -1 + uuid = block_device.value.id # actually openstack_blockstorage_volume_v3 id } } dynamic "network" { - for_each = {for net in var.cluster_networks: net.network => net} + for_each = { for net in var.cluster_networks : net.network => net } content { - port = openstack_networking_port_v2.control[network.key].id + port = openstack_networking_port_v2.control[network.key].id access_network = network.key == var.cluster_networks[0].network } } @@ -81,8 +81,8 @@ resource "openstack_compute_instance_v2" "control" { metadata = { environment_root = var.environment_root - access_ip = openstack_networking_port_v2.control[var.cluster_networks[0].network].all_fixed_ips[0] - gateway_ip = var.gateway_ip + access_ip = openstack_networking_port_v2.control[var.cluster_networks[0].network].all_fixed_ips[0] + gateway_ip = var.gateway_ip } user_data = <<-EOF diff --git a/environments/site/tofu/data.tf b/environments/site/tofu/data.tf index 443c522..f90f2f0 100644 --- a/environments/site/tofu/data.tf +++ b/environments/site/tofu/data.tf @@ -1,5 +1,6 @@ +# tflint-ignore: terraform_required_providers data "external" "baremetal_nodes" { # returns an empty map if cannot list baremetal nodes program = ["${path.module}/baremetal-node-list.py"] - query = {} + query = {} } diff --git a/environments/site/tofu/inventory.tf b/environments/site/tofu/inventory.tf index 0e23323..fa7108b 100644 --- a/environments/site/tofu/inventory.tf +++ b/environments/site/tofu/inventory.tf @@ -1,16 +1,17 @@ +# tflint-ignore: terraform_required_providers resource "local_file" "hosts" { - content = templatefile("${path.module}/inventory.tpl", - { - "cluster_name": var.cluster_name, - "cluster_domain_suffix": var.cluster_domain_suffix - "control": openstack_compute_instance_v2.control - "control_fqdn": local.control_fqdn - "login_groups": module.login - "compute_groups": module.compute - "additional_groups": module.additional - "state_dir": var.state_dir - "cluster_home_volume": var.home_volume_provisioning != "none" - }, - ) + content = templatefile("${path.module}/inventory.tpl", + { + "cluster_name" : var.cluster_name, + "cluster_domain_suffix" : var.cluster_domain_suffix + "control" : openstack_compute_instance_v2.control + "control_fqdn" : local.control_fqdn + "login_groups" : module.login + "compute_groups" : module.compute + "additional_groups" : module.additional + "state_dir" : var.state_dir + "cluster_home_volume" : var.home_volume_provisioning != "none" + }, + ) filename = "../inventory/hosts.yml" } diff --git a/environments/site/tofu/login.tf b/environments/site/tofu/login.tf index 5ecc033..0f9bc83 100644 --- a/environments/site/tofu/login.tf +++ b/environments/site/tofu/login.tf @@ -4,48 +4,48 @@ module "login" { for_each = var.login # must be set for group: - nodes = each.value.nodes + nodes = each.value.nodes flavor = each.value.flavor # always taken from top-level value: - cluster_name = var.cluster_name + cluster_name = var.cluster_name cluster_domain_suffix = var.cluster_domain_suffix - key_pair = var.key_pair - environment_root = var.environment_root - config_drive = var.config_drive - + key_pair = var.key_pair + environment_root = var.environment_root + config_drive = var.config_drive + # can be set for group, defaults to top-level value: - image_id = lookup(each.value, "image_id", var.cluster_image_id) - vnic_types = lookup(each.value, "vnic_types", var.vnic_types) - volume_backed_instances = lookup(each.value, "volume_backed_instances", var.volume_backed_instances) - root_volume_size = lookup(each.value, "root_volume_size", var.root_volume_size) - root_volume_type = lookup(each.value, "root_volume_type", var.root_volume_type) - gateway_ip = lookup(each.value, "gateway_ip", var.gateway_ip) - nodename_template = lookup(each.value, "nodename_template", var.cluster_nodename_template) - additional_cloud_config = lookup(each.value, "additional_cloud_config", var.additional_cloud_config) + image_id = lookup(each.value, "image_id", var.cluster_image_id) + vnic_types = lookup(each.value, "vnic_types", var.vnic_types) + volume_backed_instances = lookup(each.value, "volume_backed_instances", var.volume_backed_instances) + root_volume_size = lookup(each.value, "root_volume_size", var.root_volume_size) + root_volume_type = lookup(each.value, "root_volume_type", var.root_volume_type) + gateway_ip = lookup(each.value, "gateway_ip", var.gateway_ip) + nodename_template = lookup(each.value, "nodename_template", var.cluster_nodename_template) + additional_cloud_config = lookup(each.value, "additional_cloud_config", var.additional_cloud_config) additional_cloud_config_vars = lookup(each.value, "additional_cloud_config_vars", var.additional_cloud_config_vars) - + # optionally set for group: networks = concat(var.cluster_networks, lookup(each.value, "extra_networks", [])) # here null means "use module var default" - extra_volumes = lookup(each.value, "extra_volumes", null) - fip_addresses = lookup(each.value, "fip_addresses", null) - fip_network = lookup(each.value, "fip_network", null) + extra_volumes = lookup(each.value, "extra_volumes", null) + fip_addresses = lookup(each.value, "fip_addresses", null) + fip_network = lookup(each.value, "fip_network", null) match_ironic_node = lookup(each.value, "match_ironic_node", null) availability_zone = lookup(each.value, "availability_zone", null) - ip_addresses = lookup(each.value, "ip_addresses", null) - server_group_id = lookup(each.value, "server_group_id", null) + ip_addresses = lookup(each.value, "ip_addresses", null) + server_group_id = lookup(each.value, "server_group_id", null) # can't be set for login - compute_init_enable = [] + compute_init_enable = [] ignore_image_changes = false # computed # not using openstack_compute_instance_v2.control.access_ip_v4 to avoid # updates to node metadata on deletion/recreation of the control node: - control_address = openstack_networking_port_v2.control[var.cluster_networks[0].network].all_fixed_ips[0] - security_group_ids = lookup(each.value, "security_group_ids", [for o in data.openstack_networking_secgroup_v2.login: o.id]) - baremetal_nodes = data.external.baremetal_nodes.result + control_address = openstack_networking_port_v2.control[var.cluster_networks[0].network].all_fixed_ips[0] + security_group_ids = lookup(each.value, "security_group_ids", [for o in data.openstack_networking_secgroup_v2.login : o.id]) + baremetal_nodes = data.external.baremetal_nodes.result # input dict validation: group_name = each.key @@ -72,5 +72,5 @@ module "login" { "security_group_ids", "server_group_id" ] - + } diff --git a/environments/site/tofu/main.tf b/environments/site/tofu/main.tf index dc639f7..e88ac1a 100644 --- a/environments/site/tofu/main.tf +++ b/environments/site/tofu/main.tf @@ -2,7 +2,7 @@ terraform { required_version = ">= 1.7" # templatestring() function required_providers { openstack = { - source = "terraform-provider-openstack/openstack" + source = "terraform-provider-openstack/openstack" version = "~>3.0.0" } } diff --git a/environments/site/tofu/network.tf b/environments/site/tofu/network.tf index 0a86b8f..43c2e5d 100644 --- a/environments/site/tofu/network.tf +++ b/environments/site/tofu/network.tf @@ -1,14 +1,14 @@ data "openstack_networking_network_v2" "cluster_net" { - for_each = {for net in var.cluster_networks: net.network => net} + for_each = { for net in var.cluster_networks : net.network => net } name = each.value.network } data "openstack_networking_subnet_v2" "cluster_subnet" { - for_each = {for net in var.cluster_networks: net.network => net} + for_each = { for net in var.cluster_networks : net.network => net } name = each.value.subnet } @@ -22,13 +22,13 @@ data "openstack_identity_auth_scope_v3" "scope" { data "openstack_networking_secgroup_v2" "login" { for_each = toset(var.login_security_groups) - name = each.key + name = each.key tenant_id = data.openstack_identity_auth_scope_v3.scope.project_id } data "openstack_networking_secgroup_v2" "nonlogin" { for_each = toset(var.nonlogin_security_groups) - name = each.key + name = each.key tenant_id = data.openstack_identity_auth_scope_v3.scope.project_id } diff --git a/environments/site/tofu/node_group/main.tf b/environments/site/tofu/node_group/main.tf index f298284..03fbec4 100644 --- a/environments/site/tofu/node_group/main.tf +++ b/environments/site/tofu/node_group/main.tf @@ -2,7 +2,7 @@ terraform { required_version = ">= 0.14" required_providers { openstack = { - source = "terraform-provider-openstack/openstack" + source = "terraform-provider-openstack/openstack" version = "~>3.0.0" } } diff --git a/environments/site/tofu/node_group/network.tf b/environments/site/tofu/node_group/network.tf index f5763b9..5a66d32 100644 --- a/environments/site/tofu/node_group/network.tf +++ b/environments/site/tofu/node_group/network.tf @@ -1,14 +1,14 @@ data "openstack_networking_network_v2" "network" { - for_each = {for net in var.networks: net.network => net} + for_each = { for net in var.networks : net.network => net } name = each.value.network } data "openstack_networking_subnet_v2" "subnet" { - for_each = {for net in var.networks: net.network => net} + for_each = { for net in var.networks : net.network => net } name = each.value.subnet } diff --git a/environments/site/tofu/node_group/nodes.tf b/environments/site/tofu/node_group/nodes.tf index 45cd449..d02028f 100644 --- a/environments/site/tofu/node_group/nodes.tf +++ b/environments/site/tofu/node_group/nodes.tf @@ -1,5 +1,5 @@ locals { - all_compute_volumes = {for v in setproduct(var.nodes, keys(var.extra_volumes)): "${v[0]}-${v[1]}" => {"node" = v[0], "volume" = v[1]}} + all_compute_volumes = { for v in setproduct(var.nodes, keys(var.extra_volumes)) : "${v[0]}-${v[1]}" => { "node" = v[0], "volume" = v[1] } } # e.g. with # var.nodes = ["compute-0", "compute-1"] # var.extra_volumes = { @@ -12,16 +12,16 @@ locals { # Workaround for lifecycle meta-argument only taking static values compute_instances = var.ignore_image_changes ? openstack_compute_instance_v2.compute_fixed_image : openstack_compute_instance_v2.compute - + # Define fully qualified nodenames here to avoid repetition fqdns = { - for n in var.nodes: n => templatestring( + for n in var.nodes : n => templatestring( var.nodename_template, { - node = n, - cluster_name = var.cluster_name, + node = n, + cluster_name = var.cluster_name, cluster_domain_suffix = var.cluster_domain_suffix, - environment_name = basename(var.environment_root) + environment_name = basename(var.environment_root) } ) } @@ -31,40 +31,40 @@ locals { resource "openstack_blockstorage_volume_v3" "compute" { - for_each = local.all_compute_volumes + for_each = local.all_compute_volumes - name = "${var.cluster_name}-${each.key}" - description = "Compute node ${each.value.node} volume ${each.value.volume}" - size = var.extra_volumes[each.value.volume].size - volume_type = var.extra_volumes[each.value.volume].volume_type + name = "${var.cluster_name}-${each.key}" + description = "Compute node ${each.value.node} volume ${each.value.volume}" + size = var.extra_volumes[each.value.volume].size + volume_type = var.extra_volumes[each.value.volume].volume_type } resource "openstack_compute_volume_attach_v2" "compute" { for_each = local.all_compute_volumes - instance_id = local.compute_instances["${each.value.node}"].id - volume_id = openstack_blockstorage_volume_v3.compute["${each.key}"].id + instance_id = local.compute_instances[each.value.node].id + volume_id = openstack_blockstorage_volume_v3.compute[each.key].id } resource "openstack_networking_port_v2" "compute" { - for_each = {for item in setproduct(var.nodes, var.networks): + for_each = { for item in setproduct(var.nodes, var.networks) : "${item[0]}-${item[1].network}" => { - node_idx = index(var.nodes, item[0]) - net = item[1] - } + node_idx = index(var.nodes, item[0]) + net = item[1] + } } - name = "${var.cluster_name}-${each.key}" - network_id = data.openstack_networking_network_v2.network[each.value.net.network].id + name = "${var.cluster_name}-${each.key}" + network_id = data.openstack_networking_network_v2.network[each.value.net.network].id admin_state_up = "true" fixed_ip { - subnet_id = data.openstack_networking_subnet_v2.subnet[each.value.net.network].id + subnet_id = data.openstack_networking_subnet_v2.subnet[each.value.net.network].id ip_address = try(var.ip_addresses[each.value.net.network][each.value.node_idx], null) } - + no_security_groups = lookup(each.value.net, "no_security_groups", false) security_group_ids = lookup(each.value.net, "no_security_groups", false) ? [] : var.security_group_ids @@ -77,28 +77,28 @@ resource "openstack_compute_instance_v2" "compute_fixed_image" { for_each = var.ignore_image_changes ? toset(var.nodes) : [] - name = split(".", local.fqdns[each.key])[0] - image_id = var.image_id + name = split(".", local.fqdns[each.key])[0] + image_id = var.image_id flavor_name = var.flavor - key_pair = var.key_pair + key_pair = var.key_pair dynamic "block_device" { - for_each = var.volume_backed_instances ? [1]: [] + for_each = var.volume_backed_instances ? [1] : [] content { - uuid = var.image_id - source_type = "image" - destination_type = "volume" - volume_size = var.root_volume_size - volume_type = var.root_volume_type - boot_index = 0 + uuid = var.image_id + source_type = "image" + destination_type = "volume" + volume_size = var.root_volume_size + volume_type = var.root_volume_type + boot_index = 0 delete_on_termination = true } } dynamic "network" { - for_each = {for net in var.networks: net.network => net} + for_each = { for net in var.networks : net.network => net } content { - port = openstack_networking_port_v2.compute["${each.key}-${network.key}"].id + port = openstack_networking_port_v2.compute["${each.key}-${network.key}"].id access_network = network.key == var.networks[0].network } } @@ -112,12 +112,12 @@ resource "openstack_compute_instance_v2" "compute_fixed_image" { metadata = merge( { - environment_root = var.environment_root - control_address = var.control_address - access_ip = openstack_networking_port_v2.compute["${each.key}-${var.networks[0].network}"].all_fixed_ips[0] - gateway_ip = var.gateway_ip + environment_root = var.environment_root + control_address = var.control_address + access_ip = openstack_networking_port_v2.compute["${each.key}-${var.networks[0].network}"].all_fixed_ips[0] + gateway_ip = var.gateway_ip }, - {for e in var.compute_init_enable: e => true} + { for e in var.compute_init_enable : e => true } ) user_data = <<-EOF @@ -144,29 +144,29 @@ resource "openstack_compute_instance_v2" "compute_fixed_image" { resource "openstack_compute_instance_v2" "compute" { for_each = var.ignore_image_changes ? [] : toset(var.nodes) - - name = split(".", local.fqdns[each.key])[0] - image_id = var.image_id + + name = split(".", local.fqdns[each.key])[0] + image_id = var.image_id flavor_name = var.flavor - key_pair = var.key_pair + key_pair = var.key_pair dynamic "block_device" { - for_each = var.volume_backed_instances ? [1]: [] + for_each = var.volume_backed_instances ? [1] : [] content { - uuid = var.image_id - source_type = "image" - destination_type = "volume" - volume_size = var.root_volume_size - volume_type = var.root_volume_type - boot_index = 0 + uuid = var.image_id + source_type = "image" + destination_type = "volume" + volume_size = var.root_volume_size + volume_type = var.root_volume_type + boot_index = 0 delete_on_termination = true } } - + dynamic "network" { - for_each = {for net in var.networks: net.network => net} + for_each = { for net in var.networks : net.network => net } content { - port = openstack_networking_port_v2.compute["${each.key}-${network.key}"].id + port = openstack_networking_port_v2.compute["${each.key}-${network.key}"].id access_network = network.key == var.networks[0].network } } @@ -180,12 +180,12 @@ resource "openstack_compute_instance_v2" "compute" { metadata = merge( { - environment_root = var.environment_root - control_address = var.control_address - access_ip = openstack_networking_port_v2.compute["${each.key}-${var.networks[0].network}"].all_fixed_ips[0] - gateway_ip = var.gateway_ip + environment_root = var.environment_root + control_address = var.control_address + access_ip = openstack_networking_port_v2.compute["${each.key}-${var.networks[0].network}"].all_fixed_ips[0] + gateway_ip = var.gateway_ip }, - {for e in var.compute_init_enable: e => true} + { for e in var.compute_init_enable : e => true } ) user_data = <<-EOF @@ -204,7 +204,7 @@ resource "openstack_compute_instance_v2" "compute" { } resource "openstack_networking_floatingip_associate_v2" "fip" { - for_each = {for idx in range(length(var.fip_addresses)): var.nodes[idx] => var.fip_addresses[idx]} # zip, fip_addresses can be shorter + for_each = { for idx in range(length(var.fip_addresses)) : var.nodes[idx] => var.fip_addresses[idx] } # zip, fip_addresses can be shorter floating_ip = each.value port_id = openstack_networking_port_v2.compute["${each.key}-${length(var.networks) == 1 ? var.networks[0].network : var.fip_network}"].id diff --git a/environments/site/tofu/node_group/variables.tf b/environments/site/tofu/node_group/variables.tf index 0a129ab..352b577 100644 --- a/environments/site/tofu/node_group/variables.tf +++ b/environments/site/tofu/node_group/variables.tf @@ -1,61 +1,61 @@ variable "nodes" { - type = list(string) - description = "List of node names for node group" + type = list(string) + description = "List of node names for node group" } variable "flavor" { - type = string - description = "Name of flavor for node group" + type = string + description = "Name of flavor for node group" } variable "cluster_name" { - type = string + type = string } variable "cluster_domain_suffix" { - type = string - default = "invalid" + type = string + default = "invalid" } variable "key_pair" { - type = string - description = "Name of an existing keypair in OpenStack" + type = string + description = "Name of an existing keypair in OpenStack" } variable "image_id" { - type = string - description = "ID of image for the node group" + type = string + description = "ID of image for the node group" } variable "environment_root" { - type = string - description = "Path to environment root, automatically set by activate script" + type = string + description = "Path to environment root, automatically set by activate script" } variable "vnic_types" { - type = map(string) - default = {} + type = map(string) + default = {} } variable "volume_backed_instances" { - description = "Whether to use volumes for root disks" - type = bool - default = false + description = "Whether to use volumes for root disks" + type = bool + default = false } variable "root_volume_size" { - description = "Size of volume for root volumes if using volume backed instances, in Gb" - type = number - default = 40 + description = "Size of volume for root volumes if using volume backed instances, in Gb" + type = number + default = 40 } variable "root_volume_type" { - type = string - default = null + type = string + default = null } variable "extra_volumes" { - description = <<-EOF + description = <<-EOF Mapping defining additional volumes to create and attach. Keys are unique volume name. Values are a mapping with: @@ -63,153 +63,154 @@ variable "extra_volumes" { volume_type: Optional. Type of volume, or cloud default **NB**: The order in /dev is not guaranteed to match the mapping EOF - type = map( - object({ - size = number - volume_type = optional(string) - }) - ) - default = {} - nullable = false + type = map( + object({ + size = number + volume_type = optional(string) + }) + ) + default = {} + nullable = false } variable "security_group_ids" { - type = list(string) - nullable = false + type = list(string) + nullable = false } variable "control_address" { - description = "Name/address of control node" - type = string + description = "Name/address of control node" + type = string } variable "compute_init_enable" { - type = list(string) - description = "Groups to activate for ansible-init compute rebuilds" - default = [] - nullable = false + type = list(string) + description = "Groups to activate for ansible-init compute rebuilds" + default = [] + nullable = false } variable "ignore_image_changes" { - type = bool - description = "Whether to ignore changes to the image_id parameter" - default = false - nullable = false + type = bool + description = "Whether to ignore changes to the image_id parameter" + default = false + nullable = false } variable "networks" { - type = list(map(string)) + type = list(map(string)) } variable "fip_addresses" { - type = list(string) - description = <<-EOT + type = list(string) + description = <<-EOT List of addresses of floating IPs to associate with nodes, in same order as nodes parameter. The floating IPs must already be allocated to the project. EOT - default = [] - nullable = false + default = [] + nullable = false } variable "fip_network" { - type = string - description = <<-EOT + type = string + description = <<-EOT Name of network containing ports to attach FIPs to. Only required if multiple networks are defined. EOT - default = "" - nullable = false + default = "" + nullable = false } variable "ip_addresses" { - type = map(list(string)) - description = <<-EOT + type = map(list(string)) + description = <<-EOT Mapping of list of fixed IP addresses for nodes, keyed by network name, in same order as nodes parameter. For any networks not specified here the cloud will select addresses. NB: Changing IP addresses after deployment may hit terraform provider bugs. EOT - default = {} - nullable = false - validation { - condition = length(setsubtract(keys(var.ip_addresses), var.networks[*].network)) == 0 - error_message = "Keys in ip_addresses for nodegroup \"${var.group_name}\" must match network names in var.cluster_networks" - } - validation { - condition = alltrue([for v in values(var.ip_addresses): length(v) == length(var.nodes)]) - error_message = "Values in ip_addresses for nodegroup \"${var.group_name}\" must be a list of the same length as var.nodes" - } + default = {} + nullable = false + validation { + condition = length(setsubtract(keys(var.ip_addresses), var.networks[*].network)) == 0 + error_message = "Keys in ip_addresses for nodegroup \"${var.group_name}\" must match network names in var.cluster_networks" + } + validation { + condition = alltrue([for v in values(var.ip_addresses) : length(v) == length(var.nodes)]) + error_message = "Values in ip_addresses for nodegroup \"${var.group_name}\" must be a list of the same length as var.nodes" + } } variable "match_ironic_node" { - type = bool - description = "Whether to launch instances on the Ironic node of the same name as each cluster node" - default = false - nullable = false + type = bool + description = "Whether to launch instances on the Ironic node of the same name as each cluster node" + default = false + nullable = false } variable "availability_zone" { - type = string - description = "Name of availability zone. If undefined, defaults to 'nova' if match_ironic_node is true, deferred to OpenStack otherwise" - default = null + type = string + description = "Name of availability zone. If undefined, defaults to 'nova' if match_ironic_node is true, deferred to OpenStack otherwise" + default = null } variable "baremetal_nodes" { - type = map(string) - default = {} + type = map(string) + default = {} } variable "gateway_ip" { - type = string - default = "" + type = string + default = "" } variable "nodename_template" { - type = string - default = "" + type = string + default = "" } variable "group_name" { - type = string + type = string } +# tflint-ignore: terraform_unused_declarations variable "group_keys" { - type = list - validation { - condition = length(setsubtract(var.group_keys, var.allowed_keys)) == 0 - error_message = <<-EOT + type = list(any) + validation { + condition = length(setsubtract(var.group_keys, var.allowed_keys)) == 0 + error_message = <<-EOT Node group '${var.group_name}' contains invalid key(s) ${ - join(", ", setsubtract(var.group_keys, var.allowed_keys))}. + join(", ", setsubtract(var.group_keys, var.allowed_keys))}. Valid keys are ${join(", ", var.allowed_keys)}. EOT - } +} } variable "allowed_keys" { - type = list - # don't provide a default here as allowed keys may depend on module use + type = list(any) + # don't provide a default here as allowed keys may depend on module use } variable "config_drive" { - type = bool + type = bool } variable "additional_cloud_config" { - type = string - default = "" - nullable = false + type = string + default = "" + nullable = false } variable "additional_cloud_config_vars" { - type = map(any) - default = {} - nullable = false + type = map(any) + default = {} + nullable = false } variable "server_group_id" { - type = string - default = null + type = string + default = null } diff --git a/environments/site/tofu/read-inventory-secrets.py b/environments/site/tofu/read-inventory-secrets.py index e3de2f4..85ac0a9 100755 --- a/environments/site/tofu/read-inventory-secrets.py +++ b/environments/site/tofu/read-inventory-secrets.py @@ -1,36 +1,43 @@ #!/usr/bin/env python -""" opentofu external data program to load inventory string variables from - a (possibly vault-encrypted) secrets file. +# pylint: disable=invalid-name +"""opentofu external data program to load inventory string variables from +a (possibly vault-encrypted) secrets file. - Example usage: +Example usage: - data "external" "example" { - program = [this_file] + data "external" "example" { + program = [this_file] - query = { - path = "${path.module}/../inventory/group_vars/all/secrets.yml" - } + query = { + path = "${path.module}/../inventory/group_vars/all/secrets.yml" } + } - The external data resource's result attribute then contains a mapping of - variable names to values. +The external data resource's result attribute then contains a mapping of +variable names to values. - NB: Only keys/values where values are strings are returned, in line with - the external program protocol. +NB: Only keys/values where values are strings are returned, in line with +the external program protocol. - NB: This approach is better than e.g. templating inventory vars as the - inventory doesn't need to be valid, which is helpful when opentofu will - template out hosts/groups. +NB: This approach is better than e.g. templating inventory vars as the +inventory doesn't need to be valid, which is helpful when opentofu will +template out hosts/groups. """ -import sys, json, subprocess, yaml -input = sys.stdin.read() -secrets_path = json.loads(input)['path'] +import json +import subprocess +import sys -with open(secrets_path) as f: +import yaml # pylint: disable=import-error + +input = sys.stdin.read() # pylint: disable=redefined-builtin +secrets_path = json.loads(input)["path"] + +with open(secrets_path) as f: # pylint: disable=unspecified-encoding header = f.readline() - if header.startswith('$ANSIBLE_VAULT'): - cmd = ['ansible-vault', 'view', secrets_path] + if header.startswith("$ANSIBLE_VAULT"): + cmd = ["ansible-vault", "view", secrets_path] + # pylint: disable-next=subprocess-run-check ansible = subprocess.run(cmd, capture_output=True, text=True) contents = ansible.stdout else: diff --git a/environments/site/tofu/variables.tf b/environments/site/tofu/variables.tf index 3402c3a..98f364a 100644 --- a/environments/site/tofu/variables.tf +++ b/environments/site/tofu/variables.tf @@ -1,350 +1,351 @@ variable "cluster_name" { - type = string - description = "Name of cluster, used as part of domain name" + type = string + description = "Name of cluster, used as part of domain name" } variable "cluster_domain_suffix" { - type = string - description = "Domain suffix for cluster" - default = "internal" + type = string + description = "Domain suffix for cluster" + default = "internal" } variable "cluster_networks" { - type = list(map(string)) - description = <<-EOT - List of mappings defining networks. Mapping key/values: - network: Required. Name of existing network - subnet: Required. Name of existing subnet - no_security_groups: Optional. Bool (default: false). Disable security groups - EOT + type = list(map(string)) + description = <<-EOT + List of mappings defining networks. Mapping key/values: + network: Required. Name of existing network + subnet: Required. Name of existing subnet + no_security_groups: Optional. Bool (default: false). Disable security groups + EOT } variable "key_pair" { - type = string - description = "Name of an existing keypair in OpenStack" + type = string + description = "Name of an existing keypair in OpenStack" } variable "control_ip_addresses" { - type = map(string) - description = <<-EOT - Mapping of fixed IP addresses for control node, keyed by network name. - For any networks not specified here the cloud will select an address. - - NB: Changing IP addresses after deployment may hit terraform provider bugs. - EOT - default = {} - validation { - # check all keys are network names in cluster_networks - condition = length(setsubtract(keys(var.control_ip_addresses), var.cluster_networks[*].network)) == 0 - error_message = "Keys in var.control_ip_addresses must match network names in var.cluster_networks" - } + type = map(string) + description = <<-EOT + Mapping of fixed IP addresses for control node, keyed by network name. + For any networks not specified here the cloud will select an address. + + NB: Changing IP addresses after deployment may hit terraform provider bugs. + EOT + default = {} + validation { + # check all keys are network names in cluster_networks + condition = length(setsubtract(keys(var.control_ip_addresses), var.cluster_networks[*].network)) == 0 + error_message = "Keys in var.control_ip_addresses must match network names in var.cluster_networks" + } } variable "control_node_flavor" { - type = string - description = "Flavor name for control node" + type = string + description = "Flavor name for control node" } variable "login" { - default = {} - description = <<-EOF - Mapping defining homogenous groups of login nodes. Multiple groups may - be useful for e.g. separating nodes for ssh and Open Ondemand usage, or - to define login nodes with different capabilities such as high-memory. - - Keys are names of groups. - Values are a mapping as follows: - - Required: - nodes: List of node names - flavor: String flavor name - Optional: - image_id: Overrides variable cluster_image_id - extra_networks: List of mappings in same format as cluster_networks - vnic_types: Overrides variable vnic_types - volume_backed_instances: Overrides variable volume_backed_instances - root_volume_size: Overrides variable root_volume_size - extra_volumes: Mapping defining additional volumes to create and attach - Keys are unique volume name. - Values are a mapping with: - size: Size of volume in GB - volume_type: Optional. Type of volume, or cloud default - **NB**: The order in /dev is not guaranteed to match the mapping - fip_addresses: List of addresses of floating IPs to associate with - nodes, in the same order as nodes parameter. The - floating IPs must already be allocated to the project. - fip_network: Name of network containing ports to attach FIPs to. Only - required if multiple networks are defined. - ip_addresses: Mapping of list of fixed IP addresses for nodes, keyed - by network name, in same order as nodes parameter. - For any networks not specified here the cloud will - select addresses. - match_ironic_node: Set true to launch instances on the Ironic node of the same name as each cluster node - availability_zone: Name of availability zone. If undefined, defaults to 'nova' - if match_ironic_node is true, defered to OpenStack otherwise - gateway_ip: Address to add default route via - nodename_template: Overrides variable cluster_nodename_template - server_group_id: String ID of server group to use for scheduler hint - EOF - - type = any + default = {} + description = <<-EOF + Mapping defining homogenous groups of login nodes. Multiple groups may + be useful for e.g. separating nodes for ssh and Open Ondemand usage, or + to define login nodes with different capabilities such as high-memory. + + Keys are names of groups. + Values are a mapping as follows: + + Required: + nodes: List of node names + flavor: String flavor name + Optional: + image_id: Overrides variable cluster_image_id + extra_networks: List of mappings in same format as cluster_networks + vnic_types: Overrides variable vnic_types + volume_backed_instances: Overrides variable volume_backed_instances + root_volume_size: Overrides variable root_volume_size + extra_volumes: Mapping defining additional volumes to create and attach + Keys are unique volume name. + Values are a mapping with: + size: Size of volume in GB + volume_type: Optional. Type of volume, or cloud default + **NB**: The order in /dev is not guaranteed to match the mapping + fip_addresses: List of addresses of floating IPs to associate with + nodes, in the same order as nodes parameter. The + floating IPs must already be allocated to the project. + fip_network: Name of network containing ports to attach FIPs to. Only + required if multiple networks are defined. + ip_addresses: Mapping of list of fixed IP addresses for nodes, keyed + by network name, in same order as nodes parameter. + For any networks not specified here the cloud will + select addresses. + match_ironic_node: Set true to launch instances on the Ironic node of the same name as each cluster node + availability_zone: Name of availability zone. If undefined, defaults to 'nova' + if match_ironic_node is true, defered to OpenStack otherwise + gateway_ip: Address to add default route via + nodename_template: Overrides variable cluster_nodename_template + server_group_id: String ID of server group to use for scheduler hint + EOF + + type = any } variable "cluster_image_id" { - type = string - description = "ID of default image for the cluster" + type = string + description = "ID of default image for the cluster" } variable "compute" { - default = {} - description = <<-EOF - Mapping defining homogenous groups of compute nodes. Groups are used - in Slurm partition definitions. - - Keys are names of groups. - Values are a mapping as follows: - - Required: - nodes: List of node names - flavor: String flavor name - Optional: - image_id: Overrides variable cluster_image_id - extra_networks: List of mappings in same format as cluster_networks - vnic_types: Overrides variable vnic_types - compute_init_enable: Toggles compute-init rebuild (see compute-init role docs) - ignore_image_changes: Ignore changes to the image_id parameter (see docs/experimental/compute-init.md) - volume_backed_instances: Overrides variable volume_backed_instances - root_volume_size: Overrides variable root_volume_size - extra_volumes: Mapping defining additional volumes to create and attach - Keys are unique volume name. - Values are a mapping with: - size: Size of volume in GB - volume_type: Optional. Type of volume, or cloud default - **NB**: The order in /dev is not guaranteed to match the mapping - ip_addresses: Mapping of list of fixed IP addresses for nodes, keyed - by network name, in same order as nodes parameter. - For any networks not specified here the cloud will - select addresses. - match_ironic_node: Set true to launch instances on the Ironic node of the same name as each cluster node - availability_zone: Name of availability zone. If undefined, defaults to 'nova' - if match_ironic_node is true, defered to OpenStack otherwise - gateway_ip: Address to add default route via - nodename_template: Overrides variable cluster_nodename_template - server_group_id: String ID of server group to use for scheduler hint - - Nodes are added to the following inventory groups: - - $group_name - - $cluster_name + '_' + $group_name - this is used for the stackhpc.openhpc role - - 'compute' - EOF - - type = any # can't do any better; TF type constraints can't cope with heterogeneous inner mappings + default = {} + description = <<-EOF + Mapping defining homogenous groups of compute nodes. Groups are used + in Slurm partition definitions. + + Keys are names of groups. + Values are a mapping as follows: + + Required: + nodes: List of node names + flavor: String flavor name + Optional: + image_id: Overrides variable cluster_image_id + extra_networks: List of mappings in same format as cluster_networks + vnic_types: Overrides variable vnic_types + compute_init_enable: Toggles compute-init rebuild (see compute-init role docs) + ignore_image_changes: Ignore changes to the image_id parameter (see docs/experimental/compute-init.md) + volume_backed_instances: Overrides variable volume_backed_instances + root_volume_size: Overrides variable root_volume_size + extra_volumes: Mapping defining additional volumes to create and attach + Keys are unique volume name. + Values are a mapping with: + size: Size of volume in GB + volume_type: Optional. Type of volume, or cloud default + **NB**: The order in /dev is not guaranteed to match the mapping + ip_addresses: Mapping of list of fixed IP addresses for nodes, keyed + by network name, in same order as nodes parameter. + For any networks not specified here the cloud will + select addresses. + match_ironic_node: Set true to launch instances on the Ironic node of the same name as each cluster node + availability_zone: Name of availability zone. If undefined, defaults to 'nova' + if match_ironic_node is true, defered to OpenStack otherwise + gateway_ip: Address to add default route via + nodename_template: Overrides variable cluster_nodename_template + server_group_id: String ID of server group to use for scheduler hint + + Nodes are added to the following inventory groups: + - $group_name + - $cluster_name + '_' + $group_name - this is used for the stackhpc.openhpc role + - 'compute' + EOF + + type = any # can't do any better; TF type constraints can't cope with heterogeneous inner mappings } +# tflint-ignore: terraform_typed_variables variable "additional_nodegroups" { - default = {} - description = <<-EOF - Mapping defining homogenous groups of nodes for arbitrary purposes. - These nodes are not in the compute or login inventory groups so they - will not run slurmd. - - Keys are names of groups. - Values are a mapping as for the "login" variable, with the addition of - the optional entry: + default = {} + description = <<-EOF + Mapping defining homogenous groups of nodes for arbitrary purposes. + These nodes are not in the compute or login inventory groups so they + will not run slurmd. + + Keys are names of groups. + Values are a mapping as for the "login" variable, with the addition of + the optional entry: - security_group_ids: List of strings giving IDs of security groups - to apply. If not specified the groups from the - variable nonlogin_security_groups are applied. - - Nodes are added to the following inventory groups: - - $group_name - - $cluster_name + '_' + $group_name - - 'additional' - EOF + security_group_ids: List of strings giving IDs of security groups + to apply. If not specified the groups from the + variable nonlogin_security_groups are applied. + + Nodes are added to the following inventory groups: + - $group_name + - $cluster_name + '_' + $group_name + - 'additional' + EOF } variable "environment_root" { - type = string - description = "Path to environment root, automatically set by activate script" + type = string + description = "Path to environment root, automatically set by activate script" } variable "state_dir" { - type = string - description = "Path to state directory on control node" - default = "/var/lib/state" + type = string + description = "Path to state directory on control node" + default = "/var/lib/state" } variable "state_volume_size" { - type = number - description = "Size of state volume on control node, in GB" - default = 150 # GB + type = number + description = "Size of state volume on control node, in GB" + default = 150 # GB } variable "state_volume_type" { - type = string - description = "Type of state volume, if not default type" - default = null + type = string + description = "Type of state volume, if not default type" + default = null } variable "state_volume_provisioning" { - type = string - default = "manage" - description = <<-EOT - How to manage the state volume. Valid values are: - "manage": (Default) OpenTofu will create a volume "$cluster_name-state" - and delete it when the cluster is destroyed. A volume - with this name must not already exist. Use for demo and - dev environments. - "attach": A single volume named "$cluster_name-state" must already - exist. It is not managed by OpenTofu so e.g. is left - intact if the cluster is destroyed. Use for production - environments. - EOT - validation { - condition = contains(["manage", "attach"], var.state_volume_provisioning) - error_message = <<-EOT - state_volume_provisioning must be "manage" or "attach" + type = string + default = "manage" + description = <<-EOT + How to manage the state volume. Valid values are: + "manage": (Default) OpenTofu will create a volume "$cluster_name-state" + and delete it when the cluster is destroyed. A volume + with this name must not already exist. Use for demo and + dev environments. + "attach": A single volume named "$cluster_name-state" must already + exist. It is not managed by OpenTofu so e.g. is left + intact if the cluster is destroyed. Use for production + environments. + EOT + validation { + condition = contains(["manage", "attach"], var.state_volume_provisioning) + error_message = <<-EOT + state_volume_provisioning must be "manage" or "attach" EOT - } + } } variable "home_volume_size" { - type = number - description = "Size of state volume on control node, in GB." - default = 100 - validation { - condition = var.home_volume_provisioning == "manage" ? var.home_volume_size > 0 : true - error_message = <<-EOT - home_volume_size must be > 0 when var.home_volume_provisioning == "manage" - EOT - } + type = number + description = "Size of state volume on control node, in GB." + default = 100 + validation { + condition = var.home_volume_provisioning == "manage" ? var.home_volume_size > 0 : true + error_message = <<-EOT + home_volume_size must be > 0 when var.home_volume_provisioning == "manage" + EOT + } } variable "home_volume_type" { - type = string - default = null - description = "Type of home volume, if not default type" + type = string + default = null + description = "Type of home volume, if not default type" } variable "home_volume_provisioning" { - type = string - default = "manage" - description = <<-EOT - How to manage the home volume. Valid values are: - "manage": (Default) OpenTofu will create a volume "$cluster_name-home" - and delete it when the cluster is destroyed. A volume - with this name must not already exist. Use for demo and - dev environments. - "attach": A single volume named "$cluster_name-home" must already - exist. It is not managed by OpenTofu so e.g. is left - intact if the cluster is destroyed. Use for production - environments. - "none": No home volume is used. Use if /home is provided by - a parallel filesystem, e.g. manila. - EOT - validation { - condition = contains(["manage", "attach", "none"], var.home_volume_provisioning) - error_message = <<-EOT - home_volume_provisioning must be one of "manage", "attach" or "none" + type = string + default = "manage" + description = <<-EOT + How to manage the home volume. Valid values are: + "manage": (Default) OpenTofu will create a volume "$cluster_name-home" + and delete it when the cluster is destroyed. A volume + with this name must not already exist. Use for demo and + dev environments. + "attach": A single volume named "$cluster_name-home" must already + exist. It is not managed by OpenTofu so e.g. is left + intact if the cluster is destroyed. Use for production + environments. + "none": No home volume is used. Use if /home is provided by + a parallel filesystem, e.g. manila. + EOT + validation { + condition = contains(["manage", "attach", "none"], var.home_volume_provisioning) + error_message = <<-EOT + home_volume_provisioning must be one of "manage", "attach" or "none" EOT - } + } } variable "vnic_types" { - type = map(string) - description = <<-EOT - Default VNIC types, keyed by network name. See https://registry.terraform.io/providers/terraform-provider-openstack/openstack/latest/docs/resources/networking_port_v2#vnic_type - If not given this defaults to the "normal" type. - EOT - default = {} + type = map(string) + description = <<-EOT + Default VNIC types, keyed by network name. See https://registry.terraform.io/providers/terraform-provider-openstack/openstack/latest/docs/resources/networking_port_v2#vnic_type + If not given this defaults to the "normal" type. + EOT + default = {} } variable "login_security_groups" { - type = list(string) - description = "Name of preexisting security groups to apply to login nodes" - default = [ - "default", # allow all in-cluster services - "SSH", # access via ssh - "HTTPS", # access OpenOndemand - ] + type = list(string) + description = "Name of preexisting security groups to apply to login nodes" + default = [ + "default", # allow all in-cluster services + "SSH", # access via ssh + "HTTPS", # access OpenOndemand + ] } variable "nonlogin_security_groups" { - type = list(string) - description = "Name of preexisting security groups to apply to non-login nodes" - default = [ - "default", # allow all in-cluster services - ] + type = list(string) + description = "Name of preexisting security groups to apply to non-login nodes" + default = [ + "default", # allow all in-cluster services + ] } variable "volume_backed_instances" { - description = "Whether to use volumes for root disks" - type = bool - default = false + description = "Whether to use volumes for root disks" + type = bool + default = false } variable "root_volume_size" { - description = "Size of volume for root volumes if using volume backed instances, in Gb" - type = number - default = 40 + description = "Size of volume for root volumes if using volume backed instances, in Gb" + type = number + default = 40 } variable "root_volume_type" { - description = "Type of root volume, if using volume backed instances. If unset, the target cloud default volume type is used." - type = string - default = null + description = "Type of root volume, if using volume backed instances. If unset, the target cloud default volume type is used." + type = string + default = null } variable "gateway_ip" { - description = "Address to add default route via" - type = string - default = "" + description = "Address to add default route via" + type = string + default = "" } variable "cluster_nodename_template" { - description = <<-EOT - Template for node fully-qualified names. The following interpolations - can be used: - $${cluster_name}: From var.cluster_name - $${cluster_domain_suffix}: From var.cluster_domain_suffix - $${node}: The current entry in the "nodes" parameter for nodes - defined by var.compute and var.login, or "control" for the control - node - $${environment_name}: The last element of the current environment's path - EOT - type = string - default = "$${cluster_name}-$${node}.$${cluster_name}.$${cluster_domain_suffix}" + description = <<-EOT + Template for node fully-qualified names. The following interpolations + can be used: + $${cluster_name}: From var.cluster_name + $${cluster_domain_suffix}: From var.cluster_domain_suffix + $${node}: The current entry in the "nodes" parameter for nodes + defined by var.compute and var.login, or "control" for the control + node + $${environment_name}: The last element of the current environment's path + EOT + type = string + default = "$${cluster_name}-$${node}.$${cluster_name}.$${cluster_domain_suffix}" } variable "config_drive" { - description = <<-EOT - Whether to enable Nova config drives on all nodes, which will attach a drive containing - information usually provided through the metadata service. - EOT - type = bool - default = null + description = <<-EOT + Whether to enable Nova config drives on all nodes, which will attach a drive containing + information usually provided through the metadata service. + EOT + type = bool + default = null } variable "additional_cloud_config" { - description = <<-EOT - Multiline string to be appended to the node's cloud-init cloud-config user-data. - Must be in yaml format and not include the #cloud-config or any other user-data headers. - See https://cloudinit.readthedocs.io/en/latest/explanation/format.html#cloud-config-data. - Can be a templatestring parameterised by `additional_cloud_config_vars`. - The `boot-cmd`, `fqdn` and `mounts` modules must not be specified. - EOT - type = string - default = "" + description = <<-EOT + Multiline string to be appended to the node's cloud-init cloud-config user-data. + Must be in yaml format and not include the #cloud-config or any other user-data headers. + See https://cloudinit.readthedocs.io/en/latest/explanation/format.html#cloud-config-data. + Can be a templatestring parameterised by `additional_cloud_config_vars`. + The `boot-cmd`, `fqdn` and `mounts` modules must not be specified. + EOT + type = string + default = "" } variable "additional_cloud_config_vars" { - description = "Map of values passed to the `additional_cloud_config` templatestring" - type = map(any) - default = {} + description = "Map of values passed to the `additional_cloud_config` templatestring" + type = map(any) + default = {} } variable "control_server_group_id" { - description = "ID of server group to use for control node scheduler hint" - type = string - default = null + description = "ID of server group to use for control node scheduler hint" + type = string + default = null } diff --git a/environments/site/tofu/volumes.tf b/environments/site/tofu/volumes.tf index 18a6a09..46b63eb 100644 --- a/environments/site/tofu/volumes.tf +++ b/environments/site/tofu/volumes.tf @@ -1,59 +1,59 @@ resource "openstack_blockstorage_volume_v3" "state" { - # NB: Changes to this resource's "address" i.e. (label or for_each key) - # may lose state data for existing clusters using this volume + # NB: Changes to this resource's "address" i.e. (label or for_each key) + # may lose state data for existing clusters using this volume - count = var.state_volume_provisioning == "manage" ? 1 : 0 + count = var.state_volume_provisioning == "manage" ? 1 : 0 - name = "${var.cluster_name}-state" # last word used to label filesystem - description = "State for control node" - size = var.state_volume_size - volume_type = var.state_volume_type + name = "${var.cluster_name}-state" # last word used to label filesystem + description = "State for control node" + size = var.state_volume_size + volume_type = var.state_volume_type } data "openstack_blockstorage_volume_v3" "state" { -/* We use a data resource whether or not TF is managing the volume, so the + /* We use a data resource whether or not TF is managing the volume, so the logic is all in one place. But that means this needs a dependency on the actual resource to avoid a race. Because there may be no volume, this has to use for_each. */ - for_each = toset( - (var.state_volume_provisioning == "manage") ? - [for v in openstack_blockstorage_volume_v3.state: v.name] : - ["${var.cluster_name}-state"] - ) + for_each = toset( + (var.state_volume_provisioning == "manage") ? + [for v in openstack_blockstorage_volume_v3.state : v.name] : + ["${var.cluster_name}-state"] + ) - name = each.key + name = each.key } resource "openstack_blockstorage_volume_v3" "home" { - # NB: Changes to this resource's "address" i.e. (label or for_each key) - # may lose user data for existing clusters using this volume + # NB: Changes to this resource's "address" i.e. (label or for_each key) + # may lose user data for existing clusters using this volume - count = var.home_volume_provisioning == "manage" ? 1 : 0 + count = var.home_volume_provisioning == "manage" ? 1 : 0 - name = "${var.cluster_name}-home" # last word used to label filesystem - description = "Home for control node" - size = var.home_volume_size - volume_type = var.home_volume_type + name = "${var.cluster_name}-home" # last word used to label filesystem + description = "Home for control node" + size = var.home_volume_size + volume_type = var.home_volume_type } data "openstack_blockstorage_volume_v3" "home" { -/* Comments as for the state volume. */ + /* Comments as for the state volume. */ - for_each = toset( - (var.home_volume_provisioning == "manage") ? - [for v in openstack_blockstorage_volume_v3.home: v.name] : - (var.home_volume_provisioning == "attach") ? - ["${var.cluster_name}-home"] : - [] - ) + for_each = toset( + (var.home_volume_provisioning == "manage") ? + [for v in openstack_blockstorage_volume_v3.home : v.name] : + (var.home_volume_provisioning == "attach") ? + ["${var.cluster_name}-home"] : + [] + ) - name = each.key + name = each.key } diff --git a/packer/openhpc_extravars.yml b/packer/openhpc_extravars.yml index 66f6686..8bfcd59 100644 --- a/packer/openhpc_extravars.yml +++ b/packer/openhpc_extravars.yml @@ -1 +1,2 @@ -workaround_ansible_issue_61497: yes # extravars files can't be empty +--- +workaround_ansible_issue_61497: true # extravars files can't be empty diff --git a/requirements.yml b/requirements.yml index 8850c16..e577dd3 100644 --- a/requirements.yml +++ b/requirements.yml @@ -57,4 +57,3 @@ collections: version: 0.5.5 - name: stackhpc.linux version: 1.5.0 -... diff --git a/super-linter.env b/super-linter.env new file mode 100644 index 0000000..5362c92 --- /dev/null +++ b/super-linter.env @@ -0,0 +1,27 @@ +# Detect that default branch is devel when running locally +DEFAULT_BRANCH=main + +# Don't validate JSCPD +VALIDATE_JSCPD=false + +# Don't validate JS standard because it conflicts with JS prettier +VALIDATE_JAVASCRIPT_STANDARD=false + +# Don't validate Ansible because ansible-lint is more flexible +VALIDATE_ANSIBLE=false + +# Don't validate YAML prettier because yamllint is sufficient +VALIDATE_YAML_PRETTIER=false + +# Getting false positives with terrascan that seemingly can't be masked +VALIDATE_TERRAFORM_TERRASCAN=false + +# Doesn't seem possible to exclude files with terragrunt +VALIDATE_TERRAGRUNT=false + +# TODO: address the following. +# Temporarily disable these linters, +# there are select issues remaining with each that can be addressed individually +VALIDATE_GITHUB_ACTIONS=false +VALIDATE_SHELL_SHFMT=false +VALIDATE_YAML=false From fad0ff4a3d2f76eb4ae7bc010872ff9e9cb0636c Mon Sep 17 00:00:00 2001 From: Pierre Riteau Date: Thu, 18 Sep 2025 15:57:40 +0200 Subject: [PATCH 18/50] Define login subgroups in Ansible inventory (#727) It resolves some limitations with login subgroups, such as difficulty to bind the Open OnDemand service to a specific node when naming of the nodes is not predictable. This replicates what is already done for compute subgroups. --- environments/site/tofu/inventory.tpl | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/environments/site/tofu/inventory.tpl b/environments/site/tofu/inventory.tpl index 6c11b32..9920f9e 100644 --- a/environments/site/tofu/inventory.tpl +++ b/environments/site/tofu/inventory.tpl @@ -27,6 +27,11 @@ ${cluster_name}_${group_name}: networks: ${jsonencode({for n in node.network: n.name => {"fixed_ip_v4": n.fixed_ip_v4, "fixed_ip_v6": n.fixed_ip_v6}})} node_fqdn: ${login_groups[group_name]["fqdns"][nodename]} %{ endfor ~} + +${group_name}: + children: + ${cluster_name}_${group_name}: + %{ endfor ~} login: From eb1fb2dbcc51fc9df208d7401a1956715f507301 Mon Sep 17 00:00:00 2001 From: Pierre Riteau Date: Thu, 18 Sep 2025 18:17:48 +0200 Subject: [PATCH 19/50] Fix label in Jupyter Notebook form (#787) --- environments/common/inventory/group_vars/all/openondemand.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/environments/common/inventory/group_vars/all/openondemand.yml b/environments/common/inventory/group_vars/all/openondemand.yml index af7554a..7727052 100644 --- a/environments/common/inventory/group_vars/all/openondemand.yml +++ b/environments/common/inventory/group_vars/all/openondemand.yml @@ -161,7 +161,7 @@ openondemand_apps_jupyter_default: - node attributes: # TODO num_cores: - label: Number of cores FOO + label: Number of cores value: 1 modules: "" extra_jupyter_args: "" From 0da4041e148c5a043885a0cbaee8a23f99daea9e Mon Sep 17 00:00:00 2001 From: Steve Brasier <33413598+sjpb@users.noreply.github.com> Date: Thu, 18 Sep 2025 17:19:46 +0100 Subject: [PATCH 20/50] Ignore changes to port binding and dhcp options (#778) * ignore port binding info; fixes tf when admin * ignore port dhcp changes to fix networking-mlxn * ignore port binding/dhcp options for caas * fix TF linter errors --- .../cluster_infra/templates/resources.tf.j2 | 47 +++++++++++++++++++ environments/site/tofu/control.tf | 7 +++ environments/site/tofu/node_group/nodes.tf | 7 +++ 3 files changed, 61 insertions(+) diff --git a/ansible/roles/cluster_infra/templates/resources.tf.j2 b/ansible/roles/cluster_infra/templates/resources.tf.j2 index f342371..f46192c 100644 --- a/ansible/roles/cluster_infra/templates/resources.tf.j2 +++ b/ansible/roles/cluster_infra/templates/resources.tf.j2 @@ -219,6 +219,14 @@ resource "openstack_networking_port_v2" "login" { binding { vnic_type = "{{ cluster_vnic_type | default('normal') }}" } + + lifecycle { + ignore_changes = [ + binding, # fixes running as admin + extra_dhcp_option # required for networking-mlnx neutron plugin + ] + } + } # Storage network @@ -235,6 +243,14 @@ resource "openstack_networking_port_v2" "login_storage" { binding { vnic_type = "{{ cluster_storage_vnic_type | default('normal') }}" } + + lifecycle { + ignore_changes = [ + binding, # fixes running as admin + extra_dhcp_option # required for networking-mlnx neutron plugin + ] + } + } {% endif %} @@ -258,8 +274,15 @@ resource "openstack_networking_port_v2" "control" { binding { vnic_type = "{{ cluster_vnic_type | default('normal') }}" + } + lifecycle { + ignore_changes = [ + binding, # fixes running as admin + extra_dhcp_option # required for networking-mlnx neutron plugin + ] } + } # Storage network @@ -276,6 +299,14 @@ resource "openstack_networking_port_v2" "control_storage" { binding { vnic_type = "{{ cluster_storage_vnic_type | default('normal') }}" } + + lifecycle { + ignore_changes = [ + binding, # fixes running as admin + extra_dhcp_option # required for networking-mlnx neutron plugin + ] + } + } {% endif %} @@ -301,6 +332,14 @@ resource "openstack_networking_port_v2" "{{ nodegroup.name }}" { binding { vnic_type = "{{ cluster_vnic_type | default('normal') }}" } + + lifecycle { + ignore_changes = [ + binding, # fixes running as admin + extra_dhcp_option # required for networking-mlnx neutron plugin + ] + } + } # Storage network @@ -318,6 +357,14 @@ resource "openstack_networking_port_v2" "{{ nodegroup.name }}_storage" { binding { vnic_type = "{{ cluster_storage_vnic_type | default('normal') }}" } + + lifecycle { + ignore_changes = [ + binding, # fixes running as admin + extra_dhcp_option # required for networking-mlnx neutron plugin + ] + } + } {% endif %} diff --git a/environments/site/tofu/control.tf b/environments/site/tofu/control.tf index 7bfa13f..87da2f7 100644 --- a/environments/site/tofu/control.tf +++ b/environments/site/tofu/control.tf @@ -34,6 +34,13 @@ resource "openstack_networking_port_v2" "control" { binding { vnic_type = lookup(var.vnic_types, each.key, "normal") } + + lifecycle { + ignore_changes = [ + binding, # fixes running as admin + extra_dhcp_option # required for networking-mlnx neutron plugin + ] + } } resource "openstack_compute_instance_v2" "control" { diff --git a/environments/site/tofu/node_group/nodes.tf b/environments/site/tofu/node_group/nodes.tf index d02028f..4d874d1 100644 --- a/environments/site/tofu/node_group/nodes.tf +++ b/environments/site/tofu/node_group/nodes.tf @@ -71,6 +71,13 @@ resource "openstack_networking_port_v2" "compute" { binding { vnic_type = lookup(var.vnic_types, each.value.net.network, "normal") } + + lifecycle { + ignore_changes = [ + binding, # fixes running as admin + extra_dhcp_option # required for networking-mlnx neutron plugin + ] + } } resource "openstack_compute_instance_v2" "compute_fixed_image" { From 06857df9ae7cc1ea8efe80decfb63aab7c272744 Mon Sep 17 00:00:00 2001 From: Pierre Riteau Date: Thu, 18 Sep 2025 19:47:46 +0200 Subject: [PATCH 21/50] Expose vgpu group in site inventory (#786) * Fix various comments in Ansible group files * Expose vgpu group in site inventory --- environments/common/inventory/groups | 13 ++++++------- environments/site/inventory/groups | 15 +++++++++------ 2 files changed, 15 insertions(+), 13 deletions(-) diff --git a/environments/common/inventory/groups b/environments/common/inventory/groups index 6926355..c02226b 100644 --- a/environments/common/inventory/groups +++ b/environments/common/inventory/groups @@ -91,22 +91,22 @@ fail2ban # Add `openhpc` group to add slurm users via creation of users on each node. [openondemand] -# Host to run Open Ondemand server on - subset of login +# Host to run Open OnDemand server on - subset of login [openondemand_desktop] -# Subset of compute to run a interactive desktops on via Open Ondemand +# Subset of compute to run a interactive desktops on via Open OnDemand [openondemand_jupyter] -# Subset of compute to run a Jupyter Notebook servers on via Open Ondemand +# Subset of compute to run a Jupyter Notebook servers on via Open OnDemand [openondemand_rstudio] -# Subset of compute to run RStudio servers on via Open Ondemand +# Subset of compute to run RStudio servers on via Open OnDemand [openondemand_matlab] -# Subset of compute to run RStudio servers on via Open Ondemand +# Subset of compute to run a MATLAB interactive desktop on via Open OnDemand [openondemand_codeserver] -# Subset of compute to run a Codeserver VSCode instance on via Open Ondemand +# Subset of compute to run a Codeserver VSCode instance on via Open OnDemand [etc_hosts] # Hosts to manage /etc/hosts e.g. if no internal DNS. See ansible/roles/etc_hosts/README.md @@ -217,4 +217,3 @@ extra_packages # separately from the appliance. e.g # pulp_host ansible_host= # Note the host name can't conflict with group names i.e can't be called `pulp` or `pulp_server` - diff --git a/environments/site/inventory/groups b/environments/site/inventory/groups index b78197d..930cf93 100644 --- a/environments/site/inventory/groups +++ b/environments/site/inventory/groups @@ -44,27 +44,27 @@ login openhpc [openondemand:children] -# Host to run Open Ondemand server on - subset of login +# Host to run Open OnDemand server on - subset of login login [openondemand_desktop:children] -# Subset of compute to run a interactive desktops on via Open Ondemand +# Subset of compute to run a interactive desktops on via Open OnDemand compute [openondemand_jupyter:children] -# Subset of compute to run a Jupyter Notebook servers on via Open Ondemand +# Subset of compute to run a Jupyter Notebook servers on via Open OnDemand compute [openondemand_rstudio:children] -# Subset of compute to run RStudio servers on via Open Ondemand +# Subset of compute to run RStudio servers on via Open OnDemand compute [openondemand_matlab:children] -# Subset of compute to run a MATLAB interactive desktop on via Open Ondemand +# Subset of compute to run a MATLAB interactive desktop on via Open OnDemand compute [openondemand_codeserver:children] -# Subset of compute to run a Codeserver VSCode instance on via Open Ondemand +# Subset of compute to run a Codeserver VSCode instance on via Open OnDemand compute [etc_hosts:children] @@ -81,6 +81,9 @@ cluster # Hosts to recompile Slurm for - allows supporting Slurm autodetection method 'nvml' cuda +[vgpu] +# Hosts where vGPU/MIG should be configured - see docs/mig.md + [eessi:children] # Hosts on which EESSI stack should be configured openhpc From 535528fb0089743d67a6332f559d672f4494f850 Mon Sep 17 00:00:00 2001 From: Steve Brasier <33413598+sjpb@users.noreply.github.com> Date: Fri, 19 Sep 2025 09:57:53 +0100 Subject: [PATCH 22/50] Add documentation for OpenTofu remote state (#784) * wip: add TF remote state docs * wip s3 remote state * improve gitlab backend configuration * automate s3 creds * make s3 buckets clearer * fix linting * try to allow same headings at different levels in markdown * fix tf lint errors * fix prettier errors --- .markdownlint.json | 5 + docs/opentofu-remote-state.md | 184 ++++++++++++++++++ docs/production.md | 3 + .../site/tofu/example-backends/gitlab.tf | 42 ++++ environments/site/tofu/example-backends/s3.tf | 25 +++ 5 files changed, 259 insertions(+) create mode 100644 .markdownlint.json create mode 100644 docs/opentofu-remote-state.md create mode 100644 environments/site/tofu/example-backends/gitlab.tf create mode 100644 environments/site/tofu/example-backends/s3.tf diff --git a/.markdownlint.json b/.markdownlint.json new file mode 100644 index 0000000..fba9b6e --- /dev/null +++ b/.markdownlint.json @@ -0,0 +1,5 @@ +{ + "no-duplicate-heading": { + "siblings_only": true + } +} diff --git a/docs/opentofu-remote-state.md b/docs/opentofu-remote-state.md new file mode 100644 index 0000000..c70a53f --- /dev/null +++ b/docs/opentofu-remote-state.md @@ -0,0 +1,184 @@ +# OpenTofu remote state + +OpenTofu supports a number of [remote state backends](https://opentofu.org/docs/language/state/remote/) +which can be used to persist state independently of where a deployment is run. +This allows deployments to be made from anywhere that can access the state +without corrupting or conflicting with any existing resources from previous +deployments. + +Using remote state is therefore strongly recommended for environments which +should only be instantiated once, e.g. `production` and `staging`. + +This page provides guidance for configuring remote states using backends +commonly available on OpenStack deployments. + +> [!IMPORTANT] +> In the below replace `$ENV` with the relevant environment name. + +## GitLab + +GitLab can be used with the [http backend](https://opentofu.org/docs/language/settings/backends/http/) +to store separate states for each environment within the GitLab project. +Access is protected by GitLab access tokens, which in the approach below are +persisted to local files. Therefore each repository checkout will need to +authenticate separately, using either a separate token or a shared token from +some external secret store. + +The below is based on the [official docs](https://docs.gitlab.com/user/infrastructure/iac/terraform_state/) +but includes some missing details and is modified for common appliance workflows. + +### Initial setup + +1. Create the backend file: + + ```shell + cp environments/site/tofu/example-backends/gitlab.tf environments/$ENV/tofu + ``` + +2. Modify `environments/$ENV/tofu/gitlab.tf` to set the default for the + project ID. This can be found by clicking the 3-dot menu at the top right of + the GitLab project page. + + ```terraform + # environments/$ENV/tofu/backend.tf: + terraform { + backend "http" {} + } + ``` + +3. Commit it. + +4. Follow the per-checkout steps below. + +### Per-checkout configuration + +1. Create an access token in the GitLab UI, using either: + + a. If project access tokens are available, create one via + Project > Settings > Access tokens. + The token must have `Maintainer` role and `api` scope. + + b. Otherwise create a personal access token via + User profile > Preferences > Access tokens. + The token must have `api` scope. + + Copy the generated secret and set an environment variable: + + ```shell + export TF_VAR_gitlab_access_token=$secret + ``` + +2. If using a personal access token, set the GitLab username as an environment variable: + + ```shell + export TF_VAR_gitlab_username=$your_username + ``` + +3. With the environment activated, initialise OpenTofu. + + If no local state exists run: + + ```shell + cd environments/$ENV/tofu/ + tofu init + ``` + + otherwise append `-migrate-state` to the `init` command to attempt to copy + local state to the new backend. + +OpenTofu is now configured to use GitLab to store state for this environment. + +Repeat for each environment needing remote state. + +> [!CAUTION] +> The GitLab credentials are [persisted](https://opentofu.org/docs/language/settings/backends/configuration/#credentials-and-sensitive-data) +> into a file `environments/$ENV/tofu/.terraform/terraform.tfstate` and any +> plan files. These should therefore not be committed. + +### Token expiry + +If the project token expires repeat the per-checkout configuration, but using +`opentofu init -reconfigure` instead. + +## S3 + +For clouds with S3-compatible object storage (e.g. Ceph with [radosgw](https://docs.ceph.com/en/latest/radosgw/)) +the S3 backend can be used. This approach uses a bucket per environment and +derives credentials from OpenStack credentials, meaning no backend-specific +per-checkout configuration is required. + +### Initial setup + +1. Create an S3 bucket with a name `${cluster_name}-${environment_name}-tfstate` + where: + + - `CLUSTER_NAME` is defined in `environments/$ENV/tofu/main.tf` + - `$ENVIRONMENT_NAME` is the name of the environment directory + + e.g. + + ```shell + openstack container create research-staging-tfstate + ``` + +2. Create `ec2` credentials: + + ```shell + openstack ec2 credentials create + ``` + + and make a note of the `access` field returned. + +3. Create the backend file: + + ```shell + cp environments/site/tofu/example-backends/s3.tf environments/$ENV/tofu + ``` + +4. Modify `environments/$ENV/tofu/s3.tf` to set the default for `s3_backend_endpoint`. + This is the radosgw address. If not known it can be determined by creating a + public bucket, and then getting the URL using + Project > Containers > (your public bucket) > Link + which provides a URL of the form `https://$ENDPOINT/swift/...`. + +5. Add the following to `environments/$ENV/activate`: + + ```bash + export AWS_ACCESS_KEY_ID=$EC2_CREDENTIALS_ACCESS + export AWS_SECRET_ACCESS_KEY=$(openstack ec2 credentials show $AWS_ACCESS_KEY_ID -f value -c secret) + ``` + + replacing `$EC2_CREDENTIALS_ACCESS` with the `access` field of the created + credentials. + + This avoids these credentials being persisted in local files. + +6. Copy the lines above into your shell to set them for your current shell. + +7. With the environment activated, initialise OpenTofu. + + If no local state exists run: + + ```shell + cd environments/$ENV/tofu/ + tofu init + ``` + + otherwise append `-migrate-state` to the `init` command to attempt to copy + local state to the new backend. + +8. If this fails, try setting `use_path_style = true` in `environments/$ENV/tofu/s3.tf`. + +9. Once it works, commit `environments/$ENV/tofu/s3.tf` and `environments/$ENV/activate`. + +OpenTofu is now configured to use the cloud's S3-compatible storage to store +state for this environment. + +Repeat for each environment needing remote state. + +For more configuration options, see the OpenTofu [s3 backend docs](https://opentofu.org/docs/language/settings/backends/s3/). + +### Per-checkout configuration + +The ec2 credentials will automatically be loaded when activating the environment. +For a new checkout simply initialise OpenTofu as normal as described in step 7 above. diff --git a/docs/production.md b/docs/production.md index 83587f9..abebf4f 100644 --- a/docs/production.md +++ b/docs/production.md @@ -316,6 +316,9 @@ The value chosen should be the highest value demonstrated during testing. Note that any time spent blocked due to this parallelism limit does not count against the (un-overridable) internal OpenTofu timeout of 30 minutes +Consider configuring [OpenTofu remote state](./opentofu-remote-state.md) for any +environments which should be unique, e.g. production and staging. + ## Configure appliance ### Production configuration to consider diff --git a/environments/site/tofu/example-backends/gitlab.tf b/environments/site/tofu/example-backends/gitlab.tf new file mode 100644 index 0000000..722744a --- /dev/null +++ b/environments/site/tofu/example-backends/gitlab.tf @@ -0,0 +1,42 @@ +variable "gitlab_username" { + type = string + description = <<-EOF + Username of actual GitLab user, for personal access token only. + Default uses bot account name, for project access token. + EOF + default = null +} + +variable "gitlab_access_token" { + type = string + description = <<-EOF + GitLab Project or Personal access token. + Must have Maintainer role (for Project token) and API scope + EOF +} + +variable "gitlab_project_id" { + type = string + description = "GitLab project ID - click 3-dot menu at the top right of project page" + #default = # add here +} + +locals { + gitlab_username = coalesce(var.gitlab_username, "project_${var.gitlab_project_id}_bot") + gitlab_state_name = basename(var.environment_root) + gitlab_state_address = "https://gitlab.com/api/v4/projects/${var.gitlab_project_id}/terraform/state/${local.gitlab_state_name}" +} + +# tflint-ignore: terraform_required_version +terraform { + backend "http" { + address = local.gitlab_state_address + lock_address = "${local.gitlab_state_address}/lock" + unlock_address = "${local.gitlab_state_address}/lock" + username = local.gitlab_username + password = var.gitlab_access_token + lock_method = "POST" + unlock_method = "DELETE" + retry_wait_min = 5 + } +} diff --git a/environments/site/tofu/example-backends/s3.tf b/environments/site/tofu/example-backends/s3.tf new file mode 100644 index 0000000..d471135 --- /dev/null +++ b/environments/site/tofu/example-backends/s3.tf @@ -0,0 +1,25 @@ +variable "s3_backend_endpoint" { + type = string + description = "radosgw address without protocol or path e.g. leafcloud.store" + #default = # add here +} + +# tflint-ignore: terraform_required_version +terraform { + backend "s3" { + endpoint = var.s3_backend_endpoint + bucket = "${var.cluster_name}-${basename(var.environment_root)}-tfstate" + key = "environment.tfstate" + + # Reginon is required but not used in radosgw: + region = "dummy" + skip_region_validation = true + + # Normally STS is not configured in radosgw: + skip_credentials_validation = true + + # Enable path-style S3 URLs (https:/// instead of https://.) + # may or may not be required depending on radosgw configuration + use_path_style = true + } +} From 5bedf73c63c3de474190863809fc50e35d1a4988 Mon Sep 17 00:00:00 2001 From: Steve Brasier <33413598+sjpb@users.noreply.github.com> Date: Fri, 19 Sep 2025 10:45:46 +0100 Subject: [PATCH 23/50] Remove unused cloudalchemy alertmanager role (is in-repo role instead) (#781) --- requirements.yml | 2 -- 1 file changed, 2 deletions(-) diff --git a/requirements.yml b/requirements.yml index e577dd3..98785d7 100644 --- a/requirements.yml +++ b/requirements.yml @@ -12,8 +12,6 @@ roles: - src: https://github.com/cloudalchemy/ansible-prometheus.git version: 4d2c8d742de39e50387e0aa6d5510b21c7451343 # need fix in preceeding commit for rocky name: cloudalchemy.prometheus - - src: cloudalchemy.alertmanager - version: 0.19.1 - src: https://github.com/stackhpc/ansible-grafana.git name: cloudalchemy.grafana version: stackhpc-0.19.0 # fix grafana install From 3b4be853a17156d9ce2802dda5e8bed0ef6dcb44 Mon Sep 17 00:00:00 2001 From: Pierre Riteau Date: Tue, 23 Sep 2025 11:23:36 +0200 Subject: [PATCH 24/50] Fix various typos --- ansible/roles/firewalld/handlers/main.yml | 2 +- ansible/roles/firewalld/tasks/runtime.yml | 6 +++--- ansible/roles/nhc/README.md | 2 +- docs/sequence.md | 2 +- environments/common/inventory/groups | 2 +- 5 files changed, 7 insertions(+), 7 deletions(-) diff --git a/ansible/roles/firewalld/handlers/main.yml b/ansible/roles/firewalld/handlers/main.yml index 0e8c3df..c498f70 100644 --- a/ansible/roles/firewalld/handlers/main.yml +++ b/ansible/roles/firewalld/handlers/main.yml @@ -1,5 +1,5 @@ --- -- name: Restart filewalld +- name: Restart firewalld ansible.builtin.service: name: firewalld state: restarted diff --git a/ansible/roles/firewalld/tasks/runtime.yml b/ansible/roles/firewalld/tasks/runtime.yml index 03a5356..4c3b8ec 100644 --- a/ansible/roles/firewalld/tasks/runtime.yml +++ b/ansible/roles/firewalld/tasks/runtime.yml @@ -1,11 +1,11 @@ --- -- name: Apply filewalld configs # noqa: args[module] +- name: Apply firewalld configs # noqa: args[module] ansible.posix.firewalld: "{{ item }}" - notify: Restart filewalld + notify: Restart firewalld loop: "{{ firewalld_configs }}" - ansible.builtin.meta: flush_handlers -- name: Ensure filewalld state +- name: Ensure firewalld state ansible.builtin.systemd: name: firewalld state: "{{ firewalld_state }}" diff --git a/ansible/roles/nhc/README.md b/ansible/roles/nhc/README.md index a826932..689f054 100644 --- a/ansible/roles/nhc/README.md +++ b/ansible/roles/nhc/README.md @@ -21,7 +21,7 @@ To enable node health checks, ensure the `nhc` group contains the `compute` grou compute ``` -When the `anisble/site.yml` playbook is run this will automatically: +When the `ansible/site.yml` playbook is run this will automatically: 1. Add NHC-related configuration to the `slurm.conf` Slurm configuration file. The default configuration is defined in `openhpc_config_nhc` diff --git a/docs/sequence.md b/docs/sequence.md index 8149290..6f3b779 100644 --- a/docs/sequence.md +++ b/docs/sequence.md @@ -8,7 +8,7 @@ This sequence applies to both: control,login,compute inventory groups to install all packages, e.g. StackHPC CI builds - "extra" builds, starting from StackHPC images and using selected inventory - groups to add specfic features for a site-specific image. + groups to add specific features for a site-specific image. Note that a generic Pulp server is shown in the below diagram. This may be StackHPC's Ark server or a local Pulp mirroring Ark. It is assumed a local Pulp diff --git a/environments/common/inventory/groups b/environments/common/inventory/groups index c02226b..ef24952 100644 --- a/environments/common/inventory/groups +++ b/environments/common/inventory/groups @@ -84,7 +84,7 @@ cluster # https://www.fail2ban.org/wiki/index.php/Main_Page [firewalld:children] -# Hosts to install firewalld on - see ansible/roles/filewalld +# Hosts to install firewalld on - see ansible/roles/firewalld fail2ban [basic_users] From 67b2658d75b389e8273fd426984e7e38aaf541b8 Mon Sep 17 00:00:00 2001 From: Steve Brasier <33413598+sjpb@users.noreply.github.com> Date: Wed, 24 Sep 2025 11:01:22 +0100 Subject: [PATCH 25/50] Update dnf repo snapshots (+ source repos, removes RL8 Lustre build CI) (#792) * update dnf_repos_timestamps.yml * bump Ark timestamps * update again * make it possible NOT to clean up packer builds * fixup source repo path typo * add missing RL8 PowerTools source repo * correct RL8 source repo files * update timestamps * bump CI image * disable Lustre for RL8 extrabuild tests due to kernel mismatch --------- Co-authored-by: bertiethorpe --- .github/workflows/extra.yml | 2 +- .github/workflows/fatimage.yml | 3 +- .../tofu/cluster_image.auto.tfvars.json | 4 +- .../group_vars/all/dnf_repo_timestamps.yml | 66 ++++++++++++++++--- 4 files changed, 61 insertions(+), 14 deletions(-) diff --git a/.github/workflows/extra.yml b/.github/workflows/extra.yml index 1941064..b8531c1 100644 --- a/.github/workflows/extra.yml +++ b/.github/workflows/extra.yml @@ -25,7 +25,7 @@ jobs: build: - image_name: openhpc-extra-RL8 source_image_name_key: RL8 # key into environments/.stackhpc/tofu/cluster_image.auto.tfvars.json - inventory_groups: doca,cuda,lustre + inventory_groups: doca,cuda # lustre disabled due to https://github.com/stackhpc/ansible-slurm-appliance/pull/759 volume_size: 35 # needed for cuda - image_name: openhpc-extra-RL9 source_image_name_key: RL9 diff --git a/.github/workflows/fatimage.yml b/.github/workflows/fatimage.yml index 407bd44..d9884ca 100644 --- a/.github/workflows/fatimage.yml +++ b/.github/workflows/fatimage.yml @@ -54,6 +54,7 @@ jobs: - name: Record settings for CI cloud run: | echo CI_CLOUD: ${{ env.CI_CLOUD }} + echo cleanup_on_failure: ${{ github.event.inputs.cleanup_on_failure }} - name: Setup ssh run: | @@ -91,7 +92,7 @@ jobs: packer init . PACKER_LOG=1 packer build \ - -on-error=${{ github.event.inputs.cleanup_on_failure && 'cleanup' || 'abort' }} \ + -on-error=${{ github.event.inputs.cleanup_on_failure == 'true' && 'cleanup' || 'abort' }} \ -var-file="$PKR_VAR_environment_root/${{ env.CI_CLOUD }}.pkrvars.hcl" \ -var "source_image_name=${{ matrix.build.source_image_name }}" \ -var "image_name=${{ matrix.build.image_name }}" \ diff --git a/environments/.stackhpc/tofu/cluster_image.auto.tfvars.json b/environments/.stackhpc/tofu/cluster_image.auto.tfvars.json index 2000b5e..b0d1022 100644 --- a/environments/.stackhpc/tofu/cluster_image.auto.tfvars.json +++ b/environments/.stackhpc/tofu/cluster_image.auto.tfvars.json @@ -1,6 +1,6 @@ { "cluster_image": { - "RL8": "openhpc-RL8-250918-0840-930223fb", - "RL9": "openhpc-RL9-250918-0840-930223fb" + "RL8": "openhpc-RL8-250923-1321-5fcc36b0", + "RL9": "openhpc-RL9-250923-1321-5fcc36b0" } } diff --git a/environments/common/inventory/group_vars/all/dnf_repo_timestamps.yml b/environments/common/inventory/group_vars/all/dnf_repo_timestamps.yml index c80a85a..ef2e2d4 100644 --- a/environments/common/inventory/group_vars/all/dnf_repo_timestamps.yml +++ b/environments/common/inventory/group_vars/all/dnf_repo_timestamps.yml @@ -29,7 +29,7 @@ dnf_repos_default: appstream: '8.10': pulp_path: rocky/8.10/AppStream/x86_64/os - pulp_timestamp: 20250614T013846 + pulp_timestamp: 20250923T022841 repo_file: Rocky-AppStream '9.4': pulp_path: rocky/9.4/AppStream/x86_64/os @@ -41,12 +41,21 @@ dnf_repos_default: repo_file: rocky '9.6': pulp_path: rocky/9.6/AppStream/x86_64/os - pulp_timestamp: 20250902T060015 + pulp_timestamp: 20250923T031638 + repo_file: rocky + appstream-source: + '8.10': + pulp_path: rocky/8.10/AppStream/source/os + pulp_timestamp: 20250923T024945 + repo_file: Rocky-Sources + '9.6': + pulp_path: rocky/9.6/AppStream/source/os + pulp_timestamp: 20250923T043546 repo_file: rocky baseos: '8.10': pulp_path: rocky/8.10/BaseOS/x86_64/os - pulp_timestamp: 20250614T013846 + pulp_timestamp: 20250918T034501 repo_file: Rocky-BaseOS '9.4': pulp_path: rocky/9.4/BaseOS/x86_64/os @@ -58,12 +67,21 @@ dnf_repos_default: repo_file: rocky '9.6': pulp_path: rocky/9.6/BaseOS/x86_64/os - pulp_timestamp: 20250902T094855 + pulp_timestamp: 20250923T045903 + repo_file: rocky + baseos-source: + '8.10': + pulp_path: rocky/8.10/BaseOS/source/os + pulp_timestamp: 20250918T040529 + repo_file: Rocky-Sources + '9.6': + pulp_path: rocky/9.6/BaseOS/source/os + pulp_timestamp: 20250923T043546 repo_file: rocky crb: '8.10': pulp_path: rocky/8.10/PowerTools/x86_64/os - pulp_timestamp: 20250614T013846 + pulp_timestamp: 20250918T034501 repo_file: Rocky-PowerTools repo_name: powertools '9.4': @@ -76,16 +94,35 @@ dnf_repos_default: repo_file: rocky '9.6': pulp_path: rocky/9.6/CRB/x86_64/os - pulp_timestamp: 20250902T060015 + pulp_timestamp: 20250923T031638 + repo_file: rocky + crb-source: + '8.10': + pulp_path: rocky/8.10/PowerTools/source/tree + pulp_timestamp: 20250923T125600 + repo_file: Rocky-Sources + repo_name: powertools-source + '9.6': + pulp_path: rocky/9.6/CRB/source/os + pulp_timestamp: 20250923T043546 repo_file: rocky epel: '8': pulp_path: epel/8/Everything/x86_64 - pulp_timestamp: 20250615T234151 + pulp_timestamp: 20250923T001717 repo_file: epel '9': pulp_path: epel/9/Everything/x86_64 - pulp_timestamp: 20250908T001730 + pulp_timestamp: 20250923T001717 + repo_file: epel + epel-source: + '8': + pulp_path: epel/8/Everything/source + pulp_timestamp: 20250923T001717 + repo_file: epel + '9': + pulp_path: epel/9/Everything/source + pulp_timestamp: 20250923T001717 repo_file: epel extras: '8.10': @@ -104,13 +141,22 @@ dnf_repos_default: pulp_path: rocky/9.6/extras/x86_64/os pulp_timestamp: 20250726T040613 repo_file: rocky-extras + extras-source: + '8.10': + pulp_path: rocky/8.10/extras/source/os + pulp_timestamp: 20250828T161842 + repo_file: Rocky-Sources + '9.6': + pulp_path: rocky/9.6/extras/source/os + pulp_timestamp: 20250828T161842 + repo_file: rocky-extras grafana: '8': pulp_path: grafana/oss/rpm - pulp_timestamp: 20250730T011314 + pulp_timestamp: 20250917T024714 repo_file: grafana timestamp: 20250615T005738 '9': pulp_path: grafana/oss/rpm - pulp_timestamp: 20250906T025340 + pulp_timestamp: 20250917T024714 repo_file: grafana From dbf142245aaeefc8a7b07754cf380396bf612368 Mon Sep 17 00:00:00 2001 From: Steve Brasier <33413598+sjpb@users.noreply.github.com> Date: Wed, 24 Sep 2025 15:06:43 +0100 Subject: [PATCH 26/50] Validate nodegroup names (#793) * validate nodename groups * add validation for nodegroup name clashes * add validation for nodegroup name clashes * fix linter whinges * extend validation to cover additional_nodegroups * fix TF linting * fixup logic * fix logic * fix linter --- environments/.stackhpc/tofu/ARCUS.tfvars | 6 +-- .../.stackhpc/tofu/LEAFCLOUD-dev.tfvars | 14 +++---- environments/.stackhpc/tofu/LEAFCLOUD.tfvars | 14 +++---- environments/.stackhpc/tofu/SMS.tfvars | 10 ++--- environments/.stackhpc/tofu/main.tf | 2 +- environments/site/tofu/variables.tf | 41 +++++++++++++++++-- 6 files changed, 61 insertions(+), 26 deletions(-) diff --git a/environments/.stackhpc/tofu/ARCUS.tfvars b/environments/.stackhpc/tofu/ARCUS.tfvars index 6aec599..40daa51 100644 --- a/environments/.stackhpc/tofu/ARCUS.tfvars +++ b/environments/.stackhpc/tofu/ARCUS.tfvars @@ -1,4 +1,4 @@ -cluster_net = "portal-internal" -cluster_subnet = "portal-internal" +cluster_net = "portal-internal" +cluster_subnet = "portal-internal" control_node_flavor = "vm.ska.cpu.general.eighth" -other_node_flavor = "vm.ska.cpu.general.small" +other_node_flavor = "vm.ska.cpu.general.small" diff --git a/environments/.stackhpc/tofu/LEAFCLOUD-dev.tfvars b/environments/.stackhpc/tofu/LEAFCLOUD-dev.tfvars index 82e336d..b45a961 100644 --- a/environments/.stackhpc/tofu/LEAFCLOUD-dev.tfvars +++ b/environments/.stackhpc/tofu/LEAFCLOUD-dev.tfvars @@ -1,10 +1,10 @@ cluster_networks = [ - { - network = "stackhpc-dev" - subnet = "stackhpc-dev" - } + { + network = "stackhpc-dev" + subnet = "stackhpc-dev" + } ] control_node_flavor = "ec1.medium" # small ran out of memory, medium gets down to ~100Mi mem free on deployment -other_node_flavor = "en1.xsmall" -state_volume_type = "unencrypted" -home_volume_type = "unencrypted" +other_node_flavor = "en1.xsmall" +state_volume_type = "unencrypted" +home_volume_type = "unencrypted" diff --git a/environments/.stackhpc/tofu/LEAFCLOUD.tfvars b/environments/.stackhpc/tofu/LEAFCLOUD.tfvars index 135aadc..601910a 100644 --- a/environments/.stackhpc/tofu/LEAFCLOUD.tfvars +++ b/environments/.stackhpc/tofu/LEAFCLOUD.tfvars @@ -1,10 +1,10 @@ cluster_networks = [ - { - network = "slurmapp-ci" - subnet = "slurmapp-ci" - } + { + network = "slurmapp-ci" + subnet = "slurmapp-ci" + } ] control_node_flavor = "ec1.medium" # small ran out of memory, medium gets down to ~100Mi mem free on deployment -other_node_flavor = "en1.xsmall" -state_volume_type = "unencrypted" -home_volume_type = "unencrypted" +other_node_flavor = "en1.xsmall" +state_volume_type = "unencrypted" +home_volume_type = "unencrypted" diff --git a/environments/.stackhpc/tofu/SMS.tfvars b/environments/.stackhpc/tofu/SMS.tfvars index 808821b..6d14fc2 100644 --- a/environments/.stackhpc/tofu/SMS.tfvars +++ b/environments/.stackhpc/tofu/SMS.tfvars @@ -1,8 +1,8 @@ cluster_networks = [ - { - network = "stackhpc-ipv4-geneve" - subnet = "stackhpc-ipv4-geneve-subnet" - } + { + network = "stackhpc-ipv4-geneve" + subnet = "stackhpc-ipv4-geneve-subnet" + } ] control_node_flavor = "general.v1.small" -other_node_flavor = "general.v1.small" \ No newline at end of file +other_node_flavor = "general.v1.small" \ No newline at end of file diff --git a/environments/.stackhpc/tofu/main.tf b/environments/.stackhpc/tofu/main.tf index 22113cd..649f2f7 100644 --- a/environments/.stackhpc/tofu/main.tf +++ b/environments/.stackhpc/tofu/main.tf @@ -76,7 +76,7 @@ module "cluster" { control_node_flavor = var.control_node_flavor login = { - login = { + head = { nodes = ["login-0"] flavor = var.other_node_flavor } diff --git a/environments/site/tofu/variables.tf b/environments/site/tofu/variables.tf index 98f364a..82358b2 100644 --- a/environments/site/tofu/variables.tf +++ b/environments/site/tofu/variables.tf @@ -52,7 +52,8 @@ variable "login" { be useful for e.g. separating nodes for ssh and Open Ondemand usage, or to define login nodes with different capabilities such as high-memory. - Keys are names of groups. + Keys are names of groups, and cannot be 'login', 'compute', 'control', or + keys in the compute or additional_nodegroups variables. Values are a mapping as follows: Required: @@ -88,6 +89,25 @@ variable "login" { EOF type = any + validation { + condition = length(setintersection(keys(var.login), ["login", "compute", "control"])) == 0 + error_message = <<-EOF + Login nodegroup names cannot be 'login', 'compute' or 'control'. Invalid var.login key(s): ${join(", ", setintersection(keys(var.login), ["login", "compute", "control"]))}. + EOF + } + validation { + condition = length(distinct(concat(keys(var.login), keys(var.compute), keys(var.additional_nodegroups)))) == length(concat(keys(var.login), keys(var.compute), keys(var.additional_nodegroups))) + error_message = <<-EOF + Nodegroup names must be unique. Shared key(s) found in variables login, compute and/or additional_nodegroups: ${ + join(", ", setunion( + setintersection(keys(var.login), keys(var.compute)), + setintersection(keys(var.compute), keys(var.additional_nodegroups)), + setintersection(keys(var.additional_nodegroups), keys(var.login)) + )) + } + EOF + +} } variable "cluster_image_id" { @@ -101,7 +121,8 @@ variable "compute" { Mapping defining homogenous groups of compute nodes. Groups are used in Slurm partition definitions. - Keys are names of groups. + Keys are names of groups, and cannot be 'compute', 'login', 'control', 'default' + or keys in the login or additional_nodegroups variables. Values are a mapping as follows: Required: @@ -139,6 +160,12 @@ variable "compute" { EOF type = any # can't do any better; TF type constraints can't cope with heterogeneous inner mappings + validation { + condition = length(setintersection(keys(var.compute), ["login", "compute", "control", "default"])) == 0 + error_message = <<-EOF + Compute nodegroup names cannot be 'compute', 'default', 'login' or 'control'. Invalid var.compute key(s): ${join(", ", setintersection(keys(var.compute), ["login", "compute", "control", "default"]))}. + EOF + } } # tflint-ignore: terraform_typed_variables @@ -149,7 +176,8 @@ variable "additional_nodegroups" { These nodes are not in the compute or login inventory groups so they will not run slurmd. - Keys are names of groups. + Keys are names of groups and cannot be 'login', 'compute, 'control', or + keys in the login or additional_nodegroups variables. Values are a mapping as for the "login" variable, with the addition of the optional entry: @@ -162,6 +190,13 @@ variable "additional_nodegroups" { - $cluster_name + '_' + $group_name - 'additional' EOF + type = any # can't do any better; TF type constraints can't cope with heterogeneous inner mappings + validation { + condition = length(setintersection(keys(var.additional_nodegroups), ["login", "compute", "control"])) == 0 + error_message = <<-EOF + Additional nodegroup names cannot be 'compute', 'login' or 'control'. Invalid var.additional_nodegroups key(s): ${join(", ", setintersection(keys(var.additional_nodegroups), ["login", "compute", "control"]))}. + EOF + } } variable "environment_root" { From 4548b9b5962660121523a7e3c6d4a57ef87f76bb Mon Sep 17 00:00:00 2001 From: bertiethorpe <84867280+bertiethorpe@users.noreply.github.com> Date: Thu, 25 Sep 2025 12:40:06 +0100 Subject: [PATCH 27/50] Bump Open OnDemand to v4 & install apps in fatimage (#782) * bump OSC's OOD v4.0.1 * pin ondemand 4.0.7 in common env * install ood app packages in fatimage.yml * make packer volume 20 GB to manage ood app packages * fix typo * bump images * update ood cleanup paths triggering trivy errors * bump fatimages * noqa yaml[brackets] for OOD options * fix linter warnings about flow-style * remove wrong comment * Add module FQDN * pickup task name fixes from PR#794 * bump CI image --------- Co-authored-by: Steve Brasier Co-authored-by: Steve Brasier <33413598+sjpb@users.noreply.github.com> --- .yamllint.yml | 2 +- ansible/cleanup.yml | 4 ++-- ansible/fatimage.yml | 23 ++++++++++++++----- .../tofu/cluster_image.auto.tfvars.json | 4 ++-- .../inventory/group_vars/all/openondemand.yml | 11 ++++----- packer/openstack.pkr.hcl | 2 +- requirements.yml | 2 +- 7 files changed, 28 insertions(+), 20 deletions(-) diff --git a/.yamllint.yml b/.yamllint.yml index 3220260..650a27a 100644 --- a/.yamllint.yml +++ b/.yamllint.yml @@ -3,7 +3,7 @@ extends: default rules: brackets: - forbid: non-empty + forbid: false comments: # https://github.com/prettier/prettier/issues/6780 min-spaces-from-content: 1 diff --git a/ansible/cleanup.yml b/ansible/cleanup.yml index 6b495d7..b9a0d72 100644 --- a/ansible/cleanup.yml +++ b/ansible/cleanup.yml @@ -47,8 +47,8 @@ loop: # NB: items here MUST have a justification! # ondemand install: raised at https://github.com/OSC/ondemand/security/advisories/GHSA-f7j8-ppqm-m5vw # All declared not to be an issue by Open Ondemand as relevant packages not installed - - "/opt/ood/ondemand/root/usr/share/gems/3.1/ondemand/{{ ondemand_package_version }}-1/gems/bootstrap_form-2.7.0/test/dummy/Gemfile.lock" - - "/opt/ood/ondemand/root/usr/share/gems/3.1/ondemand/{{ ondemand_package_version }}-1/gems/bootstrap_form-4.5.0/demo/yarn.lock" + - "/opt/ood/ondemand/root/usr/share/gems/3.3/ondemand/{{ ondemand_package_version }}-1/gems/bootstrap_form-2.7.0/test/dummy/Gemfile.lock" + - "/opt/ood/ondemand/root/usr/share/gems/3.3/ondemand/{{ ondemand_package_version }}-1/gems/bootstrap_form-5.0.0/demo/yarn.lock" - /var/www/ood/apps/sys/dashboard/node_modules/data-confirm-modal/Gemfile.lock # chrony role: only used for role dev, venv never created on disk - /etc/ansible-init/playbooks/roles/mrlesmithjr.chrony/poetry.lock diff --git a/ansible/fatimage.yml b/ansible/fatimage.yml index 8e8e58a..7565af6 100644 --- a/ansible/fatimage.yml +++ b/ansible/fatimage.yml @@ -121,33 +121,44 @@ when: "'openhpc' in group_names" # - import_playbook: portal.yml - - name: Open Ondemand server (packages) + - name: Open OnDemand server (packages) ansible.builtin.include_role: name: osc.ood tasks_from: install-package.yml vars_from: "Rocky/{{ ansible_distribution_major_version }}.yml" when: "'openondemand' in group_names" - # # FUTURE: install-apps.yml - this is git clones - - name: Open Ondemand server (apps) + - name: Open OnDemand server (apps) ansible.builtin.include_role: name: osc.ood tasks_from: install-apps.yml vars_from: "Rocky/{{ ansible_distribution_major_version }}.yml" when: "'openondemand' in group_names" - - name: Open Ondemand remote desktop + - name: Open OnDemand remote desktop # Used for plain desktop and matlab ansible.builtin.import_role: name: openondemand tasks_from: vnc_compute.yml - when: "'openondemand_desktop' in group_names" + when: "'openondemand_desktop' or 'openondemand_matlab' in group_names" - - name: Open Ondemand jupyter node + - name: Open OnDemand Jupyter node ansible.builtin.import_role: name: openondemand tasks_from: jupyter_compute.yml when: "'openondemand_jupyter' in group_names" + - name: Open OnDemand RStudio node + ansible.builtin.import_role: + name: openondemand + tasks_from: rstudio_compute.yml + when: "'openondemand_rstudio' in group_names" + + - name: Open OnDemand Code Server node + ansible.builtin.import_role: + name: openondemand + tasks_from: codeserver_compute.yml + when: "'openondemand_codeserver' in group_names" + - name: Install Apache PAM module # Extracted from start of roles/openondemand/tasks/pam_auth.yml to ensure only installed during build ansible.builtin.dnf: name: mod_authnz_pam diff --git a/environments/.stackhpc/tofu/cluster_image.auto.tfvars.json b/environments/.stackhpc/tofu/cluster_image.auto.tfvars.json index b0d1022..bcb56cf 100644 --- a/environments/.stackhpc/tofu/cluster_image.auto.tfvars.json +++ b/environments/.stackhpc/tofu/cluster_image.auto.tfvars.json @@ -1,6 +1,6 @@ { "cluster_image": { - "RL8": "openhpc-RL8-250923-1321-5fcc36b0", - "RL9": "openhpc-RL9-250923-1321-5fcc36b0" + "RL8": "openhpc-RL8-250924-1502-e9afbfe5", + "RL9": "openhpc-RL9-250924-1536-e9afbfe5" } } diff --git a/environments/common/inventory/group_vars/all/openondemand.yml b/environments/common/inventory/group_vars/all/openondemand.yml index 7727052..ea88b08 100644 --- a/environments/common/inventory/group_vars/all/openondemand.yml +++ b/environments/common/inventory/group_vars/all/openondemand.yml @@ -5,7 +5,7 @@ # NB: Variables prefixed ood_ are all from https://github.com/OSC/ood-ansible -ondemand_package_version: "3.1.10" # used in ansible/cleanup.yml +ondemand_package_version: "4.0.7" # used in ansible/cleanup.yml ondemand_package: ondemand-"{{ ondemand_package_version }}" # osc.ood role var controlling installed package openondemand_servername: "{{ hostvars[groups['openondemand'].0].ansible_host if groups['openondemand'] else '' }}" @@ -198,8 +198,7 @@ openondemand_apps_rstudio_default: help: Choose your RStudio module widget: select options: - - "RStudio v{{ openondemand_rstudio_version }}" - - "rstudio-server/{{ openondemand_rstudio_version }}}" + - ["RStudio v{{ openondemand_rstudio_version }}", "rstudio-server/{{ openondemand_rstudio_version }}"] extra_modules_script: label: Extra modules script help: If you'd like to load additional modules alongside RStudio-Server, put the 'module load ...' commands into a text file (one 'module load...' per line) and specify its path here # noqa: yaml[line-length] @@ -288,8 +287,7 @@ openondemand_apps_matlab_default: help: Choose your MATLAB module widget: select options: - - "MATLAB v{{ openondemand_matlab_version }}" - - "matlab/{{ openondemand_matlab_version }}" + - ["MATLAB v{{ openondemand_matlab_version }}", "matlab/{{ openondemand_matlab_version }}"] cores: label: Number of CPU cores help: How many CPU cores to reserve for your session. NB Ensure this is within the maximum allowed by your chosen partition. @@ -349,8 +347,7 @@ openondemand_apps_codeserver_default: help: Choose your Code Server module widget: select options: - - "Code Server v{{ openondemand_code_server_version}}" - - "code-server/{{ openondemand_code_server_version }}" + - ["Code Server v{{ openondemand_code_server_version}}", "code-server/{{ openondemand_code_server_version }}"] bc_queue: value: "{{ openondemand_codeserver_partition | default(none) }}" cores: diff --git a/packer/openstack.pkr.hcl b/packer/openstack.pkr.hcl index 3f93d50..9faf4bb 100644 --- a/packer/openstack.pkr.hcl +++ b/packer/openstack.pkr.hcl @@ -125,7 +125,7 @@ variable "volume_type" { variable "volume_size" { type = number - default = 15 + default = 20 } variable "image_disk_format" { diff --git a/requirements.yml b/requirements.yml index 98785d7..27dbcbe 100644 --- a/requirements.yml +++ b/requirements.yml @@ -17,7 +17,7 @@ roles: version: stackhpc-0.19.0 # fix grafana install - src: https://github.com/OSC/ood-ansible.git name: osc.ood - version: v3.1.5 + version: v4.0.1 - src: https://github.com/stackhpc/ansible-role-os-manila-mount.git name: stackhpc.os-manila-mount version: v25.3.1 From ab4a5ae62c0d90b606b7be20f832f70f09d82e35 Mon Sep 17 00:00:00 2001 From: Steve Brasier <33413598+sjpb@users.noreply.github.com> Date: Thu, 25 Sep 2025 20:48:15 +0100 Subject: [PATCH 28/50] Support software raid root disks in stackhpc images (#785) * support raid root disks in stackhpc-built images * clarify image requirements * bump CI image * fixup grub for RL8 * fix linter issues * fix raid kernel commandline configuration for RL8 [no ci] * bump CI image * fix handler ansible-lint errors * bump CI image --- ansible/.gitignore | 2 ++ ansible/extras.yml | 9 ++++++++- ansible/roles/raid/README.md | 17 +++++++++++++++++ ansible/roles/raid/handlers/main.yml | 3 +++ ansible/roles/raid/tasks/main.yml | 17 +++++++++++++++++ environments/.stackhpc/inventory/extra_groups | 4 ++++ .../tofu/cluster_image.auto.tfvars.json | 4 ++-- environments/common/inventory/groups | 3 +++ environments/site/inventory/groups | 3 +++ 9 files changed, 59 insertions(+), 3 deletions(-) create mode 100644 ansible/roles/raid/README.md create mode 100644 ansible/roles/raid/handlers/main.yml create mode 100644 ansible/roles/raid/tasks/main.yml diff --git a/ansible/.gitignore b/ansible/.gitignore index 62c9a54..b5b3572 100644 --- a/ansible/.gitignore +++ b/ansible/.gitignore @@ -98,3 +98,5 @@ roles/* !roles/eessi/** !roles/topology/ !roles/topology/** +!roles/raid/ +!roles/raid/** diff --git a/ansible/extras.yml b/ansible/extras.yml index 02b0d40..3c790be 100644 --- a/ansible/extras.yml +++ b/ansible/extras.yml @@ -1,4 +1,11 @@ ---- +- hosts: raid + become: true + tags: raid + gather_facts: true + tasks: + - ansible.builtin.include_role: + name: raid + - hosts: k3s_server:!builder become: true tags: k3s diff --git a/ansible/roles/raid/README.md b/ansible/roles/raid/README.md new file mode 100644 index 0000000..4774a46 --- /dev/null +++ b/ansible/roles/raid/README.md @@ -0,0 +1,17 @@ +# raid + +Configure an image to support software raid (via [mdadm](https://github.com/md-raid-utilities/mdadm)). + +RockyLinux genericcloud images already have the necessary `mdraid` dracut +module installed, as well as kernel modules for `raid0`, `raikd1`, `raid10` and +`raid456` [^1]. This covers all raid modes [supported by Ironic](https://docs.openstack.org/ironic/latest/admin/raid.html#software-raid) +hence this role does not support extending this. + +This role changes the command line for the current kernel. It does not reboot +the instance so generally is only useful during image builds. + +Note that the `rootfs_uuid` image property described in the [Ironic raid documentation](https://docs.openstack.org/ironic/latest/admin/raid.html#image-requirements) +is not required; the root partition is the first (non-boot) partition and this +is sufficent for Ironic to find the root file system. + +[^1]: As shown by `lsinitrd /boot/initramfs-$(uname -r).img | grep raid` diff --git a/ansible/roles/raid/handlers/main.yml b/ansible/roles/raid/handlers/main.yml new file mode 100644 index 0000000..02867f7 --- /dev/null +++ b/ansible/roles/raid/handlers/main.yml @@ -0,0 +1,3 @@ +- name: Update GRUB configuration file + ansible.builtin.command: "grub2-mkconfig -o /boot/grub2/grub.cfg {{ '--update-bls-cmdline' if ansible_distribution_major_version == '9' else '' }}" + changed_when: true diff --git a/ansible/roles/raid/tasks/main.yml b/ansible/roles/raid/tasks/main.yml new file mode 100644 index 0000000..3ea61db --- /dev/null +++ b/ansible/roles/raid/tasks/main.yml @@ -0,0 +1,17 @@ +- name: Enable autoassembly of mdraid devices + # adds rd.auto=1 - see `man dracut.cmdline` + ansible.builtin.lineinfile: + path: /etc/default/grub + regexp: > + ^{{ grub_cmdline_var[ansible_distribution_major_version] }}="((?:(?!rd.auto=1).)*?)"$ + line: > + {{ grub_cmdline_var[ansible_distribution_major_version] }}="\1 rd.auto=1" + backup: true + backrefs: true + register: update_grub + notify: + - Update GRUB configuration file + vars: + grub_cmdline_var: + '8': GRUB_CMDLINE_LINUX + '9': GRUB_CMDLINE_LINUX_DEFAULT diff --git a/environments/.stackhpc/inventory/extra_groups b/environments/.stackhpc/inventory/extra_groups index f3a9964..29d9d93 100644 --- a/environments/.stackhpc/inventory/extra_groups +++ b/environments/.stackhpc/inventory/extra_groups @@ -50,3 +50,7 @@ cluster [compute_init:children] compute + +[raid:children] +# Configure fatimage for raid +builder diff --git a/environments/.stackhpc/tofu/cluster_image.auto.tfvars.json b/environments/.stackhpc/tofu/cluster_image.auto.tfvars.json index bcb56cf..9650ccd 100644 --- a/environments/.stackhpc/tofu/cluster_image.auto.tfvars.json +++ b/environments/.stackhpc/tofu/cluster_image.auto.tfvars.json @@ -1,6 +1,6 @@ { "cluster_image": { - "RL8": "openhpc-RL8-250924-1502-e9afbfe5", - "RL9": "openhpc-RL9-250924-1536-e9afbfe5" + "RL8": "openhpc-RL8-250925-1639-62d67ae3", + "RL9": "openhpc-RL9-250925-1639-62d67ae3" } } diff --git a/environments/common/inventory/groups b/environments/common/inventory/groups index ef24952..2c67c4a 100644 --- a/environments/common/inventory/groups +++ b/environments/common/inventory/groups @@ -217,3 +217,6 @@ extra_packages # separately from the appliance. e.g # pulp_host ansible_host= # Note the host name can't conflict with group names i.e can't be called `pulp` or `pulp_server` + +[raid] +# Add `builder` to configure image for software raid diff --git a/environments/site/inventory/groups b/environments/site/inventory/groups index 930cf93..85d7e36 100644 --- a/environments/site/inventory/groups +++ b/environments/site/inventory/groups @@ -169,3 +169,6 @@ compute # separately from the appliance. e.g # pulp_host ansible_host= # Note inventory host name cannot conflict with group names i.e can't be called `pulp` or `pulp_server`. + +[raid] +# Add `builder` to configure image for software raid From 00c044ff56045110c6d89ec78b61ca8939d9b4c4 Mon Sep 17 00:00:00 2001 From: wtripp180901 Date: Mon, 29 Sep 2025 15:16:16 +0100 Subject: [PATCH 29/50] Fix .caas secrets not persisting post-reimage + skip failing validation check for .caas --- ansible/roles/persist_openhpc_secrets/tasks/main.yml | 8 ++++++++ ansible/validate.yml | 1 + environments/.caas/inventory/group_vars/all/defaults.yml | 1 + environments/common/inventory/group_vars/all/defaults.yml | 1 + 4 files changed, 11 insertions(+) create mode 100644 environments/.caas/inventory/group_vars/all/defaults.yml diff --git a/ansible/roles/persist_openhpc_secrets/tasks/main.yml b/ansible/roles/persist_openhpc_secrets/tasks/main.yml index dc12e2a..35fd045 100644 --- a/ansible/roles/persist_openhpc_secrets/tasks/main.yml +++ b/ansible/roles/persist_openhpc_secrets/tasks/main.yml @@ -15,6 +15,14 @@ - "{{ appliances_state_dir }}/ansible.facts.d" - "/etc/ansible/facts.d" +- name: Symlink to persisted facts if present + ansible.builtin.file: + state: link + src: "{{ appliances_state_dir }}/ansible.facts.d/openhpc_secrets.fact" + dest: /etc/ansible/facts.d/openhpc_secrets.fact + owner: root + when: openhpc_secrets_stat.stat.exists + - name: Load existing OpenHPC secrets if present ansible.builtin.setup: filter: ansible_local diff --git a/ansible/validate.yml b/ansible/validate.yml index 2352fff..e1d03a2 100644 --- a/ansible/validate.yml +++ b/ansible/validate.yml @@ -77,6 +77,7 @@ - cluster_home_volume is defined - cluster_compute_groups is defined fail_msg: "One or more expected variables are missing: is OpenTofu inventory template up to date?" + when: not appliances_caas_skip_validate_vars - name: Ensure control node is in inventory hosts: all diff --git a/environments/.caas/inventory/group_vars/all/defaults.yml b/environments/.caas/inventory/group_vars/all/defaults.yml new file mode 100644 index 0000000..7ec96c0 --- /dev/null +++ b/environments/.caas/inventory/group_vars/all/defaults.yml @@ -0,0 +1 @@ +appliances_caas_skip_validate_vars: true diff --git a/environments/common/inventory/group_vars/all/defaults.yml b/environments/common/inventory/group_vars/all/defaults.yml index 027e407..c11cc44 100644 --- a/environments/common/inventory/group_vars/all/defaults.yml +++ b/environments/common/inventory/group_vars/all/defaults.yml @@ -8,6 +8,7 @@ appliances_cockpit_state: absent # RHEL cockpit installed but not enabled in gen # appliances_state_dir: # define an absolute path here to use for persistent state: NB: This is defined as /var/lib/state in inventory by the default Terraform appliances_mode: configure appliances_pulp_url: https://ark.stackhpc.com +appliances_caas_skip_validate_vars: false # Address(ip/dns) for internal communication between services. This is # normally traffic you do no want to expose to users. From 9fb0bf81ed743e58bc445aa8ce9dae7b47b70f44 Mon Sep 17 00:00:00 2001 From: wtripp180901 <78219569+wtripp180901@users.noreply.github.com> Date: Tue, 30 Sep 2025 12:38:42 +0100 Subject: [PATCH 30/50] Pin bcrypt to 4.3.0 to avoid passlib bug (#801) --- requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/requirements.txt b/requirements.txt index 7e596f4..4e816af 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,6 +4,7 @@ python-openstackclient==8.0.0 python-manilaclient python-ironicclient jmespath +bcrypt==4.3.0 passlib[bcrypt]==1.7.4 cookiecutter selinux # this is a shim to avoid having to use --system-site-packages, you still need sudo yum install libselinux-python3 From d8b3cf6480883c54f182c524e9af6a766486e23c Mon Sep 17 00:00:00 2001 From: wtripp180901 Date: Tue, 30 Sep 2025 12:46:37 +0100 Subject: [PATCH 31/50] changed variable name --- ansible/validate.yml | 2 +- environments/.caas/inventory/group_vars/all/defaults.yml | 2 +- environments/common/inventory/group_vars/all/defaults.yml | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/ansible/validate.yml b/ansible/validate.yml index e1d03a2..b43eb7e 100644 --- a/ansible/validate.yml +++ b/ansible/validate.yml @@ -77,7 +77,7 @@ - cluster_home_volume is defined - cluster_compute_groups is defined fail_msg: "One or more expected variables are missing: is OpenTofu inventory template up to date?" - when: not appliances_caas_skip_validate_vars + when: appliances_validate_tofu_vars | bool - name: Ensure control node is in inventory hosts: all diff --git a/environments/.caas/inventory/group_vars/all/defaults.yml b/environments/.caas/inventory/group_vars/all/defaults.yml index 7ec96c0..ded58fe 100644 --- a/environments/.caas/inventory/group_vars/all/defaults.yml +++ b/environments/.caas/inventory/group_vars/all/defaults.yml @@ -1 +1 @@ -appliances_caas_skip_validate_vars: true +appliances_validate_tofu_vars: false diff --git a/environments/common/inventory/group_vars/all/defaults.yml b/environments/common/inventory/group_vars/all/defaults.yml index c11cc44..e9852af 100644 --- a/environments/common/inventory/group_vars/all/defaults.yml +++ b/environments/common/inventory/group_vars/all/defaults.yml @@ -8,7 +8,7 @@ appliances_cockpit_state: absent # RHEL cockpit installed but not enabled in gen # appliances_state_dir: # define an absolute path here to use for persistent state: NB: This is defined as /var/lib/state in inventory by the default Terraform appliances_mode: configure appliances_pulp_url: https://ark.stackhpc.com -appliances_caas_skip_validate_vars: false +appliances_validate_tofu_vars: true # Address(ip/dns) for internal communication between services. This is # normally traffic you do no want to expose to users. From 7e88f5a5b31472ff3654c788a7dd5ce7097691c7 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Tue, 30 Sep 2025 19:18:24 +0000 Subject: [PATCH 32/50] move image download/conversion to runner's /mnt [no ci] --- .github/workflows/s3-image-sync.yml | 17 ++++------------- 1 file changed, 4 insertions(+), 13 deletions(-) diff --git a/.github/workflows/s3-image-sync.yml b/.github/workflows/s3-image-sync.yml index 43adf50..f73885c 100644 --- a/.github/workflows/s3-image-sync.yml +++ b/.github/workflows/s3-image-sync.yml @@ -93,33 +93,24 @@ jobs: echo "TARGET_IMAGE=${TARGET_IMAGE}" >> "$GITHUB_ENV" shell: bash - - name: Clear up some space on runner - run: | - df -h - sudo rm -rf /usr/share/dotnet - sudo rm -rf /opt/ghc - sudo rm -rf "/usr/local/share/boost" - sudo rm -rf "$AGENT_TOOLSDIRECTORY" - sudo apt-get clean - df -h - - name: Download image to runner run: | . venv/bin/activate - openstack image save --file "${{ env.TARGET_IMAGE }}.raw" "${{ env.TARGET_IMAGE }}" + df -h + openstack image save --file "/mnt/${{ env.TARGET_IMAGE }}.raw" "${{ env.TARGET_IMAGE }}" df -h shell: bash - name: Convert image to QCOW2 run: | . venv/bin/activate - qemu-img convert -f raw -O qcow2 -c "${{ env.TARGET_IMAGE }}.raw" "${{ env.TARGET_IMAGE }}" + qemu-img convert -f raw -O qcow2 -c "/mnt/${{ env.TARGET_IMAGE }}.raw" "${{ env.TARGET_IMAGE }}" shell: bash - name: Upload Image to S3 run: | echo "Uploading Image: ${{ env.TARGET_IMAGE }} to S3..." - s3cmd --multipart-chunk-size-mb=150 put ${{ env.TARGET_IMAGE }} s3://${{ env.S3_BUCKET }} + s3cmd --multipart-chunk-size-mb=150 put "/mnt/${{ env.TARGET_IMAGE }}" s3://${{ env.S3_BUCKET }} shell: bash image_sync: From 4c36537ff3775429ece8c791f5c3be8c03296ff6 Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Tue, 30 Sep 2025 20:10:40 +0000 Subject: [PATCH 33/50] make image dir --- .github/workflows/s3-image-sync.yml | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/.github/workflows/s3-image-sync.yml b/.github/workflows/s3-image-sync.yml index f73885c..b8629eb 100644 --- a/.github/workflows/s3-image-sync.yml +++ b/.github/workflows/s3-image-sync.yml @@ -9,6 +9,7 @@ on: env: S3_BUCKET: openhpc-images-prerelease IMAGE_PATH: environments/.stackhpc/tofu/cluster_image.auto.tfvars.json + RUNNER_IMAGE_DIR: /mnt/images permissions: contents: read @@ -97,20 +98,22 @@ jobs: run: | . venv/bin/activate df -h - openstack image save --file "/mnt/${{ env.TARGET_IMAGE }}.raw" "${{ env.TARGET_IMAGE }}" + sudo mkdir ${{ env.RUNNER_IMAGE_DIR }} + sudo chmod ugo=rwX ${{ env.RUNNER_IMAGE_DIR }} + openstack image save --file "${{ env.RUNNER_IMAGE_DIR }}/${{ env.TARGET_IMAGE }}.raw" "${{ env.TARGET_IMAGE }}" df -h shell: bash - name: Convert image to QCOW2 run: | . venv/bin/activate - qemu-img convert -f raw -O qcow2 -c "/mnt/${{ env.TARGET_IMAGE }}.raw" "${{ env.TARGET_IMAGE }}" + qemu-img convert -f raw -O qcow2 -c "${{ env.RUNNER_IMAGE_DIR }}/${{ env.TARGET_IMAGE }}.raw" "${{ env.TARGET_IMAGE }}" shell: bash - name: Upload Image to S3 run: | echo "Uploading Image: ${{ env.TARGET_IMAGE }} to S3..." - s3cmd --multipart-chunk-size-mb=150 put "/mnt/${{ env.TARGET_IMAGE }}" s3://${{ env.S3_BUCKET }} + s3cmd --multipart-chunk-size-mb=150 put "${{ env.RUNNER_IMAGE_DIR }}/${{ env.TARGET_IMAGE }}" s3://${{ env.S3_BUCKET }} shell: bash image_sync: From 63d918b7b0bc5c0ae0f534dd08c80199c24b15bf Mon Sep 17 00:00:00 2001 From: Steve Brasier Date: Wed, 1 Oct 2025 08:23:39 +0000 Subject: [PATCH 34/50] fix image upload --- .github/workflows/s3-image-sync.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/s3-image-sync.yml b/.github/workflows/s3-image-sync.yml index b8629eb..3489dc6 100644 --- a/.github/workflows/s3-image-sync.yml +++ b/.github/workflows/s3-image-sync.yml @@ -107,7 +107,7 @@ jobs: - name: Convert image to QCOW2 run: | . venv/bin/activate - qemu-img convert -f raw -O qcow2 -c "${{ env.RUNNER_IMAGE_DIR }}/${{ env.TARGET_IMAGE }}.raw" "${{ env.TARGET_IMAGE }}" + qemu-img convert -f raw -O qcow2 -c "${{ env.RUNNER_IMAGE_DIR }}/${{ env.TARGET_IMAGE }}.raw" "${{ env.RUNNER_IMAGE_DIR }}/${{ env.TARGET_IMAGE }}" shell: bash - name: Upload Image to S3 From f351b9dcdd5bc3a062873f8d678c05709cb009d2 Mon Sep 17 00:00:00 2001 From: bertiethorpe <84867280+bertiethorpe@users.noreply.github.com> Date: Wed, 1 Oct 2025 07:17:42 -0700 Subject: [PATCH 35/50] bump codeserver app version (#806) --- environments/common/inventory/group_vars/all/openondemand.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/environments/common/inventory/group_vars/all/openondemand.yml b/environments/common/inventory/group_vars/all/openondemand.yml index ea88b08..1f7859a 100644 --- a/environments/common/inventory/group_vars/all/openondemand.yml +++ b/environments/common/inventory/group_vars/all/openondemand.yml @@ -100,7 +100,7 @@ openondemand_install_app_matlab: openondemand_install_app_codeserver: codeserver: repo: https://github.com/stackhpc/bc_osc_codeserver.git - version: 2025.08.1 + version: 2025.09.1 # osc:ood role var (NB only active when not in configure): ood_install_apps: >- {{ From 69f71fe41024953fb57cd22a0c00f48550f1a6a1 Mon Sep 17 00:00:00 2001 From: Pierre Riteau Date: Wed, 1 Oct 2025 16:18:17 +0200 Subject: [PATCH 36/50] Use (group) syntax in access.conf (#804) From access.conf(5): The second field, the users/group field, should be a list of one or more login names, group names, or ALL (which always matches). To differentiate user entries from group entries, group entries should be written with brackets, e.g. (group). --- ansible/roles/compute_init/files/compute-init.yml | 2 +- ansible/slurm.yml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/ansible/roles/compute_init/files/compute-init.yml b/ansible/roles/compute_init/files/compute-init.yml index 0ff647a..81dedf8 100644 --- a/ansible/roles/compute_init/files/compute-init.yml +++ b/ansible/roles/compute_init/files/compute-init.yml @@ -346,7 +346,7 @@ ansible.builtin.blockinfile: path: /etc/security/access.conf block: | - +:adm:ALL + +:(adm):ALL -:ALL:ALL - name: Ensure slurmd service state diff --git a/ansible/slurm.yml b/ansible/slurm.yml index d6d306e..345b361 100644 --- a/ansible/slurm.yml +++ b/ansible/slurm.yml @@ -50,7 +50,7 @@ ansible.builtin.blockinfile: path: /etc/security/access.conf block: | - +:adm:ALL + +:(adm):ALL -:ALL:ALL # vagrant uses (deprecated) ansible_ssh_user From 81a25814b697024d2f1f37eb323f9904adeec07b Mon Sep 17 00:00:00 2001 From: Pierre Riteau Date: Wed, 1 Oct 2025 16:18:50 +0200 Subject: [PATCH 37/50] Remove extra lines in activate scripts (#803) --- cookiecutter/{{cookiecutter.environment}}/activate | 2 -- environments/.stackhpc/activate | 2 -- environments/site/activate | 2 -- 3 files changed, 6 deletions(-) diff --git a/cookiecutter/{{cookiecutter.environment}}/activate b/cookiecutter/{{cookiecutter.environment}}/activate index 2a58b40..c9bb527 100644 --- a/cookiecutter/{{cookiecutter.environment}}/activate +++ b/cookiecutter/{{cookiecutter.environment}}/activate @@ -18,5 +18,3 @@ echo "Setting PKR_VAR_repo_root to $PKR_VAR_repo_root" if [ -f "$APPLIANCES_ENVIRONMENT_ROOT/ansible.cfg" ]; then export ANSIBLE_CONFIG=$APPLIANCES_ENVIRONMENT_ROOT/ansible.cfg fi - - diff --git a/environments/.stackhpc/activate b/environments/.stackhpc/activate index 2a58b40..c9bb527 100644 --- a/environments/.stackhpc/activate +++ b/environments/.stackhpc/activate @@ -18,5 +18,3 @@ echo "Setting PKR_VAR_repo_root to $PKR_VAR_repo_root" if [ -f "$APPLIANCES_ENVIRONMENT_ROOT/ansible.cfg" ]; then export ANSIBLE_CONFIG=$APPLIANCES_ENVIRONMENT_ROOT/ansible.cfg fi - - diff --git a/environments/site/activate b/environments/site/activate index 2a58b40..c9bb527 100644 --- a/environments/site/activate +++ b/environments/site/activate @@ -18,5 +18,3 @@ echo "Setting PKR_VAR_repo_root to $PKR_VAR_repo_root" if [ -f "$APPLIANCES_ENVIRONMENT_ROOT/ansible.cfg" ]; then export ANSIBLE_CONFIG=$APPLIANCES_ENVIRONMENT_ROOT/ansible.cfg fi - - From 82c814e0283c41ab4d8ba3b298ae474ccbb626c3 Mon Sep 17 00:00:00 2001 From: bertiethorpe <84867280+bertiethorpe@users.noreply.github.com> Date: Thu, 2 Oct 2025 14:21:04 +0100 Subject: [PATCH 38/50] bump new fatimages (#808) --- environments/.stackhpc/tofu/cluster_image.auto.tfvars.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/environments/.stackhpc/tofu/cluster_image.auto.tfvars.json b/environments/.stackhpc/tofu/cluster_image.auto.tfvars.json index 9650ccd..4137567 100644 --- a/environments/.stackhpc/tofu/cluster_image.auto.tfvars.json +++ b/environments/.stackhpc/tofu/cluster_image.auto.tfvars.json @@ -1,6 +1,6 @@ { "cluster_image": { - "RL8": "openhpc-RL8-250925-1639-62d67ae3", - "RL9": "openhpc-RL9-250925-1639-62d67ae3" + "RL8": "openhpc-RL8-251001-1515-81a25814", + "RL9": "openhpc-RL9-251001-1424-81a25814" } } From b11696e669d47cfe220e9cb982bcc774a67163d6 Mon Sep 17 00:00:00 2001 From: Steve Brasier <33413598+sjpb@users.noreply.github.com> Date: Fri, 3 Oct 2025 14:40:53 +0100 Subject: [PATCH 39/50] Improve build group definitions (#788) * support raid root disks in stackhpc-built images * clarify image requirements * bump CI image * remove default build groups * fixup doca/cuda inventory groups * add fatimage inventory group * update docs for image build * minor docs tweaks * fixup fatimage group definition * fix build groups * bump CI image * minor docs tweak * fix linter markdown error * fix linter markdown error * swap example site image build to normal case * fix borked merge * fixes after self-review * bump CI image --- .github/workflows/fatimage.yml | 4 +- docs/image-build.md | 165 +++++++++++++----- docs/operations.md | 43 ++--- environments/.stackhpc/inventory/extra_groups | 20 +-- .../tofu/cluster_image.auto.tfvars.json | 4 +- environments/common/inventory/groups | 20 ++- environments/site/inventory/groups | 35 +++- 7 files changed, 195 insertions(+), 96 deletions(-) diff --git a/.github/workflows/fatimage.yml b/.github/workflows/fatimage.yml index d9884ca..66f2819 100644 --- a/.github/workflows/fatimage.yml +++ b/.github/workflows/fatimage.yml @@ -36,10 +36,10 @@ jobs: build: - image_name: openhpc-RL8 source_image_name: Rocky-8-GenericCloud-Base-8.10-20240528.0.x86_64.raw - inventory_groups: control,compute,login,update + inventory_groups: fatimage - image_name: openhpc-RL9 source_image_name: Rocky-9-GenericCloud-Base-9.6-20250531.0.x86_64.qcow2 - inventory_groups: control,compute,login,update + inventory_groups: fatimage env: ANSIBLE_FORCE_COLOR: True OS_CLOUD: openstack diff --git a/docs/image-build.md b/docs/image-build.md index 71be030..533bc62 100644 --- a/docs/image-build.md +++ b/docs/image-build.md @@ -1,67 +1,136 @@ # Packer-based image build -The appliance contains code and configuration to use [Packer](https://developer.hashicorp.com/packer) with the [OpenStack builder](https://www.packer.io/plugins/builders/openstack) to build images. - -The Packer configuration defined here builds "fat images" which contain packages, binaries and container images but no cluster-specific configuration. Using these: - -- Enables the image to be tested in CI before production use. -- Ensures re-deployment of the cluster or deployment of additional nodes can be completed even if packages are changed in upstream repositories (e.g. due to RockyLinux or OpenHPC updates). -- Improves deployment speed by reducing the number of package downloads to improve deployment speed. - -The fat images StackHPC builds and tests in CI are available from [GitHub releases](https://github.com/stackhpc/ansible-slurm-appliance/releases). However with some additional configuration it is also possible to: - -1. Build site-specific fat images from scratch. -2. Extend an existing fat image with additional functionality. +The appliance contains configuration to use [Packer](https://developer.hashicorp.com/packer) +with the [OpenStack builder](https://www.packer.io/plugins/builders/openstack) +to build images. Using images: + +- Enables the image to be tested in a `staging` environment before deployment + to the `production` environment. +- Ensures re-deployment of the cluster or deployment of additional nodes is + repeatable. +- Improves deployment speed by reducing the number of package installation. + +The Packer configuration here can be used to build two types of images: + +1. "Fat images" which contain packages, binaries and container images but no + cluster-specific configuration. These start from a RockyLinux GenericCloud + (or compatible) image. The fat images StackHPC builds and tests in CI are + available from [GitHub releases](https://github.com/stackhpc/ansible-slurm-appliance/releases). + However site-specific fat images can also be built from a different source + image e.g. if a different partition layout is required. +2. "Extra-build" images which extend a fat image to create a site-specific + image with with additional packages or functionality. For example the NVIDIA + `cuda` packages cannot be redistributed hence require an "extra" build. ## Usage -To build either a site-specific fat image from scratch, or to extend an existing StackHPC fat image: - -1. Ensure the current OpenStack credentials have sufficient authorisation to upload images (this may or may not require the `member` role for an application credential, depending on your OpenStack configuration). -2. The provided dev credentials for StackHPC's "Ark" Pulp server must be added to the target environments. This is done by overriding `dnf_repos_username` and `dnf_repos_password` with your vault encrypted credentials in `environments//inventory/group_vars/all/pulp.yml`. See the [experimental docs](experimental/pulp.md) if you wish instead wish to use a local Pulp server. -3. Create a Packer [variable definition file](https://developer.hashicorp.com/packer/docs/templates/hcl_templates/variables#assigning-values-to-input-variables) at e.g. `environments//builder.pkrvars.hcl` containing at a minimum: - -```hcl -flavor = "general.v1.small" # VM flavor to use for builder VMs -networks = ["26023e3d-bc8e-459c-8def-dbd47ab01756"] # List of network UUIDs to attach the VM to -source_image_name = "Rocky-9-GenericCloud-Base-9.4" # Name of image to create VM with, i.e. starting image -inventory_groups = "control,login,compute" # Additional inventory groups to add build VM to -``` - -Note that: - -- The network used for the Packer VM must provide outbound internet access but does not need to provide access to resources which the final cluster nodes require (e.g. Slurm control node, network filesystem servers etc.). -- The flavor used must have sufficent memory for the build tasks, but otherwise does not need to match the final cluster nodes. Usually 8GB is sufficent. By default, the build VM is volume-backed to allow control of the root disk size (and hence final image size) so the flavor disk size does not matter. -- The source image should be either a RockyLinux GenericCloud image for a site-specific image build from scratch, or a StackHPC fat image if extending an existing image. -- The `inventory_groups` variable takes a comma-separated list of Ansible inventory groups to add the build VM to. This is in addition to the `builder` group which it is always added to. This controls which Ansible roles and functionality run during build, and hence what gets added to the image. - All possible groups are listed in `environments/common/groups` but common options for this variable will be: - - `update,control,login,compute`: The resultant image has all packages in the source image updated, and then packages for all types of nodes in the cluster are added. When using a GenericCloud image for `source_image_name` this builds a site-specific fat image from scratch. - - One or more specific groups which are not enabled in the appliance by default, e.g. `lustre`. When using a StackHPC fat image for `source_image_name` this extends the image with just this additional functionality. +For either a site-specific fat-image build or an extra-build: + +1. Ensure the current OpenStack credentials have sufficient authorisation to + upload images (this may or may not require the `member` role for an + application credential, depending on your OpenStack configuration). +2. If package installs are required, add the provided dev credentials for + StackHPC's "Ark" Pulp server to the `site` environment: + + ```yaml + # environments/site/inventory/group_vars/all/dnf_repos.yml: + dnf_repos_username: your-ark-username + dnf_repos_password: "{{ vault_dnf_repos_password }}" + ``` + + ```yaml + # environments/site/inventory/group_vars/all/dnf_repos.yml: + dnf_repos_password: "your-ark-password" + ``` + + > [!IMPORTANT] + > The latter file should be vault-encrypted. + + Alternatively, configure a [local Pulp mirror](experimental/pulp.md). + +3. Create a Packer [variable definition file](https://developer.hashicorp.com/packer/docs/templates/hcl_templates/variables#assigning-values-to-input-variables). It must specify at least the + the following variables: + + ```hcl + # environments/site/builder.pkrvars.hcl: + flavor = "general.v1.small" # VM flavor to use for builder VMs + networks = ["26023e3d-bc8e-459c-8def-dbd47ab01756"] # List of network UUIDs to attach the VM to + source_image_name = "Rocky-9-GenericCloud-Base-9.4" # Name of image to create VM with, i.e. starting image + inventory_groups = "doca,cuda,extra_packages" # Build VM inventory groups => functionality to add to image + ``` + + See the top of [packer/openstack.pkr.hcl](../packer/openstack.pkr.hcl) + for all possible variables which can be set. + + Note that: + + - Normally the network must provide outbound internet access. However it + does not need to provide access to resources used by the actual cluster + nodes (e.g. Slurm control node, network filesystem servers etc.). + - The flavor used must have sufficent memory for the build tasks (usually + 8GB), but otherwise does not need to match the actual cluster node + flavor(s). + - By default, the build VM is volume-backed to allow control of the root + disk size (and hence final image size), so the flavor's disk size does not + matter. The default volume size is not sufficent if enabling `cuda` and/or + `doca` and should be increased: + ```terraform + volume_size = 35 # GB + ``` + - The source image should be either: + - For a site-specific fatimage build: A RockyLinux GenericCloud or + compatible image. + - For an extra-build image: Usually the appropriate StackHPC fat image, + as defined in `environments/.stackhpc/tofu/cluster_image.auto.tfvars.json` at the + checkout's current commit. See the [GitHub release page](https://github.com/stackhpc/ansible-slurm-appliance/releases) + for download links. In some cases extra builds may be chained, e.g. + one extra build adds a Lustre client, and the resulting image is used + as the source image for an extra build adding GPU support. + - The `inventory_groups` variable takes a comma-separated list of Ansible + inventory groups to add the build VM to (in addition to the `builder` + group which is it always in). This controls which Ansible roles and + functionality run during build, and hence what gets added to the image. + All possible groups are listed in `environments/common/groups` but common + options for this variable will be: + + - For a fatimage build: `fatimage`: This is defined in `enviroments/site/inventory/groups` + and results in an update of all packages in the source image, plus + installation of packages for default control, login and compute nodes. + + - For an extra-built image, one or more specific groups. This extends the + source image with just this additional functionality. The example above + installs NVIDIA DOCA network drivers, NVIDIA GPU drivers/Cuda packages + and also enables installation of packages defined in the + `appliances_extra_packages_other` variable (see + [docs/operations.md](./operations.md#adding-additional-packages)). 4. Activate the venv and the relevant environment. 5. Build images using the relevant variable definition file, e.g.: -```shell -cd packer/ -PACKER_LOG=1 /usr/bin/packer build -on-error=ask -var-file=$PKR_VAR_environment_root/builder.pkrvars.hcl openstack.pkr.hcl -``` + ```shell + cd packer/ + PACKER_LOG=1 /usr/bin/packer build -on-error=ask -var-file=../environments/site/builder.pkrvars.hcl openstack.pkr.hcl + ``` -**NB:** If the build fails while creating the volume, check if the source image has the `signature_verified` property: + **NB:** If the build fails while creating the volume, check if the source image has the `signature_verified` property: -```shell -openstack image show $SOURCE_IMAGE -``` + ```shell + openstack image show $SOURCE_IMAGE + ``` -If it does, remove this property: + If it does, remove this property: -```shell -openstack image unset --property signature_verified $SOURCE_IMAGE -``` + ```shell + openstack image unset --property signature_verified $SOURCE_IMAGE + ``` -then delete the failed volume, select cancelling the build when Packer queries, and then retry. This is [OpenStack bug 1823445](https://bugs.launchpad.net/cinder/+bug/1823445). + then delete the failed volume, select cancelling the build when Packer asks, + and then retry. This is [OpenStack bug 1823445](https://bugs.launchpad.net/cinder/+bug/1823445). -6. The built image will be automatically uploaded to OpenStack with a name prefixed `openhpc` and including a timestamp and a shortened Git hash. +6. The built image will be automatically uploaded to OpenStack. By default it + will have a name prefixed `openhpc` and including a timestamp and a shortened + Git hash. ## Build Process diff --git a/docs/operations.md b/docs/operations.md index 4064d44..4127300 100644 --- a/docs/operations.md +++ b/docs/operations.md @@ -83,7 +83,7 @@ disabled during runtime to prevent Ark credentials from being leaked. To enable In both cases, Ark credentials will be required. -=# Adding Additional Packages +## Adding Additional Packages By default, the following utility packages are installed during the StackHPC image build: @@ -101,22 +101,27 @@ By default, the following utility packages are installed during the StackHPC ima Additional packages can be added during image builds by: -- adding the `extra_packages` group to the build `inventory_groups` (see - [docs/image-build.md](./image-build.md)) -- defining a list of packages in `appliances_extra_packages_other` in e.g. - `environments/$SITE_ENV/inventory/group_vars/all/defaults.yml`. For example: +1. Configuring an [image build](./image-build.md) to enable the + `extra_packages` group: -```yaml -# environments/foo-base/inventory/group_vars/all/defaults.yml: -appliances_extra_packages_other: - - somepackage - - anotherpackage -``` + ```terraform + # environments/site/builder.pkrvars.hcl: + ... + inventory_groups = "extra_packages" + ... + ``` + +2. Defining a list of packages in `appliances_extra_packages_other`, for example: -For packages which come from repositories mirrored by StackHPC's "Ark" Pulp server -(including rocky, EPEL and OpenHPC repositories), this will require either [Ark -credentials](./image-build.md)) or a [local Pulp mirror](./experimental/pulp.md) -to be configured. This includes rocky, EPEL and OpenHPC repos. + ```yaml + # environments/site/inventory/group_vars/all/defaults.yml: + appliances_extra_packages_other: + - somepackage + - anotherpackage + ``` + +3. Either adding [Ark credentials](./image-build.md) or a [local Pulp mirror](./experimental/pulp.md) + to provide access to the required [repository snapshots](../environments/common/inventory/group_vars/all/dnf_repo_timestamps.yml). The packages available from the OpenHPC repos are described in Appendix E of the OpenHPC installation guide (linked from the @@ -125,9 +130,9 @@ the OpenHPC installation guide (linked from the corresponding `lmod` modules. Packages _may_ also be installed during the site.yml, by adding the `cluster` -group into the `extra_packages` group. An error will occur if Ark credentials -are defined in this case, as they are readable by unprivileged users in the -`.repo` files and a local Pulp mirror must be used instead. +group as a child of the `extra_packages` group. An error will occur if Ark +credential are defined in this case, as they are readable by unprivileged users +in the `.repo` files and a local Pulp mirror must be used instead. If additional repositories are required, these could be added/enabled as necessary in a play added to `environments/$SITE_ENV/hooks/{pre,post}.yml` as appropriate. Note such a play should NOT exclude the builder group, so that the repositories are also added to built images. @@ -148,8 +153,6 @@ ansible-playbook environments/$SITE_ENV/hooks/{pre,post}.yml as appropriate. -TODO: improve description about adding these to extra images. - ## Reconfiguring Slurm At a minimum run: diff --git a/environments/.stackhpc/inventory/extra_groups b/environments/.stackhpc/inventory/extra_groups index 29d9d93..0d7fb53 100644 --- a/environments/.stackhpc/inventory/extra_groups +++ b/environments/.stackhpc/inventory/extra_groups @@ -1,3 +1,5 @@ +# Unless noted otherwise features enabled here are tested by CI site.yml playbook + [basic_users:children] cluster @@ -20,7 +22,7 @@ cluster # --- end of FreeIPA example --- [manila:children] -# Allows demo; also installs manila client in fat image +# Not actully tested but allows demo using this environment login compute @@ -28,20 +30,8 @@ compute cluster [tuned:children] -# Install tuned into fat image -# NB: builder has tuned_enabled and tuned_started false so does not configure it -builder -# Also test tuned during site playbook cluster -[squid:children] -# Install squid into fat image -builder - -[sssd:children] -# Install sssd into fat image -builder - [rebuild:children] control @@ -50,7 +40,3 @@ cluster [compute_init:children] compute - -[raid:children] -# Configure fatimage for raid -builder diff --git a/environments/.stackhpc/tofu/cluster_image.auto.tfvars.json b/environments/.stackhpc/tofu/cluster_image.auto.tfvars.json index 4137567..585cfe2 100644 --- a/environments/.stackhpc/tofu/cluster_image.auto.tfvars.json +++ b/environments/.stackhpc/tofu/cluster_image.auto.tfvars.json @@ -1,6 +1,6 @@ { "cluster_image": { - "RL8": "openhpc-RL8-251001-1515-81a25814", - "RL9": "openhpc-RL9-251001-1424-81a25814" + "RL8": "openhpc-RL8-251002-1537-1d21952c", + "RL9": "openhpc-RL9-251002-1456-1d21952c" } } diff --git a/environments/common/inventory/groups b/environments/common/inventory/groups index 2c67c4a..558bad1 100644 --- a/environments/common/inventory/groups +++ b/environments/common/inventory/groups @@ -1,3 +1,12 @@ +# This file +# 1. Ensures all groups in the appliance are always defined - even if empty +# 2. Defines dependencies between groups - child groups require & enables parent +# +# IMPORTANT +# --------- +# All groups and child groups here MUST be empty, as other environments cannot +# remove hosts/groups. + [login] # All Slurm login nodes. Combined control/login nodes are not supported. @@ -129,6 +138,9 @@ prometheus freeipa_server freeipa_client +[doca] +# Add `builder` to install NVIDIA DOCA during image build + [cuda] # Hosts to install NVIDIA CUDA on - see ansible/roles/cuda/README.md @@ -193,9 +205,10 @@ k3s_agent [dnf_repos:children] # Hosts to replace system repos with Pulp repos -# Warning: when using Ark directly rather than a local Pulp server, adding hosts other than `builder` will leak Ark creds to users -builder +# Roles/groups listed here *always* do installs: extra_packages +doca +# TODO: can't express: if cuda and builder, enable dnf_repos [pulp_site] # Add builder to this group to automatically sync pulp during image build @@ -220,3 +233,6 @@ extra_packages [raid] # Add `builder` to configure image for software raid + +[fatimage] +# Add build VM into this group to enable all features with this as child diff --git a/environments/site/inventory/groups b/environments/site/inventory/groups index 85d7e36..c40928c 100644 --- a/environments/site/inventory/groups +++ b/environments/site/inventory/groups @@ -1,3 +1,15 @@ +[login:children] +# All Slurm login nodes. Combined control/login nodes are not supported. +fatimage + +[control:children] +# A single Slurm control node. Multiple (high availability) control nodes are not supported. +fatimage + +[compute:children] +# All Slurm compute nodes (in all partitions). +fatimage + [nfs:children] openhpc @@ -31,6 +43,7 @@ slurm_stats # NB: [rebuild] not defined here as likely to need features not currently supported [update:children] +fatimage [fail2ban:children] # Hosts to install fail2ban on to protect SSH @@ -74,6 +87,9 @@ cluster [freeipa_client] # Hosts to be a FreeIPA client. See ansible/roles/freeipa/README.md +[doca] +# Add `builder` to install NVIDIA DOCA during image build + [cuda] # Hosts to install NVIDIA CUDA on - see ansible/roles/cuda/README.md @@ -102,18 +118,21 @@ openhpc login openondemand -[squid] +[squid:children] # Hosts to run squid proxy +fatimage [tuned:children] # Hosts to run TuneD configuration +fatimage [ansible_init:children] # Hosts to run linux-ansible-init cluster -[sssd] +[sssd:children] # Hosts to configure sssd on +fatimage [sshd] # Hosts where the OpenSSH server daemon should be configured @@ -137,8 +156,13 @@ cluster [lustre] # Hosts to run lustre client -[extra_packages:children] +[extra_packages] # Hosts to install specified additional packages on + +[dnf_repos:children] +# Hosts to replace system repos with Pulp repos +# Some roles do installs when in install mode/on build VM only: +fatimage builder [cacerts] @@ -149,7 +173,7 @@ builder [gateway:children] # Add builder to this group to install gateway ansible-init playbook into image -builder +fatimage [nhc:children] # Hosts to configure for node health checks @@ -170,5 +194,6 @@ compute # pulp_host ansible_host= # Note inventory host name cannot conflict with group names i.e can't be called `pulp` or `pulp_server`. -[raid] +[raid:children] # Add `builder` to configure image for software raid +fatimage From b504f10581c8831f520f5d84dd2176945bf42235 Mon Sep 17 00:00:00 2001 From: Claudia Watson Date: Fri, 3 Oct 2025 14:54:38 +0100 Subject: [PATCH 40/50] Expose FIPs in inventory hosts file (#807) * Expose FIPs in inventory hosts file * adding output for "fip_address" * changing 'fip_address' to 'nodegroup_fips' --- environments/site/tofu/inventory.tpl | 2 ++ environments/site/tofu/node_group/nodes.tf | 9 +++++++++ 2 files changed, 11 insertions(+) diff --git a/environments/site/tofu/inventory.tpl b/environments/site/tofu/inventory.tpl index 9920f9e..ec17711 100644 --- a/environments/site/tofu/inventory.tpl +++ b/environments/site/tofu/inventory.tpl @@ -26,6 +26,7 @@ ${cluster_name}_${group_name}: image_id: ${ node.image_id } networks: ${jsonencode({for n in node.network: n.name => {"fixed_ip_v4": n.fixed_ip_v4, "fixed_ip_v6": n.fixed_ip_v6}})} node_fqdn: ${login_groups[group_name]["fqdns"][nodename]} + node_fip: ${login_groups[group_name]["nodegroup_fips"][nodename]} %{ endfor ~} ${group_name}: @@ -77,6 +78,7 @@ ${cluster_name}_${group_name}: instance_id: ${ node.id } networks: ${jsonencode({for n in node.network: n.name => {"fixed_ip_v4": n.fixed_ip_v4, "fixed_ip_v6": n.fixed_ip_v6}})} node_fqdn: ${additional_groups[group_name]["fqdns"][nodename]} + node_fip: ${additional_groups[group_name]["nodegroup_fips"][nodename]} %{ endfor ~} ${group_name}: children: diff --git a/environments/site/tofu/node_group/nodes.tf b/environments/site/tofu/node_group/nodes.tf index 4d874d1..4ff1fd0 100644 --- a/environments/site/tofu/node_group/nodes.tf +++ b/environments/site/tofu/node_group/nodes.tf @@ -25,6 +25,11 @@ locals { } ) } + # Map node names to floating IPs from the list var.fip_addresses by index + nodegroup_fips = { + for idx, n in var.nodes : + n => length(var.fip_addresses) > idx ? var.fip_addresses[idx] : "" + } baremetal_az = var.availability_zone != null ? var.availability_zone : "nova" } @@ -229,3 +234,7 @@ output "image_id" { output "fqdns" { value = local.fqdns } + +output "nodegroup_fips" { + value = local.nodegroup_fips +} \ No newline at end of file From 72aff75fbd296e7d42280dea4aba8fceb27df501 Mon Sep 17 00:00:00 2001 From: Pierre Riteau Date: Sat, 4 Oct 2025 21:22:13 +0200 Subject: [PATCH 41/50] Allow VS Code Remote SSH while blocking NFS mounts (#799) --- ansible/roles/sshd/README.md | 1 + ansible/roles/sshd/defaults/main.yml | 1 + ansible/roles/sshd/templates/sshd.conf.j2 | 4 ++++ 3 files changed, 6 insertions(+) diff --git a/ansible/roles/sshd/README.md b/ansible/roles/sshd/README.md index a47f602..3b25360 100644 --- a/ansible/roles/sshd/README.md +++ b/ansible/roles/sshd/README.md @@ -6,5 +6,6 @@ Configure sshd. - `sshd_password_authentication`: Optional bool. Whether to enable password login. Default `false`. - `sshd_disable_forwarding`: Optional bool. Whether to disable all forwarding features (X11, ssh-agent, TCP and StreamLocal). Default `true`. +- `sshd_allow_local_forwarding`: Optional bool. Whether to allow limited forwarding for the Visual Studio Code Remote - SSH extension. Use together with `sshd_disable_forwarding: false`. NOTE THIS MAY BE INSECURE! Default `false`. - `sshd_conf_src`: Optional string. Path to sshd configuration template. Default is in-role template. - `sshd_conf_dest`: Optional string. Path to destination for sshd configuration file. Default is `/etc/ssh/sshd_config.d/10-ansible.conf` which overrides `50-{cloud-init,redhat}` files, if present. diff --git a/ansible/roles/sshd/defaults/main.yml b/ansible/roles/sshd/defaults/main.yml index ca2f8c7..0228dd3 100644 --- a/ansible/roles/sshd/defaults/main.yml +++ b/ansible/roles/sshd/defaults/main.yml @@ -1,5 +1,6 @@ --- sshd_password_authentication: false sshd_disable_forwarding: true +sshd_allow_local_forwarding: false sshd_conf_src: sshd.conf.j2 sshd_conf_dest: /etc/ssh/sshd_config.d/10-ansible.conf diff --git a/ansible/roles/sshd/templates/sshd.conf.j2 b/ansible/roles/sshd/templates/sshd.conf.j2 index 862e263..d409c37 100644 --- a/ansible/roles/sshd/templates/sshd.conf.j2 +++ b/ansible/roles/sshd/templates/sshd.conf.j2 @@ -1,3 +1,7 @@ # {{ ansible_managed }} PasswordAuthentication {{ 'yes' if sshd_password_authentication | bool else 'no' }} DisableForwarding {{ 'yes' if sshd_disable_forwarding | bool else 'no' }} +{% if sshd_allow_local_forwarding %} +AllowTcpForwarding local +PermitOpen 127.0.0.1:* +{% endif %} From d9c5d8f67cc72943a3bb9b4f18b4b1e2d5cbc6c7 Mon Sep 17 00:00:00 2001 From: Steve Brasier <33413598+sjpb@users.noreply.github.com> Date: Tue, 7 Oct 2025 10:41:28 +0100 Subject: [PATCH 42/50] Delete build VMs in CI nightly cleanup (#777) * delete build VMs in CI nightly cleanup * name build volumes and include in nightly cleanup * simplify cleanup of volumes and include fatimage build VMs --------- Co-authored-by: bertiethorpe Co-authored-by: bertiethorpe <84867280+bertiethorpe@users.noreply.github.com> --- .github/workflows/nightly-cleanup.yml | 53 +++++---------------------- packer/openstack.pkr.hcl | 6 +++ 2 files changed, 16 insertions(+), 43 deletions(-) diff --git a/.github/workflows/nightly-cleanup.yml b/.github/workflows/nightly-cleanup.yml index 5bec96d..4c2fd01 100644 --- a/.github/workflows/nightly-cleanup.yml +++ b/.github/workflows/nightly-cleanup.yml @@ -46,53 +46,20 @@ jobs: echo "${{ secrets[format('{0}_CLOUDS_YAML', env.CI_CLOUD)] }}" > ~/.config/openstack/clouds.yaml shell: bash - - name: Find CI clusters + - name: Delete all CI clusters run: | . venv/bin/activate - CI_CLUSTERS=$(openstack server list | grep --only-matching 'slurmci-RL.-[0-9]\+' | sort | uniq || true) - echo "DEBUG: Raw CI clusters: $CI_CLUSTERS" - - if [[ -z "$CI_CLUSTERS" ]]; then - echo "No matching CI clusters found." - else - # Flatten multiline value so can be passed as env var - CI_CLUSTERS_FORMATTED=$(echo "$CI_CLUSTERS" | tr '\n' ' ' | sed 's/ $//') - echo "DEBUG: Formatted CI clusters: $CI_CLUSTERS_FORMATTED" - echo "ci_clusters=$CI_CLUSTERS_FORMATTED" >> "$GITHUB_ENV" - fi + ./dev/delete-cluster.py slurmci-RL --force shell: bash - - - name: Delete CI clusters + + - name: Delete all CI extra build VMs and volumes run: | . venv/bin/activate - if [[ -z ${ci_clusters} ]]; then - echo "No clusters to delete." - exit 0 - fi - - for cluster_prefix in ${ci_clusters} - do - echo "Processing cluster: $cluster_prefix" - - # Get all servers with the matching name for control node - CONTROL_SERVERS=$(openstack server list --name "${cluster_prefix}-control" --format json) - - # Get unique server names to avoid duplicate cleanup - UNIQUE_NAMES=$(echo "$CONTROL_SERVERS" | jq -r '.[].Name' | sort | uniq) - for name in $UNIQUE_NAMES; do - echo "Deleting cluster with control node: $name" - - # Get the first matching server ID by name - server=$(echo "$CONTROL_SERVERS" | jq -r '.[] | select(.Name=="'"$name"'") | .ID' | head -n1) - - # Make sure server still exists (wasn't deleted earlier) - if ! openstack server show "$server" &>/dev/null; then - echo "Server $server no longer exists, skipping $name." - continue - fi + ./dev/delete-cluster.py openhpc-extra-RL --force + shell: bash - echo "Deleting cluster $cluster_prefix (server $server)..." - ./dev/delete-cluster.py "$cluster_prefix" --force - done - done + - name: Delete all fatimage build VMs and volumes + run: | + . venv/bin/activate + ./dev/delete-cluster.py openhpc-RL --force shell: bash diff --git a/packer/openstack.pkr.hcl b/packer/openstack.pkr.hcl index 9faf4bb..b6f570e 100644 --- a/packer/openstack.pkr.hcl +++ b/packer/openstack.pkr.hcl @@ -128,6 +128,11 @@ variable "volume_size" { default = 20 } +variable "volume_name" { + type = string + default = null +} + variable "image_disk_format" { type = string default = "raw" @@ -162,6 +167,7 @@ source "openstack" "openhpc" { use_blockstorage_volume = var.use_blockstorage_volume volume_type = var.volume_type volume_size = var.volume_size + volume_name = "${var.image_name}${local.image_name_version}" metadata = var.metadata instance_metadata = { ansible_init_disable = "true" From c40a383eaab9b6f38e6371410daaeafd6dc5d200 Mon Sep 17 00:00:00 2001 From: wtripp180901 <78219569+wtripp180901@users.noreply.github.com> Date: Thu, 9 Oct 2025 09:38:09 +0100 Subject: [PATCH 43/50] Export state directory to OnDemand nodes in CaaS environment (#809) * export state directory to ondemand nodes for caas * fixed caas config --- ansible/roles/zenith_proxy/tasks/main.yml | 2 ++ environments/.caas/inventory/group_vars/all/nfs.yml | 10 +++++++++- 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/ansible/roles/zenith_proxy/tasks/main.yml b/ansible/roles/zenith_proxy/tasks/main.yml index 7a4c034..360e77d 100644 --- a/ansible/roles/zenith_proxy/tasks/main.yml +++ b/ansible/roles/zenith_proxy/tasks/main.yml @@ -61,6 +61,8 @@ group: "{{ zenith_proxy_podman_user }}" mode: "0755" become: true + delegate_to: "{{ groups['control'] | first }}" + run_once: true - name: Initialise Zenith client # Use a foreground command rather than the podman_container module as I could not diff --git a/environments/.caas/inventory/group_vars/all/nfs.yml b/environments/.caas/inventory/group_vars/all/nfs.yml index 0eca0c8..7d617df 100644 --- a/environments/.caas/inventory/group_vars/all/nfs.yml +++ b/environments/.caas/inventory/group_vars/all/nfs.yml @@ -9,4 +9,12 @@ caas_nfs_home: nfs_export: "/exports/home" # assumes default site TF is being used nfs_client_mnt_point: "/home" -nfs_configurations: "{{ caas_nfs_home if not cluster_home_manila_share | bool else [] }}" +caas_ood_zenith_state_dir: + - comment: Export /var/lib/state from Slurm control node + nfs_enable: + server: "{{ inventory_hostname in groups['control'] }}" + clients: "{{ inventory_hostname in groups['openondemand'] }}" + nfs_export: "/var/lib/state" + nfs_client_mnt_point: "/var/lib/state" + +nfs_configurations: "{{ caas_ood_zenith_state_dir + ( caas_nfs_home if not cluster_home_manila_share | bool else [] ) }}" From bc4e6ee9a8e0c3e0c50809b895c9960bead7d7fa Mon Sep 17 00:00:00 2001 From: technowhizz <7688823+technowhizz@users.noreply.github.com> Date: Fri, 7 Nov 2025 11:16:58 +0000 Subject: [PATCH 44/50] Bump image for v2.7 --- environments/site/builder.pkrvars.hcl | 2 +- environments/site/tofu/variables.tf | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/environments/site/builder.pkrvars.hcl b/environments/site/builder.pkrvars.hcl index 8163baf..9029117 100644 --- a/environments/site/builder.pkrvars.hcl +++ b/environments/site/builder.pkrvars.hcl @@ -1,6 +1,6 @@ flavor = "ec1.large" # VM flavor to use for builder VMs networks = ["84205817-e75c-47c7-a57e-0f14ee8de257"] # List of network UUIDs to attach the VM to - workshop-internal -source_image_name = "openhpc-RL9-250808-1727-faa44755" # Name of image to create VM with, i.e. starting image +source_image_name = "openhpc-RL9-251002-1456-1d21952c" # Name of image to create VM with, i.e. starting image volume_size = "15" # in GB volume_type = "unencrypted" inventory_groups = "extra_packages" # Additional inventory groups to add build VM to diff --git a/environments/site/tofu/variables.tf b/environments/site/tofu/variables.tf index f5e7dd0..f4679b9 100644 --- a/environments/site/tofu/variables.tf +++ b/environments/site/tofu/variables.tf @@ -126,7 +126,7 @@ variable "login" { variable "cluster_image_id" { type = string description = "ID of default image for the cluster" - default = "7ca99016-c342-4557-8a8d-9a856e934b58" # openhpc-RL9-250808-1727-faa44755 + default = "d62d93df-0a1f-473f-81ef-d89538dd6cef" # openhpc-RL9-251002-1456-1d21952c } variable "compute" { From d954cbc72e9d84977ff2e4dd1a39aab199227550 Mon Sep 17 00:00:00 2001 From: technowhizz <7688823+technowhizz@users.noreply.github.com> Date: Mon, 10 Nov 2025 14:20:14 +0000 Subject: [PATCH 45/50] Don't use login as name for login node object --- environments/site/tofu/variables.tf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/environments/site/tofu/variables.tf b/environments/site/tofu/variables.tf index f4679b9..4579a7c 100644 --- a/environments/site/tofu/variables.tf +++ b/environments/site/tofu/variables.tf @@ -55,7 +55,7 @@ variable "control_node_flavor" { variable "login" { default = { - login = { + head = { nodes = ["login-0"] flavor = "en1.xsmall" } From cd96d9d203791199470f73d5908652cfcf38658c Mon Sep 17 00:00:00 2001 From: technowhizz <7688823+technowhizz@users.noreply.github.com> Date: Mon, 10 Nov 2025 16:01:12 +0000 Subject: [PATCH 46/50] Fix structure of ansible-ssh --- dev/ansible-ssh | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/dev/ansible-ssh b/dev/ansible-ssh index b2e13ff..6a1839b 100755 --- a/dev/ansible-ssh +++ b/dev/ansible-ssh @@ -21,8 +21,7 @@ def _optional_arg(prototype, *values): if __name__ == "__main__": if len(sys.argv) < 2: msg = ( - f"Usage: { - sys.argv[0]} [args to pass to ssh]") + f"Usage: {sys.argv[0]} [args to pass to ssh]") print(msg, file=sys.stderr) sys.exit(-1) From 5812791956c1d766126fe224a83b2d286fdc5ed6 Mon Sep 17 00:00:00 2001 From: technowhizz <7688823+technowhizz@users.noreply.github.com> Date: Mon, 10 Nov 2025 19:45:55 +0000 Subject: [PATCH 47/50] Add script to do all env related config at once --- dev/activate-env.sh | 5 +++++ 1 file changed, 5 insertions(+) create mode 100755 dev/activate-env.sh diff --git a/dev/activate-env.sh b/dev/activate-env.sh new file mode 100755 index 0000000..0982709 --- /dev/null +++ b/dev/activate-env.sh @@ -0,0 +1,5 @@ +#!/usr/bin/env bash +. venv/bin/activate +. environments/staging/activate +export OS_CLOUD=openstack +export ANSIBLE_VAULT_PASSWORD_FILE=/home/lab/vault.password From 06dede48fe76fa6c4c5c13e94786594ae726f560 Mon Sep 17 00:00:00 2001 From: technowhizz <7688823+technowhizz@users.noreply.github.com> Date: Mon, 10 Nov 2025 20:22:45 +0000 Subject: [PATCH 48/50] Change image to use id and volume size for v2.7 image build (packer) --- environments/site/builder.pkrvars.hcl | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/environments/site/builder.pkrvars.hcl b/environments/site/builder.pkrvars.hcl index 9029117..7ce3795 100644 --- a/environments/site/builder.pkrvars.hcl +++ b/environments/site/builder.pkrvars.hcl @@ -1,7 +1,8 @@ flavor = "ec1.large" # VM flavor to use for builder VMs networks = ["84205817-e75c-47c7-a57e-0f14ee8de257"] # List of network UUIDs to attach the VM to - workshop-internal -source_image_name = "openhpc-RL9-251002-1456-1d21952c" # Name of image to create VM with, i.e. starting image -volume_size = "15" # in GB +# source_image_name = "openhpc-RL9-251002-1456-1d21952c" # Name of image to create VM with, i.e. starting image +source_image = "d62d93df-0a1f-473f-81ef-d89538dd6cef" # Use image ID instead of name to avoid ambiguity +volume_size = "25" # in GB volume_type = "unencrypted" inventory_groups = "extra_packages" # Additional inventory groups to add build VM to From 079dd05fefe5472ce18babebf77ce169a11a309b Mon Sep 17 00:00:00 2001 From: technowhizz <7688823+technowhizz@users.noreply.github.com> Date: Tue, 11 Nov 2025 14:32:28 +0000 Subject: [PATCH 49/50] Change volume size for workshop --- environments/site/tofu/variables.tf | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/environments/site/tofu/variables.tf b/environments/site/tofu/variables.tf index 4579a7c..a5c5050 100644 --- a/environments/site/tofu/variables.tf +++ b/environments/site/tofu/variables.tf @@ -227,7 +227,7 @@ variable "state_dir" { variable "state_volume_size" { type = number description = "Size of state volume on control node, in GB" - default = 150 # GB + default = 75 # GB } variable "state_volume_type" { @@ -261,7 +261,7 @@ variable "state_volume_provisioning" { variable "home_volume_size" { type = number description = "Size of state volume on control node, in GB." - default = 100 + default = 75 validation { condition = var.home_volume_provisioning == "manage" ? var.home_volume_size > 0 : true error_message = <<-EOT From ea2aea212129f1b6a3912c9716340a81533b2ed8 Mon Sep 17 00:00:00 2001 From: technowhizz <7688823+technowhizz@users.noreply.github.com> Date: Wed, 12 Nov 2025 14:15:32 +0000 Subject: [PATCH 50/50] Change volume size for packer --- environments/site/builder.pkrvars.hcl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/environments/site/builder.pkrvars.hcl b/environments/site/builder.pkrvars.hcl index 7ce3795..ea65759 100644 --- a/environments/site/builder.pkrvars.hcl +++ b/environments/site/builder.pkrvars.hcl @@ -2,7 +2,7 @@ flavor = "ec1.large" # VM flavor to use for builder VM networks = ["84205817-e75c-47c7-a57e-0f14ee8de257"] # List of network UUIDs to attach the VM to - workshop-internal # source_image_name = "openhpc-RL9-251002-1456-1d21952c" # Name of image to create VM with, i.e. starting image source_image = "d62d93df-0a1f-473f-81ef-d89538dd6cef" # Use image ID instead of name to avoid ambiguity -volume_size = "25" # in GB +volume_size = "20" # in GB volume_type = "unencrypted" inventory_groups = "extra_packages" # Additional inventory groups to add build VM to