From b8a502f825ec249083437c5d1eaf3af89fbba12f Mon Sep 17 00:00:00 2001 From: Max Norton Date: Mon, 16 Jun 2025 14:20:29 +0100 Subject: [PATCH 01/37] Add Github Actions for running code linters --- .ansible-lint.yml | 9 +++++++ .editorconfig | 8 ++++++ .github/linters/actionlint.yml | 1 + .github/workflows/lint.yml | 47 ++++++++++++++++++++++++++++++++++ .yamllint.yml | 24 +++++++++++++++++ README.md | 17 ++++++++++++ actionlint.yml | 1 + super-linter.env | 14 ++++++++++ 8 files changed, 121 insertions(+) create mode 100644 .ansible-lint.yml create mode 100644 .editorconfig create mode 120000 .github/linters/actionlint.yml create mode 100644 .github/workflows/lint.yml create mode 100644 .yamllint.yml create mode 100644 actionlint.yml create mode 100644 super-linter.env diff --git a/.ansible-lint.yml b/.ansible-lint.yml new file mode 100644 index 000000000..846d42b86 --- /dev/null +++ b/.ansible-lint.yml @@ -0,0 +1,9 @@ +--- +skip_list: + - var-naming[no-role-prefix] + - galaxy[no-changelog] + - galaxy[version-incorrect] + - meta-runtime[unsupported-version] +exclude_paths: + - actionlint.yml + - .github/ diff --git a/.editorconfig b/.editorconfig new file mode 100644 index 000000000..984b0d9d7 --- /dev/null +++ b/.editorconfig @@ -0,0 +1,8 @@ +# The is primarily used to alter the behaviour of linters executed by super-linter. +# See https://editorconfig.org/ + +# shfmt will default to indenting shell scripts with tabs, +# define the indent as 2 spaces +[bin/*] +indent_style = space +indent_size = 2 diff --git a/.github/linters/actionlint.yml b/.github/linters/actionlint.yml new file mode 120000 index 000000000..766b4e9ba --- /dev/null +++ b/.github/linters/actionlint.yml @@ -0,0 +1 @@ +../../actionlint.yml \ No newline at end of file diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml new file mode 100644 index 000000000..e51c31b8d --- /dev/null +++ b/.github/workflows/lint.yml @@ -0,0 +1,47 @@ +--- +name: Lint + +on: # yamllint disable-line rule:truthy + workflow_call: + +permissions: + contents: read + packages: read + # To report GitHub Actions status checks + statuses: write + +jobs: + lint: + name: Lint + runs-on: ubuntu-latest + permissions: + contents: read + packages: read + # To report GitHub Actions status checks + statuses: write + + steps: + - uses: actions/checkout@v4 + with: + # super-linter needs the full git history to get the + # list of files that changed across commits + fetch-depth: 0 + submodules: true + + - name: Run ansible-lint + uses: ansible/ansible-lint@v25.4.0 + + - name: Load super-linter configuration + # Use grep inverse matching to exclude eventual comments in the .env file + # because the GitHub Actions command to set environment variables doesn't + # support comments. + # yamllint disable-line rule:line-length + # Ref: https://docs.github.com/en/actions/writing-workflows/choosing-what-your-workflow-does/workflow-commands-for-github-actions#setting-an-environment-variable + run: grep -v '^#' super-linter.env >> "$GITHUB_ENV" + if: always() + + - name: Run super-linter + uses: super-linter/super-linter@v7.3.0 + if: always() + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} diff --git a/.yamllint.yml b/.yamllint.yml new file mode 100644 index 000000000..32202604f --- /dev/null +++ b/.yamllint.yml @@ -0,0 +1,24 @@ +--- +extends: default + +rules: + brackets: + forbid: non-empty + comments: + # https://github.com/prettier/prettier/issues/6780 + min-spaces-from-content: 1 + # https://github.com/adrienverge/yamllint/issues/384 + comments-indentation: false + document-start: disable + # 160 chars was the default used by old E204 rule, but + # you can easily change it or disable in your .yamllint file. + line-length: + max: 160 + # We are adding an extra space inside braces as that's how prettier does it + # and we are trying not to fight other linters. + braces: + min-spaces-inside: 0 # yamllint defaults to 0 + max-spaces-inside: 1 # yamllint defaults to 0 + octal-values: + forbid-implicit-octal: true # yamllint defaults to false + forbid-explicit-octal: true # yamllint defaults to false diff --git a/README.md b/README.md index a47afd4e4..6ee964fef 100644 --- a/README.md +++ b/README.md @@ -138,3 +138,20 @@ where the IP of the login node is given in `environments/$ENV/inventory/hosts.ym - `dev/`: Contains development tools. For further information see the [docs](docs/) directory. + +## Developing locally + +To run the GitHub Actions linters locally, use: + +```sh +docker run --rm \ + -e RUN_LOCAL=true \ + --env-file "super-linter.env" \ + -v "$(pwd)":/tmp/lint \ + ghcr.io/super-linter/super-linter:v7.3.0 +``` + +```sh +ansible-lint -c .ansible-lint.yml ansible/ +``` + diff --git a/actionlint.yml b/actionlint.yml new file mode 100644 index 000000000..ed97d539c --- /dev/null +++ b/actionlint.yml @@ -0,0 +1 @@ +--- diff --git a/super-linter.env b/super-linter.env new file mode 100644 index 000000000..df2f160b5 --- /dev/null +++ b/super-linter.env @@ -0,0 +1,14 @@ +# Detect that default branch is devel when running locally +DEFAULT_BRANCH=main + +# Don't validate JSCPD +VALIDATE_JSCPD=false + +# Don't validate JS standard because it conflicts with JS prettier +VALIDATE_JAVASCRIPT_STANDARD=false + +# Don't validate Ansible because ansible-lint is more flexible +VALIDATE_ANSIBLE=false + +# Don't validate YAML prettier because yamllint is sufficient +VALIDATE_YAML_PRETTIER=false From 9939b235fbc1334f8b4f07e642813977ef2998ef Mon Sep 17 00:00:00 2001 From: Max Norton Date: Thu, 3 Jul 2025 18:31:42 +0100 Subject: [PATCH 02/37] Fix linting issues. The super-linter.env currently has the following additions that are to be addressed in the future: VALIDATE_GITHUB_ACTIONS=false VALIDATE_SHELL_SHFMT=false VALIDATE_YAML=false Most of the linting for the above has been addressed with just a single issue remaining that blocks the linter from being enabled. --- .ansible-lint.yml | 17 +- .checkov.yaml | 4 + .editorconfig | 2 +- .github/bin/create-merge-branch.sh | 10 +- .github/bin/get-s3-image.sh | 14 +- .github/linters/.checkov.yaml | 1 + .github/linters/.python-lint | 1 + .github/linters/.shellcheckrc | 1 + .github/linters/.yamllint.yml | 1 + .github/workflows/extra.yml | 20 +- .github/workflows/fatimage.yml | 19 +- .github/workflows/nightly-cleanup.yml | 16 +- .github/workflows/nightlybuild.yml | 21 +- .github/workflows/release-image.yml | 9 +- .github/workflows/s3-image-sync.yml | 18 +- .github/workflows/stackhpc.yml | 41 +-- .github/workflows/trivyscan.yml | 14 +- .github/workflows/upgrade-check.yml.sample | 7 + .../workflows/upload-release-image.yml.sample | 6 + .gitignore | 1 + .python-lint | 6 + .shellcheckrc | 7 + README.md | 115 ++++---- ansible.cfg | 2 +- ansible/adhoc/backup-keytabs.yml | 7 +- ansible/adhoc/cudatests.yml | 7 +- ansible/adhoc/deploy-pulp.yml | 33 +-- ansible/adhoc/generate-passwords.yml | 5 +- ansible/adhoc/hpctests.yml | 3 +- ansible/adhoc/rebuild-via-slurm.yml | 5 +- ansible/adhoc/rebuild.yml | 17 +- ansible/adhoc/restart-slurm.yml | 19 +- ansible/adhoc/sync-pulp.yml | 1 + ansible/adhoc/update-packages.yml | 10 +- ansible/bootstrap.yml | 213 +++++++------- ansible/ci/check_eessi.yml | 22 +- ansible/ci/check_grafana.yml | 10 +- ansible/ci/check_sacct_hpctests.yml | 5 +- ansible/ci/check_slurm.yml | 5 +- ansible/ci/delete_images.yml | 14 +- ansible/ci/get_image_ids.yml | 5 +- .../ci/library/grafana_elasticsearch_query.py | 98 ++++--- ansible/ci/output_vars.yml | 5 +- ansible/ci/retrieve_inventory.yml | 13 +- ansible/ci/update_timestamps.yml | 3 +- ansible/cleanup.yml | 38 +-- ansible/disable-repos.yml | 3 +- ansible/extras.yml | 46 +-- ansible/fatimage.yml | 138 ++++----- ansible/filesystems.yml | 11 +- ansible/filter_plugins/utils.py | 88 +++--- ansible/iam.yml | 27 +- ansible/library/latest_timestamps.py | 99 ++++--- ansible/library/user_namespace_facts.py | 69 +++-- ansible/monitoring.yml | 35 +-- ansible/noop.yml | 1 - ansible/portal.yml | 19 +- ansible/roles/alertmanager/README.md | 37 ++- ansible/roles/alertmanager/defaults/main.yml | 18 +- ansible/roles/alertmanager/handlers/main.yml | 3 +- .../roles/alertmanager/tasks/configure.yml | 8 +- ansible/roles/alertmanager/tasks/install.yml | 5 +- ansible/roles/basic_users/README.md | 65 ++--- ansible/roles/basic_users/defaults/main.yml | 5 +- .../basic_users/filter_plugins/filter_keys.py | 29 +- .../library/terminate_user_sessions.py | 69 +++-- ansible/roles/basic_users/tasks/main.yml | 31 +- ansible/roles/block_devices/README.md | 26 +- ansible/roles/block_devices/defaults/main.yml | 6 +- .../block_devices/library/block_devices.py | 32 ++- ansible/roles/block_devices/tasks/main.yml | 27 +- ansible/roles/cacerts/defaults/main.yml | 3 +- ansible/roles/cacerts/tasks/configure.yml | 9 +- ansible/roles/cacerts/tasks/export.yml | 5 +- ansible/roles/cacerts/tasks/main.yml | 3 +- ansible/roles/cluster_infra/defaults/main.yml | 1 + ansible/roles/cluster_infra/tasks/main.yml | 36 +-- ansible/roles/compute_init/README.md | 226 ++++++++------- .../roles/compute_init/files/compute-init.yml | 127 ++++---- ansible/roles/compute_init/tasks/export.yml | 29 +- ansible/roles/compute_init/tasks/install.yml | 25 +- ansible/roles/cuda/defaults/main.yml | 6 +- ansible/roles/cuda/tasks/install.yml | 19 +- ansible/roles/cuda/tasks/runtime.yml | 3 +- ansible/roles/cuda/tasks/samples.yml | 13 +- ansible/roles/dnf_repos/defaults/main.yml | 73 ++--- ansible/roles/dnf_repos/tasks/set_repos.yml | 1 - ansible/roles/doca/defaults/main.yml | 3 +- .../roles/doca/tasks/install-kernel-devel.yml | 14 +- ansible/roles/doca/tasks/install.yml | 18 +- ansible/roles/doca/tasks/main.yml | 3 +- ansible/roles/eessi/README.md | 19 +- ansible/roles/eessi/defaults/main.yaml | 1 - ansible/roles/eessi/tasks/main.yaml | 23 +- ansible/roles/etc_hosts/README.md | 5 +- ansible/roles/etc_hosts/defaults/main.yml | 3 +- ansible/roles/etc_hosts/tasks/main.yml | 7 +- ansible/roles/fail2ban/README.md | 22 +- ansible/roles/fail2ban/handlers/main.yml | 3 +- ansible/roles/fail2ban/meta/main.yml | 10 +- ansible/roles/fail2ban/tasks/main.yml | 14 +- ansible/roles/filebeat/defaults/main.yml | 2 +- ansible/roles/filebeat/handlers/main.yml | 7 +- ansible/roles/filebeat/tasks/install.yml | 11 +- ansible/roles/filebeat/tasks/main.yml | 5 +- ansible/roles/filebeat/tasks/runtime.yml | 24 +- ansible/roles/filebeat/tasks/validate.yml | 4 +- ansible/roles/firewalld/README.md | 42 ++- ansible/roles/firewalld/defaults/main.yml | 3 +- ansible/roles/firewalld/handlers/main.yml | 2 +- ansible/roles/firewalld/meta/main.yml | 11 +- ansible/roles/firewalld/tasks/install.yml | 3 +- ansible/roles/firewalld/tasks/main.yml | 4 +- ansible/roles/firewalld/tasks/runtime.yml | 6 +- ansible/roles/freeipa/README.md | 33 ++- ansible/roles/freeipa/defaults/main.yml | 9 +- ansible/roles/freeipa/tasks/addhost.yml | 7 +- .../roles/freeipa/tasks/backup-keytabs.yml | 6 +- .../roles/freeipa/tasks/client-install.yml | 4 +- ansible/roles/freeipa/tasks/enrol.yml | 19 +- ansible/roles/freeipa/tasks/server.yml | 40 +-- ansible/roles/freeipa/tasks/users.yml | 10 +- ansible/roles/freeipa/tasks/validate.yml | 17 +- ansible/roles/gateway/README.md | 2 + ansible/roles/gateway/files/gateway-init.yml | 29 +- ansible/roles/gateway/tasks/main.yml | 5 +- .../files/openhpc-slurm.json | 2 +- .../roles/grafana-dashboards/tasks/main.yml | 25 +- ansible/roles/hpctests/README.md | 46 ++- ansible/roles/hpctests/defaults/main.yml | 22 +- .../roles/hpctests/files/.clang-format-ignore | 1 + ansible/roles/hpctests/files/CPPLINT.cfg | 1 + .../roles/hpctests/files/plot_imb_pingpong.py | 111 ++++--- ansible/roles/hpctests/library/hpl_pq.py | 41 +-- .../roles/hpctests/library/plot_nxnlatbw.py | 184 ++++++++---- .../hpctests/library/read_imb_pingpong.py | 45 +-- .../roles/hpctests/library/slurm_node_info.py | 48 ++-- ansible/roles/hpctests/meta/main.yml | 4 +- ansible/roles/hpctests/tasks/build-hpl.yml | 43 +-- ansible/roles/hpctests/tasks/hpl-solo.yml | 61 ++-- ansible/roles/hpctests/tasks/main.yml | 25 +- ansible/roles/hpctests/tasks/pingmatrix.yml | 45 +-- ansible/roles/hpctests/tasks/pingpong.yml | 39 +-- ansible/roles/hpctests/tasks/setup.yml | 18 +- .../roles/hpctests/templates/hpl-build.sh.j2 | 0 .../roles/hpctests/templates/hpl-solo.sh.j2 | 0 .../roles/hpctests/templates/pingmatrix.sh.j2 | 0 .../roles/hpctests/templates/pingpong.sh.j2 | 0 ansible/roles/k3s/README.md | 10 +- ansible/roles/k3s/defaults/main.yml | 3 +- ansible/roles/k3s/tasks/agent-runtime.yml | 11 +- ansible/roles/k3s/tasks/install.yml | 93 +++--- ansible/roles/k3s/tasks/server-runtime.yml | 22 +- .../k3s/templates/k3s-agent.service.env.j2 | 6 +- .../roles/k3s/templates/k3s.service.env.j2 | 2 +- ansible/roles/k9s/tasks/main.yml | 19 +- ansible/roles/lustre/README.md | 17 +- ansible/roles/lustre/defaults/main.yml | 7 +- ansible/roles/lustre/tasks/configure.yml | 12 +- ansible/roles/lustre/tasks/install.yml | 18 +- ansible/roles/lustre/tasks/validate.yml | 9 +- ansible/roles/mysql/README.md | 28 +- ansible/roles/mysql/defaults/main.yml | 5 +- ansible/roles/mysql/tasks/configure.yml | 41 +-- ansible/roles/mysql/tasks/install.yml | 10 +- ansible/roles/mysql/tasks/main.yml | 5 +- ansible/roles/ofed/README.md | 7 +- ansible/roles/ofed/defaults/main.yml | 7 +- ansible/roles/ofed/tasks/install.yml | 37 +-- ansible/roles/ofed/tasks/main.yml | 3 +- ansible/roles/openondemand/README.md | 42 ++- ansible/roles/openondemand/defaults/main.yml | 35 ++- .../files/missing_home_directory.html | 99 ++++--- .../openondemand/tasks/config_changes.yml | 3 +- ansible/roles/openondemand/tasks/exporter.yml | 5 +- .../openondemand/tasks/jupyter_compute.yml | 14 +- ansible/roles/openondemand/tasks/main.yml | 51 ++-- ansible/roles/openondemand/tasks/pam_auth.yml | 14 +- ansible/roles/openondemand/tasks/validate.yml | 3 +- .../roles/openondemand/tasks/vnc_compute.yml | 29 +- ansible/roles/opensearch/defaults/main.yml | 4 +- ansible/roles/opensearch/handlers/main.yml | 3 +- .../roles/opensearch/tasks/archive_data.yml | 4 +- ansible/roles/opensearch/tasks/certs.yml | 3 +- ansible/roles/opensearch/tasks/install.yml | 16 +- .../opensearch/tasks/migrate-opendistro.yml | 3 +- ansible/roles/opensearch/tasks/runtime.yml | 44 ++- ansible/roles/passwords/defaults/main.yml | 3 + ansible/roles/passwords/tasks/main.yml | 4 +- ansible/roles/passwords/tasks/validate.yml | 3 +- .../roles/persist_hostkeys/defaults/main.yml | 1 + ansible/roles/persist_hostkeys/tasks/main.yml | 66 ++--- .../persist_openhpc_secrets/tasks/main.yml | 17 +- ansible/roles/podman/defaults/main.yml | 1 + ansible/roles/podman/tasks/config.yml | 18 +- ansible/roles/podman/tasks/prereqs.yml | 4 +- ansible/roles/proxy/README.md | 2 +- ansible/roles/proxy/defaults/main.yml | 1 + ansible/roles/proxy/tasks/main.yml | 24 +- ansible/roles/pulp_site/defaults/main.yml | 38 +-- .../filter_plugins/pulp-list-filters.py | 62 ++-- ansible/roles/pulp_site/tasks/install.yml | 27 +- ansible/roles/pulp_site/tasks/sync.yml | 40 +-- ansible/roles/rebuild/README.md | 11 +- ansible/roles/rebuild/defaults/main.yml | 6 +- ansible/roles/rebuild/tasks/main.yml | 7 +- ansible/roles/rebuild/tasks/rebuild.yml | 6 +- .../roles/rebuild/tasks/rebuild_partition.yml | 7 +- ansible/roles/resolv_conf/README.md | 2 + ansible/roles/resolv_conf/defaults/main.yml | 1 + ansible/roles/resolv_conf/tasks/main.yml | 3 +- ansible/roles/slurm_exporter/README.md | 41 ++- .../roles/slurm_exporter/defaults/main.yml | 4 +- .../roles/slurm_exporter/handlers/main.yml | 2 +- .../roles/slurm_exporter/tasks/install.yml | 14 +- ansible/roles/slurm_exporter/tasks/main.yml | 2 +- ansible/roles/slurm_stats/README.md | 24 +- ansible/roles/slurm_stats/tasks/main.yml | 11 +- ansible/roles/slurm_tools/README.md | 8 +- ansible/roles/slurm_tools/tasks/main.yml | 32 +-- ansible/roles/squid/README.md | 2 +- ansible/roles/squid/defaults/main.yml | 5 +- ansible/roles/squid/handlers/main.yml | 3 +- ansible/roles/squid/tasks/configure.yml | 10 +- ansible/roles/squid/tasks/install.yml | 3 +- ansible/roles/squid/tasks/main.yml | 5 +- ansible/roles/sshd/defaults/main.yml | 1 + ansible/roles/sshd/handlers/main.yml | 3 +- ansible/roles/sshd/tasks/configure.yml | 11 +- ansible/roles/sshd/tasks/export.yml | 3 +- ansible/roles/sshd/tasks/main.yml | 3 +- ansible/roles/sssd/README.md | 1 - ansible/roles/sssd/defaults/main.yml | 1 + ansible/roles/sssd/handlers/main.yml | 3 +- ansible/roles/sssd/tasks/configure.yml | 20 +- ansible/roles/sssd/tasks/export.yml | 5 +- ansible/roles/sssd/tasks/install.yml | 7 +- ansible/roles/sssd/tasks/main.yml | 5 +- ansible/roles/systemd/README.md | 19 +- ansible/roles/systemd/defaults/main.yml | 3 +- ansible/roles/systemd/tasks/main.yml | 13 +- ansible/roles/tuned/README.md | 7 +- ansible/roles/tuned/defaults/main.yml | 2 +- ansible/roles/tuned/tasks/configure.yml | 2 +- ansible/roles/tuned/tasks/install.yml | 3 +- ansible/roles/tuned/tasks/main.yml | 4 +- ansible/roles/zenith_proxy/defaults/main.yml | 8 +- .../files/podman-pod-infra-attach.sh | 2 +- ansible/roles/zenith_proxy/tasks/main.yml | 43 +-- ansible/site.yml | 9 +- ansible/slurm.yml | 31 +- ansible/validate.yml | 23 +- dev/ansible-ssh | 24 +- dev/delete-cluster.py | 48 ++-- dev/extract_logs.py | 74 +++-- dev/image-share.sh | 12 +- dev/output_manifest.py | 26 +- dev/setup-env.sh | 42 +-- docs/README.md | 8 +- docs/adding-functionality.md | 3 +- docs/alerting.md | 74 ++--- docs/chrony.md | 3 +- docs/ci.md | 3 +- docs/environments.md | 15 +- docs/experimental/compute-init.md | 12 +- docs/experimental/pulp.md | 5 +- docs/experimental/slurm-controlled-rebuild.md | 272 ++++++++++-------- docs/image-build.md | 76 ++--- docs/k3s.README.md | 8 +- docs/monitoring-and-logging.md | 67 +++-- docs/networks.md | 11 +- docs/openondemand.md | 25 +- docs/operations.md | 86 +++--- docs/persistent-state.md | 2 + docs/production.md | 176 ++++++------ docs/sequence.md | 11 +- docs/site/README.md | 3 +- docs/upgrades.md | 84 +++--- environments/.caas/README.md | 11 +- environments/.caas/hooks/post.yml | 16 +- environments/.caas/hooks/pre.yml | 31 +- .../inventory/group_vars/all/basic_users.yml | 1 + .../inventory/group_vars/all/cluster.yml | 1 + .../inventory/group_vars/all/grafana.yml | 1 + .../inventory/group_vars/all/hpctests.yml | 3 +- .../.caas/inventory/group_vars/all/manila.yml | 5 +- .../.caas/inventory/group_vars/all/nfs.yml | 5 +- .../inventory/group_vars/all/openhpc.yml | 1 + .../inventory/group_vars/all/openondemand.yml | 1 - .../.caas/inventory/group_vars/all/zenith.yml | 1 + .../.caas/inventory/group_vars/openstack.yml | 1 + .../ui-meta/slurm-infra-fast-volume-type.yml | 13 +- .../.caas/ui-meta/slurm-infra-manila-home.yml | 12 +- environments/.caas/ui-meta/slurm-infra.yml | 12 +- .../.stackhpc/hooks/post-bootstrap.yml | 8 +- environments/.stackhpc/hooks/pre.yml | 9 +- .../inventory/group_vars/all/basic_users.yml | 3 + .../inventory/group_vars/all/bastion.yml | 1 + .../inventory/group_vars/all/freeipa.yml | 1 + .../inventory/group_vars/all/grafana.yml | 1 + .../inventory/group_vars/all/hpctests.yml | 1 + .../inventory/group_vars/all/manila.yml | 1 + .../inventory/group_vars/all/openhpc.yml | 1 + .../inventory/group_vars/all/openondemand.yml | 7 +- .../inventory/group_vars/all/podman.yml | 1 + .../inventory/group_vars/all/tuned.yml | 1 + .../inventory/group_vars/builder.yml | 3 +- .../tofu/cluster_image.auto.tfvars.json | 8 +- environments/.stackhpc/tofu/main.tf | 103 +++---- environments/README.md | 13 +- .../common/files/filebeat/filebeat.yml | 1 + .../inventory/group_vars/all/alertmanager.yml | 4 +- .../inventory/group_vars/all/ansible_init.yml | 1 + .../inventory/group_vars/all/basic_users.yml | 3 +- .../inventory/group_vars/all/defaults.yml | 105 ++++--- .../inventory/group_vars/all/filebeat.yml | 2 +- .../inventory/group_vars/all/firewalld.yml | 5 +- .../group_vars/all/freeipa_server.yml | 1 + .../inventory/group_vars/all/grafana.yml | 12 +- .../common/inventory/group_vars/all/k3s.yml | 1 + .../inventory/group_vars/all/manila.yml | 1 + .../common/inventory/group_vars/all/mysql.yml | 2 +- .../common/inventory/group_vars/all/nfs.yml | 12 +- .../inventory/group_vars/all/openhpc.yml | 16 +- .../inventory/group_vars/all/openondemand.yml | 31 +- .../group_vars/all/os-manila-mount.yml | 1 + .../inventory/group_vars/all/podman.yml | 1 + .../inventory/group_vars/all/prometheus.yml | 67 ++--- .../common/inventory/group_vars/all/proxy.yml | 1 + .../common/inventory/group_vars/all/pulp.yml | 1 + .../group_vars/all/slurm_exporter.yml | 5 +- .../common/inventory/group_vars/all/squid.yml | 1 + .../common/inventory/group_vars/all/sshd.yaml | 1 + .../inventory/group_vars/all/systemd.yml | 1 + .../inventory/group_vars/all/timestamps.yml | 45 +-- .../inventory/group_vars/all/update.yml | 7 +- environments/common/layouts/README.md | 2 +- environments/skeleton/cookiecutter.json | 4 +- .../{{cookiecutter.environment}}/README.md | 2 +- .../inventory/group_vars/all/basic_users.yml | 1 + .../inventory/group_vars/all/grafana.yml | 3 +- .../inventory/group_vars/all/hpctests.yml | 1 + .../group_vars/all/vault_alertmanager.yml | 2 +- .../tofu/baremetal-node-list.py | 35 +-- .../tofu/compute.tf | 40 +-- .../tofu/control.tf | 56 ++-- .../{{cookiecutter.environment}}/tofu/data.tf | 3 +- .../tofu/inventory.tf | 23 +- .../tofu/login.tf | 40 +-- .../{{cookiecutter.environment}}/tofu/main.tf | 2 +- .../tofu/network.tf | 8 +- .../tofu/node_group/main.tf | 2 +- .../tofu/node_group/network.tf | 4 +- .../tofu/node_group/nodes.tf | 116 ++++---- .../tofu/node_group/variables.tf | 173 +++++------ .../tofu/read-inventory-secrets.py | 50 ++-- .../tofu/variables.tf | 184 ++++++------ .../tofu/volumes.tf | 60 ++-- packer/openhpc_extravars.yml | 3 +- requirements.yml | 1 - super-linter.env | 13 + 361 files changed, 4198 insertions(+), 3479 deletions(-) create mode 100644 .checkov.yaml mode change 100644 => 100755 .github/bin/create-merge-branch.sh mode change 100644 => 100755 .github/bin/get-s3-image.sh create mode 120000 .github/linters/.checkov.yaml create mode 120000 .github/linters/.python-lint create mode 120000 .github/linters/.shellcheckrc create mode 120000 .github/linters/.yamllint.yml create mode 100644 .python-lint create mode 100644 .shellcheckrc create mode 100644 ansible/roles/hpctests/files/.clang-format-ignore create mode 100644 ansible/roles/hpctests/files/CPPLINT.cfg mode change 100644 => 100755 ansible/roles/hpctests/templates/hpl-build.sh.j2 mode change 100644 => 100755 ansible/roles/hpctests/templates/hpl-solo.sh.j2 mode change 100644 => 100755 ansible/roles/hpctests/templates/pingmatrix.sh.j2 mode change 100644 => 100755 ansible/roles/hpctests/templates/pingpong.sh.j2 mode change 100644 => 100755 ansible/roles/zenith_proxy/files/podman-pod-infra-attach.sh diff --git a/.ansible-lint.yml b/.ansible-lint.yml index 846d42b86..477f37f91 100644 --- a/.ansible-lint.yml +++ b/.ansible-lint.yml @@ -1,9 +1,22 @@ --- skip_list: - - var-naming[no-role-prefix] + - role-name + # Unresolved issues with parsing jinja in multiline strings + # https://github.com/ansible/ansible-lint/issues/3935 + - jinja[spacing] - galaxy[no-changelog] - - galaxy[version-incorrect] - meta-runtime[unsupported-version] + +warn_list: + - name[missing] + - name[play] + - var-naming + exclude_paths: - actionlint.yml + - .ansible/ - .github/ + # The following are files with syntax errors. + # Rule 'syntax-check' is unskippable, you cannot use it in 'skip_list' or 'warn_list'. Still, you could exclude the file. + - ansible/roles/filebeat/tasks/runtime.yml + - environments/common/files/filebeat/filebeat.yml diff --git a/.checkov.yaml b/.checkov.yaml new file mode 100644 index 000000000..ef0fb8bfd --- /dev/null +++ b/.checkov.yaml @@ -0,0 +1,4 @@ +--- +skip-check: + # Requires all blocks to have rescue: - not considered appropriate + - CKV2_ANSIBLE_3 diff --git a/.editorconfig b/.editorconfig index 984b0d9d7..ab1e65780 100644 --- a/.editorconfig +++ b/.editorconfig @@ -3,6 +3,6 @@ # shfmt will default to indenting shell scripts with tabs, # define the indent as 2 spaces -[bin/*] +[{.github/bin,dev}/*.sh] indent_style = space indent_size = 2 diff --git a/.github/bin/create-merge-branch.sh b/.github/bin/create-merge-branch.sh old mode 100644 new mode 100755 index d76fe45de..af1684d13 --- a/.github/bin/create-merge-branch.sh +++ b/.github/bin/create-merge-branch.sh @@ -44,7 +44,7 @@ if git show-branch "remotes/origin/$BRANCH_NAME" >/dev/null 2>&1; then fi echo "[INFO] Merging release tag - $RELEASE_TAG" -git merge --strategy recursive -X theirs --no-commit $RELEASE_TAG +git merge --strategy recursive -X theirs --no-commit "$RELEASE_TAG" # Check if the merge resulted in any changes being staged if [ -n "$(git status --short)" ]; then @@ -54,7 +54,7 @@ if [ -n "$(git status --short)" ]; then # NOTE(scott): The GitHub create-pull-request action does # the commiting for us, so we only need to make branches # and commits if running outside of GitHub actions. - if [ ! $GITHUB_ACTIONS ]; then + if [ ! "$GITHUB_ACTIONS" ]; then echo "[INFO] Checking out temporary branch '$BRANCH_NAME'..." git checkout -b "$BRANCH_NAME" @@ -74,8 +74,8 @@ if [ -n "$(git status --short)" ]; then # Write a file containing the branch name and tag # for automatic PR or MR creation that follows - echo "BRANCH_NAME=\"$BRANCH_NAME\"" > .mergeenv - echo "RELEASE_TAG=\"$RELEASE_TAG\"" >> .mergeenv + echo "BRANCH_NAME=\"$BRANCH_NAME\"" >.mergeenv + echo "RELEASE_TAG=\"$RELEASE_TAG\"" >>.mergeenv else echo "[INFO] Merge resulted in no changes" -fi \ No newline at end of file +fi diff --git a/.github/bin/get-s3-image.sh b/.github/bin/get-s3-image.sh old mode 100644 new mode 100755 index d20838aca..ca8d9b5f3 --- a/.github/bin/get-s3-image.sh +++ b/.github/bin/get-s3-image.sh @@ -13,14 +13,14 @@ echo "Checking if image $image_name exists in OpenStack" image_exists=$(openstack image list --name "$image_name" -f value -c Name) if [ -n "$image_exists" ]; then - echo "Image $image_name already exists in OpenStack." + echo "Image $image_name already exists in OpenStack." else - echo "Image $image_name not found in OpenStack. Getting it from S3." + echo "Image $image_name not found in OpenStack. Getting it from S3." - wget https://object.arcus.openstack.hpc.cam.ac.uk/swift/v1/AUTH_3a06571936a0424bb40bc5c672c4ccb1/$bucket_name/$image_name --progress=dot:giga + wget "https://object.arcus.openstack.hpc.cam.ac.uk/swift/v1/AUTH_3a06571936a0424bb40bc5c672c4ccb1/$bucket_name/$image_name --progress=dot:giga" - echo "Uploading image $image_name to OpenStack..." - openstack image create --file $image_name --disk-format qcow2 $image_name --progress + echo "Uploading image $image_name to OpenStack..." + openstack image create --file "$image_name" --disk-format qcow2 "$image_name" --progress - echo "Image $image_name has been uploaded to OpenStack." -fi \ No newline at end of file + echo "Image $image_name has been uploaded to OpenStack." +fi diff --git a/.github/linters/.checkov.yaml b/.github/linters/.checkov.yaml new file mode 120000 index 000000000..2cc8ad8e9 --- /dev/null +++ b/.github/linters/.checkov.yaml @@ -0,0 +1 @@ +../../.checkov.yaml \ No newline at end of file diff --git a/.github/linters/.python-lint b/.github/linters/.python-lint new file mode 120000 index 000000000..d0b74712e --- /dev/null +++ b/.github/linters/.python-lint @@ -0,0 +1 @@ +../../.python-lint \ No newline at end of file diff --git a/.github/linters/.shellcheckrc b/.github/linters/.shellcheckrc new file mode 120000 index 000000000..3f3450147 --- /dev/null +++ b/.github/linters/.shellcheckrc @@ -0,0 +1 @@ +../../.shellcheckrc \ No newline at end of file diff --git a/.github/linters/.yamllint.yml b/.github/linters/.yamllint.yml new file mode 120000 index 000000000..54a3654b9 --- /dev/null +++ b/.github/linters/.yamllint.yml @@ -0,0 +1 @@ +../../.yamllint.yml \ No newline at end of file diff --git a/.github/workflows/extra.yml b/.github/workflows/extra.yml index 6a075ae16..a076a6980 100644 --- a/.github/workflows/extra.yml +++ b/.github/workflows/extra.yml @@ -18,6 +18,12 @@ on: - 'ansible/roles/lustre/**' - '.github/workflows/extra.yml' +permissions: + contents: read + packages: write + # To report GitHub Actions status checks + statuses: write + jobs: doca: name: extra-build @@ -44,7 +50,7 @@ jobs: ARK_PASSWORD: ${{ secrets.ARK_PASSWORD }} steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v4 - name: Load current fat images into GITHUB_ENV # see https://docs.github.com/en/actions/writing-workflows/choosing-what-your-workflow-does/workflow-commands-for-github-actions#example-of-a-multiline-string @@ -58,7 +64,7 @@ jobs: - name: Record settings run: | echo CI_CLOUD: ${{ env.CI_CLOUD }} - echo FAT_IMAGES: ${FAT_IMAGES} + echo "FAT_IMAGES: ${FAT_IMAGES}" - name: Setup ssh run: | @@ -97,7 +103,7 @@ jobs: PACKER_LOG=1 packer build \ -on-error=${{ vars.PACKER_ON_ERROR }} \ - -var-file=$PKR_VAR_environment_root/${{ env.CI_CLOUD }}.pkrvars.hcl \ + -var-file="$PKR_VAR_environment_root/${{ env.CI_CLOUD }}.pkrvars.hcl" \ -var "source_image_name=${{ fromJSON(env.FAT_IMAGES)['cluster_image'][matrix.build.source_image_name_key] }}" \ -var "image_name=${{ matrix.build.image_name }}" \ -var "inventory_groups=${{ matrix.build.inventory_groups }}" \ @@ -109,14 +115,14 @@ jobs: run: | . venv/bin/activate IMAGE_ID=$(jq --raw-output '.builds[-1].artifact_id' packer/packer-manifest.json) - while ! openstack image show -f value -c name $IMAGE_ID; do + while ! openstack image show -f value -c name "$IMAGE_ID"; do sleep 5 done - IMAGE_NAME=$(openstack image show -f value -c name $IMAGE_ID) + IMAGE_NAME=$(openstack image show -f value -c name "$IMAGE_ID") echo "image-name=${IMAGE_NAME}" >> "$GITHUB_OUTPUT" echo "image-id=$IMAGE_ID" >> "$GITHUB_OUTPUT" - echo $IMAGE_ID > image-id.txt - echo $IMAGE_NAME > image-name.txt + echo "$IMAGE_ID" > image-id.txt + echo "$IMAGE_NAME" > image-name.txt - name: Make image usable for further builds run: | diff --git a/.github/workflows/fatimage.yml b/.github/workflows/fatimage.yml index 8b5eebfc8..61cf264e2 100644 --- a/.github/workflows/fatimage.yml +++ b/.github/workflows/fatimage.yml @@ -1,6 +1,7 @@ name: Build fat image on: workflow_dispatch: + # checkov:skip=CKV_GHA_7: "The build output cannot be affected by user parameters other than the build entry point and the top-level source location. GitHub Actions workflow_dispatch inputs MUST be empty. " inputs: ci_cloud: description: 'Select the CI_CLOUD' @@ -16,6 +17,12 @@ on: required: true default: true +permissions: + contents: read + packages: write + # To report GitHub Actions status checks + statuses: write + jobs: openstack: name: openstack-imagebuild @@ -41,7 +48,7 @@ jobs: LEAFCLOUD_PULP_PASSWORD: ${{ secrets.LEAFCLOUD_PULP_PASSWORD }} steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v4 - name: Record settings for CI cloud run: | @@ -84,7 +91,7 @@ jobs: PACKER_LOG=1 packer build \ -on-error=${{ github.event.inputs.cleanup_on_failure && 'cleanup' || 'abort' }} \ - -var-file=$PKR_VAR_environment_root/${{ env.CI_CLOUD }}.pkrvars.hcl \ + -var-file="$PKR_VAR_environment_root/${{ env.CI_CLOUD }}.pkrvars.hcl" \ -var "source_image_name=${{ matrix.build.source_image_name }}" \ -var "image_name=${{ matrix.build.image_name }}" \ -var "inventory_groups=${{ matrix.build.inventory_groups }}" \ @@ -95,14 +102,14 @@ jobs: run: | . venv/bin/activate IMAGE_ID=$(jq --raw-output '.builds[-1].artifact_id' packer/packer-manifest.json) - while ! openstack image show -f value -c name $IMAGE_ID; do + while ! openstack image show -f value -c name "$IMAGE_ID"; do sleep 5 done - IMAGE_NAME=$(openstack image show -f value -c name $IMAGE_ID) + IMAGE_NAME=$(openstack image show -f value -c name "$IMAGE_ID") echo "image-name=${IMAGE_NAME}" >> "$GITHUB_OUTPUT" echo "image-id=$IMAGE_ID" >> "$GITHUB_OUTPUT" - echo $IMAGE_ID > image-id.txt - echo $IMAGE_NAME > image-name.txt + echo "$IMAGE_ID" > image-id.txt + echo "$IMAGE_NAME" > image-name.txt - name: Make image usable for further builds run: | diff --git a/.github/workflows/nightly-cleanup.yml b/.github/workflows/nightly-cleanup.yml index 897d3572d..5bec96d48 100644 --- a/.github/workflows/nightly-cleanup.yml +++ b/.github/workflows/nightly-cleanup.yml @@ -4,6 +4,12 @@ on: schedule: - cron: '0 21 * * *' # Run at 9PM - image sync runs at midnight +permissions: + contents: read + packages: write + # To report GitHub Actions status checks + statuses: write + jobs: ci_cleanup: name: ci-cleanup @@ -20,7 +26,7 @@ jobs: OS_CLOUD: openstack CI_CLOUD: ${{ matrix.cloud }} steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v4 - name: Record which cloud CI is running on run: | @@ -31,7 +37,7 @@ jobs: python3 -m venv venv . venv/bin/activate pip install -U pip - pip install $(grep -o 'python-openstackclient[><=0-9\.]*' requirements.txt) + pip install "$(grep -o 'python-openstackclient[><=0-9\.]*' requirements.txt)" shell: bash - name: Write clouds.yaml @@ -52,7 +58,7 @@ jobs: # Flatten multiline value so can be passed as env var CI_CLUSTERS_FORMATTED=$(echo "$CI_CLUSTERS" | tr '\n' ' ' | sed 's/ $//') echo "DEBUG: Formatted CI clusters: $CI_CLUSTERS_FORMATTED" - echo "ci_clusters=$CI_CLUSTERS_FORMATTED" >> $GITHUB_ENV + echo "ci_clusters=$CI_CLUSTERS_FORMATTED" >> "$GITHUB_ENV" fi shell: bash @@ -69,7 +75,7 @@ jobs: echo "Processing cluster: $cluster_prefix" # Get all servers with the matching name for control node - CONTROL_SERVERS=$(openstack server list --name ${cluster_prefix}-control --format json) + CONTROL_SERVERS=$(openstack server list --name "${cluster_prefix}-control" --format json) # Get unique server names to avoid duplicate cleanup UNIQUE_NAMES=$(echo "$CONTROL_SERVERS" | jq -r '.[].Name' | sort | uniq) @@ -86,7 +92,7 @@ jobs: fi echo "Deleting cluster $cluster_prefix (server $server)..." - ./dev/delete-cluster.py $cluster_prefix --force + ./dev/delete-cluster.py "$cluster_prefix" --force done done shell: bash diff --git a/.github/workflows/nightlybuild.yml b/.github/workflows/nightlybuild.yml index f0d4b941f..0b61d4f8b 100644 --- a/.github/workflows/nightlybuild.yml +++ b/.github/workflows/nightlybuild.yml @@ -1,6 +1,7 @@ name: Build nightly image on: workflow_dispatch: + # checkov:skip=CKV_GHA_7: "The build output cannot be affected by user parameters other than the build entry point and the top-level source location. GitHub Actions workflow_dispatch inputs MUST be empty. " inputs: ci_cloud: description: 'Select the CI_CLOUD' @@ -13,6 +14,12 @@ on: # schedule: # - cron: '0 0 * * *' # Run at midnight on default branch +permissions: + contents: read + packages: write + # To report GitHub Actions status checks + statuses: write + jobs: openstack: name: openstack-imagebuild @@ -38,7 +45,7 @@ jobs: LEAFCLOUD_PULP_PASSWORD: ${{ secrets.LEAFCLOUD_PULP_PASSWORD }} steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v4 - name: Record settings for CI cloud run: | @@ -80,8 +87,8 @@ jobs: packer init . PACKER_LOG=1 packer build \ - -on-error=${{ vars.PACKER_ON_ERROR }} \ - -var-file=$PKR_VAR_environment_root/${{ env.CI_CLOUD }}.pkrvars.hcl \ + -on-error="${{ vars.PACKER_ON_ERROR }}" \ + -var-file="$PKR_VAR_environment_root/${{ env.CI_CLOUD }}.pkrvars.hcl" \ -var "source_image_name=${{ matrix.build.source_image_name }}" \ -var "image_name=${{ matrix.build.image_name }}" \ -var "image_name_version=" \ @@ -93,10 +100,10 @@ jobs: run: | . venv/bin/activate IMAGE_ID=$(jq --raw-output '.builds[-1].artifact_id' packer/packer-manifest.json) - while ! openstack image show -f value -c name $IMAGE_ID; do + while ! openstack image show -f value -c name "$IMAGE_ID"; do sleep 5 done - IMAGE_NAME=$(openstack image show -f value -c name $IMAGE_ID) + IMAGE_NAME=$(openstack image show -f value -c name "$IMAGE_ID") echo "image-name=${IMAGE_NAME}" >> "$GITHUB_OUTPUT" echo "image-id=$IMAGE_ID" >> "$GITHUB_OUTPUT" @@ -141,7 +148,7 @@ jobs: SOURCE_CLOUD: ${{ github.event.inputs.ci_cloud || vars.CI_CLOUD }} TARGET_CLOUD: ${{ matrix.target_cloud }} steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v4 - name: Record settings for CI cloud run: | @@ -153,7 +160,7 @@ jobs: python3 -m venv venv . venv/bin/activate pip install -U pip - pip install $(grep -o 'python-openstackclient[><=0-9\.]*' requirements.txt) + pip install "$(grep -o 'python-openstackclient[><=0-9\.]*' requirements.txt)" - name: Write clouds.yaml run: | diff --git a/.github/workflows/release-image.yml b/.github/workflows/release-image.yml index cb5553b9c..4a9546436 100644 --- a/.github/workflows/release-image.yml +++ b/.github/workflows/release-image.yml @@ -6,6 +6,13 @@ on: - published # should work for both pre-releases and releases env: IMAGE_PATH: environments/.stackhpc/tofu/cluster_image.auto.tfvars.json + +permissions: + contents: read + packages: write + # To report GitHub Actions status checks + statuses: write + jobs: ci-image-release: name: ci-image-release @@ -18,7 +25,7 @@ jobs: - RL8 - RL9 steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v4 - name: Write s3cmd configuration run: echo "${{ secrets.ARCUS_S3_CFG }}" > ~/.s3cfg diff --git a/.github/workflows/s3-image-sync.yml b/.github/workflows/s3-image-sync.yml index d5ba6afb9..17844e94e 100644 --- a/.github/workflows/s3-image-sync.yml +++ b/.github/workflows/s3-image-sync.yml @@ -10,6 +10,12 @@ env: S3_BUCKET: openhpc-images-prerelease IMAGE_PATH: environments/.stackhpc/tofu/cluster_image.auto.tfvars.json +permissions: + contents: read + packages: write + # To report GitHub Actions status checks + statuses: write + jobs: s3_cleanup: runs-on: ubuntu-22.04 @@ -17,7 +23,7 @@ jobs: strategy: fail-fast: false steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v4 - name: Write s3cmd configuration run: | @@ -50,7 +56,7 @@ jobs: outputs: ci_cloud: ${{ steps.ci.outputs.CI_CLOUD }} steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v4 - name: Record which cloud CI is running on id: ci @@ -62,7 +68,7 @@ jobs: python3 -m venv venv . venv/bin/activate pip install -U pip - pip install $(grep -o 'python-openstackclient[><=0-9\.]*' requirements.txt) + pip install "$(grep -o 'python-openstackclient[><=0-9\.]*' requirements.txt)" shell: bash - name: Write clouds.yaml @@ -138,7 +144,7 @@ jobs: OS_CLOUD: openstack CI_CLOUD: ${{ matrix.cloud }} steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v4 - name: Record which cloud CI is running on run: | @@ -149,7 +155,7 @@ jobs: python3 -m venv venv . venv/bin/activate pip install -U pip - pip install $(grep -o 'python-openstackclient[><=0-9\.]*' requirements.txt) + pip install "$(grep -o 'python-openstackclient[><=0-9\.]*' requirements.txt)" shell: bash - name: Write clouds.yaml @@ -175,7 +181,7 @@ jobs: image_hanging=$(openstack image list --name ${{ env.TARGET_IMAGE }} -f value -c ID -c Status | grep -v ' active$' | awk '{print $1}') if [ -n "$image_hanging" ]; then echo "Cleaning up OpenStack image with ID: $image_hanging" - openstack image delete $image_hanging + openstack image delete "$image_hanging" else echo "No image ID found, skipping cleanup." fi diff --git a/.github/workflows/stackhpc.yml b/.github/workflows/stackhpc.yml index 173b4e797..ef81ae4fe 100644 --- a/.github/workflows/stackhpc.yml +++ b/.github/workflows/stackhpc.yml @@ -24,6 +24,13 @@ on: - '!.gitignore' - '!.github/workflows/' - '.github/workflows/stackhpc' + +permissions: + contents: read + packages: write + # To report GitHub Actions status checks + statuses: write + jobs: openstack: name: openstack-ci @@ -48,7 +55,7 @@ jobs: - name: Find the latest release run: | - echo LATEST_RELEASE_TAG=$(curl -s https://api.github.com/repos/stackhpc/ansible-slurm-appliance/releases/latest | jq -r .tag_name) >> "$GITHUB_ENV" + echo "LATEST_RELEASE_TAG=$(curl -s https://api.github.com/repos/stackhpc/ansible-slurm-appliance/releases/latest | jq -r .tag_name)" >> "$GITHUB_ENV" - name: Checkout latest release uses: actions/checkout@v4 @@ -61,19 +68,19 @@ jobs: run: | # Iterate over the labels labels=$(echo '${{ toJSON(github.event.pull_request.labels) }}' | jq -r '.[].name') - echo $labels + echo "$labels" for label in $labels; do if [[ $label == CI_CLOUD=* ]]; then # Extract the value after 'CI_CLOUD=' CI_CLOUD_OVERRIDE=${label#CI_CLOUD=} - echo "CI_CLOUD=${CI_CLOUD_OVERRIDE}" >> $GITHUB_ENV + echo "CI_CLOUD=${CI_CLOUD_OVERRIDE}" >> "$GITHUB_ENV" fi done - name: Record debug info run: | - echo LATEST_RELEASE_TAG: $LATEST_RELEASE_TAG - echo CI_CLOUD: $CI_CLOUD + echo "LATEST_RELEASE_TAG: $LATEST_RELEASE_TAG" + echo "CI_CLOUD: $CI_CLOUD" - name: Setup ssh run: | @@ -109,7 +116,7 @@ jobs: run: | . venv/bin/activate . environments/.stackhpc/activate - echo vault_demo_user_password: "$DEMO_USER_PASSWORD" > $APPLIANCES_ENVIRONMENT_ROOT/inventory/group_vars/all/test_user.yml + echo "vault_demo_user_password: $DEMO_USER_PASSWORD" > "$APPLIANCES_ENVIRONMENT_ROOT/inventory/group_vars/all/test_user.yml" env: DEMO_USER_PASSWORD: ${{ secrets.TEST_USER_PASSWORD }} @@ -118,14 +125,14 @@ jobs: run: | . venv/bin/activate . environments/.stackhpc/activate - cd $STACKHPC_TF_DIR + cd "$STACKHPC_TF_DIR" tofu apply -auto-approve -var-file="${{ env.CI_CLOUD }}.tfvars" - name: Delete infrastructure if provisioning failed run: | . venv/bin/activate . environments/.stackhpc/activate - cd $STACKHPC_TF_DIR + cd "$STACKHPC_TF_DIR" tofu destroy -auto-approve -var-file="${{ env.CI_CLOUD }}.tfvars" if: failure() && steps.provision_servers.outcome == 'failure' @@ -161,7 +168,7 @@ jobs: run: | . venv/bin/activate . environments/.stackhpc/activate - cd $STACKHPC_TF_DIR + cd "$STACKHPC_TF_DIR" tofu init tofu apply -auto-approve -var-file="${{ env.CI_CLOUD }}.tfvars" @@ -207,14 +214,14 @@ jobs: # load ansible variables into shell: ansible-playbook ansible/ci/output_vars.yml \ -e output_vars_hosts=openondemand \ - -e output_vars_path=$APPLIANCES_ENVIRONMENT_ROOT/vars.txt \ + -e output_vars_path="$APPLIANCES_ENVIRONMENT_ROOT/vars.txt" \ -e output_vars_items=bastion_ip,bastion_user,openondemand_servername - source $APPLIANCES_ENVIRONMENT_ROOT/vars.txt + source "$APPLIANCES_ENVIRONMENT_ROOT/vars.txt" # setup ssh proxying: sudo apt-get --yes install proxychains echo proxychains installed - ssh -v -fN -D 9050 ${bastion_user}@${bastion_ip} + ssh -v -fN -D 9050 "${bastion_user}@${bastion_ip}" echo port 9050 forwarded # check OOD server returns 200: @@ -224,9 +231,9 @@ jobs: --server-response \ --no-check-certificate \ --http-user=demo_user \ - --http-password=${DEMO_USER_PASSWORD} https://${openondemand_servername} \ + --http-password="${DEMO_USER_PASSWORD}" "https://${openondemand_servername}" \ 2>&1) - (echo $statuscode | grep "200 OK") || (echo $statuscode && exit 1) + (echo "$statuscode" | grep "200 OK") || (echo "$statuscode" && exit 1) env: DEMO_USER_PASSWORD: ${{ secrets.TEST_USER_PASSWORD }} @@ -236,14 +243,14 @@ jobs: . environments/.stackhpc/activate if [ -n "$SNAPSHOT" ] then - echo Deleting $SNAPSHOT - openstack volume snapshot delete $SNAPSHOT + echo "Deleting $SNAPSHOT" + openstack volume snapshot delete "$SNAPSHOT" fi - name: Delete infrastructure run: | . venv/bin/activate . environments/.stackhpc/activate - cd $STACKHPC_TF_DIR + cd "$STACKHPC_TF_DIR" tofu destroy -auto-approve -var-file="${{ env.CI_CLOUD }}.tfvars" || echo "tofu failed in $STACKHPC_TF_DIR" if: ${{ success() || cancelled() }} diff --git a/.github/workflows/trivyscan.yml b/.github/workflows/trivyscan.yml index fe049e60d..5980a0b17 100644 --- a/.github/workflows/trivyscan.yml +++ b/.github/workflows/trivyscan.yml @@ -7,6 +7,12 @@ on: paths: - 'environments/.stackhpc/tofu/cluster_image.auto.tfvars.json' +permissions: + contents: read + packages: write + # To report GitHub Actions status checks + statuses: write + jobs: scan: concurrency: @@ -23,19 +29,19 @@ jobs: CI_CLOUD: ${{ vars.CI_CLOUD }} steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v4 - name: Override CI_CLOUD if PR label is present if: ${{ github.event_name == 'pull_request' }} run: | # Iterate over the labels labels=$(echo '${{ toJSON(github.event.pull_request.labels) }}' | jq -r '.[].name') - echo $labels + echo "$labels" for label in $labels; do if [[ $label == CI_CLOUD=* ]]; then # Extract the value after 'CI_CLOUD=' CI_CLOUD_OVERRIDE=${label#CI_CLOUD=} - echo "CI_CLOUD=${CI_CLOUD_OVERRIDE}" >> $GITHUB_ENV + echo "CI_CLOUD=${CI_CLOUD_OVERRIDE}" >> "$GITHUB_ENV" fi done @@ -60,7 +66,7 @@ jobs: python3 -m venv venv . venv/bin/activate pip install -U pip - pip install $(grep -o 'python-openstackclient[><=0-9\.]*' requirements.txt) + pip install "$(grep -o 'python-openstackclient[><=0-9\.]*' requirements.txt)" shell: bash - name: Write clouds.yaml diff --git a/.github/workflows/upgrade-check.yml.sample b/.github/workflows/upgrade-check.yml.sample index 39efcd8fe..eabe973f3 100644 --- a/.github/workflows/upgrade-check.yml.sample +++ b/.github/workflows/upgrade-check.yml.sample @@ -28,6 +28,13 @@ on: schedule: - cron: "0 9 * * *" workflow_dispatch: + +permissions: + contents: read + packages: write + # To report GitHub Actions status checks + statuses: write + jobs: check_for_update: runs-on: ubuntu-22.04 diff --git a/.github/workflows/upload-release-image.yml.sample b/.github/workflows/upload-release-image.yml.sample index 0b123bcf4..ec6f7009a 100644 --- a/.github/workflows/upload-release-image.yml.sample +++ b/.github/workflows/upload-release-image.yml.sample @@ -29,6 +29,12 @@ on: - openhpc-images # - openhpc-images-prerelease +permissions: + contents: read + packages: write + # To report GitHub Actions status checks + statuses: write + jobs: image_upload: runs-on: ubuntu-22.04 diff --git a/.gitignore b/.gitignore index f2bf5d59c..e1720675a 100644 --- a/.gitignore +++ b/.gitignore @@ -5,3 +5,4 @@ venv *.pyc packer/openhpc2 .vscode +.ansible diff --git a/.python-lint b/.python-lint new file mode 100644 index 000000000..7fe8d51b8 --- /dev/null +++ b/.python-lint @@ -0,0 +1,6 @@ +[MESSAGES CONTROL] + +# There seems to be an issue with the check +# https://github.com/pylint-dev/pylint/issues/214 +disable= + duplicate-code, diff --git a/.shellcheckrc b/.shellcheckrc new file mode 100644 index 000000000..454b8efab --- /dev/null +++ b/.shellcheckrc @@ -0,0 +1,7 @@ +# Configuration file for shellcheck +# https://github.com/koalaman/shellcheck/blob/master/shellcheck.1.md#rc-files + +# Unable to exclude *.sh.j2 files and the ansible parentheses upset shellcheck a lot. +# Lines can be address individually with # shellcheck disable=SCxxxx but this gets quite prolific. +# Disabling globally as we have more sh.j2 files than .sh +disable=SC1009,SC1054,SC1064,SC1065,SC1072,SC1073,SC1083 diff --git a/README.md b/README.md index 6ee964fef..0e75196d1 100644 --- a/README.md +++ b/README.md @@ -1,8 +1,9 @@ -[![Test deployment and image build on OpenStack](https://github.com/stackhpc/ansible-slurm-appliance/actions/workflows/stackhpc.yml/badge.svg)](https://github.com/stackhpc/ansible-slurm-appliance/actions/workflows/stackhpc.yml) - # StackHPC Slurm Appliance +[![Test deployment and image build on OpenStack](https://github.com/stackhpc/ansible-slurm-appliance/actions/workflows/stackhpc.yml/badge.svg)](https://github.com/stackhpc/ansible-slurm-appliance/actions/workflows/stackhpc.yml) + This repository contains playbooks and configuration to define a Slurm-based HPC environment. This includes: + - [Rocky Linux](https://rockylinux.org/)-based hosts. - [OpenTofu](https://opentofu.org/) configurations to define the cluster's infrastructure-as-code. - Packages for Slurm and MPI software stacks from [OpenHPC](https://openhpc.community/). @@ -22,18 +23,20 @@ While it is tested on OpenStack it should work on any cloud with appropriate Ope ## Demonstration Deployment The default configuration in this repository may be used to create a cluster to explore use of the appliance. It provides: + - Persistent state backed by an OpenStack volume. - NFS-based shared file system backed by another OpenStack volume. It requires an OpenStack cloud, and an Ansible "deploy host" with access to that cloud. Before starting ensure that: + - You have root access on the deploy host. - You can create instances from the [latest Slurm appliance image](https://github.com/stackhpc/ansible-slurm-appliance/releases), which already contains the required packages. This is built and tested in StackHPC's CI. - You have an SSH keypair defined in OpenStack, with the private key available on the deploy host. - Created instances have access to internet (note proxies can be setup through the appliance if necessary). - Created instances have accurate/synchronised time (for VM instances this is usually provided by the hypervisor; if not or for bare metal instances it may be necessary to configure a time service via the appliance). -- Three security groups are present: ``default`` allowing intra-cluster communication, ``SSH`` allowing external access via SSH and ``HTTPS`` allowing access for Open OnDemand. +- Three security groups are present: `default` allowing intra-cluster communication, `SSH` allowing external access via SSH and `HTTPS` allowing access for Open OnDemand. ### Setup deploy host @@ -44,11 +47,13 @@ The following operating systems are supported for the deploy host: These instructions assume the deployment host is running Rocky Linux 8: - sudo yum install -y git python38 - git clone https://github.com/stackhpc/ansible-slurm-appliance - cd ansible-slurm-appliance - git checkout ${latest-release-tag} - ./dev/setup-env.sh +```shell +sudo yum install -y git python38 +git clone https://github.com/stackhpc/ansible-slurm-appliance +cd ansible-slurm-appliance +git checkout ${latest-release-tag} +./dev/setup-env.sh +``` You will also need to install [OpenTofu](https://opentofu.org/docs/intro/install/rpm/). @@ -56,12 +61,16 @@ You will also need to install [OpenTofu](https://opentofu.org/docs/intro/install Run the following from the repository root to activate the venv: - . venv/bin/activate +```shell +. venv/bin/activate +``` Use the `cookiecutter` template to create a new environment to hold your configuration: - cd environments - cookiecutter skeleton +```shell +cd environments +cookiecutter skeleton +``` and follow the prompts to complete the environment name and description. @@ -69,52 +78,59 @@ and follow the prompts to complete the environment name and description. Go back to the root folder and activate the new environment: - cd .. - . environments/$ENV/activate +```shell +cd .. +. environments/$ENV/activate +``` And generate secrets for it: - ansible-playbook ansible/adhoc/generate-passwords.yml +```shell +ansible-playbook ansible/adhoc/generate-passwords.yml +``` ### Define and deploy infrastructure Create an OpenTofu variables file to define the required infrastructure, e.g.: - # environments/$ENV/tofu/terraform.tfvars: - - cluster_name = "mycluster" - cluster_networks = [ - { - network = "some_network" # * - subnet = "some_subnet" # * - } - ] - key_pair = "my_key" # * - control_node_flavor = "some_flavor_name" - login = { - # Arbitrary group name for these login nodes - interactive = { - nodes: ["login-0"] - flavor: "login_flavor_name" # * - } +```text +# environments/$ENV/tofu/terraform.tfvars: +cluster_name = "mycluster" +cluster_networks = [ + { + network = "some_network" # * + subnet = "some_subnet" # * + } +] +key_pair = "my_key" # * +control_node_flavor = "some_flavor_name" +login = { + # Arbitrary group name for these login nodes + interactive = { + nodes: ["login-0"] + flavor: "login_flavor_name" # * } - cluster_image_id = "rocky_linux_9_image_uuid" - compute = { - # Group name used for compute node partition definition - general = { - nodes: ["compute-0", "compute-1"] - flavor: "compute_flavor_name" # * - } +} +cluster_image_id = "rocky_linux_9_image_uuid" +compute = { + # Group name used for compute node partition definition + general = { + nodes: ["compute-0", "compute-1"] + flavor: "compute_flavor_name" # * } +} +``` Variables marked `*` refer to OpenStack resources which must already exist. The above is a minimal configuration - for all variables and descriptions see `environments/$ENV/tofu/variables.tf`. To deploy this infrastructure, ensure the venv and the environment are [activated](#create-a-new-environment) and run: - export OS_CLOUD=openstack - cd environments/$ENV/tofu/ - tofu init - tofu apply +```shell +export OS_CLOUD=openstack +cd environments/$ENV/tofu/ +tofu init +tofu apply +``` and follow the prompts. Note the OS_CLOUD environment variable assumes that OpenStack credentials are defined using a [clouds.yaml](https://docs.openstack.org/python-openstackclient/latest/configuration/index.html#clouds-yaml) file in a default location with the default cloud name of `openstack`. @@ -122,11 +138,15 @@ and follow the prompts. Note the OS_CLOUD environment variable assumes that Open To configure the appliance, ensure the venv and the environment are [activated](#create-a-new-environment) and run: - ansible-playbook ansible/site.yml +```shell +ansible-playbook ansible/site.yml +``` Once it completes you can log in to the cluster using: - ssh rocky@$login_ip +```shell +ssh rocky@$login_ip +``` where the IP of the login node is given in `environments/$ENV/inventory/hosts.yml` @@ -134,7 +154,7 @@ where the IP of the login node is given in `environments/$ENV/inventory/hosts.ym - `environments/`: See [docs/environments.md](docs/environments.md). - `ansible/`: Contains the ansible playbooks to configure the infrastructure. -- `packer/`: Contains automation to use Packer to build machine images for an environment - see the README in this directory for further information. +- `packer/`: Contains automation to use Packer to build machine images for an environment - see the readme in this directory for further information. - `dev/`: Contains development tools. For further information see the [docs](docs/) directory. @@ -143,7 +163,7 @@ For further information see the [docs](docs/) directory. To run the GitHub Actions linters locally, use: -```sh +```shell docker run --rm \ -e RUN_LOCAL=true \ --env-file "super-linter.env" \ @@ -151,7 +171,6 @@ docker run --rm \ ghcr.io/super-linter/super-linter:v7.3.0 ``` -```sh +```shell ansible-lint -c .ansible-lint.yml ansible/ ``` - diff --git a/ansible.cfg b/ansible.cfg index 09c5b9fb9..00efd536d 100644 --- a/ansible.cfg +++ b/ansible.cfg @@ -5,7 +5,7 @@ gathering = smart forks = 30 host_key_checking = False remote_tmp = /tmp -collections_path = ansible/collections +collections_path = .ansible/collections roles_path = ansible/roles filter_plugins = ansible/filter_plugins callbacks_enabled = ansible.posix.profile_tasks diff --git a/ansible/adhoc/backup-keytabs.yml b/ansible/adhoc/backup-keytabs.yml index 5566e48ac..a88daf71c 100644 --- a/ansible/adhoc/backup-keytabs.yml +++ b/ansible/adhoc/backup-keytabs.yml @@ -1,11 +1,12 @@ +--- # Use ONE of the following tags on this playbook: # - retrieve: copies keytabs out of the state volume to the environment # - deploy: copies keytabs from the environment to the state volume - hosts: freeipa_client - become: yes - gather_facts: no + become: true + gather_facts: false tasks: - - import_role: + - ansible.builtin.import_role: name: freeipa tasks_from: backup-keytabs.yml diff --git a/ansible/adhoc/cudatests.yml b/ansible/adhoc/cudatests.yml index 59af8568a..f571f8a89 100644 --- a/ansible/adhoc/cudatests.yml +++ b/ansible/adhoc/cudatests.yml @@ -1,8 +1,9 @@ +--- - hosts: cuda - become: yes - gather_facts: yes + become: true + gather_facts: true tags: cuda_samples tasks: - - import_role: + - ansible.builtin.import_role: name: cuda tasks_from: samples.yml diff --git a/ansible/adhoc/deploy-pulp.yml b/ansible/adhoc/deploy-pulp.yml index 2858d032b..a3cb402dd 100644 --- a/ansible/adhoc/deploy-pulp.yml +++ b/ansible/adhoc/deploy-pulp.yml @@ -1,26 +1,27 @@ +--- # Usage: ansible-playbook ansible/adhoc/deploy-pulp.yml -e "pulp_server=" - name: Add temporary pulp server host hosts: localhost tasks: - - ansible.builtin.add_host: - name: "{{ pulp_server }}" - group: "_pulp_host" + - ansible.builtin.add_host: + name: "{{ pulp_server }}" + group: "_pulp_host" - name: Install pulp on server and add to config - become: yes + become: true hosts: _pulp_host tasks: - - name: Install pulp - ansible.builtin.include_role: - name: pulp_site - tasks_from: install.yml - public: true + - name: Install pulp + ansible.builtin.include_role: + name: pulp_site + tasks_from: install.yml + public: true - - name: Print Pulp endpoint - become: no - debug: - msg: | - Server configured, override 'appliances_pulp_url' with - appliances_pulp_url: "http://{{ pulp_server }}:{{ pulp_site_port }}" - in your environments + - name: Print Pulp endpoint + become: false + ansible.builtin.debug: + msg: | + Server configured, override 'appliances_pulp_url' with + appliances_pulp_url: "http://{{ pulp_server }}:{{ pulp_site_port }}" + in your environments diff --git a/ansible/adhoc/generate-passwords.yml b/ansible/adhoc/generate-passwords.yml index 89c08f0ed..f9354f285 100644 --- a/ansible/adhoc/generate-passwords.yml +++ b/ansible/adhoc/generate-passwords.yml @@ -1,9 +1,8 @@ --- - - name: Generate passwords.yml hosts: localhost gather_facts: false tasks: - name: Include password generation role - include_role: - name: passwords \ No newline at end of file + ansible.builtin.include_role: + name: passwords diff --git a/ansible/adhoc/hpctests.yml b/ansible/adhoc/hpctests.yml index 6e733d340..5747e7c92 100644 --- a/ansible/adhoc/hpctests.yml +++ b/ansible/adhoc/hpctests.yml @@ -3,10 +3,9 @@ # Relies on installed packages in appliance defaults - see openhpc variables. --- - - hosts: hpctests[0] # TODO: might want to make which node is used selectable? become: false gather_facts: false tasks: - - import_role: + - ansible.builtin.import_role: name: hpctests diff --git a/ansible/adhoc/rebuild-via-slurm.yml b/ansible/adhoc/rebuild-via-slurm.yml index 4f7b5a576..33cbe5cc7 100644 --- a/ansible/adhoc/rebuild-via-slurm.yml +++ b/ansible/adhoc/rebuild-via-slurm.yml @@ -1,3 +1,4 @@ +--- # Rebuild compute nodes via slurm. # Nodes will be rebuilt if `image_id` in inventory is different to the # currently-provisioned image. Otherwise they are rebooted. @@ -9,9 +10,9 @@ - hosts: login run_once: true - gather_facts: no + gather_facts: false tasks: - name: Run slurm-controlled rebuild - import_role: + ansible.builtin.import_role: name: rebuild tasks_from: rebuild.yml diff --git a/ansible/adhoc/rebuild.yml b/ansible/adhoc/rebuild.yml index 9e7a3a770..b6033e43c 100644 --- a/ansible/adhoc/rebuild.yml +++ b/ansible/adhoc/rebuild.yml @@ -1,21 +1,24 @@ +--- # Rebuild hosts with a specified image from OpenStack. -# +# # Use ansible's -v output to see output. # Use --limit to control which hosts to rebuild (either specific hosts or the _ groups defining partitions). # Optionally, supply `-e rebuild_image=` to define a specific image, otherwise the current image is reused. # -# NOTE: If a hostvar `instance_id` is defined this is used to select hosts. Otherwise the hostname is used and this must be unique, which may not be the case e.g. if using identically-named staging and production hosts. +# NOTE: If a hostvar `instance_id` is defined this is used to select hosts. +# Otherwise the hostname is used and this must be unique, which may not be the case e.g. if using identically-named staging and production hosts. # # Example: # ansible-playbook -v --limit ohpc_compute ansible/adhoc/rebuild.yml -e rebuild_image=openhpc_v2.3 - hosts: cluster - become: no - gather_facts: no + become: false + gather_facts: false tasks: - - command: "openstack server rebuild {{ instance_id | default(inventory_hostname) }}{% if rebuild_image is defined %} --image {{ rebuild_image }}{% endif %}" + # yamllint disable-line rule:line-length + - ansible.builtin.command: "openstack server rebuild {{ instance_id | default(inventory_hostname) }}{% if rebuild_image is defined %} --image {{ rebuild_image }}{% endif %}" delegate_to: localhost - - wait_for_connection: + changed_when: false + - ansible.builtin.wait_for_connection: delay: 60 timeout: 600 - diff --git a/ansible/adhoc/restart-slurm.yml b/ansible/adhoc/restart-slurm.yml index 41b9dcb50..de837f5d3 100644 --- a/ansible/adhoc/restart-slurm.yml +++ b/ansible/adhoc/restart-slurm.yml @@ -1,3 +1,4 @@ +--- # Restart all slurm daemons e.g. after changing configuration. Note that: # - `scontrol reconfigure` will handle most reconfiguration - see https://slurm.schedmd.com/scontrol.html#OPT_reconfigure # for which options need a restart @@ -5,25 +6,25 @@ # restart daemons as required. - hosts: compute,login - become: yes - gather_facts: no + become: true + gather_facts: false tasks: - - service: + - ansible.builtin.service: name: slurmd state: stopped - hosts: control - become: yes - gather_facts: no + become: true + gather_facts: false tasks: - - service: + - ansible.builtin.service: name: slurmctld state: restarted - hosts: compute,login - become: yes - gather_facts: no + become: true + gather_facts: false tasks: - - service: + - ansible.builtin.service: name: slurmd state: started diff --git a/ansible/adhoc/sync-pulp.yml b/ansible/adhoc/sync-pulp.yml index b2cd9a8c4..a7c35bf63 100644 --- a/ansible/adhoc/sync-pulp.yml +++ b/ansible/adhoc/sync-pulp.yml @@ -1,3 +1,4 @@ +--- - hosts: localhost tasks: - ansible.builtin.include_role: diff --git a/ansible/adhoc/update-packages.yml b/ansible/adhoc/update-packages.yml index ae970ba0f..929b0da77 100644 --- a/ansible/adhoc/update-packages.yml +++ b/ansible/adhoc/update-packages.yml @@ -1,18 +1,20 @@ +--- - hosts: update - become: yes + become: true gather_facts: false tasks: - name: Update selected packages - yum: + ansible.builtin.dnf: name: "{{ update_name }}" state: "{{ update_state }}" exclude: "{{ update_exclude }}" disablerepo: "{{ update_disablerepo }}" register: updates - name: Log updated packages - copy: + ansible.builtin.copy: content: "{{ updates.results | join('\n') }}" dest: "{{ update_log_path }}" + mode: "0644" delegate_to: localhost - - debug: + - ansible.builtin.debug: msg: "{{ updates.results | length }} changes to packages - see {{ update_log_path }} for details" diff --git a/ansible/bootstrap.yml b/ansible/bootstrap.yml index 3fd7f267d..cbf13bc6e 100644 --- a/ansible/bootstrap.yml +++ b/ansible/bootstrap.yml @@ -1,16 +1,15 @@ --- - - hosts: cluster gather_facts: false - become: yes + become: true tasks: - name: Check if ansible-init is installed - stat: + ansible.builtin.stat: path: /etc/systemd/system/ansible-init.service register: _stat_ansible_init_unitfile - + - name: Wait for ansible-init to finish - wait_for: + ansible.builtin.wait_for: path: /var/lib/ansible-init.done timeout: "{{ ansible_init_wait }}" # seconds when: _stat_ansible_init_unitfile.stat.exists @@ -21,7 +20,7 @@ tags: - deprecated tasks: - - fail: + - ansible.builtin.fail: msg: | Variables prefixed secrets_openhpc_* are deprecated - run: $ ansible-playbook ansible/adhoc/generate-passwords.yml @@ -29,34 +28,34 @@ when: "'secrets_openhpc_' in (hostvars[inventory_hostname] | join)" - hosts: resolv_conf - become: yes + become: true gather_facts: false tags: resolv_conf tasks: - - import_role: + - ansible.builtin.import_role: name: resolv_conf - hosts: etc_hosts gather_facts: false tags: etc_hosts - become: yes + become: true tasks: - - import_role: + - ansible.builtin.import_role: name: etc_hosts - hosts: proxy gather_facts: false tags: proxy - become: yes + become: true tasks: - - import_role: + - ansible.builtin.import_role: name: proxy - hosts: chrony tags: chrony - become: yes + become: true tasks: - - import_role: + - ansible.builtin.import_role: name: mrlesmithjr.chrony # skip install tasks as might not have network yet tasks_from: config_chrony.yml @@ -67,53 +66,55 @@ - hosts: cluster gather_facts: false - become: yes + become: true tasks: - name: Fix incorrect permissions on /etc in Rocky-9-GenericCloud-Base-9.4-20240523.0.x86_64.qcow2 # breaks munge - file: + ansible.builtin.file: path: /etc state: directory owner: root group: root mode: u=rwx,go=rx # has g=rwx - name: Prevent ssh hanging if shared home is unavailable - lineinfile: + ansible.builtin.lineinfile: path: /etc/profile search_string: HOSTNAME=$(/usr/bin/hostnamectl --transient 2>/dev/null) || \ state: absent - name: Add system user groups - ansible.builtin.group: "{{ item.group }}" + ansible.builtin.group: + name: "{{ item.group }}" loop: "{{ appliances_local_users }}" when: - item.enable | default(true) | bool - "'group' in item" - become_method: "sudo" + become_method: "ansible.builtin.sudo" # Need to change working directory otherwise we try to switch back to non-existent directory. - become_flags: '-i' + become_flags: "-i" - name: Add system users - ansible.builtin.user: "{{ item.user }}" + ansible.builtin.user: + name: "{{ item.user }}" loop: "{{ appliances_local_users }}" when: item.enable | default(true) | bool - become_method: "sudo" + become_method: "ansible.builtin.sudo" # Need to change working directory otherwise we try to switch back to non-existent directory. - become_flags: '-i' + become_flags: "-i" - name: Reset ssh connection to allow user changes to affect ansible_user - meta: reset_connection - become: no + ansible.builtin.meta: reset_connection + become: false - hosts: systemd - become: yes + become: true gather_facts: false tags: systemd tasks: - name: Make systemd unit modifications - import_role: + ansible.builtin.import_role: name: systemd - hosts: selinux gather_facts: false - become: yes + become: true tags: - selinux tasks: @@ -125,36 +126,36 @@ - hosts: sshd tags: sshd - gather_facts: no - become: yes + gather_facts: false + become: true tasks: - name: Configure sshd - import_role: + ansible.builtin.import_role: name: sshd - hosts: dnf_repos - become: yes + become: true tasks: - - name: Check that creds won't be leaked to users - ansible.builtin.assert: - that: dnf_repos_password is undefined - fail_msg: Passwords should not be templated into repofiles during configure, unset 'dnf_repos_password' - when: - - appliances_mode == 'configure' - - not (dnf_repos_allow_insecure_creds | default(false)) # useful for development + - name: Check that creds won't be leaked to users + ansible.builtin.assert: + that: dnf_repos_password is undefined + fail_msg: Passwords should not be templated into repofiles during configure, unset 'dnf_repos_password' + when: + - appliances_mode == 'configure' + - not (dnf_repos_allow_insecure_creds | default(false)) # useful for development - hosts: cacerts tags: cacerts gather_facts: false tasks: - name: Install custom cacerts - import_role: + ansible.builtin.import_role: name: cacerts - hosts: squid tags: squid - gather_facts: yes - become: yes + gather_facts: true + become: true tasks: # - Installing squid requires working dnf repos # - Configuring dnf_repos itself requires working dnf repos to install epel @@ -165,27 +166,27 @@ tasks_from: set_repos.yml when: "'dnf_repos' in group_names" - name: Configure squid proxy - import_role: + ansible.builtin.import_role: name: squid - hosts: dnf_repos tags: dnf_repos - gather_facts: yes - become: yes + gather_facts: true + become: true tasks: - - name: Replace system repos with pulp repos - ansible.builtin.include_role: - name: dnf_repos - tasks_from: set_repos.yml + - name: Replace system repos with pulp repos + ansible.builtin.include_role: + name: dnf_repos + tasks_from: set_repos.yml # --- tasks after here require general access to package repos --- - hosts: tuned tags: tuned - gather_facts: yes - become: yes + gather_facts: true + become: true tasks: - name: Install and configure tuneD - import_role: + ansible.builtin.import_role: name: tuned - hosts: freeipa_server @@ -193,38 +194,38 @@ tags: - freeipa - freeipa_server - gather_facts: yes - become: yes + gather_facts: true + become: true tasks: - name: Install FreeIPA server - import_role: + ansible.builtin.import_role: name: freeipa tasks_from: server.yml - hosts: cluster gather_facts: false - become: yes + become: true tags: cockpit tasks: - - name: Remove RHEL cockpit - command: dnf -y remove cockpit-ws # N.B. using ansible dnf module is very slow + - name: Remove RHEL cockpit # noqa: no-changed-when + ansible.builtin.command: dnf -y remove cockpit-ws register: dnf_remove_output - ignore_errors: true # Avoid failing if a lock or other error happens + ignore_errors: true # Avoid failing if a lock or other error happens - hosts: firewalld gather_facts: false - become: yes + become: true tags: firewalld tasks: - - import_role: + - ansible.builtin.import_role: name: firewalld - hosts: fail2ban gather_facts: false - become: yes + become: true tags: fail2ban tasks: - - import_role: + - ansible.builtin.import_role: name: fail2ban - name: Setup podman @@ -232,95 +233,97 @@ hosts: podman tags: podman tasks: - - import_role: + - ansible.builtin.import_role: name: podman tasks_from: prereqs.yml tags: prereqs - - import_role: + - ansible.builtin.import_role: name: podman tasks_from: config.yml tags: config - hosts: update gather_facts: false - become: yes + become: true tags: - update tasks: - - block: - - name: Update selected packages - yum: - name: "{{ update_name }}" - state: "{{ update_state }}" - exclude: "{{ update_exclude }}" - disablerepo: "{{ update_disablerepo }}" - async: "{{ 30 * 60 }}" # wait for up to 30 minutes - poll: 15 # check every 15 seconds - register: updates - - name: Ensure update log directory on localhost exists - file: - path: "{{ update_log_path | dirname }}" - state: directory - become: false - delegate_to: localhost - run_once: true - - name: Log updated packages - copy: - content: "{{ updates.results | join('\n') }}" - dest: "{{ update_log_path }}" - delegate_to: localhost - become: no - - debug: - msg: "{{ updates.results | length }} changes to packages - see {{ update_log_path }} for details" - when: "update_enable | default('false') | bool" + - when: "update_enable | default('false') | bool" + block: + - name: Update selected packages + ansible.builtin.dnf: + name: "{{ update_name }}" + state: "{{ update_state }}" + exclude: "{{ update_exclude }}" + disablerepo: "{{ update_disablerepo }}" + async: "{{ 30 * 60 }}" # wait for up to 30 minutes + poll: 15 # check every 15 seconds + register: updates + - name: Ensure update log directory on localhost exists + ansible.builtin.file: + path: "{{ update_log_path | dirname }}" + state: directory + mode: "0755" + become: false + delegate_to: localhost + run_once: true # noqa: run-once[task] + - name: Log updated packages + ansible.builtin.copy: + content: "{{ updates.results | join('\n') }}" + dest: "{{ update_log_path }}" + mode: "0644" + delegate_to: localhost + become: false + - ansible.builtin.debug: + msg: "{{ updates.results | length }} changes to packages - see {{ update_log_path }} for details" - hosts: - selinux - update gather_facts: false - become: yes + become: true tags: - reboot - selinux - update tasks: - name: Check for pending reboot from package updates - command: + ansible.builtin.command: cmd: dnf needs-restarting -r register: update_reboot_required failed_when: "update_reboot_required.rc not in [0, 1]" changed_when: false - name: Reboot to cover SELinux state change or package upgrades - reboot: + ansible.builtin.reboot: post_reboot_delay: 30 when: (sestatus['reboot_required'] | default(false)) or (update_reboot_required.rc == 1) - name: Wait for hosts to be reachable - wait_for_connection: + ansible.builtin.wait_for_connection: sleep: 15 - name: Clear facts - meta: clear_facts + ansible.builtin.meta: clear_facts - name: Update facts - setup: + ansible.builtin.setup: - hosts: ofed - gather_facts: yes - become: yes + gather_facts: true + become: true tags: ofed tasks: - - include_role: + - ansible.builtin.include_role: name: ofed - hosts: ansible_init - gather_facts: yes - become: yes + gather_facts: true + become: true tags: linux_ansible_init tasks: - - include_role: + - ansible.builtin.include_role: name: azimuth_cloud.image_utils.linux_ansible_init - hosts: k3s:&builder - become: yes + become: true tags: k3s tasks: - name: Install k3s diff --git a/ansible/ci/check_eessi.yml b/ansible/ci/check_eessi.yml index 280f8658b..a72bd916a 100644 --- a/ansible/ci/check_eessi.yml +++ b/ansible/ci/check_eessi.yml @@ -5,20 +5,21 @@ eessi_test_rootdir: /home/eessi_test tasks: - name: Create test root directory - file: + ansible.builtin.file: path: "{{ eessi_test_rootdir }}" state: directory owner: "{{ ansible_user }}" group: "{{ ansible_user }}" + mode: "0755" become: true - - - name: Clone eessi-demo repo + + - name: Clone eessi-demo repo # noqa: latest[git] ansible.builtin.git: repo: "https://github.com/eessi/eessi-demo.git" dest: "{{ eessi_test_rootdir }}/eessi-demo" - name: Create batch script - copy: + ansible.builtin.copy: dest: "{{ eessi_test_rootdir }}/eessi-demo/TensorFlow/tensorflow.sh" content: | #!/usr/bin/env bash @@ -26,25 +27,26 @@ #SBATCH --error=%x.out source /cvmfs/pilot.eessi-hpc.org/latest/init/bash srun ./run.sh + mode: "0644" - - name: Run test job - ansible.builtin.shell: + - name: Run test job # noqa: no-changed-when + ansible.builtin.command: cmd: sbatch --wait tensorflow.sh chdir: "{{ eessi_test_rootdir }}/eessi-demo/TensorFlow" register: job_output - name: Retrieve job output - slurp: + ansible.builtin.slurp: src: "{{ eessi_test_rootdir }}/eessi-demo/TensorFlow/tensorflow.sh.out" register: _tensorflow_out no_log: true # as its base64 encoded so useless - name: Show job output - debug: + ansible.builtin.debug: msg: "{{ _tensorflow_out.content | b64decode }}" - name: Fail if job output contains error - fail: + ansible.builtin.fail: # Note: Job prints live progress bar to terminal, so use regex filter to remove this from stdout - msg: "Test job using EESSI modules failed. Job output was: {{ job_output.stdout | regex_replace('\b', '') }}" + msg: "Test job using EESSI modules failed. Job output was: {{ job_output.stdout | regex_replace('\b', '') }}" when: '"Epoch 5/5" not in _tensorflow_out.content | b64decode' diff --git a/ansible/ci/check_grafana.yml b/ansible/ci/check_grafana.yml index 36fb78b72..0764b654d 100644 --- a/ansible/ci/check_grafana.yml +++ b/ansible/ci/check_grafana.yml @@ -1,15 +1,16 @@ +--- # Checks Slurm jobs from hpctests are shown in Grafana. # Can't actually check the dashboard programatically so this queries the datasource used by the dashboard instead. - hosts: control # so proxying etc is irrelevant - gather_facts: no - become: no + gather_facts: false + become: false tasks: - name: Wait for slurm-stats file to exist (run by cron) ansible.builtin.wait_for: path: /var/log/slurm-stats/finished_jobs.json timeout: 315 # slurm stats cron job runs every 5 mins - + - name: Query grafana for expected hpctests jobs grafana_elasticsearch_query: grafana_url: http://{{ grafana_api_address }}:{{ grafana_port }} @@ -23,4 +24,5 @@ delay: 5 vars: _found_jobs: "{{ _slurm_stats_jobs.docs | map(attribute='JobName', default='(json error in slurmstats data)') }}" - _expected_jobs: ['pingpong.sh'] + _expected_jobs: + - "pingpong.sh" diff --git a/ansible/ci/check_sacct_hpctests.yml b/ansible/ci/check_sacct_hpctests.yml index 1ebbf2171..362860905 100644 --- a/ansible/ci/check_sacct_hpctests.yml +++ b/ansible/ci/check_sacct_hpctests.yml @@ -1,3 +1,4 @@ +--- - hosts: control gather_facts: false become: true @@ -7,13 +8,13 @@ 1,pingpong.sh,COMPLETED tasks: - name: Get info for ended jobs - shell: + ansible.builtin.command: cmd: sacct --format=jobid,jobname,state --allocations --parsable2 --delimiter=, --starttime=now-1days --endtime=now # by default start/end time is midnight/now which is not robust changed_when: false register: sacct - name: Check info for ended jobs - assert: + ansible.builtin.assert: that: sacct_stdout_expected in sacct.stdout fail_msg: | Expected: diff --git a/ansible/ci/check_slurm.yml b/ansible/ci/check_slurm.yml index ff527da06..45cda6c5b 100644 --- a/ansible/ci/check_slurm.yml +++ b/ansible/ci/check_slurm.yml @@ -1,9 +1,10 @@ +--- - hosts: login:!builder # won't have a slurm control daemon when in build - become: no + become: false gather_facts: false tasks: - name: Run sinfo - shell: 'sinfo --noheader --format="%N %P %a %l %D %t" | sort' # using --format ensures we control whitespace: Partition,partition_state,max_jobtime,num_nodes,node_state,node_name + ansible.builtin.shell: 'sinfo --noheader --format="%N %P %a %l %D %t" | sort' # noqa: risky-shell-pipe register: sinfo changed_when: false until: sinfo.stdout_lines == expected_sinfo diff --git a/ansible/ci/delete_images.yml b/ansible/ci/delete_images.yml index 78b5742fc..992fb8ecf 100644 --- a/ansible/ci/delete_images.yml +++ b/ansible/ci/delete_images.yml @@ -1,12 +1,12 @@ +--- - hosts: login:!builder - become: no - gather_facts: no + become: false + gather_facts: false tasks: - - import_tasks: get_image_ids.yml - - - name: Delete images - shell: + - ansible.builtin.import_tasks: get_image_ids.yml + - name: Delete images # noqa: no-changed-when + ansible.builtin.shell: cmd: | openstack image delete {{ item.artifact_id }} delegate_to: localhost - loop: "{{ manifest['builds'] }}" + loop: "{{ manifest['builds'] }}" # noqa: no-changed-when diff --git a/ansible/ci/get_image_ids.yml b/ansible/ci/get_image_ids.yml index 4a53b15dd..ede3a729c 100644 --- a/ansible/ci/get_image_ids.yml +++ b/ansible/ci/get_image_ids.yml @@ -1,12 +1,13 @@ +--- - name: Read packer build manifest - set_fact: + ansible.builtin.set_fact: manifest: "{{ lookup('file', manifest_path) | from_json }}" vars: manifest_path: "{{ lookup('env', 'APPLIANCES_REPO_ROOT') }}/packer/packer-manifest.json" delegate_to: localhost - name: Get latest image builds - set_fact: + ansible.builtin.set_fact: login_build: "{{ manifest['builds'] | selectattr('custom_data', 'eq', {'source': 'login'}) | last }}" compute_build: "{{ manifest['builds'] | selectattr('custom_data', 'eq', {'source': 'compute'}) | last }}" control_build: "{{ manifest['builds'] | selectattr('custom_data', 'eq', {'source': 'control'}) | last }}" diff --git a/ansible/ci/library/grafana_elasticsearch_query.py b/ansible/ci/library/grafana_elasticsearch_query.py index 3809565db..39fab5420 100644 --- a/ansible/ci/library/grafana_elasticsearch_query.py +++ b/ansible/ci/library/grafana_elasticsearch_query.py @@ -1,10 +1,16 @@ -#!/usr/bin/python +#!/usr/bin/python # pylint: disable=missing-module-docstring # Copyright: (c) 2022 Steve Brasier steve@stackhpc.com -from __future__ import (absolute_import, division, print_function) -__metaclass__ = type +from __future__ import absolute_import, division, print_function -DOCUMENTATION = r''' +import json + +import requests # pylint: disable=import-error +from ansible.module_utils.basic import AnsibleModule # pylint: disable=import-error + +__metaclass__ = type # pylint: disable=invalid-name + +DOCUMENTATION = r""" --- module: grafana_elasticsearch_query @@ -16,9 +22,9 @@ author: - Steve Brasier -''' +""" -EXAMPLES = r''' +EXAMPLES = r""" - name: Get elasticsearch hits grafana_elasticsearch_query: grafana_url: http://{{ grafana_api_address }}:{{ grafana_port }} @@ -26,63 +32,83 @@ grafana_password: "{{ vault_grafana_admin_password }}" datasource: slurmstats index_pattern: 'filebeat-*' -''' +""" -RETURN = r''' +RETURN = r""" # These are examples of possible return values, and in general should use other names for return values. docs: description: List of dicts with the original json in each document. returned: always type: list -''' - -from ansible.module_utils.basic import AnsibleModule -import requests -import json - -def run_module(): - module_args = dict( - grafana_url=dict(type="str", required=True), - grafana_username=dict(type="str", required=True), - grafana_password=dict(type="str", required=True), - datasource=dict(type="str", required=True), - index_pattern=dict(type="str", required=True), - ) +""" + + +def run_module(): # pylint: disable=missing-function-docstring + module_args = { + "grafana_url": { + "type": "str", + "required": True, + }, + "grafana_username": { + "type": "str", + "required": True, + }, + "grafana_password": { + "type": "str", + "required": True, + }, + "datasource": { + "type": "str", + "required": True, + }, + "index_pattern": { + "type": "str", + "required": True, + }, + } - result = dict( - changed=False, - jobs=[] - ) + result = { + "changed": False, + "jobs": [], + } module = AnsibleModule(argument_spec=module_args, supports_check_mode=True) - auth=(module.params['grafana_username'], module.params['grafana_password']) - + auth = (module.params["grafana_username"], module.params["grafana_password"]) + # list datasources: - datasources_api_url = module.params["grafana_url"] + '/api/datasources' + datasources_api_url = module.params["grafana_url"] + "/api/datasources" r = requests.get(datasources_api_url, auth=auth) datasources = json.loads(r.text) # select required datasource: - ds = [s for s in datasources if s['name'] == module.params["datasource"]][0] + ds = [s for s in datasources if s["name"] == module.params["datasource"]][0] # get documents: - datasource_proxy_url = module.params["grafana_url"] + '/api/datasources/proxy/' + str(ds['id']) + '/' + module.params['index_pattern'] + '/_search' + datasource_proxy_url = ( + module.params["grafana_url"] + + "/api/datasources/proxy/" + + str(ds["id"]) + + "/" + + module.params["index_pattern"] + + "/_search" + ) r = requests.get(datasource_proxy_url, auth=auth) search = json.loads(r.text) - # see https://www.elastic.co/guide/en/elasticsearch/reference/current/search-search.html#search-api-response-body: - docs = [h['_source']['json'] for h in search['hits']['hits']] + # see + # https://www.elastic.co/guide/en/elasticsearch/reference/current/search-search.html#search-api-response-body: + docs = [h["_source"]["json"] for h in search["hits"]["hits"]] result = { - 'docs': docs, + "docs": docs, } module.exit_json(**result) -def main(): +def main(): # pylint: disable=missing-function-docstring run_module() -if __name__ == '__main__': +if __name__ == "__main__": main() diff --git a/ansible/ci/output_vars.yml b/ansible/ci/output_vars.yml index 0e2bc4c61..2963a5848 100644 --- a/ansible/ci/output_vars.yml +++ b/ansible/ci/output_vars.yml @@ -1,7 +1,8 @@ +--- # Output specific hostvars to a file in a form which can be sourced by bash # NB: obviously the keys and values for the hostvars need to be suitable bash variables -- hosts: "{{ output_vars_hosts }}" - gather_facts: no +- hosts: "{{ output_vars_hosts }}" # noqa: syntax-check[specific] + gather_facts: false tasks: - copy: dest: "{{ output_vars_path }}" diff --git a/ansible/ci/retrieve_inventory.yml b/ansible/ci/retrieve_inventory.yml index d5f61bbd2..6e395ef37 100644 --- a/ansible/ci/retrieve_inventory.yml +++ b/ansible/ci/retrieve_inventory.yml @@ -1,27 +1,28 @@ +--- # Retrieve inventory from a deployed CI arcus environment by reversing arcus/inventory/hooks/pre.yml # Usage example: # ansible-playbook ansible/ci/retrieve_inventory.yml -e cluster_prefix=ci4005969475 # - hosts: localhost - become: no - gather_facts: no + become: false + gather_facts: false vars: cluster_prefix: "{{ undef(hint='cluster_prefix must be defined') }}" # e.g. ci4005969475 ci_vars_file: "{{ appliances_environment_root + '/tofu/' + lookup('env', 'CI_CLOUD') }}.tfvars" cluster_network: "{{ lookup('ansible.builtin.ini', 'cluster_net', file=ci_vars_file, type='properties') | trim('\"') }}" tasks: - name: Get control host IP - set_fact: + ansible.builtin.set_fact: control_ip: "{{ (lookup('pipe', 'openstack server show -f json ' + cluster_prefix + '-control') | from_json)['addresses'][cluster_network][0] }}" - name: Add host into in-memory inventory - add_host: + ansible.builtin.add_host: name: cluster_control groups: control ansible_host: "{{ control_ip }}" - hosts: control - become: yes - gather_facts: no + become: true + gather_facts: false tasks: - ansible.builtin.fetch: src: "/etc/ci-config/{{ item | basename }}" diff --git a/ansible/ci/update_timestamps.yml b/ansible/ci/update_timestamps.yml index e9a455a1e..88978e09a 100644 --- a/ansible/ci/update_timestamps.yml +++ b/ansible/ci/update_timestamps.yml @@ -1,6 +1,7 @@ +--- - hosts: localhost tasks: - - name: Get latest timestamps from sources + - name: Get latest timestamps from sources # noqa: syntax-check[unknown-module] # ansible/library/latest_timestamps.py latest_timestamps: repos_dict: "{{ appliances_pulp_repos }}" content_url: "https://ark.stackhpc.com/pulp/content" diff --git a/ansible/cleanup.yml b/ansible/cleanup.yml index 744f9b657..7543aa7ac 100644 --- a/ansible/cleanup.yml +++ b/ansible/cleanup.yml @@ -1,16 +1,17 @@ +--- # Clean up a Packer build VM -- meta: flush_handlers +- ansible.builtin.meta: flush_handlers -- name: Remove dnf caches - command: dnf clean all +- name: Remove dnf caches # noqa: no-changed-when + ansible.builtin.command: dnf clean all # If image build happens on a Neutron subnet with property dns_namservers defined, then cloud-init # disables NetworkManager's control of /etc/resolv.conf and appends nameservers itself. # We don't want network configuration during instance boot to depend on the configuration # of the network the builder was on, so we reset these aspects. - name: Delete /etc/resolv.conf - file: + ansible.builtin.file: path: /etc/resolv.conf state: absent when: "'resolv_conf' not in group_names" # if its been overriden, deleting it is the wrong thing to do @@ -19,25 +20,25 @@ # NB: This *doesn't* delete the 90-dns-none.conf file created by the resolv_conf role # as if nameservers are explicitly being set by that role we don't want to allow NM # to override it again. - file: + ansible.builtin.file: path: /etc/NetworkManager/conf.d/99-cloud-init.conf state: absent - name: Get remote environment for ansible_user - setup: + ansible.builtin.setup: gather_subset: env - become: no + become: false - name: Delete any injected ssh config for ansible_user - file: + ansible.builtin.file: path: "{{ ansible_env.HOME }}/.ssh/" state: absent -- name: Run cloud-init cleanup - command: cloud-init clean --logs --seed +- name: Run cloud-init cleanup # noqa: no-changed-when + ansible.builtin.command: cloud-init clean --logs --seed -- name: Cleanup /tmp - command : rm -rf /tmp/* +- name: Cleanup /tmp # noqa: no-changed-when + ansible.builtin.command: rm -rf /tmp/* - name: Delete files triggering vulnerability scans ansible.builtin.file: @@ -54,10 +55,10 @@ - /etc/ansible-init/playbooks/roles/mrlesmithjr.chrony/requirements.txt - name: Get package facts - package_facts: + ansible.builtin.package_facts: - name: Ensure image summary directory exists - file: + ansible.builtin.file: path: /var/lib/image/ state: directory owner: root @@ -65,9 +66,10 @@ mode: u=rwX,go=rX - name: Write image summary - copy: + ansible.builtin.copy: content: "{{ image_info | to_nice_json }}" dest: /var/lib/image/image.json + mode: "0644" vars: image_info: branch: "{{ lookup('pipe', 'git rev-parse --abbrev-ref HEAD') }}" @@ -76,8 +78,8 @@ kernel: "{{ ansible_kernel }}" ofed: "{{ ansible_facts.packages['mlnx-ofa_kernel'].0.version | default('-') }}" doca: "{{ ansible_facts.packages[doca_profile | default('doca-ofed') ].0.version | default('-') }}" - cuda: "{{ ansible_facts.packages['cuda'].0.version | default('-') }}" + cuda: "{{ ansible_facts.packages['cuda'].0.version | default('-') }}" slurm-ohpc: "{{ ansible_facts.packages['slurm-ohpc'].0.version | default('-') }}" -- name: Show image summary - command: cat /var/lib/image/image.json +- name: Show image summary # noqa: no-changed-when + ansible.builtin.command: cat /var/lib/image/image.json diff --git a/ansible/disable-repos.yml b/ansible/disable-repos.yml index 3b68aee68..1a3223354 100644 --- a/ansible/disable-repos.yml +++ b/ansible/disable-repos.yml @@ -1,5 +1,6 @@ +--- - hosts: dnf_repos - become: yes + become: true tasks: - name: Disable pulp repos ansible.builtin.include_role: diff --git a/ansible/extras.yml b/ansible/extras.yml index c7cacb877..901c6da7d 100644 --- a/ansible/extras.yml +++ b/ansible/extras.yml @@ -1,5 +1,6 @@ +--- - hosts: k3s_server:!builder - become: yes + become: true tags: k3s tasks: - name: Start k3s server @@ -10,7 +11,7 @@ # technically should be part of bootstrap.yml but hangs waiting on failed mounts # if runs before filesystems.yml after the control node has been reimaged - hosts: k3s_agent:!builder - become: yes + become: true tags: k3s tasks: - name: Start k3s agents @@ -19,13 +20,13 @@ tasks_from: agent-runtime.yml - hosts: basic_users:!builder - become: yes + become: true tags: - basic_users - users - gather_facts: yes + gather_facts: true tasks: - - import_role: + - ansible.builtin.import_role: name: basic_users - name: Setup EESSI @@ -35,16 +36,16 @@ gather_facts: false tasks: - name: Install and configure EESSI - import_role: + ansible.builtin.import_role: name: eessi - name: Setup CUDA hosts: cuda - become: yes - gather_facts: yes + become: true + gather_facts: true tags: cuda tasks: - - include_role: + - ansible.builtin.include_role: name: cuda tasks_from: "{{ 'runtime.yml' if appliances_mode == 'configure' else 'install.yml' }}" @@ -52,36 +53,35 @@ # Must be after filesystems.yml (for storage) # and before portal.yml (where OOD login node hostkeys are scanned) hosts: persist_hostkeys:!builder - become: yes - gather_facts: no + become: true + gather_facts: false tasks: - - import_role: + - ansible.builtin.import_role: name: persist_hostkeys - - name: Setup NFS export for compute node configuration hosts: compute_init:!builder # NB: has to be after eeesi and os-manila-mount tags: compute_init - become: yes + become: true tasks: - - include_role: + - ansible.builtin.include_role: name: compute_init tasks_from: export.yml - name: Install k9s - become: yes + become: true hosts: k9s tags: k9s tasks: - - import_role: - name: k9s + - ansible.builtin.import_role: + name: k9s - hosts: extra_packages - become: yes + become: true tags: - - extra_packages + - extra_packages tasks: - - name: Install additional packages - dnf: - name: "{{ appliances_extra_packages }}" + - name: Install additional packages + ansible.builtin.dnf: + name: "{{ appliances_extra_packages }}" diff --git a/ansible/fatimage.yml b/ansible/fatimage.yml index 0b4335b14..df74f29ba 100644 --- a/ansible/fatimage.yml +++ b/ansible/fatimage.yml @@ -1,13 +1,14 @@ +--- # Builder version of site.yml just installing binaries - hosts: builder - become: no - gather_facts: no + become: false + gather_facts: false tasks: - name: Report hostname (= final image name) - command: hostname + ansible.builtin.command: hostname # noqa: no-changed-when - name: Report inventory groups - debug: + ansible.builtin.debug: var: group_names - name: Run pre.yml hook @@ -20,21 +21,21 @@ - name: Sync pulp repos with upstream hosts: pulp tasks: - - ansible.builtin.include_role: - name: pulp_site - tasks_from: sync.yml - apply: - delegate_to: localhost - when: appliances_mode != 'configure' + - ansible.builtin.include_role: + name: pulp_site + tasks_from: sync.yml + apply: + delegate_to: localhost + when: appliances_mode != 'configure' - import_playbook: bootstrap.yml - hosts: doca - become: yes - gather_facts: yes + become: true + gather_facts: true tasks: - name: Install NVIDIA DOCA - import_role: + ansible.builtin.import_role: name: doca - name: Run post-bootstrap.yml hook @@ -45,33 +46,33 @@ when: hook_path | exists - hosts: builder - become: yes - gather_facts: yes + become: true + gather_facts: true tasks: # - import_playbook: iam.yml - name: Install FreeIPA client - import_role: + ansible.builtin.import_role: name: freeipa tasks_from: client-install.yml when: "'freeipa_client' in group_names" - name: Install sssd - import_role: + ansible.builtin.import_role: name: sssd tasks_from: install.yml when: "'sssd' in group_names" # - import_playbook: filesystems.yml: - name: Install nfs packages - dnf: + ansible.builtin.dnf: name: nfs-utils when: "'nfs' in group_names" - name: Install Manila client packages - include_role: + ansible.builtin.include_role: name: stackhpc.os-manila-mount tasks_from: install.yml when: "'manila' in group_names" - name: Install Lustre packages - include_role: + ansible.builtin.include_role: name: lustre tasks_from: install.yml when: "'lustre' in group_names" @@ -82,41 +83,41 @@ - name: Install compute_init playbook hosts: compute_init tags: compute_init # tagged to allow running on cluster instances for dev - become: yes + become: true tasks: - - include_role: + - ansible.builtin.include_role: name: compute_init tasks_from: install.yml - name: Install gateway playbook hosts: gateway tags: gateway - become: yes - gather_facts: no + become: true + gather_facts: false tasks: - - include_role: + - ansible.builtin.include_role: name: gateway - hosts: builder - become: yes - gather_facts: yes + become: true + gather_facts: true tasks: # - import_playbook: slurm.yml: - name: Setup DB - include_role: + ansible.builtin.include_role: name: mysql tasks_from: install.yml when: "'mysql' in group_names" - name: OpenHPC - import_role: + ansible.builtin.import_role: name: stackhpc.openhpc tasks_from: install.yml when: "'openhpc' in group_names" # - import_playbook: portal.yml - name: Open Ondemand server (packages) - include_role: + ansible.builtin.include_role: name: osc.ood tasks_from: install-package.yml vars_from: "Rocky/{{ ansible_distribution_major_version }}.yml" @@ -124,54 +125,54 @@ # # FUTURE: install-apps.yml - this is git clones - name: Open Ondemand server (apps) - include_role: + ansible.builtin.include_role: name: osc.ood tasks_from: install-apps.yml vars_from: "Rocky/{{ ansible_distribution_major_version }}.yml" when: "'openondemand' in group_names" - name: Open Ondemand remote desktop - import_role: + ansible.builtin.import_role: name: openondemand tasks_from: vnc_compute.yml when: "'openondemand_desktop' in group_names" - name: Open Ondemand jupyter node - import_role: + ansible.builtin.import_role: name: openondemand tasks_from: jupyter_compute.yml when: "'openondemand_jupyter' in group_names" - name: Install Apache PAM module # Extracted from start of roles/openondemand/tasks/pam_auth.yml to ensure only installed during build - yum: + ansible.builtin.dnf: name: mod_authnz_pam # - import_playbook: monitoring.yml: - - import_role: + - ansible.builtin.import_role: name: opensearch tasks_from: install.yml when: "'opensearch' in group_names" # slurm_stats - nothing to do - - import_role: + - ansible.builtin.import_role: name: filebeat tasks_from: install.yml when: "'filebeat' in group_names" - - import_role: - # can't only run cloudalchemy.node_exporter/tasks/install.yml as needs vars from preflight.yml and triggers service start - # however starting node exporter is ok + - ansible.builtin.import_role: + # can't only run cloudalchemy.node_exporter/tasks/install.yml as needs vars from preflight.yml and triggers service start + # however starting node exporter is ok name: cloudalchemy.node_exporter when: "'node_exporter' in group_names" - - name: openondemand exporter - dnf: + - name: Openondemand exporter + ansible.builtin.dnf: name: ondemand_exporter when: "'openondemand' in group_names" - - name: slurm exporter - import_role: + - name: Slurm exporter + ansible.builtin.import_role: name: slurm_exporter tasks_from: install vars: @@ -179,29 +180,29 @@ when: "'slurm_exporter' in group_names" - name: Install alertmanager - include_role: + ansible.builtin.include_role: name: alertmanager tasks_from: install.yml when: "'alertmanager' in group_names" - hosts: prometheus - become: yes - gather_facts: yes + become: true + gather_facts: true tasks: - - import_role: + - ansible.builtin.import_role: name: cloudalchemy.prometheus tasks_from: preflight.yml # can't run cloudalchemy.prometheus/tasks/install.yml as it triggers a unit start # so below is a partial extraction of this: - - name: create prometheus system group - group: + - name: Create prometheus system group + ansible.builtin.group: name: prometheus system: true state: present - - name: create prometheus system user - user: + - name: Create prometheus system user + ansible.builtin.user: name: prometheus system: true shell: "/usr/sbin/nologin" @@ -209,31 +210,33 @@ createhome: false home: "{{ prometheus_db_dir }}" - - name: download prometheus binary to local folder + - name: Download prometheus binary to local folder become: false - get_url: + ansible.builtin.get_url: + # yamllint disable-line rule:line-length url: "https://github.com/prometheus/prometheus/releases/download/v{{ prometheus_version }}/prometheus-{{ prometheus_version }}.linux-{{ go_arch }}.tar.gz" dest: "/tmp/prometheus-{{ prometheus_version }}.linux-{{ go_arch }}.tar.gz" checksum: "sha256:{{ __prometheus_checksum }}" + mode: "0644" register: _download_archive until: _download_archive is succeeded retries: 5 delay: 2 - - name: unpack prometheus binaries + - name: Unpack prometheus binaries become: false - unarchive: - remote_src: yes + ansible.builtin.unarchive: + remote_src: true src: "/tmp/prometheus-{{ prometheus_version }}.linux-{{ go_arch }}.tar.gz" dest: "/tmp" creates: "/tmp/prometheus-{{ prometheus_version }}.linux-{{ go_arch }}/prometheus" - - name: propagate official prometheus and promtool binaries - copy: - remote_src: yes + - name: Propagate official prometheus and promtool binaries + ansible.builtin.copy: + remote_src: true src: "/tmp/prometheus-{{ prometheus_version }}.linux-{{ go_arch }}/{{ item }}" dest: "{{ _prometheus_binary_install_dir }}/{{ item }}" - mode: 0755 + mode: "0755" owner: root group: root with_items: @@ -241,12 +244,12 @@ - promtool - hosts: grafana - become: yes - gather_facts: yes + become: true + gather_facts: true tasks: - name: Include distribution variables for cloudalchemy.grafana - include_vars: "{{ appliances_repository_root }}/ansible/roles/cloudalchemy.grafana/vars/redhat.yml" - - import_role: + ansible.builtin.include_vars: "{{ appliances_repository_root }}/ansible/roles/cloudalchemy.grafana/vars/redhat.yml" + - ansible.builtin.import_role: name: cloudalchemy.grafana tasks_from: install.yml @@ -260,12 +263,11 @@ - import_playbook: disable-repos.yml - hosts: builder - become: yes - gather_facts: yes + become: true + gather_facts: true tags: finalise tasks: - name: Cleanup image - import_tasks: cleanup.yml - + ansible.builtin.import_tasks: cleanup.yml - name: Shutdown Packer VM community.general.shutdown: diff --git a/ansible/filesystems.yml b/ansible/filesystems.yml index 4665c0f8f..e9c0f689e 100644 --- a/ansible/filesystems.yml +++ b/ansible/filesystems.yml @@ -1,11 +1,10 @@ --- - - name: Setup block devices hosts: block_devices - become: yes + become: true tags: block_devices tasks: - - include_role: + - ansible.builtin.include_role: name: block_devices - name: Setup NFS @@ -14,7 +13,7 @@ tags: - nfs tasks: - - include_role: + - ansible.builtin.include_role: name: stackhpc.nfs - name: Setup Manila share mounts @@ -22,7 +21,7 @@ become: true tags: manila tasks: - - include_role: + - ansible.builtin.include_role: name: stackhpc.os-manila-mount - name: Setup Lustre clients @@ -30,7 +29,7 @@ become: true tags: lustre tasks: - - include_role: + - ansible.builtin.include_role: name: lustre # NB install is ONLY run in builder tasks_from: configure.yml diff --git a/ansible/filter_plugins/utils.py b/ansible/filter_plugins/utils.py index b5b92ed7e..d07d0b9fb 100644 --- a/ansible/filter_plugins/utils.py +++ b/ansible/filter_plugins/utils.py @@ -1,85 +1,89 @@ -#!/usr/bin/python +#!/usr/bin/python # pylint: disable=missing-module-docstring # Copyright: (c) 2020, StackHPC # Apache 2 License -from ansible.errors import AnsibleError, AnsibleFilterError -from ansible.utils.display import Display -from collections import defaultdict -import jinja2 -from ansible.module_utils.six import string_types import os.path import re +from collections import defaultdict + +from ansible.utils.display import Display # pylint: disable=import-error + def prometheus_node_exporter_targets(hosts, hostvars, env_key, group): - """ Return a mapping in cloudalchemy.nodeexporter prometheus_targets - format. + """Return a mapping in cloudalchemy.nodeexporter prometheus_targets + format. - hosts: list of inventory_hostnames - hostvars: Ansible hostvars variable - env_key: key to lookup in each host's hostvars to add as label 'env' (default: 'ungrouped') - group: string to add as label 'group' + hosts: list of inventory_hostnames + hostvars: Ansible hostvars variable + env_key: key to lookup in each host's hostvars to add as label 'env' (default: 'ungrouped') + group: string to add as label 'group' """ result = [] per_env = defaultdict(list) for host in hosts: - host_env = hostvars[host].get(env_key, 'ungrouped') + host_env = hostvars[host].get(env_key, "ungrouped") per_env[host_env].append(host) - for env, hosts in per_env.items(): + for env, hosts in per_env.items(): # pylint: disable=redefined-argument-from-local target = { "targets": [f"{target}:9100" for target in hosts], - "labels": { - 'env': env, - 'group': group - } + "labels": {"env": env, "group": group}, } result.append(target) return result -def readfile(fpath): + +def readfile(fpath): # pylint: disable=missing-function-docstring if not os.path.isfile(fpath): return "" - with open(fpath) as f: + with open(fpath) as f: # pylint: disable=unspecified-encoding return f.read() -def exists(fpath): + +def exists(fpath): # pylint: disable=missing-function-docstring return os.path.isfile(fpath) + def to_ood_regex(items): - """ Convert a list of strings possibly containing digits into a regex containing \d+ - - eg {{ [compute-001, compute-002, control] | to_regex }} -> '(compute-\d+)|(control)' + """Convert a list of strings possibly containing digits into a regex containing \\d+ + + eg {{ [compute-001, compute-002, control] | to_regex }} -> '(compute-\\d+)|(control)' """ - + # NB: for python3.12+ the \d in this function & docstring - # need to be raw strings. See https://docs.python.org/3/reference/lexical_analysis.html + # need to be raw strings. See + # https://docs.python.org/3/reference/lexical_analysis.html # There's a python bug which means re.sub() can't use '\d' in the replacement so # have to do replacement in two stages: - r = [re.sub(r"\d+", 'XBACKSLASHX', v) for v in items] - r = [v.replace('XBACKSLASHX', '\d+') for v in set(r)] - r = ['(%s)' % v for v in r] - return '|'.join(r) + r = [re.sub(r"\d+", "XBACKSLASHX", v) for v in items] + r = [v.replace("XBACKSLASHX", "\\d+") for v in set(r)] + r = ["(%s)" % v for v in r] # pylint: disable=consider-using-f-string + return "|".join(r) + def appliances_repo_to_subpath(repo_entry): - """ Take an element from appliances_pulp_repos and convert it to a pulp path. This assumes that the remote and local pulp structures are the same + """Take an element from appliances_pulp_repos and convert it to a pulp path. + This assumes that the remote and local pulp structures are the same """ - return repo_entry['path'] + '/' + repo_entry['timestamp'] + return repo_entry["path"] + "/" + repo_entry["timestamp"] + -class FilterModule(object): - ''' Ansible core jinja2 filters ''' +class FilterModule(object): # pylint: disable=useless-object-inheritance + """Ansible core jinja2 filters""" - def warn(self, message, **kwargs): + # pylint: disable=missing-function-docstring + def warn(self, message, **kwargs): # pylint: disable=unused-argument Display().warning(message) return message - def filters(self): + def filters(self): # pylint: disable=missing-function-docstring return { # jinja2 overrides - 'readfile': readfile, - 'prometheus_node_exporter_targets': prometheus_node_exporter_targets, - 'exists': exists, - 'warn': self.warn, - 'to_ood_regex': to_ood_regex, - 'appliances_repo_to_subpath': appliances_repo_to_subpath + "readfile": readfile, + "prometheus_node_exporter_targets": prometheus_node_exporter_targets, + "exists": exists, + "warn": self.warn, + "to_ood_regex": to_ood_regex, + "appliances_repo_to_subpath": appliances_repo_to_subpath, } diff --git a/ansible/iam.yml b/ansible/iam.yml index 857b8f840..a99bbfb1b 100644 --- a/ansible/iam.yml +++ b/ansible/iam.yml @@ -1,13 +1,14 @@ +--- - hosts: freeipa_client tags: - freeipa - freeipa_server # as this is only relevant if using freeipa_server - freeipa_host - gather_facts: no - become: yes + gather_facts: false + become: true tasks: - name: Ensure FreeIPA client hosts are added to the FreeIPA server - import_role: + ansible.builtin.import_role: name: freeipa tasks_from: addhost.yml when: groups['freeipa_server'] | length > 0 @@ -16,15 +17,15 @@ tags: - freeipa - freeipa_client - gather_facts: yes - become: yes + gather_facts: true + become: true tasks: - name: Install FreeIPA client - import_role: + ansible.builtin.import_role: name: freeipa tasks_from: client-install.yml - name: Enrol FreeIPA client - import_role: + ansible.builtin.import_role: name: freeipa tasks_from: enrol.yml @@ -33,19 +34,19 @@ - freeipa - freeipa_server - users - gather_facts: yes - become: yes + gather_facts: true + become: true tasks: - name: Add FreeIPA users - import_role: + ansible.builtin.import_role: name: freeipa tasks_from: users.yml - hosts: sssd - become: yes - gather_facts: no + become: true + gather_facts: false tags: sssd tasks: - name: Configure sssd - import_role: + ansible.builtin.import_role: name: sssd diff --git a/ansible/library/latest_timestamps.py b/ansible/library/latest_timestamps.py index 6407ef049..6da8445d2 100644 --- a/ansible/library/latest_timestamps.py +++ b/ansible/library/latest_timestamps.py @@ -1,25 +1,32 @@ -__metaclass__ = type +# pylint: disable=missing-module-docstring +import requests # pylint: disable=import-error +from ansible.module_utils.basic import AnsibleModule # pylint: disable=import-error +from bs4 import BeautifulSoup # pylint: disable=import-error, wrong-import-order -DOCUMENTATION = r''' +__metaclass__ = type # pylint: disable=invalid-name + +DOCUMENTATION = r""" --- module: latest_timestamps short_description: Gets the latest set of snapshots from Pulp version_added: "1.0.0" -description: Gets the latest set of snapshots from given source URLs and returns dictionary to replace 'appliances_repo_timestamps' with +description: > + Gets the latest set of snapshots from given source URLs + and returns dictionary to replace 'appliances_repo_timestamps' with author: - William Tripp - Steve Brasier -''' +""" -EXAMPLES = r''' +EXAMPLES = r""" - name: Get latest timestamps latest_timestamps: repos_dict: "{{ appliances_repo_timestamp_sources }}" content_url: "https://ark.stackhpc.com/pulp/content" register: result -''' +""" -RETURN = r''' +RETURN = r""" latest_dict: description: Dictionary with updated timestamps type: dict @@ -28,49 +35,59 @@ description: List of repos that have updated timestamps type: str[] returned: always -''' - -from ansible.module_utils.basic import AnsibleModule -import requests -from bs4 import BeautifulSoup - -def run_module(): - module_args = dict( - repos_dict=dict(type='dict', required=True), - content_url=dict(type='str', required=True) - ) - - result = dict( - changed=False, - original_message='', - message='' - ) - - module = AnsibleModule( - argument_spec=module_args, - supports_check_mode=True - ) - - timestamps = dict(module.params['repos_dict']) +""" + + +def run_module(): # pylint: disable=missing-function-docstring + module_args = { + "repos_dict": { + "type": "dict", + "required": True, + }, + "content_url": { + "type": "str", + "required": True, + }, + } + + result = { + "changed": False, + "original_message": "", + "message": "", + } + + module = AnsibleModule(argument_spec=module_args, supports_check_mode=True) + + timestamps = dict(module.params["repos_dict"]) for repo in timestamps: for version in timestamps[repo]: html_txt = requests.get( - url= module.params['content_url'] + '/' + timestamps[repo][version]['path'] - ).text - timestamp_link_list = BeautifulSoup(html_txt,features="html.parser").body.find('pre').find_all() # getting raw list of timestamps from html - timestamp_link_list = map(lambda x: x.string,timestamp_link_list) # stripping xml tags - latest_timestamp = list(timestamp_link_list)[-1][:-1] # last timestamp in list with trailing / removed - timestamps[repo][version]['timestamp'] = latest_timestamp - - result['timestamps'] = dict(sorted(timestamps.items())) + url=module.params["content_url"] + + "/" + + timestamps[repo][version]["path"] + ).text + timestamp_link_list = ( + BeautifulSoup(html_txt, features="html.parser") + .body.find("pre") + .find_all() + ) # getting raw list of timestamps from html + timestamp_link_list = map( + lambda x: x.string, timestamp_link_list + ) # stripping xml tags + latest_timestamp = list(timestamp_link_list)[-1][ + :-1 + ] # last timestamp in list with trailing / removed + timestamps[repo][version]["timestamp"] = latest_timestamp + + result["timestamps"] = dict(sorted(timestamps.items())) module.exit_json(**result) -def main(): +def main(): # pylint: disable=missing-function-docstring run_module() -if __name__ == '__main__': +if __name__ == "__main__": main() diff --git a/ansible/library/user_namespace_facts.py b/ansible/library/user_namespace_facts.py index 022f63fb4..acedc6210 100644 --- a/ansible/library/user_namespace_facts.py +++ b/ansible/library/user_namespace_facts.py @@ -1,11 +1,18 @@ -#!/usr/bin/python +#!/usr/bin/python # pylint: disable=missing-module-docstring # Copyright: (c) 2020, Will Szumski -# GNU General Public License v3.0+ (see COPYING or https://www.gnu.org/licenses/gpl-3.0.txt) -from __future__ import (absolute_import, division, print_function) -__metaclass__ = type +# GNU General Public License v3.0+ (see COPYING or +# https://www.gnu.org/licenses/gpl-3.0.txt) +from __future__ import absolute_import, division, print_function -DOCUMENTATION = r''' +import csv +import os + +from ansible.module_utils.basic import AnsibleModule # pylint: disable=import-error + +__metaclass__ = type # pylint: disable=invalid-name + +DOCUMENTATION = r""" --- module: user_namepace_facts @@ -17,14 +24,14 @@ author: - Will Szumski (@jovial) -''' +""" -EXAMPLES = r''' +EXAMPLES = r""" - name: Return ansible_facts user_namepace_facts: -''' +""" -RETURN = r''' +RETURN = r""" # These are examples of possible return values, and in general should use other names for return values. ansible_facts: description: Facts to add to ansible_facts. @@ -41,20 +48,17 @@ type: str returned: always, empty dict if /etc/subgid doesn't exist sample: { "foo": {"size": 123, "start": 100000 }} -''' +""" -from ansible.module_utils.basic import AnsibleModule -import csv -import os -def parse(path): +def parse(path): # pylint: disable=missing-function-docstring result = {} if not os.path.exists(path): return result - with open(path) as f: - reader = csv.reader(f, delimiter=':') + with open(path) as f: # pylint: disable=unspecified-encoding + reader = csv.reader(f, delimiter=":") for row in reader: user = row[0] entry = { @@ -65,50 +69,43 @@ def parse(path): return result -def run_module(): + +def run_module(): # pylint: disable=missing-function-docstring # define available arguments/parameters a user can pass to the module - module_args = dict() + module_args = {} # seed the result dict in the object # we primarily care about changed and state # changed is if this module effectively modified the target # state will include any data that you want your module to pass back # for consumption, for example, in a subsequent task - result = dict( - changed=False, - ansible_facts=dict(), - ) + result = { + "changed": False, + "ansible_facts": {}, + } # the AnsibleModule object will be our abstraction working with Ansible # this includes instantiation, a couple of common attr would be the # args/params passed to the execution, as well as if the module # supports check mode - module = AnsibleModule( - argument_spec=module_args, - supports_check_mode=True - ) + module = AnsibleModule(argument_spec=module_args, supports_check_mode=True) # manipulate or modify the state as needed (this is going to be the # part where your module will do what it needs to do) - result = { - 'ansible_facts': { - 'subuid': {}, - 'subgid': {} - } - } + result = {"ansible_facts": {"subuid": {}, "subgid": {}}} - result['ansible_facts']['subuid'] = parse('/etc/subuid') - result['ansible_facts']['subgid'] = parse('/etc/subgid') + result["ansible_facts"]["subuid"] = parse("/etc/subuid") + result["ansible_facts"]["subgid"] = parse("/etc/subgid") # in the event of a successful module execution, you will want to # simple AnsibleModule.exit_json(), passing the key/value results module.exit_json(**result) -def main(): +def main(): # pylint: disable=missing-function-docstring run_module() -if __name__ == '__main__': +if __name__ == "__main__": main() diff --git a/ansible/monitoring.yml b/ansible/monitoring.yml index e97946212..dee7942e6 100644 --- a/ansible/monitoring.yml +++ b/ansible/monitoring.yml @@ -5,11 +5,11 @@ hosts: opensearch tags: opensearch tasks: - - import_role: + - ansible.builtin.import_role: name: opensearch tasks_from: install.yml become: true - - import_role: + - ansible.builtin.import_role: name: opensearch tasks_from: runtime.yml become: true @@ -18,21 +18,22 @@ hosts: slurm_stats tags: slurm_stats tasks: - - include_role: + - ansible.builtin.include_role: name: slurm_stats - name: Deploy filebeat hosts: filebeat tags: filebeat tasks: - - import_role: + - ansible.builtin.import_role: name: filebeat - name: Deploy node_exporter hosts: node_exporter tags: node_exporter tasks: - - import_role: name=cloudalchemy.node_exporter + - ansible.builtin.import_role: + name: cloudalchemy.node_exporter - name: Deploy OpenOndemand exporter hosts: openondemand @@ -41,7 +42,7 @@ - openondemand - openondemand_server tasks: - - import_role: + - ansible.builtin.import_role: name: openondemand tasks_from: exporter.yml @@ -50,7 +51,7 @@ become: true tags: slurm_exporter tasks: - - import_role: + - ansible.builtin.import_role: name: slurm_exporter - name: Setup core monitoring software @@ -58,7 +59,7 @@ tags: prometheus tasks: - name: Check for existing prometheus binaries - stat: + ansible.builtin.stat: path: /usr/local/bin/{{ item }} register: prometheus_binaries loop: @@ -66,34 +67,34 @@ - promtool - name: Skip prometheus install if prometheus binaries exist and prometheus_version not defined # i.e. if prometheus_version isn't defined we don't care, so use what's already there - set_fact: - prometheus_skip_install: "{{ false if prometheus_version is defined else true }}" - when: "{{ (prometheus_binaries.results | map(attribute='stat') | map(attribute='exists')) + [prometheus_skip_install is not defined] }}" - - import_role: + ansible.builtin.set_fact: + prometheus_skip_install: "{{ false if prometheus_version is defined else true }}" + when: (prometheus_binaries.results | map(attribute='stat') | map(attribute='exists')) + [prometheus_skip_install is not defined] + - ansible.builtin.import_role: name: cloudalchemy.prometheus - name: Deploy grafana hosts: grafana tags: grafana tasks: - - assert: + - ansible.builtin.assert: that: vault_grafana_admin_password is defined fail_msg: "Must define vault_grafana_admin_password - use `ansible-playbook generate-passwords.yml` to generate a set of passwords" - - include_role: + - ansible.builtin.include_role: name: cloudalchemy.grafana vars: # We use internal roles to register the dashboards as the role does not support all options that we require. grafana_dashboards: [] - - import_role: # done in same play so it can use handlers from cloudalchemy.grafana + - ansible.builtin.import_role: # done in same play so it can use handlers from cloudalchemy.grafana name: grafana-dashboards - name: Deploy alertmanager hosts: alertmanager tags: alertmanager - become: yes + become: true gather_facts: false tasks: - name: Configure alertmanager - include_role: + ansible.builtin.include_role: name: alertmanager tasks_from: configure.yml diff --git a/ansible/noop.yml b/ansible/noop.yml index adad24813..4c1c5ea7f 100644 --- a/ansible/noop.yml +++ b/ansible/noop.yml @@ -1,5 +1,4 @@ --- - # This file exists so that we can conditionally import a playbook. The path # must exist, but we can use a when conditional so that it is not actually # run diff --git a/ansible/portal.yml b/ansible/portal.yml index 2aa646ae9..f391a2e16 100644 --- a/ansible/portal.yml +++ b/ansible/portal.yml @@ -1,11 +1,12 @@ +--- - hosts: openondemand tags: - openondemand - openondemand_server - become: yes - gather_facts: yes # TODO + become: true + gather_facts: true # TODO tasks: - - import_role: + - ansible.builtin.import_role: name: openondemand tasks_from: main.yml @@ -13,10 +14,10 @@ tags: - openondemand - openondemand_desktop - become: yes - gather_facts: yes + become: true + gather_facts: true tasks: - - import_role: + - ansible.builtin.import_role: name: openondemand tasks_from: vnc_compute.yml @@ -24,9 +25,9 @@ tags: - openondemand - openondemand_jupyter - become: yes - gather_facts: yes + become: true + gather_facts: true tasks: - - import_role: + - ansible.builtin.import_role: name: openondemand tasks_from: jupyter_compute.yml diff --git a/ansible/roles/alertmanager/README.md b/ansible/roles/alertmanager/README.md index 612761731..0e1c0aba4 100644 --- a/ansible/roles/alertmanager/README.md +++ b/ansible/roles/alertmanager/README.md @@ -5,6 +5,7 @@ to route Prometheus alerts to a receiver. Currently Slack is the only supported receiver. Note that: + - HA configuration is not supported - Alertmanager state is not preserved when the node it runs on (by default, control node) is reimaged, so any alerts silenced via the GUI will reoccur. @@ -15,6 +16,7 @@ Alertmanager is enabled by default on the `control` node in the `cookiecutter` uses for a new environment's `inventory/groups` file. In general usage may only require: + - Adding the `control` node into the `alertmanager` group in `environments/site/groups` if upgrading an existing environment. - Enabling the Slack integration (see section below). @@ -28,6 +30,7 @@ All variables are optional. See [defaults/main.yml](defaults/main.yml) for all default values. General variables: + - `alertmanager_version`: String, version (no leading 'v') - `alertmanager_download_checksum`: String, checksum for relevant version from [prometheus.io download page](https://prometheus.io/download/), in format @@ -46,14 +49,14 @@ The following variables are equivalent to similarly-named arguments to the `alertmanager` binary. See `man alertmanager` for more info: - `alertmanager_config_file`: String, path the main alertmanager config file - will be written to. Parent directory will be created if necessary. + will be written to. Parent directory will be created if necessary. - `alertmanager_web_config_file`: String, path alertmanager web config file - will be written to. Parent directory will be created if necessary. + will be written to. Parent directory will be created if necessary. - `alertmanager_storage_path`: String, base path for data storage. - `alertmanager_web_listen_addresses`: List of strings, defining addresses to listeen on. - `alertmanager_web_external_url`: String, the URL under which Alertmanager is - externally reachable - defaults to host IP address and `alertmanager_port`. - See man page for more details if proxying alertmanager. + externally reachable - defaults to host IP address and `alertmanager_port`. + See man page for more details if proxying alertmanager. - `alertmanager_data_retention`: String, how long to keep data for - `alertmanager_data_maintenance_interval`: String, interval between garbage collection and snapshotting to disk of the silences and the notification logs. @@ -62,6 +65,7 @@ The following variables are equivalent to similarly-named arguments to the - `alertmanager_default_receivers`: The following variables are templated into the alertmanager [main configuration](https://prometheus.io/docs/alerting/latest/configuration/): + - `alertmanager_config_template`: String, path to configuration template. The default is to template in `alertmanager_config_default` and `alertmanager_config_extra`. - `alertmanager_config_default`: Mapping with default configuration for the @@ -73,24 +77,27 @@ The following variables are templated into the alertmanager [main configuration] - `alertmanager_extra_receivers`: A list of additional [receiver](https://prometheus.io/docs/alerting/), mappings to add, by default empty. - `alertmanager_slack_receiver`: Mapping defining the [Slack receiver](https://prometheus.io/docs/alerting/latest/configuration/#slack_config). Note the default configuration for this is in -`environments/common/inventory/group_vars/all/alertmanager.yml`. + `environments/common/inventory/group_vars/all/alertmanager.yml`. - `alertmanager_slack_receiver_name`: String, name for the above Slack reciever. - `alertmanager_slack_receiver_send_resolved`: Bool, whether to send resolved alerts via the above Slack reciever. -- `alertmanager_null_receiver`: Mapping defining a `null` [receiver](https://prometheus.io/docs/alerting/latest/configuration/#receiver) so a receiver is always defined. +- `alertmanager_null_receiver`: Mapping defining a `null` [receiver](https://prometheus.io/docs/alerting/latest/configuration/#receiver) so a receiver is always defined. - `alertmanager_config_extra`: Mapping with additional configuration. Keys in this become top-level keys in the configuration. E.g this might be: - ```yaml - alertmanager_config_extra: - global: - smtp_from: smtp.example.org:587 - time_intervals: - - name: monday-to-friday - time_intervals: - - weekdays: ['monday:friday'] - ``` + + ```yaml + alertmanager_config_extra: + global: + smtp_from: smtp.example.org:587 + time_intervals: + - name: monday-to-friday + time_intervals: + - weekdays: ['monday:friday'] + ``` + Note that `route` and `receivers` keys should not be added here. The following variables are templated into the alertmanager [web configuration](https://prometheus.io/docs/alerting/latest/https/): + - `alertmanager_web_config_default`: Mapping with default configuration for `basic_auth_users` providing the default web user. - `alertmanager_alertmanager_web_config_extra`: Mapping with additional web diff --git a/ansible/roles/alertmanager/defaults/main.yml b/ansible/roles/alertmanager/defaults/main.yml index b30301739..4b9099404 100644 --- a/ansible/roles/alertmanager/defaults/main.yml +++ b/ansible/roles/alertmanager/defaults/main.yml @@ -1,5 +1,6 @@ -alertmanager_version: '0.28.1' -alertmanager_download_checksum: 'sha256:5ac7ab5e4b8ee5ce4d8fb0988f9cb275efcc3f181b4b408179fafee121693311' +--- +alertmanager_version: "0.28.1" +alertmanager_download_checksum: "sha256:5ac7ab5e4b8ee5ce4d8fb0988f9cb275efcc3f181b4b408179fafee121693311" alertmanager_download_dest: /tmp/alertmanager.tar.gz alertmanager_binary_dir: /usr/local/bin alertmanager_started: true @@ -11,13 +12,13 @@ alertmanager_config_file: /etc/alertmanager/alertmanager.yml alertmanager_web_config_file: /etc/alertmanager/alertmanager-web.yml alertmanager_storage_path: /var/lib/alertmanager -alertmanager_port: '9093' +alertmanager_port: "9093" alertmanager_web_listen_addresses: - ":{{ alertmanager_port }}" -alertmanager_web_external_url: '' # defined in environments/common/inventory/group_vars/all/alertmanager.yml for visibility +alertmanager_web_external_url: "" # defined in environments/common/inventory/group_vars/all/alertmanager.yml for visibility -alertmanager_data_retention: '120h' -alertmanager_data_maintenance_interval: '15m' +alertmanager_data_retention: "120h" +alertmanager_data_maintenance_interval: "15m" alertmanager_config_flags: {} # other command-line parameters as shown by `man alertmanager` alertmanager_config_template: alertmanager.yml.j2 alertmanager_web_config_template: alertmanager-web.yml.j2 @@ -35,7 +36,7 @@ alertmanager_alertmanager_web_config_extra: {} # top-level only # app_creds: alertmanager_null_receiver: - name: 'null' + name: "null" alertmanager_slack_receiver: {} # defined in environments/common/inventory/group_vars/all/alertmanager.yml as it needs prometheus_address alertmanager_extra_receivers: [] alertmanager_default_receivers: "{{ [alertmanager_null_receiver] + ([alertmanager_slack_receiver] if alertmanager_slack_integration is defined else []) }}" @@ -43,7 +44,8 @@ alertmanager_receivers: "{{ alertmanager_default_receivers + alertmanager_extra alertmanager_config_default: route: - group_by: ['...'] + group_by: + - "..." receiver: "{{ alertmanager_slack_receiver_name if alertmanager_slack_integration is defined else 'null' }}" receivers: "{{ alertmanager_receivers }}" diff --git a/ansible/roles/alertmanager/handlers/main.yml b/ansible/roles/alertmanager/handlers/main.yml index ee87e1e3b..6e427a6b9 100644 --- a/ansible/roles/alertmanager/handlers/main.yml +++ b/ansible/roles/alertmanager/handlers/main.yml @@ -1,5 +1,6 @@ +--- - name: Restart alertmanager - systemd: + ansible.builtin.systemd: name: alertmanager state: restarted daemon_reload: "{{ _alertmanager_service.changed | default(false) }}" diff --git a/ansible/roles/alertmanager/tasks/configure.yml b/ansible/roles/alertmanager/tasks/configure.yml index a43ec2041..15f252fbe 100644 --- a/ansible/roles/alertmanager/tasks/configure.yml +++ b/ansible/roles/alertmanager/tasks/configure.yml @@ -1,3 +1,4 @@ +--- - name: Create alertmanager directories ansible.builtin.file: path: "{{ item }}" @@ -11,7 +12,7 @@ - "{{ alertmanager_storage_path }}" - name: Create alertmanager service file with immutable options - template: + ansible.builtin.template: src: alertmanager.service.j2 dest: /usr/lib/systemd/system/alertmanager.service owner: root @@ -38,10 +39,9 @@ mode: u=rw,go= notify: Restart alertmanager -- meta: flush_handlers - +- ansible.builtin.meta: flush_handlers - name: Ensure alertmanager service state - systemd: + ansible.builtin.systemd: name: alertmanager state: "{{ 'started' if alertmanager_started | bool else 'stopped' }}" enabled: "{{ alertmanager_enabled | bool }}" diff --git a/ansible/roles/alertmanager/tasks/install.yml b/ansible/roles/alertmanager/tasks/install.yml index 0f655da3d..f1cb9cd4b 100644 --- a/ansible/roles/alertmanager/tasks/install.yml +++ b/ansible/roles/alertmanager/tasks/install.yml @@ -1,3 +1,4 @@ +--- - name: Create alertmanager system user ansible.builtin.user: name: "{{ alertmanager_system_user }}" @@ -22,4 +23,6 @@ group: root mode: u=rwx,go=rx remote_src: true - extra_opts: ['--strip-components=1', '--show-stored-names'] + extra_opts: + - "--strip-components=1" + - "--show-stored-names" diff --git a/ansible/roles/basic_users/README.md b/ansible/roles/basic_users/README.md index 70ab1545a..23bea4cf2 100644 --- a/ansible/roles/basic_users/README.md +++ b/ansible/roles/basic_users/README.md @@ -1,9 +1,8 @@ - -basic_users -=========== +# basic_users Setup users on cluster nodes using `/etc/passwd` and manipulating `$HOME`, i.e. without requiring LDAP etc. Features: + - UID/GID is consistent across cluster (and explicitly defined). - SSH key generated and propagated to all nodes to allow login between cluster nodes. @@ -12,59 +11,56 @@ without requiring LDAP etc. Features: - When deleting users, systemd user sessions are terminated first. > [!IMPORTANT] The defaults for this role assumes that `$HOME` for users -managed by this role (e.g. not `rocky` and other system users) is on a shared -filesystem. The export of this shared filesystem may be root squashed if its -server is in the `basic_user` group - see configuration examples below. +> managed by this role (e.g. not `rocky` and other system users) is on a shared +> filesystem. The export of this shared filesystem may be root squashed if its +> server is in the `basic_user` group - see configuration examples below. -Role Variables --------------- +## Role Variables - `basic_users_homedir_server`: Optional inventory hostname in the `basic_users` group defining the host to use to create home directories. If the home - directory export is root squashed, this host *must* be the home directory + directory export is root squashed, this host _must_ be the home directory server. Default is the `control` node which is appropriate for the default appliance configuration. Not relevant if `create_home` is false for all users. - `basic_users_homedir_server_path`: Optional path prefix for home directories on - the `basic_users_homedir_server`, i.e. on the "server side". Default is - `/exports/home` which is appropriate for the default appliance configuration. + the `basic_users_homedir_server`, i.e. on the "server-side". Default is + `/exports/home` which is appropriate for the default appliance configuration. - `basic_users_homedir_client`: Optional inventory hostname in the `basic_users` - group defining the host to use to create ssh keys etc in home directories. + group defining the host to use to create SSH keys etc in home directories. This should be a host mounting the home directories. Default is the first node in the `login` group which is appropriate for the default appliance configuration. - `basic_users_users`: Optional, default empty list. A list of mappings defining - information for each user. In general, mapping keys/values are passed through - as parameters to [ansible.builtin.user](https://docs.ansible.com/ansible/latest/collections/ansible/builtin/user_module.html) - and default values are as given there, with the following differences: + information for each user. In general, mapping keys/values are passed through + as parameters to [ansible.builtin.user](https://docs.ansible.com/ansible/latest/collections/ansible/builtin/user_module.html) + and default values are as given there, with the following differences: - `generate_ssh_key`: Default is `true`, and the generated key is added to - the user's authorized keys. - - `ssh_key_comment`: Default is user name. - - `home`: Set automatically based on the user name and + the user's authorized keys. + - `ssh_key_comment`: Default is username. + - `home`: Set automatically based on the username and `basic_users_homedir_server_path`. Can be overriden for users with - non-standard home directory paths. + non-standard home directory paths. - `uid`: Should be set, so that the UID/GID is consistent across the cluster (which Slurm requires). - - `shell`: If *not* set will be `/sbin/nologin` on the `control` node to - prevent users logging in to this node, and the default shell on other - nodes. Explicitly setting this defines the shell for all nodes and if the - shared home directories are mounted on the control node will allow the - user to log in to the control node. + - `shell`: If _not_ set will be `/sbin/nologin` on the `control` node to + prevent users logging in to this node, and the default shell on other + nodes. Explicitly setting this defines the shell for all nodes and if the + shared home directories are mounted on the control node will allow the + user to log in to the control node. - `public_key`: Optional, define a key to log into the cluster with. - `sudo`: Optional, a (possibly multiline) string defining sudo rules for the - user. + user. - `ssh_key_type` defaults to `ed25519` instead of the `ansible.builtin.user` - default of `rsa`. + default of `rsa`. - Any other keys may present for other purposes (i.e. not used by this role). - `basic_users_groups`: Optional, default empty list. A list of mappings defining information for each group. Mapping keys/values are passed through as parameters to [ansible.builtin.group](https://docs.ansible.com/ansible/latest/collections/ansible/builtin/group_module.html) and default values are as given there. - `basic_users_override_sssd`: Optional bool, default false. Whether to disable `sssd` when ensuring users/groups exist with this role. Permits creating local users/groups even if they clash with users provided via sssd (e.g. from LDAP). Ignored if host is not in group `sssd` as well. Note with this option active `sssd` will be stopped and restarted each time this role is run. -Dependencies ------------- +## Dependencies None. -Example Configurations ----------------------- +## Example Configurations With default appliance NFS configuration, create user `alice` with access to all nodes except the control node, and delete user `bob`: @@ -83,9 +79,10 @@ basic_users_users: ``` Using an external share which: - - does not root squash (so this role can create directories on it) - - is mounted to all nodes including the control node (so this role can set - authorized keys there) + +- does not root squash (so this role can create directories on it) +- is mounted to all nodes including the control node (so this role can set + authorized keys there) Create user `Carol`: @@ -99,7 +96,7 @@ basic_users_user: public_key: ssh-ed25519 ... ``` -Using an external share which *does* root squash, so home directories cannot be +Using an external share which _does_ root squash, so home directories cannot be created by this role and must already exist, create user `Dan`: ```yaml diff --git a/ansible/roles/basic_users/defaults/main.yml b/ansible/roles/basic_users/defaults/main.yml index 7b24ef784..8b4b66b13 100644 --- a/ansible/roles/basic_users/defaults/main.yml +++ b/ansible/roles/basic_users/defaults/main.yml @@ -1,9 +1,10 @@ +--- basic_users_homedir_server: "{{ groups['control'] | first }}" # no way, generally, to find the nfs_server basic_users_homedir_server_path: /exports/home -basic_users_homedir_client: "{{ groups['login'] | first }}" +basic_users_homedir_client: "{{ groups['login'] | first }}" basic_users_userdefaults: state: present # need this here so don't have to add default() everywhere - generate_ssh_key: true + generate_ssh_key: true ssh_key_comment: "{{ item.name }}" ssh_key_type: ed25519 shell: "{{'/sbin/nologin' if 'control' in group_names else omit }}" diff --git a/ansible/roles/basic_users/filter_plugins/filter_keys.py b/ansible/roles/basic_users/filter_plugins/filter_keys.py index 119a430c4..12aa079fe 100644 --- a/ansible/roles/basic_users/filter_plugins/filter_keys.py +++ b/ansible/roles/basic_users/filter_plugins/filter_keys.py @@ -1,22 +1,27 @@ -""" Filter a dict to remove specified keys """ +"""Filter a dict to remove specified keys""" import copy -USER_MODULE_PARAMS = ('append authorization comment create_home createhome expires force generate_ssh_key group ' - 'groups hidden home local login_class move_home name user non_unique password password_expire_min ' - 'password_expire_max password_lock profile remove role seuser shell skeleton ssh_key_bits ' - 'ssh_key_comment ssh_key_file ssh_key_passphrase ssh_key_type state system uid update_password').split() +USER_MODULE_PARAMS = ( + "append authorization comment create_home createhome expires force generate_ssh_key group " + "groups hidden home local login_class move_home name user non_unique password " + "password_expire_min password_expire_max password_lock profile remove role seuser shell " + "skeleton ssh_key_bits ssh_key_comment ssh_key_file ssh_key_passphrase ssh_key_type state " + "system uid update_password" +).split() -class FilterModule(object): - def filters(self): - return { - 'filter_user_params': self.filter_user_params - } +class FilterModule( + object +): # pylint: disable=missing-class-docstring, useless-object-inheritance + + def filters(self): # pylint: disable=missing-function-docstring + return {"filter_user_params": self.filter_user_params} def filter_user_params(self, d): - ''' Return a copy of dict `d` containing only keys which are parameters for the user module''' - + # pylint: disable-next=line-too-long + """Return a copy of dict `d` containing only keys which are parameters for the user module""" + user_dict = copy.deepcopy(d) remove_keys = set(user_dict).difference(USER_MODULE_PARAMS) for key in remove_keys: diff --git a/ansible/roles/basic_users/library/terminate_user_sessions.py b/ansible/roles/basic_users/library/terminate_user_sessions.py index 711b3732f..0a53381b1 100644 --- a/ansible/roles/basic_users/library/terminate_user_sessions.py +++ b/ansible/roles/basic_users/library/terminate_user_sessions.py @@ -1,11 +1,14 @@ -#!/usr/bin/python +#!/usr/bin/python # pylint: disable=missing-module-docstring # Copyright: (c) 2021, Steve Brasier # Apache V2 licence -from __future__ import (absolute_import, division, print_function) -__metaclass__ = type +from __future__ import absolute_import, division, print_function -DOCUMENTATION = r''' +from ansible.module_utils.basic import AnsibleModule # pylint: disable=import-error + +__metaclass__ = type # pylint: disable=invalid-name + +DOCUMENTATION = r""" --- module: terminate_user_sessions @@ -22,54 +25,60 @@ description: Name of user required: true type: str - + author: - Steve Brasier (stackhpc.com) -''' +""" -EXAMPLES = r''' +EXAMPLES = r""" - terminate_user_sessions: name: fred -''' - -RETURN = r''' -''' +""" -from ansible.module_utils.basic import AnsibleModule +RETURN = r""" +""" -def run_module(): - # define available arguments/parameters a user can pass to the module - module_args = dict( - user=dict(type='str', required=True), - ) +def run_module(): # pylint: disable=missing-function-docstring + # define available arguments/parameters a user can pass to the module] + module_args = { + "user": { + "type": "str", + "required": True, + } + } - result = dict(changed=False) + result = { + "changed": False, + } - module = AnsibleModule( - argument_spec=module_args, - supports_check_mode=True - ) + module = AnsibleModule(argument_spec=module_args, supports_check_mode=True) if module.check_mode: module.exit_json(**result) - _, sessions_stdout, _ = module.run_command("loginctl --no-legend list-sessions", check_rc=True) + _, sessions_stdout, _ = module.run_command( + "loginctl --no-legend list-sessions", check_rc=True + ) for line in sessions_stdout.splitlines(): session_info = line.split() user = session_info[1] session_id = session_info[0] - if user == module.params['user']: - _, sessions_stdout, _ = module.run_command("loginctl terminate-session %s" % session_id, check_rc=True) - result['changed'] = True - + if user == module.params["user"]: + _, sessions_stdout, _ = module.run_command( + # pylint: disable-next=consider-using-f-string + "loginctl terminate-session %s" % session_id, + check_rc=True, + ) + result["changed"] = True + # successful module exit: module.exit_json(**result) -def main(): +def main(): # pylint: disable=missing-function-docstring run_module() -if __name__ == '__main__': - main() \ No newline at end of file +if __name__ == "__main__": + main() diff --git a/ansible/roles/basic_users/tasks/main.yml b/ansible/roles/basic_users/tasks/main.yml index 6abba9cc0..4df4e78ef 100644 --- a/ansible/roles/basic_users/tasks/main.yml +++ b/ansible/roles/basic_users/tasks/main.yml @@ -9,7 +9,7 @@ - "item.state | default('present') == 'absent'" - name: Stop sssd if required - systemd: + ansible.builtin.systemd: name: sssd state: stopped register: _stop_sssd @@ -18,11 +18,13 @@ - basic_users_override_sssd | bool - name: Create groups - ansible.builtin.group: "{{ item }}" - loop: "{{ basic_users_groups }}" + ansible.builtin.group: + name: "{{ item }}" + loop: "{{ basic_users_groups }}" - name: Create users - user: "{{ basic_users_userdefaults | combine(item) | filter_user_params() | combine(_disable_homedir) }}" + ansible.builtin.user: + name: "{{ basic_users_userdefaults | combine(item) | filter_user_params() | combine(_disable_homedir) }}" loop: "{{ basic_users_users }}" loop_control: label: "{{ item.name }}" @@ -32,10 +34,11 @@ generate_ssh_key: false - name: Write sudo rules - blockinfile: + ansible.builtin.blockinfile: path: /etc/sudoers.d/80-{{ item.name }}-user block: "{{ item.sudo }}" create: true + mode: "0440" loop: "{{ basic_users_users }}" loop_control: label: "{{ item.name }}" @@ -44,10 +47,10 @@ - "'sudo' in item" - name: Restart sssd if required - systemd: + ansible.builtin.systemd: name: sssd state: started - when: _stop_sssd is changed + when: _stop_sssd is changed # noqa: no-handler # This task runs only on the home directory server so it can handle # root-squashed exports @@ -56,7 +59,7 @@ ansible.builtin.copy: remote_src: true src: "{{ item.skeleton | default('/etc/skel/') }}" - dest: "{{ item.home | default( basic_users_homedir_server_path + '/' + item.name ) }}" + dest: "{{ item.home | default(basic_users_homedir_server_path + '/' + item.name) }}" owner: "{{ item.name }}" group: "{{ item.name }}" mode: u=rwX,go= @@ -72,13 +75,13 @@ # paths are easily constructed, becoming each user so that root-squash # doesn't matter - name: Create ~/.ssh directories - file: + ansible.builtin.file: state: directory path: ~/.ssh/ owner: "{{ item.name }}" group: "{{ item.name }}" mode: u=rwX,go= - become_user: "{{ item.name }}" + # become_user: "{{ item.name }}" # Commenting out as become_user does not imply become: true loop: "{{ basic_users_users }}" loop_control: label: "{{ item.name }}" @@ -89,12 +92,12 @@ - name: Generate cluster ssh key community.crypto.openssh_keypair: - path: "{{ item.ssh_key_file | default('~/.ssh/id_' + _ssh_key_type )}}" # NB: ssh_key_file is from ansible.builtin.user + path: "{{ item.ssh_key_file | default('~/.ssh/id_' + _ssh_key_type) }}" # NB: ssh_key_file is from ansible.builtin.user type: "{{ _ssh_key_type }}" comment: "{{ item.ssh_key_comment | default(item.name) }}" vars: _ssh_key_type: "{{ item.ssh_key_type | default('ed25519') }}" - become_user: "{{ item.name }}" + # become_user: "{{ item.name }}" # Commenting out as become_user does not imply become: true loop: "{{ basic_users_users }}" loop_control: label: "{{ item.name }}" @@ -111,7 +114,7 @@ manage_dir: false key: "{{ item.public_key }}" path: ~/.ssh/authorized_keys - become_user: "{{ item.item.name }}" + # become_user: "{{ item.item.name }}" # Commenting out as become_user does not imply become: true loop: "{{ _cluster_ssh_keypair.results }}" loop_control: label: "{{ item.item.name }}" @@ -128,7 +131,7 @@ manage_dir: false key: "{{ item.public_key }}" path: ~/.ssh/authorized_keys - become_user: "{{ item.name }}" + # become_user: "{{ item.name }}" # Commenting out as become_user does not imply become: true loop: "{{ basic_users_users }}" loop_control: label: "{{ item.name }}" diff --git a/ansible/roles/block_devices/README.md b/ansible/roles/block_devices/README.md index d3dad63bf..e3280979d 100644 --- a/ansible/roles/block_devices/README.md +++ b/ansible/roles/block_devices/README.md @@ -1,9 +1,9 @@ -block_devices -============= +# block_devices Manage filesystems on block devices (such as OpenStack volumes), including creating partitions, creating filesystems and mounting filesystems. This is a convenience wrapper around the ansible modules: + - community.general.parted - community.general.filesystem - ansible.buildin.file @@ -15,13 +15,11 @@ To avoid issues with device names changing after e.g. reboots, devices are ident [^1]: See `environments/common/inventory/group_vars/builder/defaults.yml` -Requirements ------------- +## Requirements N/A. -Role Variables --------------- +## Role Variables - `block_devices_partition_state`: Optional. Partition state, 'present' or 'absent' (as for parted) or 'skip'. Defaults to 'present'. - `block_devices_serial`: Required. Serial number of block device. For an OpenStack volume this is the volume ID. @@ -36,20 +34,18 @@ Role Variables Multiple NFS client/server configurations may be provided by defining `block_devices_configurations`. This should be a list of mappings with keys/values are as per the variables above without the `block_devices_` prefix. Omitted keys/values are filled from the corresponding variable. -Dependencies ------------- +## Dependencies See top of page. -Example Playbook ----------------- +## Example Playbook ```yaml - hosts: servers become: true tasks: - - include_role: - name: block_devices + - include_role: + name: block_devices ``` The example variables below create an `ext4` partition on `/dev/sdb1` and mount it as `/mnt/files` with the default owner/group: @@ -71,12 +67,10 @@ block_devices_configurations: path: /mnt/files ``` -License -------- +## License Apache V2 -Author Information ------------------- +## Author Information stackhpc.com diff --git a/ansible/roles/block_devices/defaults/main.yml b/ansible/roles/block_devices/defaults/main.yml index 0f997bfe8..1a9da7bda 100644 --- a/ansible/roles/block_devices/defaults/main.yml +++ b/ansible/roles/block_devices/defaults/main.yml @@ -1,9 +1,11 @@ -block_devices_configurations: [{}] +--- +block_devices_configurations: + - {} block_devices_partition_state: present # 'present', 'absent' (as for parted) or 'skip' block_devices_device: # Path to block device, e.g. '/dev/sda'. See community.general.parted:device and community.general.filesystem:dev block_devices_number: # Partition number, e.g 1 for /dev/sda1. See community.general.parted:number block_devices_fstype: # Filesystem type, e.g. e.g. 'ext4'. See community.general.filesystem:fstype -block_devices_resizefs: no # Grow filesystem into block device space (yes or no). See community.general.filesystem:resizefs +block_devices_resizefs: false # Grow filesystem into block device space (yes or no). See community.general.filesystem:resizefs block_devices_filesystem_state: present # 'present', 'absent' (as for community.general.filesystem:state) or 'skip' block_devices_path: # Path to mount point, e.g. '/mnt/files' block_devices_mount_state: mounted # Mount state, see ansible.posix.mount:state diff --git a/ansible/roles/block_devices/library/block_devices.py b/ansible/roles/block_devices/library/block_devices.py index ac34f2bbb..47496f2f3 100644 --- a/ansible/roles/block_devices/library/block_devices.py +++ b/ansible/roles/block_devices/library/block_devices.py @@ -1,9 +1,13 @@ -#!/usr/bin/python +#!/usr/bin/python # pylint: disable=missing-module-docstring # Copyright: (c) 2021, StackHPC # Apache 2 License -DOCUMENTATION = r''' +import json + +from ansible.module_utils.basic import AnsibleModule # pylint: disable=import-error + +DOCUMENTATION = r""" --- module: block_devices @@ -13,32 +17,30 @@ author: - Steve Brasier (@sjpb) -''' +""" -RETURN = r''' +RETURN = r""" devices: description: dict with device serial numbers as keys and full paths (e.g. /dev/sdb) as values type: dict return: always -''' +""" -import json -from ansible.module_utils.basic import AnsibleModule - -def run_module(): - module_args = dict() +def run_module(): # pylint: disable=missing-function-docstring + module_args = {} module = AnsibleModule(argument_spec=module_args, supports_check_mode=True) result = {"changed": False} _, stdout, _ = module.run_command("lsblk --paths --json -O", check_rc=True) - - device_info = json.loads(stdout)['blockdevices'] - result['devices'] = dict((item['serial'], item['name']) for item in device_info) + + device_info = json.loads(stdout)["blockdevices"] + result["devices"] = dict((item["serial"], item["name"]) for item in device_info) module.exit_json(**result) -def main(): + +def main(): # pylint: disable=missing-function-docstring run_module() -if __name__ == '__main__': +if __name__ == "__main__": main() diff --git a/ansible/roles/block_devices/tasks/main.yml b/ansible/roles/block_devices/tasks/main.yml index efaec3ca1..4ce7925f9 100644 --- a/ansible/roles/block_devices/tasks/main.yml +++ b/ansible/roles/block_devices/tasks/main.yml @@ -1,5 +1,6 @@ +--- - name: Warn role is deprecated - debug: + ansible.builtin.debug: msg: "{{ 'Role block_devices is deprecated, see ansible/roles/block_devices/README.md' | warn }}" when: block_devices_configurations | length > 0 @@ -7,18 +8,18 @@ block_devices: register: _block_devices -- name: Create partitions - parted: +- name: Create partitions + community.general.parted: device: "{{ _device }}" number: "{{ item.get('number', block_devices_number) }}" state: "{{ item.get('partition_state', block_devices_partition_state) }}" when: "item.get('partition_state', block_devices_partition_state) != 'skip'" loop: "{{ block_devices_configurations }}" vars: - _device: "{{ _block_devices.devices[ item.get('serial', block_devices_serial) ] }}" + _device: "{{ _block_devices.devices[item.get('serial', block_devices_serial)] }}" - name: Create filesystems - filesystem: + community.general.filesystem: fstype: "{{ item.get('fstype', block_devices_fstype) }}" dev: "{{ _device }}{{ item.get('number', block_devices_number) }}" resizefs: "{{ item.get('resizefs', block_devices_resizefs) }}" @@ -26,26 +27,27 @@ when: "item.get('filesystem_state', block_devices_filesystem_state) != 'skip'" loop: "{{ block_devices_configurations }}" vars: - _device: "{{ _block_devices.devices[ item.get('serial', block_devices_serial) ] }}" + _device: "{{ _block_devices.devices[item.get('serial', block_devices_serial)] }}" - name: Get filesystem UUIDs - command: + ansible.builtin.command: cmd: "lsblk {{ _device }}{{ item.get('number', block_devices_number) }} --noheadings --output UUID" loop: "{{ block_devices_configurations }}" vars: - _device: "{{ _block_devices.devices[ item.get('serial', block_devices_serial) ] }}" + _device: "{{ _block_devices.devices[item.get('serial', block_devices_serial)] }}" register: block_devices_uuids changed_when: false - check_mode: no + check_mode: false - name: Ensure mount point exists - file: + ansible.builtin.file: path: "{{ item.get('path', block_devices_path) }}" state: directory + mode: "0755" loop: "{{ block_devices_configurations }}" - name: Mount filesystems by UUID - mount: + ansible.posix.mount: path: "{{ item.get('path', block_devices_path) }}" src: "UUID={{ _uuid }}" fstype: "{{ item.get('fstype', block_devices_fstype) }}" @@ -57,10 +59,11 @@ index_var: block_devices_idx - name: Set owner/group for mounted directory - file: + ansible.builtin.file: path: "{{ item.get('path', block_devices_path) }}" state: directory owner: "{{ item.get('owner', block_devices_owner) | default(omit) }}" group: "{{ item.get('group', block_devices_group) | default(omit) }}" + mode: "0755" when: "item.get('owner', block_devices_owner) or item.get('group', block_devices_group)" loop: "{{ block_devices_configurations }}" diff --git a/ansible/roles/cacerts/defaults/main.yml b/ansible/roles/cacerts/defaults/main.yml index c1f940f6a..d53992a97 100644 --- a/ansible/roles/cacerts/defaults/main.yml +++ b/ansible/roles/cacerts/defaults/main.yml @@ -1,3 +1,4 @@ -#cacerts_dest_dir: /etc/pki/ca-trust/source/anchors/ +--- +# cacerts_dest_dir: /etc/pki/ca-trust/source/anchors/ cacerts_cert_dir: "{{ appliances_environment_root }}/cacerts" cacerts_update: true diff --git a/ansible/roles/cacerts/tasks/configure.yml b/ansible/roles/cacerts/tasks/configure.yml index 5001f44ef..a23f27502 100644 --- a/ansible/roles/cacerts/tasks/configure.yml +++ b/ansible/roles/cacerts/tasks/configure.yml @@ -1,16 +1,15 @@ --- - - name: Copy all certificates - copy: + ansible.builtin.copy: src: "{{ item }}" dest: /etc/pki/ca-trust/source/anchors/ owner: root group: root - mode: 0644 + mode: "0644" with_fileglob: - "{{ cacerts_cert_dir }}/*" become: true -- name: Update trust store - command: update-ca-trust extract +- name: Update trust store # noqa: no-changed-when + ansible.builtin.command: update-ca-trust extract become: true diff --git a/ansible/roles/cacerts/tasks/export.yml b/ansible/roles/cacerts/tasks/export.yml index c9c64713b..8e036a156 100644 --- a/ansible/roles/cacerts/tasks/export.yml +++ b/ansible/roles/cacerts/tasks/export.yml @@ -1,10 +1,11 @@ +--- - name: Copy cacerts from deploy host to /exports/cluster/cacerts/ - copy: + ansible.builtin.copy: src: "{{ item }}" dest: /exports/cluster/cacerts/ owner: slurm group: root - mode: 0644 + mode: "0644" with_fileglob: - "{{ cacerts_cert_dir }}/*" delegate_to: "{{ groups['control'] | first }}" diff --git a/ansible/roles/cacerts/tasks/main.yml b/ansible/roles/cacerts/tasks/main.yml index 84f493457..ec83d2b16 100644 --- a/ansible/roles/cacerts/tasks/main.yml +++ b/ansible/roles/cacerts/tasks/main.yml @@ -1 +1,2 @@ -- import_tasks: configure.yml +--- +- ansible.builtin.import_tasks: configure.yml diff --git a/ansible/roles/cluster_infra/defaults/main.yml b/ansible/roles/cluster_infra/defaults/main.yml index f2f9637b9..3b1f6c798 100644 --- a/ansible/roles/cluster_infra/defaults/main.yml +++ b/ansible/roles/cluster_infra/defaults/main.yml @@ -1,2 +1,3 @@ +--- ansible_init_collections: [] ansible_init_playbooks: [] diff --git a/ansible/roles/cluster_infra/tasks/main.yml b/ansible/roles/cluster_infra/tasks/main.yml index f62c2579d..874c5d18f 100644 --- a/ansible/roles/cluster_infra/tasks/main.yml +++ b/ansible/roles/cluster_infra/tasks/main.yml @@ -1,4 +1,5 @@ -- debug: +--- +- ansible.builtin.debug: msg: | terraform_backend_type: {{ terraform_backend_type }} terraform_state: {{ terraform_state }} @@ -8,55 +9,58 @@ # if we we have cluster_floating_ip, otherwise assume that we're # assigning the FIP in Terraform and that it will be available in # outputs.cluster_gateway_ip. -- block: +- when: + - cluster_floating_ip is defined + - cluster_floating_ip + + block: - name: Look up floating IP azimuth_cloud.terraform.os_floating_ip_info: - floating_ip: "{{ cluster_floating_ip }}" + floating_ip: "{{ cluster_floating_ip }}" register: cluster_floating_ip_info - name: Set floating IP address fact - set_fact: + ansible.builtin.set_fact: cluster_floating_ip_address: "{{ cluster_floating_ip_info.floating_ip.floating_ip_address }}" - when: - - cluster_floating_ip is defined - - cluster_floating_ip - - name: Install Terraform binary - include_role: + ansible.builtin.include_role: name: azimuth_cloud.terraform.install - name: Make Terraform project directory - file: + ansible.builtin.file: path: "{{ terraform_project_path }}" state: directory + mode: "0755" - name: Write backend configuration - copy: + ansible.builtin.copy: content: | terraform { backend "{{ terraform_backend_type }}" { } } dest: "{{ terraform_project_path }}/backend.tf" + mode: "0644" - name: Template Terraform files into project directory - template: + ansible.builtin.template: src: >- - {{ + {{ "{}{}.j2".format( ( - cluster_terraform_template_dir ~ "/" - if cluster_terraform_template_dir is defined + cluster_terraform_template_dir ~ "/" + if cluster_terraform_template_dir is defined else "" ), item ) }} dest: "{{ terraform_project_path }}/{{ item }}" + mode: "0644" loop: - outputs.tf - providers.tf - resources.tf - name: Provision infrastructure - include_role: + ansible.builtin.include_role: name: azimuth_cloud.terraform.infra diff --git a/ansible/roles/compute_init/README.md b/ansible/roles/compute_init/README.md index 81a62bade..83ec1d953 100644 --- a/ansible/roles/compute_init/README.md +++ b/ansible/roles/compute_init/README.md @@ -8,6 +8,7 @@ Allow compute nodes to rejoin the cluster after a reboot without running the > required configuration may change with further development. To enable this: + 1. Add the `compute` group (or a subset) into the `compute_init` group. 2. Build an image which includes the `compute_init` group. This is the case for StackHPC-built release images. @@ -35,65 +36,65 @@ property described above. If a role is marked as requiring a custom image then it also requires an image build with the role name added to the [Packer inventory_groups variable](../../../docs/image-build.md). -| Playbook | Role (or functionality) | Support | Custom image reqd.? | -| -------------------------|-------------------------|---------------------------------|---------------------| -| hooks/pre.yml | ? | None at present | n/a | -| validate.yml | n/a | Not relevant during boot | n/a | -| bootstrap.yml | (wait for ansible-init) | Not relevant during boot | n/a | -| bootstrap.yml | resolv_conf | Fully supported | No | -| bootstrap.yml | etc_hosts | Fully supported | No | -| bootstrap.yml | chrony | Fully supported | No | -| bootstrap.yml | proxy | None at present | No | -| bootstrap.yml | (/etc permissions) | None required - use image build | No | -| bootstrap.yml | (ssh /home fix) | None required - use image build | No | -| bootstrap.yml | (system users) | None required - use image build | No | -| bootstrap.yml | systemd | None required - use image build | No | -| bootstrap.yml | selinux | None required - use image build | Maybe [1] | -| bootstrap.yml | sshd | Fully supported | No | -| bootstrap.yml | dnf_repos | None at present [2] | - | -| bootstrap.yml | cacerts | Supported [3] | - | -| bootstrap.yml | squid | Not relevant for compute nodes | n/a | -| bootstrap.yml | tuned | Fully supported | No | -| bootstrap.yml | freeipa_server | Not relevant for compute nodes | n/a | -| bootstrap.yml | cockpit | None required - use image build | No | -| bootstrap.yml | firewalld | Not relevant for compute nodes | n/a | -| bootstrap.yml | fail2ban | Not relevant for compute nodes | n/a | -| bootstrap.yml | podman | Not relevant for compute nodes | n/a | -| bootstrap.yml | update | Not relevant during boot | n/a | -| bootstrap.yml | reboot | Not relevant for compute nodes | n/a | -| bootstrap.yml | ofed | Not relevant during boot | Yes | -| bootstrap.yml | ansible_init (install) | Not relevant during boot | n/a | -| bootstrap.yml | k3s (install) | Not relevant during boot | n/a | -| hooks/post-bootstrap.yml | ? | None at present | n/a | -| iam.yml | freeipa_client | None at present [4] | Yes | -| iam.yml | freeipa_server | Not relevant for compute nodes | n/a | -| iam.yml | sssd | Fully supported | No | -| filesystems.yml | block_devices | None required - role deprecated | n/a | -| filesystems.yml | nfs | All client functionality | No | -| filesystems.yml | manila | All functionality | No [5] | -| filesystems.yml | lustre | All functionality | Yes | -| extras.yml | basic_users | All functionality [6] | No | -| extras.yml | eessi | All functionality [7] | No | -| extras.yml | cuda | None required - use image build | Yes [8] | -| extras.yml | persist_hostkeys | Not relevant for compute nodes | n/a | -| extras.yml | compute_init (export) | Not relevant for compute nodes | n/a | -| extras.yml | k9s (install) | Not relevant during boot | n/a | -| extras.yml | extra_packages | None at present [9] | - | -| slurm.yml | mysql | Not relevant for compute nodes | n/a | -| slurm.yml | rebuild | Not relevant for compute nodes | n/a | -| slurm.yml | openhpc [10] | All slurmd functionality | No | -| slurm.yml | (set memory limits) | Fully supported | No | -| slurm.yml | (block ssh) | Fully supported | No | -| portal.yml | (openondemand server) | Not relevant for compute nodes | n/a | -| portal.yml | (openondemand vnc desktop) | None required - use image build | No | -| portal.yml | (openondemand jupyter server) | None required - use image build | No | -| monitoring.yml | node_exporter | None required - use image build | No | -| monitoring.yml | (other monitoring) | Not relevant for compute nodes | - | -| disable-repos.yml | dnf_repos | None at present [2] | - | -| hooks/post.yml | ? | None at present | - | - +| Playbook | Role (or functionality) | Support | Custom image reqd.? | +| ------------------------ | ----------------------------- | ------------------------------- | ------------------- | +| hooks/pre.yml | ? | None at present | n/a | +| validate.yml | n/a | Not relevant during boot | n/a | +| bootstrap.yml | (wait for ansible-init) | Not relevant during boot | n/a | +| bootstrap.yml | resolv_conf | Fully supported | No | +| bootstrap.yml | etc_hosts | Fully supported | No | +| bootstrap.yml | chrony | Fully supported | No | +| bootstrap.yml | proxy | None at present | No | +| bootstrap.yml | (/etc permissions) | None required - use image build | No | +| bootstrap.yml | (SSH /home fix) | None required - use image build | No | +| bootstrap.yml | (system users) | None required - use image build | No | +| bootstrap.yml | systemd | None required - use image build | No | +| bootstrap.yml | selinux | None required - use image build | Maybe [1] | +| bootstrap.yml | sshd | Fully supported | No | +| bootstrap.yml | dnf_repos | None at present [2] | - | +| bootstrap.yml | cacerts | Supported [3] | - | +| bootstrap.yml | squid | Not relevant for compute nodes | n/a | +| bootstrap.yml | tuned | Fully supported | No | +| bootstrap.yml | freeipa_server | Not relevant for compute nodes | n/a | +| bootstrap.yml | cockpit | None required - use image build | No | +| bootstrap.yml | firewalld | Not relevant for compute nodes | n/a | +| bootstrap.yml | fail2ban | Not relevant for compute nodes | n/a | +| bootstrap.yml | podman | Not relevant for compute nodes | n/a | +| bootstrap.yml | update | Not relevant during boot | n/a | +| bootstrap.yml | reboot | Not relevant for compute nodes | n/a | +| bootstrap.yml | ofed | Not relevant during boot | Yes | +| bootstrap.yml | ansible_init (install) | Not relevant during boot | n/a | +| bootstrap.yml | k3s (install) | Not relevant during boot | n/a | +| hooks/post-bootstrap.yml | ? | None at present | n/a | +| iam.yml | freeipa_client | None at present [4] | Yes | +| iam.yml | freeipa_server | Not relevant for compute nodes | n/a | +| iam.yml | sssd | Fully supported | No | +| filesystems.yml | block_devices | None required - role deprecated | n/a | +| filesystems.yml | nfs | All client functionality | No | +| filesystems.yml | manila | All functionality | No [5] | +| filesystems.yml | lustre | All functionality | Yes | +| extras.yml | basic_users | All functionality [6] | No | +| extras.yml | eessi | All functionality [7] | No | +| extras.yml | cuda | None required - use image build | Yes [8] | +| extras.yml | persist_hostkeys | Not relevant for compute nodes | n/a | +| extras.yml | compute_init (export) | Not relevant for compute nodes | n/a | +| extras.yml | k9s (install) | Not relevant during boot | n/a | +| extras.yml | extra_packages | None at present [9] | - | +| slurm.yml | MySQL | Not relevant for compute nodes | n/a | +| slurm.yml | rebuild | Not relevant for compute nodes | n/a | +| slurm.yml | openhpc [10] | All slurmd functionality | No | +| slurm.yml | (set memory limits) | Fully supported | No | +| slurm.yml | (block SSH) | Fully supported | No | +| portal.yml | (openondemand server) | Not relevant for compute nodes | n/a | +| portal.yml | (openondemand vnc desktop) | None required - use image build | No | +| portal.yml | (openondemand jupyter server) | None required - use image build | No | +| monitoring.yml | node_exporter | None required - use image build | No | +| monitoring.yml | (other monitoring) | Not relevant for compute nodes | - | +| disable-repos.yml | dnf_repos | None at present [2] | - | +| hooks/post.yml | ? | None at present | - | Notes: + 1. `selinux` is set to disabled in StackHPC images. 2. Requirement for this functionality is TBD. 3. `cacerts_cert_dir` must be the same on all nodes. @@ -103,32 +104,32 @@ Notes: 6. Assumes home directory already exists on shared storage. 7. Assumes `cvmfs_config` is the same on control node and all compute nodes. 8. If `cuda` role was run during build, the nvidia-persistenced is enabled - and will start during boot. + and will start during boot. 9. Would require `dnf_repos`. 10. `openhpc` does not need to be added to `compute_init_enable`, this is automatically enabled by adding `compute`. ## Approach + This works as follows: + 1. During image build, an ansible-init playbook and supporting files -(e.g. templates, filters, etc) are installed. + (e.g. templates, filters, etc) are installed. 2. Cluster instances are created as usual; the above compute-init playbook does -not run. + not run. 3. The `site.yml` playbook is run as usual to configure all the instances into -a cluster. In addition, with `compute-init` enabled, a `/exports/cluster` NFS -share is created on the control node containing: - - an /etc/hosts file for the cluster - - Hostvars for each compute node + a cluster. In addition, with `compute-init` enabled, a `/exports/cluster` NFS + share is created on the control node containing: - an /etc/hosts file for the cluster - Hostvars for each compute node 4. On reboot of a compute node, ansible-init runs the compute-init playbook -which: - a. Checks whether the `enable_compute` metadata flag is set, and exits if - not. - b. Tries to mount the above `/exports/cluster` NFS share from the control - node, and exits if it cannot. - c. Configures itself using the exported hostvars, depending on the - `enable_*` flags set in metadata. - d. Issues an `scontrol` command to resume the node (because Slurm will - consider it as "unexpectedly rebooted"). + which: + a. Checks whether the `enable_compute` metadata flag is set, and exits if + not. + b. Tries to mount the above `/exports/cluster` NFS share from the control + node, and exits if it cannot. + c. Configures itself using the exported hostvars, depending on the + `enable_*` flags set in metadata. + d. Issues an `scontrol` command to resume the node (because Slurm will + consider it as "unexpectedly rebooted"). The check in 4b. above is what prevents the compute-init script from trying to configure the node before the services on the control node are available @@ -145,35 +146,43 @@ a new image: 2. Reimage the compute nodes: - ansible-playbook --limit compute ansible/adhoc/rebuild.yml +```shell +ansible-playbook --limit compute ansible/adhoc/rebuild.yml +``` 3. Add metadata to a compute node e.g. via Horizon to turn on compute-init playbook functionality. 4. Stop ansible-init from running - ansible all -ba "systemctl stop ansible-init" +```shell +ansible all -ba "systemctl stop ansible-init" +``` 5. Fake an image build to deploy the compute-init playbook: - ansible-playbook ansible/fatimage.yml --tags compute_init +```shell +ansible-playbook ansible/fatimage.yml --tags compute_init +``` - NB: This will also re-export the compute hostvars, as the nodes are not - in the builder group, which conveniently means any changes made to that - play also get picked up. +NB: This will also reexport the compute hostvars, as the nodes are not +in the builder group, which conveniently means any changes made to that +play also get picked up. 6. Fake a reimage of compute to run ansible-init and the updated compute-init playbook: - ansible all -ba "rm -f /var/lib/ansible-init.done && systemctl restart ansible-init" +```shell +ansible all -ba "rm -f /var/lib/ansible-init.done && systemctl restart ansible-init" +``` - Use `systemctl status ansible-init` to view stdout/stderr from Ansible. +Use `systemctl status ansible-init` to view stdout/stderr from Ansible. Steps 4/5/6 can be repeated with changes to the compute script. If required, reimage the compute node(s) first as in step 2 and/or add additional metadata as in step 3. - ## Design notes + - Duplicating code in roles into the `compute-init` script is unfortunate, but does allow developing this functionality without wider changes to the appliance. @@ -186,7 +195,6 @@ as in step 3. 1. Control node copies files resulting from role into cluster exports, compute-init copies to local disk. Only works if files are not host-specific Examples: etc_hosts, eessi config? - 2. Re-implement the role. Works if the role vars are not too complicated, (else they all need to be duplicated in compute-init). Could also only support certain subsets of role functionality or variables @@ -195,29 +203,29 @@ as in step 3. - Some variables are defined using hostvars from other nodes, which aren't available v the current approach: - ``` - [root@rl9-compute-0 rocky]# grep hostvars /mnt/cluster/hostvars/rl9-compute-0/hostvars.yml - "grafana_address": "{{ hostvars[groups['grafana'].0].api_address }}", - "grafana_api_address": "{{ hostvars[groups['grafana'].0].internal_address }}", - "mysql_host": "{{ hostvars[groups['mysql'] | first].api_address }}", - "nfs_server_default": "{{ hostvars[groups['control'] | first ].internal_address }}", - "openhpc_slurm_control_host": "{{ hostvars[groups['control'].0].api_address }}", - "openondemand_address": "{{ hostvars[groups['openondemand'].0].api_address if groups['openondemand'] | count > 0 else '' }}", - "openondemand_node_proxy_directives": "{{ _opeonondemand_unset_auth if (openondemand_auth == 'basic_pam' and 'openondemand_host_regex' and groups['grafana'] | length > 0 and hostvars[ groups['grafana'] | first]._grafana_auth_is_anonymous) else '' }}", - "openondemand_servername": "{{ hostvars[ groups['openondemand'] | first].ansible_host }}", - "prometheus_address": "{{ hostvars[groups['prometheus'].0].api_address }}", - "{{ hostvars[groups['freeipa_server'].0].ansible_host }}" - ``` - - More generally, there is nothing to stop any group var depending on a - "{{ hostvars[] }}" interpolation ... - - Only `nfs_server_default` and `openhpc_slurm_control_host` are of concern - for compute nodes - both of these indirect via `api_address` to - `inventory_hostname`. This has been worked around by replacing this with - "{{ groups['control'] | first }}" which does result in the control node - inventory hostname when templating. - - Note that although `groups` is defined in the templated hostvars, when - the hostvars are loaded using `include_vars:` is is ignored as it is a - "magic variable" determined by ansible itself and cannot be set. + ```text + [root@rl9-compute-0 rocky]# grep hostvars /mnt/cluster/hostvars/rl9-compute-0/hostvars.yml + "grafana_address": "{{ hostvars[groups['grafana'].0].api_address }}", + "grafana_api_address": "{{ hostvars[groups['grafana'].0].internal_address }}", + "mysql_host": "{{ hostvars[groups['mysql'] | first].api_address }}", + "nfs_server_default": "{{ hostvars[groups['control'] | first ].internal_address }}", + "openhpc_slurm_control_host": "{{ hostvars[groups['control'].0].api_address }}", + "openondemand_address": "{{ hostvars[groups['openondemand'].0].api_address if groups['openondemand'] | count > 0 else '' }}", + "openondemand_node_proxy_directives": "{{ _opeonondemand_unset_auth if (openondemand_auth == 'basic_pam' and 'openondemand_host_regex' and groups['grafana'] | length > 0 and hostvars[ groups['grafana'] | first]._grafana_auth_is_anonymous) else '' }}", + "openondemand_servername": "{{ hostvars[ groups['openondemand'] | first].ansible_host }}", + "prometheus_address": "{{ hostvars[groups['prometheus'].0].api_address }}", + "{{ hostvars[groups['freeipa_server'].0].ansible_host }}" + ``` + + More generally, there is nothing to stop any group var depending on a + "{{ hostvars[] }}" interpolation ... + + Only `nfs_server_default` and `openhpc_slurm_control_host` are of concern + for compute nodes - both of these indirect via `api_address` to + `inventory_hostname`. This has been worked around by replacing this with + "{{ groups['control'] | first }}" which does result in the control node + inventory hostname when templating. + + Note that although `groups` is defined in the templated hostvars, when + the hostvars are loaded using `include_vars:` is is ignored as it is a + "magic variable" determined by ansible itself and cannot be set. diff --git a/ansible/roles/compute_init/files/compute-init.yml b/ansible/roles/compute_init/files/compute-init.yml index 086585a8d..40dda69fb 100644 --- a/ansible/roles/compute_init/files/compute-init.yml +++ b/ansible/roles/compute_init/files/compute-init.yml @@ -1,8 +1,7 @@ --- - - name: Compute node initialisation hosts: localhost - become: yes + become: true vars: os_metadata: "{{ lookup('url', 'http://169.254.169.254/openstack/latest/meta_data.json') | from_json }}" server_node_ip: "{{ os_metadata.meta.control_address }}" @@ -12,7 +11,7 @@ enable_cacerts: "{{ os_metadata.meta.cacerts | default(false) | bool }}" enable_sssd: "{{ os_metadata.meta.sssd | default(false) | bool }}" enable_sshd: "{{ os_metadata.meta.sshd | default(false) | bool }}" - enable_tuned: "{{ os_metadata.meta.tuned | default(false) | bool }}" + enable_tuned: "{{ os_metadata.meta.tuned | default(false) | bool }}" enable_nfs: "{{ os_metadata.meta.nfs | default(false) | bool }}" enable_manila: "{{ os_metadata.meta.manila | default(false) | bool }}" enable_lustre: "{{ os_metadata.meta.lustre | default(false) | bool }}" @@ -22,7 +21,6 @@ # TODO: "= role defaults" - could be moved to a vars_file: on play with similar precedence effects resolv_conf_nameservers: [] - tuned_profile_baremetal: hpc-compute tuned_profile_vm: virtual-guest tuned_profile: "{{ tuned_profile_baremetal if ansible_virtualization_role != 'guest' else tuned_profile_vm }}" @@ -45,17 +43,17 @@ - nosuid tasks: - - block: + - when: not enable_compute + + block: - name: Report skipping initialization if not compute node # meta: end_play produces no output - debug: + ansible.builtin.debug: msg: "Skipping compute initialization: Metadata enable_compute is not true" - - - meta: end_play - when: not enable_compute + - ansible.builtin.meta: end_play - name: Ensure the mount directory exists - file: + ansible.builtin.file: path: /mnt/cluster state: directory owner: slurm @@ -63,7 +61,7 @@ mode: u=rX,g=rwX,o= - name: Mount /mnt/cluster - mount: + ansible.posix.mount: path: /mnt/cluster src: "{{ server_node_ip }}:/exports/cluster" fstype: nfs @@ -74,46 +72,46 @@ # exits from playbook if this failed below, allowing ansible-init to # finish, which allows site.yml to continue on initial deploy - - block: + - when: _mount_mnt_cluster.failed + + block: - name: Report skipping initialization if cannot mount nfs # meta: end_play produces no output - debug: + ansible.builtin.debug: msg: "Skipping compute initialization: Failed to mount /exports/cluster from control node {{ server_node_ip }}" - - - meta: end_play - when: _mount_mnt_cluster.failed + - ansible.builtin.meta: end_play - name: Check if hostvars exist - become_user: slurm - stat: + # become_user: slurm # Commenting out as become_user does not imply become: true + ansible.builtin.stat: path: "/mnt/cluster/hostvars/{{ ansible_hostname }}/hostvars.yml" register: hostvars_stat - - block: + - when: not hostvars_stat.stat.exists + + block: - name: Report skipping initialization if host vars does not exist # meta: end_play produces no output - debug: + ansible.builtin.debug: msg: "Skipping compute initialization: hostvars does not exist" - - meta: end_play - when: not hostvars_stat.stat.exists - + - ansible.builtin.meta: end_play - name: Sync /mnt/cluster to /var/tmp - become_user: slurm - synchronize: + # become_user: slurm # Commenting out as become_user does not imply become: true + ansible.posix.synchronize: src: "/mnt/cluster/" dest: "/var/tmp/cluster/" - archive: yes - recursive: yes + archive: true + recursive: true - name: Unmount /mnt/cluster after sync - mount: + ansible.posix.mount: path: /mnt/cluster state: unmounted - name: Load hostvars # this is higher priority than vars block = normal ansible's hostvars - include_vars: + ansible.builtin.include_vars: file: "/var/tmp/cluster/hostvars/{{ ansible_hostname }}/hostvars.yml" - name: Run chrony role @@ -127,6 +125,8 @@ when: enable_chrony - name: Configure resolve.conf + when: enable_resolv_conf + block: - name: Set nameservers in /etc/resolv.conf ansible.builtin.template: @@ -149,16 +149,14 @@ ansible.builtin.systemd: name: NetworkManager state: reloaded - when: _copy_nm_config.changed | default(false) - when: enable_resolv_conf - + when: _copy_nm_config.changed | default(false) # noqa: no-handler - name: Copy cluster /etc/hosts - copy: + ansible.builtin.copy: src: /var/tmp/cluster/hosts dest: /etc/hosts owner: root group: root - mode: 0644 + mode: "0644" when: enable_etc_hosts - name: Configure cacerts @@ -176,7 +174,7 @@ when: enable_sshd - name: Configure tuned - include_tasks: tasks/tuned.yml + ansible.builtin.include_tasks: ansible/roles/tuned/tasks/main.yml when: enable_tuned - name: Configure sssd @@ -198,12 +196,16 @@ loop: "{{ nfs_configurations }}" - name: Manila mounts + when: + - enable_manila + - os_manila_mount_shares | length > 0 + block: - name: Read manila share info from nfs file - include_vars: + ansible.builtin.include_vars: file: /var/tmp/cluster/manila_share_info.yml no_log: true # contains secrets - + - name: Ensure Ceph configuration directory exists ansible.builtin.file: path: "{{ os_manila_mount_ceph_conf_path }}" @@ -265,10 +267,6 @@ loop_control: label: "{{ item.share_name }}" when: item.mount_state | default(os_manila_mount_state) in ['mounted' or 'ephemeral'] - when: - - enable_manila - - os_manila_mount_shares | length > 0 - - name: Configure lustre ansible.builtin.include_role: name: lustre @@ -276,84 +274,83 @@ when: enable_lustre - name: Basic users - ansible.builtin.include_role: + ansible.builtin.include_role: name: basic_users when: enable_basic_users - name: EESSI + when: enable_eessi + + # NB: don't need conditional block on enable_compute as have already exited + # if not the case block: - name: Copy cvmfs config - copy: + ansible.builtin.copy: src: /var/tmp/cluster/cvmfs/default.local dest: /etc/cvmfs/default.local owner: root group: root - mode: 0644 + mode: "0644" - - name: Ensure CVMFS config is setup - command: + - name: Ensure CVMFS config is setup # noqa: no-changed-when + ansible.builtin.command: cmd: "cvmfs_config setup" - when: enable_eessi - - # NB: don't need conditional block on enable_compute as have already exited - # if not the case - name: Write Munge key - copy: + ansible.builtin.copy: # NB: openhpc_munge_key is *binary* and may not survive json encoding # so do same as environments/common/inventory/group_vars/all/openhpc.yml content: "{{ vault_openhpc_mungekey | b64decode }}" dest: "/etc/munge/munge.key" owner: munge group: munge - mode: 0400 + mode: "0400" - name: Set slurmctld location for configless operation - lineinfile: + ansible.builtin.lineinfile: path: /etc/sysconfig/slurmd line: "SLURMD_OPTIONS='--conf-server {{ openhpc_slurm_control_host_address | default(openhpc_slurm_control_host) }}'" regexp: "^SLURMD_OPTIONS=" - create: yes + create: true owner: root group: root - mode: 0644 + mode: "0644" - name: Ensure Munge service state - service: + ansible.builtin.service: name: munge enabled: true state: started - name: Set locked memory limits on user-facing nodes - lineinfile: + ansible.builtin.lineinfile: path: /etc/security/limits.conf - regexp: '\* soft memlock unlimited' + regexp: "\\* soft memlock unlimited" line: "* soft memlock unlimited" - name: Configure sshd pam module - blockinfile: + ansible.builtin.blockinfile: path: /etc/pam.d/sshd - insertafter: 'account\s+required\s+pam_nologin.so' + insertafter: "account\\s+required\\s+pam_nologin.so" block: | account sufficient pam_access.so account required pam_slurm.so - name: Configure login access control - blockinfile: + ansible.builtin.blockinfile: path: /etc/security/access.conf block: | +:adm:ALL -:ALL:ALL - name: Ensure slurmd service state - service: + ansible.builtin.service: name: slurmd enabled: true state: started - - - name: Ensure node is resumed + - name: Ensure node is resumed # noqa: no-changed-when # TODO: consider if this is always safe for all job states? - command: scontrol update state=resume nodename={{ ansible_hostname }} + ansible.builtin.command: scontrol update state=resume nodename={{ ansible_hostname }} register: _scontrol_update failed_when: - _scontrol_update.rc > 0 diff --git a/ansible/roles/compute_init/tasks/export.yml b/ansible/roles/compute_init/tasks/export.yml index 3226e13b8..179303777 100644 --- a/ansible/roles/compute_init/tasks/export.yml +++ b/ansible/roles/compute_init/tasks/export.yml @@ -1,5 +1,6 @@ +--- - name: Ensure the /exports/cluster directory exists - file: + ansible.builtin.file: path: /exports/cluster state: directory owner: slurm @@ -9,7 +10,7 @@ delegate_to: "{{ groups['control'] | first }}" - name: Copy /etc/hosts to /exports/cluster - copy: + ansible.builtin.copy: src: /etc/hosts dest: /exports/cluster/hosts owner: slurm @@ -20,7 +21,7 @@ delegate_to: "{{ groups['control'] | first }}" - name: Create hostvars directory - file: + ansible.builtin.file: path: /exports/cluster/hostvars/{{ inventory_hostname }}/ state: directory owner: slurm @@ -29,7 +30,7 @@ delegate_to: "{{ groups['control'] | first }}" - name: Template out hostvars - template: + ansible.builtin.template: src: hostvars.yml.j2 dest: /exports/cluster/hostvars/{{ inventory_hostname }}/hostvars.yml owner: slurm @@ -38,7 +39,7 @@ delegate_to: "{{ groups['control'] | first }}" - name: Copy manila share info to /exports/cluster - copy: + ansible.builtin.copy: content: "{{ os_manila_mount_share_info_var | to_nice_yaml }}" dest: /exports/cluster/manila_share_info.yml owner: slurm @@ -52,22 +53,22 @@ os_manila_mount_share_info: "{{ os_manila_mount_share_info }}" - name: Ensure /exports/cluster/cvmfs directory exists - file: + ansible.builtin.file: path: /exports/cluster/cvmfs state: directory owner: slurm group: root - mode: 0755 + mode: "0755" run_once: true delegate_to: "{{ groups['control'] | first }}" - name: Copy EESSI CVMFS config to /exports/cluster - copy: + ansible.builtin.copy: src: /etc/cvmfs/default.local dest: /exports/cluster/cvmfs/default.local owner: slurm group: root - mode: 0644 + mode: "0644" remote_src: true run_once: true delegate_to: "{{ groups['control'] | first }}" @@ -79,7 +80,7 @@ when: "'cacerts' in group_names" - name: Create hostconfig directory - file: + ansible.builtin.file: path: "/exports/cluster/hostconfig/{{ inventory_hostname }}/" state: directory owner: slurm @@ -87,14 +88,14 @@ mode: u=rX,g=rwX,o= delegate_to: "{{ groups['control'] | first }}" -- name: Template sssd config - import_role: +- name: Template sssd config + ansible.builtin.import_role: name: sssd tasks_from: export.yml when: "'sssd' in group_names" -- name: Template sshd config - import_role: +- name: Template sshd config + ansible.builtin.import_role: name: sshd tasks_from: export.yml when: "'sshd' in group_names" diff --git a/ansible/roles/compute_init/tasks/install.yml b/ansible/roles/compute_init/tasks/install.yml index 0638f7011..9c8ff2ce2 100644 --- a/ansible/roles/compute_init/tasks/install.yml +++ b/ansible/roles/compute_init/tasks/install.yml @@ -1,12 +1,11 @@ --- - - name: Ensure directories exist - file: + ansible.builtin.file: path: "/etc/ansible-init/playbooks/{{ item }}" state: directory owner: root group: root - mode: 0755 + mode: "0755" loop: - templates - files @@ -16,11 +15,15 @@ - roles - name: Inject files from roles - synchronize: - src: '{{ item.src }}' - dest: '/etc/ansible-init/playbooks/{{ item.dest }}' + ansible.posix.synchronize: + src: "{{ item.src }}" + dest: "/etc/ansible-init/playbooks/{{ item.dest }}" archive: false - rsync_opts: ["-p", "--chmod=D770,F644", "--owner=root", "--group=root"] + rsync_opts: + - "-p" + - "--chmod=D770,F644" + - "--owner=root" + - "--group=root" recursive: true use_ssh_args: true become: true @@ -51,18 +54,18 @@ dest: roles/ - name: Add filter_plugins to ansible.cfg - lineinfile: + ansible.builtin.lineinfile: path: /etc/ansible-init/ansible.cfg line: "filter_plugins = /etc/ansible-init/filter_plugins" state: present owner: root group: root - mode: 0644 + mode: "0644" - name: Add compute initialisation playbook - copy: + ansible.builtin.copy: src: compute-init.yml dest: /etc/ansible-init/playbooks/10-compute-init.yml owner: root group: root - mode: 0644 + mode: "0644" diff --git a/ansible/roles/cuda/defaults/main.yml b/ansible/roles/cuda/defaults/main.yml index bc62ae843..1d136fda1 100644 --- a/ansible/roles/cuda/defaults/main.yml +++ b/ansible/roles/cuda/defaults/main.yml @@ -1,7 +1,9 @@ +--- +# yamllint disable-line rule:line-length cuda_repo_url: "https://developer.download.nvidia.com/compute/cuda/repos/rhel{{ ansible_distribution_major_version }}/{{ ansible_architecture }}/cuda-rhel{{ ansible_distribution_major_version }}.repo" -cuda_nvidia_driver_stream: '575-open' +cuda_nvidia_driver_stream: "575-open" cuda_nvidia_driver_pkg: "nvidia-open-3:575.57.08-1.el{{ ansible_distribution_major_version }}" -cuda_package_version: '12.9.1-1' +cuda_package_version: "12.9.1-1" cuda_version_short: "{{ (cuda_package_version | split('.'))[0:2] | join('.') }}" # major.minor cuda_packages: - "cuda-toolkit-{{ cuda_package_version }}" diff --git a/ansible/roles/cuda/tasks/install.yml b/ansible/roles/cuda/tasks/install.yml index 39bd20d94..91af515ea 100644 --- a/ansible/roles/cuda/tasks/install.yml +++ b/ansible/roles/cuda/tasks/install.yml @@ -1,10 +1,11 @@ - +--- # Based on https://docs.nvidia.com/datacenter/tesla/driver-installation-guide/ - name: Install cuda repo - get_url: + ansible.builtin.get_url: dest: "/etc/yum.repos.d/cuda-rhel{{ ansible_distribution_major_version }}.repo" url: "{{ cuda_repo_url }}" + mode: "0644" - name: Check if nvidia driver module is enabled ansible.builtin.command: dnf module list --enabled nvidia-driver @@ -24,7 +25,7 @@ register: _cuda_driver_install - name: Check kernel has not been modified - assert: + ansible.builtin.assert: that: "'kernel ' not in _cuda_driver_install.stdout | default('')" # space ensures we don't flag e.g. kernel-devel-matched fail_msg: "{{ _cuda_driver_install.stdout_lines | default([]) | select('search', 'kernel ') }}" @@ -37,13 +38,13 @@ register: cuda_package_install - name: Add cuda binaries to path - lineinfile: + ansible.builtin.lineinfile: path: /etc/profile.d/sh.local - line: 'export PATH=$PATH:$(ls -1d /usr/local/cuda-* | sort -V | tail -1)/bin' + line: "export PATH=$PATH:$(ls -1d /usr/local/cuda-* | sort -V | tail -1)/bin" when: cuda_package_version != 'none' - name: Enable NVIDIA Persistence Daemon - systemd: + ansible.builtin.systemd: name: nvidia-persistenced enabled: true state: "{{ cuda_persistenced_state }}" @@ -51,9 +52,9 @@ - name: Reboot ansible.builtin.reboot: post_reboot_delay: 30 - when: cuda_package_install.changed + when: cuda_package_install.changed # noqa: no-handler - name: Wait for hosts to be reachable - wait_for_connection: + ansible.builtin.wait_for_connection: sleep: 15 - when: cuda_package_install.changed + when: cuda_package_install.changed # noqa: no-handler diff --git a/ansible/roles/cuda/tasks/runtime.yml b/ansible/roles/cuda/tasks/runtime.yml index c16a48c6f..e2dfab3d6 100644 --- a/ansible/roles/cuda/tasks/runtime.yml +++ b/ansible/roles/cuda/tasks/runtime.yml @@ -1,5 +1,6 @@ +--- - name: Ensure NVIDIA Persistence Daemon state - systemd: + ansible.builtin.systemd: name: nvidia-persistenced enabled: true state: "{{ cuda_persistenced_state }}" diff --git a/ansible/roles/cuda/tasks/samples.yml b/ansible/roles/cuda/tasks/samples.yml index b2bccd74d..392a29561 100644 --- a/ansible/roles/cuda/tasks/samples.yml +++ b/ansible/roles/cuda/tasks/samples.yml @@ -1,13 +1,15 @@ +--- - name: Ensure cuda_samples_path exists - file: + ansible.builtin.file: state: directory path: "{{ cuda_samples_path }}" owner: "{{ ansible_user }}" group: "{{ ansible_user }}" + mode: "0755" - name: Download CUDA samples release - unarchive: - remote_src: yes + ansible.builtin.unarchive: + remote_src: true src: "{{ cuda_samples_release_url }}" dest: "{{ cuda_samples_path }}" owner: "{{ ansible_user }}" @@ -15,12 +17,13 @@ creates: "{{ cuda_samples_path }}/cuda-samples-{{ cuda_version_short }}" - name: Create CUDA samples build directory - file: + ansible.builtin.file: state: directory path: "{{ cuda_samples_path }}/cuda-samples-{{ cuda_version_short }}/build" + mode: "0755" - name: Build CUDA samples - shell: + ansible.builtin.shell: # We need to source /etc/profile.d/sh.local to add CUDA to the PATH cmd: . /etc/profile.d/sh.local && cmake .. && make -j {{ ansible_processor_vcpus }} chdir: "{{ cuda_samples_path }}/cuda-samples-{{ cuda_version_short }}/build" diff --git a/ansible/roles/dnf_repos/defaults/main.yml b/ansible/roles/dnf_repos/defaults/main.yml index d4bab0f22..2e649e6fe 100644 --- a/ansible/roles/dnf_repos/defaults/main.yml +++ b/ansible/roles/dnf_repos/defaults/main.yml @@ -1,51 +1,52 @@ +--- dnf_repos_pulp_content_url: "{{ appliances_pulp_url }}/pulp/content" dnf_repos_username: "{{ omit }}" dnf_repos_password: "{{ omit }}" dnf_repos_filenames: - '8': - baseos: 'Rocky-BaseOS' - appstream: 'Rocky-AppStream' - crb: 'Rocky-PowerTools' - extras: 'Rocky-Extras' - grafana: 'grafana' - '9': - baseos: 'rocky' - appstream: 'rocky' - crb: 'rocky' - extras: 'rocky-extras' - grafana: 'grafana' + "8": + baseos: "Rocky-BaseOS" + appstream: "Rocky-AppStream" + crb: "Rocky-PowerTools" + extras: "Rocky-Extras" + grafana: "grafana" + "9": + baseos: "rocky" + appstream: "rocky" + crb: "rocky" + extras: "rocky-extras" + grafana: "grafana" dnf_repos_version_filenames: "{{ dnf_repos_filenames[ansible_distribution_major_version] }}" # epel installed separately dnf_repos_default_repolist: -- file: "{{ dnf_repos_version_filenames.baseos }}" - name: baseos - base_url: "{{ dnf_repos_pulp_content_url }}/{{ appliances_pulp_repos.baseos[ansible_distribution_version] | appliances_repo_to_subpath }}" -- file: "{{ dnf_repos_version_filenames.appstream }}" - name: appstream - base_url: "{{ dnf_repos_pulp_content_url }}/{{ appliances_pulp_repos.appstream[ansible_distribution_version] | appliances_repo_to_subpath }}" -- file: "{{ dnf_repos_version_filenames.crb }}" - name: "{{ 'powertools' if ansible_distribution_major_version == '8' else 'crb' }}" - base_url: "{{ dnf_repos_pulp_content_url }}/{{ appliances_pulp_repos.crb[ansible_distribution_version] | appliances_repo_to_subpath }}" -- file: "{{ dnf_repos_version_filenames.extras }}" - name: extras - base_url: "{{ dnf_repos_pulp_content_url }}/{{ appliances_pulp_repos.extras[ansible_distribution_version] | appliances_repo_to_subpath }}" -- file: ceph - name: Ceph - base_url: "{{ dnf_repos_pulp_content_url }}/{{ appliances_pulp_repos.ceph[ansible_distribution_major_version] | appliances_repo_to_subpath }}" -- file: "{{ dnf_repos_version_filenames.grafana }}" - name: grafana - base_url: "{{ dnf_repos_pulp_content_url }}/{{ appliances_pulp_repos.grafana[ansible_distribution_major_version] | appliances_repo_to_subpath }}" + - file: "{{ dnf_repos_version_filenames.baseos }}" + name: baseos + base_url: "{{ dnf_repos_pulp_content_url }}/{{ appliances_pulp_repos.baseos[ansible_distribution_version] | appliances_repo_to_subpath }}" + - file: "{{ dnf_repos_version_filenames.appstream }}" + name: appstream + base_url: "{{ dnf_repos_pulp_content_url }}/{{ appliances_pulp_repos.appstream[ansible_distribution_version] | appliances_repo_to_subpath }}" + - file: "{{ dnf_repos_version_filenames.crb }}" + name: "{{ 'powertools' if ansible_distribution_major_version == '8' else 'crb' }}" + base_url: "{{ dnf_repos_pulp_content_url }}/{{ appliances_pulp_repos.crb[ansible_distribution_version] | appliances_repo_to_subpath }}" + - file: "{{ dnf_repos_version_filenames.extras }}" + name: extras + base_url: "{{ dnf_repos_pulp_content_url }}/{{ appliances_pulp_repos.extras[ansible_distribution_version] | appliances_repo_to_subpath }}" + - file: ceph + name: Ceph + base_url: "{{ dnf_repos_pulp_content_url }}/{{ appliances_pulp_repos.ceph[ansible_distribution_major_version] | appliances_repo_to_subpath }}" + - file: "{{ dnf_repos_version_filenames.grafana }}" + name: grafana + base_url: "{{ dnf_repos_pulp_content_url }}/{{ appliances_pulp_repos.grafana[ansible_distribution_major_version] | appliances_repo_to_subpath }}" dnf_repos_openhpc_repolist: -- name: OpenHPC - file: OpenHPC - base_url: "{{ dnf_repos_pulp_content_url }}/{{ appliances_pulp_repos.openhpc_base[ansible_distribution_major_version] | appliances_repo_to_subpath }}" -- name: OpenHPC-updates - file: OpenHPC - base_url: "{{ dnf_repos_pulp_content_url }}/{{ appliances_pulp_repos.openhpc_updates[ansible_distribution_major_version] | appliances_repo_to_subpath }}" + - name: OpenHPC + file: OpenHPC + base_url: "{{ dnf_repos_pulp_content_url }}/{{ appliances_pulp_repos.openhpc_base[ansible_distribution_major_version] | appliances_repo_to_subpath }}" + - name: OpenHPC-updates + file: OpenHPC + base_url: "{{ dnf_repos_pulp_content_url }}/{{ appliances_pulp_repos.openhpc_updates[ansible_distribution_major_version] | appliances_repo_to_subpath }}" dnf_repos_repolist: "{{ dnf_repos_default_repolist + (dnf_repos_openhpc_repolist if (openhpc_install_type | default('ohpc')) == 'ohpc' else []) }}" diff --git a/ansible/roles/dnf_repos/tasks/set_repos.yml b/ansible/roles/dnf_repos/tasks/set_repos.yml index c9fcb0c07..9862cfc4e 100644 --- a/ansible/roles/dnf_repos/tasks/set_repos.yml +++ b/ansible/roles/dnf_repos/tasks/set_repos.yml @@ -1,5 +1,4 @@ --- - - name: Replace system repos with Pulp repos ansible.builtin.yum_repository: file: "{{ item.file }}" diff --git a/ansible/roles/doca/defaults/main.yml b/ansible/roles/doca/defaults/main.yml index 66437cd04..f57792d35 100644 --- a/ansible/roles/doca/defaults/main.yml +++ b/ansible/roles/doca/defaults/main.yml @@ -1,3 +1,4 @@ -doca_version: '2.9.1' # 2.9 is LTS, last to support ConnectX-4, 3 years for bug fixes and CVE updates +--- +doca_version: "2.9.1" # 2.9 is LTS, last to support ConnectX-4, 3 years for bug fixes and CVE updates doca_profile: doca-ofed doca_repo_url: "https://linux.mellanox.com/public/repo/doca/{{ doca_version }}/rhel{{ ansible_distribution_version }}/{{ ansible_architecture }}/" diff --git a/ansible/roles/doca/tasks/install-kernel-devel.yml b/ansible/roles/doca/tasks/install-kernel-devel.yml index 6a1943a32..c71d4659b 100644 --- a/ansible/roles/doca/tasks/install-kernel-devel.yml +++ b/ansible/roles/doca/tasks/install-kernel-devel.yml @@ -1,24 +1,28 @@ +--- - name: Get installed kernels - command: dnf list --installed kernel + ansible.builtin.command: dnf list --installed kernel register: _ofed_dnf_kernels changed_when: false - name: Determine running kernel - command: uname -r # e.g. 4.18.0-513.18.1.el8_9.x86_64 + ansible.builtin.command: uname -r register: _ofed_loaded_kernel changed_when: false - name: Check current kernel is newest installed - assert: + ansible.builtin.assert: that: _ofed_kernel_current == _ofed_dnf_kernels_newest fail_msg: "Kernel {{ _ofed_loaded_kernel.stdout }} is loaded but newer {{ _ofed_dnf_kernels_newest }} is installed: consider rebooting?" vars: _ofed_kernel_current: >- {{ _ofed_loaded_kernel.stdout | regex_replace('\.(?:.(?!\.))+$', '') | regex_replace('\.(?:.(?!\.))+$', '') }} + # yamllint disable rule:line-length _ofed_dnf_kernels_newest: >- + # yamllint disable-line rule:line-length {{ _ofed_dnf_kernels.stdout_lines[1:] | map('split') | map(attribute=1) | map('regex_replace', '\.(?:.(?!\.))+$', '') | community.general.version_sort | last }} - # dnf line format e.g. "kernel.x86_64 4.18.0-513.18.1.el8_9 @baseos " + # yamllint enable rule:line-length + # dnf line format e.g. "kernel.x86_64 4.18.0-513.18.1.el8_9 @baseos " - name: Install matching kernel-devel package - dnf: + ansible.builtin.dnf: name: "kernel-devel-{{ _ofed_loaded_kernel.stdout | trim }}" diff --git a/ansible/roles/doca/tasks/install.yml b/ansible/roles/doca/tasks/install.yml index d26fda79e..e21218ecb 100644 --- a/ansible/roles/doca/tasks/install.yml +++ b/ansible/roles/doca/tasks/install.yml @@ -1,5 +1,5 @@ -- import_tasks: install-kernel-devel.yml - +--- +- ansible.builtin.import_tasks: install-kernel-devel.yml - name: Install DOCA repo ansible.builtin.yum_repository: name: doca @@ -13,21 +13,21 @@ ansible.builtin.dnf: name: doca-extra -- name: Build DOCA kernel modules - ansible.builtin.shell: +- name: Build DOCA kernel modules # noqa: no-changed-when + ansible.builtin.command: cmd: /opt/mellanox/doca/tools/doca-kernel-support register: _doca_kernel_build - - name: Find generated doca-kernel-repo - ansible.builtin.shell: 'find /tmp/DOCA.* -name doca-kernel-repo-*' + ansible.builtin.shell: "find /tmp/DOCA.* -name doca-kernel-repo-*" register: _doca_kernel_repo # e.g. /tmp/DOCA.WVMchs2QWo/doca-kernel-repo-24.10.1.1.4.0-1.kver.5.14.0.427.31.1.el9.4.x86.64.x86_64.rpm changed_when: false -- name: Create dnf cache +- name: Create dnf cache # noqa: no-changed-when ansible.builtin.command: dnf makecache - name: Install DOCA repository package + # checkov:skip=CKV2_ANSIBLE_4: "Ensure that packages with untrusted or missing GPG signatures are not used by dnf" ansible.builtin.dnf: name: "{{ _doca_kernel_repo.stdout }}" disable_gpg_check: true @@ -41,11 +41,11 @@ state: absent path: "{{ (_doca_kernel_repo.stdout | split('/'))[:3] | join('/') }}" # leading / means 1st element of split list is '' -- name: Update initramfs +- name: Update initramfs # noqa: no-changed-when ansible.builtin.command: cmd: dracut -f register: _doca_dracut failed_when: _doca_dracut.stderr != '' # appears rc is always 0 -- name: Load the new driver +- name: Load the new driver # noqa: no-changed-when ansible.builtin.command: /etc/init.d/openibd restart diff --git a/ansible/roles/doca/tasks/main.yml b/ansible/roles/doca/tasks/main.yml index e7a272f38..df97825c1 100644 --- a/ansible/roles/doca/tasks/main.yml +++ b/ansible/roles/doca/tasks/main.yml @@ -1 +1,2 @@ -- include_tasks: install.yml +--- +- ansible.builtin.include_tasks: install.yml diff --git a/ansible/roles/eessi/README.md b/ansible/roles/eessi/README.md index d48e00977..df9e8358e 100644 --- a/ansible/roles/eessi/README.md +++ b/ansible/roles/eessi/README.md @@ -1,26 +1,23 @@ -EESSI -===== +# EESSI Configure the EESSI pilot respository for use on given hosts. -Requirements ------------- +## Requirements None. -Role Variables --------------- +## Role Variables - `cvmfs_quota_limit_mb`: Optional int. Maximum size of local package cache on each node in MB. -- `cvmfs_config_overrides`: Optional dict. Set of key-value pairs for additional CernVM-FS settings see [official docs](https://cvmfs.readthedocs.io/en/stable/cpt-configure.html) for list of options. Each dict key should correspond to a valid config variable (e.g. `CVMFS_HTTP_PROXY`) and the corresponding dict value will be set as the variable value (e.g. `https://my-proxy.com`). These configuration parameters will be written to the `/etc/cvmfs/default.local` config file on each host in the form `KEY=VALUE`. +- `cvmfs_config_overrides`: Optional dict. Set of key-value pairs for additional CernVM-FS settings see [official docs](https://cvmfs.readthedocs.io/en/stable/cpt-configure.html) for list of options. + Each dict key should correspond to a valid config variable (e.g. `CVMFS_HTTP_PROXY`) and the corresponding dict value will be set as the variable value (e.g. `https://my-proxy.com`). + These configuration parameters will be written to the `/etc/cvmfs/default.local` config file on each host in the form `KEY=VALUE`. -Dependencies ------------- +## Dependencies None. -Example Playbook ----------------- +## Example Playbook ```yaml - name: Setup EESSI diff --git a/ansible/roles/eessi/defaults/main.yaml b/ansible/roles/eessi/defaults/main.yaml index 60e61f137..581c24f77 100644 --- a/ansible/roles/eessi/defaults/main.yaml +++ b/ansible/roles/eessi/defaults/main.yaml @@ -7,7 +7,6 @@ cvmfs_config_default: CVMFS_QUOTA_LIMIT: "{{ cvmfs_quota_limit_mb }}" cvmfs_config_overrides: {} - cvmfs_config: "{{ cvmfs_config_default | combine(cvmfs_config_overrides) }}" cvmfs_gpg_checksum: "sha256:4ac81adff957565277cfa6a4a330cdc2ce5a8fdd73b8760d1a5a32bef71c4bd6" diff --git a/ansible/roles/eessi/tasks/main.yaml b/ansible/roles/eessi/tasks/main.yaml index 91dd54887..278e8736b 100644 --- a/ansible/roles/eessi/tasks/main.yaml +++ b/ansible/roles/eessi/tasks/main.yaml @@ -1,24 +1,27 @@ --- - name: Download Cern GPG key + # checkov:skip=CKV2_ANSIBLE_2: "Ensure that HTTPS url is used with get_url" ansible.builtin.get_url: url: http://cvmrepo.web.cern.ch/cvmrepo/yum/RPM-GPG-KEY-CernVM dest: ./cvmfs-key.gpg checksum: "{{ cvmfs_gpg_checksum }}" + mode: "0644" -- name: Import downloaded GPG key - command: rpm --import cvmfs-key.gpg - +- name: Import downloaded GPG key # noqa: no-changed-when + ansible.builtin.command: rpm --import cvmfs-key.gpg # noqa: command-instead-of-module - name: Add CVMFS repo - dnf: + # checkov:skip=CKV2_ANSIBLE_4: "Ensure that packages with untrusted or missing GPG signatures are not used by dnf" + ansible.builtin.dnf: name: https://ecsft.cern.ch/dist/cvmfs/cvmfs-release/cvmfs-release-latest.noarch.rpm disable_gpg_check: true - name: Install CVMFS - dnf: + ansible.builtin.dnf: name: cvmfs - name: Install EESSI CVMFS config - dnf: + # checkov:skip=CKV2_ANSIBLE_4: "Ensure that packages with untrusted or missing GPG signatures are not used by dnf" + ansible.builtin.dnf: name: https://github.com/EESSI/filesystem-layer/releases/download/latest/cvmfs-config-eessi-latest.noarch.rpm # NOTE: Can't find any docs on obtaining gpg key - maybe downloading directly from github is ok? disable_gpg_check: true @@ -35,14 +38,14 @@ - name: Add base CVMFS config community.general.ini_file: dest: /etc/cvmfs/default.local - section: null + section: option: "{{ item.key }}" value: "{{ item.value }}" no_extra_spaces: true + mode: "0644" loop: "{{ cvmfs_config | dict2items }}" - # NOTE: Not clear how to make this idempotent -- name: Ensure CVMFS config is setup - command: +- name: Ensure CVMFS config is setup # noqa: no-changed-when + ansible.builtin.command: cmd: "cvmfs_config setup" diff --git a/ansible/roles/etc_hosts/README.md b/ansible/roles/etc_hosts/README.md index 0ad95681a..8c1c4221b 100644 --- a/ansible/roles/etc_hosts/README.md +++ b/ansible/roles/etc_hosts/README.md @@ -3,11 +3,12 @@ Hosts in the `etc_hosts` groups have `/etc/hosts` created with entries of the format `IP_address canonical_hostname [alias]`. By default, an entry is created for each host in this group as follows: + - The value of `ansible_host` is used as the IP_address. - If `node_fqdn` is defined then that is used as the canonical hostname and `inventory_hostname` as an alias. Otherwise `inventory_hostname` is used as the canonical hostname. -This may need overriding for multi-homed hosts or hosts with multiple aliases. + This may need overriding for multi-homed hosts or hosts with multiple aliases. -# Variables +## Variables - `etc_hosts_template`: Template file to use. Default is the in-role template. - `etc_hosts_hostvars`: A list of variable names, used (in the order supplied) to create the entry for each host. Default is described above. diff --git a/ansible/roles/etc_hosts/defaults/main.yml b/ansible/roles/etc_hosts/defaults/main.yml index c2ecbca0c..bf7dbe509 100644 --- a/ansible/roles/etc_hosts/defaults/main.yml +++ b/ansible/roles/etc_hosts/defaults/main.yml @@ -1,3 +1,4 @@ +--- etc_hosts_template: hosts.j2 etc_hosts_hostvars: "{{ ['ansible_host'] + (['node_fqdn'] if node_fqdn is defined else []) + ['inventory_hostname'] }}" -etc_hosts_extra_hosts: '' +etc_hosts_extra_hosts: "" diff --git a/ansible/roles/etc_hosts/tasks/main.yml b/ansible/roles/etc_hosts/tasks/main.yml index 6fdabf57c..452b58f23 100644 --- a/ansible/roles/etc_hosts/tasks/main.yml +++ b/ansible/roles/etc_hosts/tasks/main.yml @@ -1,8 +1,9 @@ +--- - name: Template out /etc/hosts - template: + ansible.builtin.template: src: "{{ etc_hosts_template }}" dest: /etc/hosts owner: root group: root - mode: 0644 - become: yes + mode: "0644" + become: true diff --git a/ansible/roles/fail2ban/README.md b/ansible/roles/fail2ban/README.md index 0e744fdbe..dec727e1b 100644 --- a/ansible/roles/fail2ban/README.md +++ b/ansible/roles/fail2ban/README.md @@ -1,27 +1,23 @@ -fail2ban -========= +# fail2ban Setup fail2ban to protect SSH on a host. Note that no email alerts are set up so logs (at `/var/log/fail2ban.log`) will have to be manually reviewed if required. -Requirements ------------- +## Requirements - An EL8 system. - `firewalld` running. -Role Variables --------------- +## Role Variables + None. -Dependencies ------------- +## Dependencies None. -Example Playbook ----------------- +## Example Playbook ```yaml - hosts: fail2ban @@ -34,12 +30,10 @@ Example Playbook name: fail2ban ``` -License -------- +## License Apache v2 -Author Information ------------------- +## Author Information stackhpc.com diff --git a/ansible/roles/fail2ban/handlers/main.yml b/ansible/roles/fail2ban/handlers/main.yml index d578c2914..9db9b0162 100644 --- a/ansible/roles/fail2ban/handlers/main.yml +++ b/ansible/roles/fail2ban/handlers/main.yml @@ -1,7 +1,6 @@ --- - - name: Restart fail2ban - service: + ansible.builtin.service: name: fail2ban state: restarted enabled: true diff --git a/ansible/roles/fail2ban/meta/main.yml b/ansible/roles/fail2ban/meta/main.yml index 02d6a2fe1..1005726db 100644 --- a/ansible/roles/fail2ban/meta/main.yml +++ b/ansible/roles/fail2ban/meta/main.yml @@ -1,6 +1,8 @@ +--- galaxy_info: author: Steve Brasier company: stackhpc + description: Setup fail2ban to protect SSH on a host # If the issue tracker for your role is not on github, uncomment the # next line and provide a value @@ -15,7 +17,7 @@ galaxy_info: # - CC-BY-4.0 license: Apache-2.0 - min_ansible_version: 2.1 + min_ansible_version: "2.1" # If this a Container Enabled role, provide the minimum Ansible Container version. # min_ansible_container_version: @@ -27,9 +29,9 @@ galaxy_info: # https://galaxy.ansible.com/api/v1/platforms/ # platforms: - - name: EL - versions: - - 8 + - name: EL + versions: + - "8" galaxy_tags: [] # List tags for your role here, one per line. A tag is a keyword that describes diff --git a/ansible/roles/fail2ban/tasks/main.yml b/ansible/roles/fail2ban/tasks/main.yml index 244a2edf9..9490b1eff 100644 --- a/ansible/roles/fail2ban/tasks/main.yml +++ b/ansible/roles/fail2ban/tasks/main.yml @@ -1,26 +1,26 @@ --- - name: Install EPEL repo - package: + ansible.builtin.package: name: epel-release - name: Install fail2ban packages - package: + ansible.builtin.package: name: - fail2ban-server - fail2ban-firewalld state: present - name: Create config - template: + ansible.builtin.template: dest: /etc/fail2ban/jail.local src: jail.local.j2 + mode: "0644" notify: Restart fail2ban -- name: flush handlers - meta: flush_handlers - +- name: Flush handlers + ansible.builtin.meta: flush_handlers - name: Ensure fail2ban running even if no config change - service: + ansible.builtin.service: name: fail2ban state: started enabled: true diff --git a/ansible/roles/filebeat/defaults/main.yml b/ansible/roles/filebeat/defaults/main.yml index 4b4220a69..5f6cce587 100644 --- a/ansible/roles/filebeat/defaults/main.yml +++ b/ansible/roles/filebeat/defaults/main.yml @@ -1,6 +1,6 @@ --- -#filebeat_config_path: undefined # REQUIRED. Path to filebeat.yml configuration file template +# filebeat_config_path: undefined # REQUIRED. Path to filebeat.yml configuration file template filebeat_podman_user: "{{ ansible_user }}" # User that runs the filebeat container filebeat_version: 7.12.1 # latest usable with opensearch - see https://opensearch.org/docs/2.4/tools/index/#compatibility-matrix-for-beats filebeat_debug: false diff --git a/ansible/roles/filebeat/handlers/main.yml b/ansible/roles/filebeat/handlers/main.yml index 77b9363e8..8fa3862e9 100644 --- a/ansible/roles/filebeat/handlers/main.yml +++ b/ansible/roles/filebeat/handlers/main.yml @@ -1,9 +1,8 @@ --- - - name: Restart filebeat container - systemd: + ansible.builtin.systemd: name: filebeat.service state: restarted - enabled: yes - daemon_reload: yes + enabled: true + daemon_reload: true become: true diff --git a/ansible/roles/filebeat/tasks/install.yml b/ansible/roles/filebeat/tasks/install.yml index 6514e3028..eaf621cc5 100644 --- a/ansible/roles/filebeat/tasks/install.yml +++ b/ansible/roles/filebeat/tasks/install.yml @@ -1,8 +1,9 @@ --- - name: Create systemd unit file - template: + ansible.builtin.template: dest: /etc/systemd/system/filebeat.service src: filebeat.service.j2 + mode: "0644" become: true register: _filebeat_unit @@ -10,9 +11,9 @@ containers.podman.podman_image: name: "docker.elastic.co/beats/filebeat-oss" tag: "{{ filebeat_version }}" - become_user: "{{ filebeat_podman_user }}" + # become_user: "{{ filebeat_podman_user }}" # Commenting out as become_user does not imply become: true -- name: Reload filebeat unit file - command: systemctl daemon-reload - when: _filebeat_unit.changed +- name: Reload filebeat unit file # noqa: no-changed-when + ansible.builtin.command: systemctl daemon-reload # noqa: command-instead-of-module + when: _filebeat_unit.changed # noqa: no-handler become: true diff --git a/ansible/roles/filebeat/tasks/main.yml b/ansible/roles/filebeat/tasks/main.yml index 849683c38..7a1e32950 100644 --- a/ansible/roles/filebeat/tasks/main.yml +++ b/ansible/roles/filebeat/tasks/main.yml @@ -1,2 +1,3 @@ -- import_tasks: install.yml -- import_tasks: runtime.yml +--- +- ansible.builtin.import_tasks: install.yml +- ansible.builtin.import_tasks: runtime.yml diff --git a/ansible/roles/filebeat/tasks/runtime.yml b/ansible/roles/filebeat/tasks/runtime.yml index 119745096..cc2bd9125 100644 --- a/ansible/roles/filebeat/tasks/runtime.yml +++ b/ansible/roles/filebeat/tasks/runtime.yml @@ -1,38 +1,36 @@ --- - - name: Collect usernamespace facts user_namespace_facts: - name: Set facts containing sub-ids - set_fact: + ansible.builtin.set_fact: # filebeat user is 1000 filebeat_host_user_id: "{{ ansible_facts.subuid[filebeat_podman_user]['start'] + 1000 - 1 }}" filebeat_host_group_id: "{{ ansible_facts.subgid[filebeat_podman_user]['start'] + 1000 - 1 }}" - name: Ensure parent directory exists - file: + ansible.builtin.file: state: directory path: "/etc/filebeat" owner: "{{ filebeat_host_user_id }}" group: "{{ filebeat_host_group_id }}" - mode: 0770 + mode: "0770" become: true - name: Template configuration files - template: - src: "{{ filebeat_config_path }}" - dest: /etc/filebeat/filebeat.yml - owner: "{{ filebeat_host_user_id }}" - group: "{{ filebeat_host_group_id }}" - mode: 0600 + ansible.builtin.template: + src: "{{ filebeat_config_path }}" + dest: /etc/filebeat/filebeat.yml + owner: "{{ filebeat_host_user_id }}" + group: "{{ filebeat_host_group_id }}" + mode: "0600" notify: Restart filebeat container become: true - name: Flush handlers - meta: flush_handlers - + ansible.builtin.meta: flush_handlers - name: Ensure filebeat service state - systemd: + ansible.builtin.systemd: name: filebeat.service state: started enabled: true diff --git a/ansible/roles/filebeat/tasks/validate.yml b/ansible/roles/filebeat/tasks/validate.yml index b4936200e..0787938ae 100644 --- a/ansible/roles/filebeat/tasks/validate.yml +++ b/ansible/roles/filebeat/tasks/validate.yml @@ -1,5 +1,5 @@ --- - name: Assert that filebeat_config_path is defined - assert: - that: filebeat_config_path is defined \ No newline at end of file + ansible.builtin.assert: + that: filebeat_config_path is defined diff --git a/ansible/roles/firewalld/README.md b/ansible/roles/firewalld/README.md index 2d75b6b3b..280e8280b 100644 --- a/ansible/roles/firewalld/README.md +++ b/ansible/roles/firewalld/README.md @@ -1,48 +1,44 @@ -Role Name -========= +# Role Name Install and configure the `firewalld` firewall. -Requirements ------------- +## Requirements EL8 host -Role Variables --------------- +## Role Variables - `firewalld_enabled`: Optional. Whether `firewalld` service is enabled (starts at boot). Default `yes`. - `firewalld_state`: Optional. State of `firewalld` service. Default `started`. Other values: `stopped`. - `firewalld_configs`: Optional. List of dicts giving parameters for [ansible.posix.firewalld module](https://docs.ansible.com/ansible/latest/collections/ansible/posix/firewalld_module.html). Default is an empty list. Note that the default configuration for firewalld on Rocky Linux 8.5 is as follows: + ```shell # firewall-offline-cmd --list-all public target: default icmp-block-inversion: no - interfaces: - sources: + interfaces: + sources: services: cockpit dhcpv6-client ssh - ports: - protocols: + ports: + protocols: forward: no masquerade: no - forward-ports: - source-ports: - icmp-blocks: - rich rules: + forward-ports: + source-ports: + icmp-blocks: + rich rules: ``` -Dependencies ------------- +## Dependencies None. -Example Playbook ----------------- +## Example Playbook -``` +```yaml - hosts: firewalld gather_facts: false become: yes @@ -52,12 +48,10 @@ Example Playbook name: firewalld ``` -License -------- +## License BSD -Author Information ------------------- +## Author Information -An optional section for the role authors to include contact information, or a website (HTML is not allowed). +An optional section for the role authors to include contact information, or a site (HTML is not allowed). diff --git a/ansible/roles/firewalld/defaults/main.yml b/ansible/roles/firewalld/defaults/main.yml index d2bdac76b..272003722 100644 --- a/ansible/roles/firewalld/defaults/main.yml +++ b/ansible/roles/firewalld/defaults/main.yml @@ -1,3 +1,4 @@ -firewalld_enabled: yes +--- +firewalld_enabled: true firewalld_state: started firewalld_configs: [] diff --git a/ansible/roles/firewalld/handlers/main.yml b/ansible/roles/firewalld/handlers/main.yml index c7a008a12..0e8c3dfce 100644 --- a/ansible/roles/firewalld/handlers/main.yml +++ b/ansible/roles/firewalld/handlers/main.yml @@ -1,6 +1,6 @@ --- - name: Restart filewalld - service: + ansible.builtin.service: name: firewalld state: restarted when: firewalld_state != 'stopped' diff --git a/ansible/roles/firewalld/meta/main.yml b/ansible/roles/firewalld/meta/main.yml index c572acc9f..7e1dddb7d 100644 --- a/ansible/roles/firewalld/meta/main.yml +++ b/ansible/roles/firewalld/meta/main.yml @@ -1,7 +1,8 @@ +--- galaxy_info: - author: your name - description: your role description - company: your company (optional) + author: StackHPC Ltd + description: Install and configure the `firewalld` firewall + company: StackHPC Ltd # If the issue tracker for your role is not on github, uncomment the # next line and provide a value @@ -14,9 +15,9 @@ galaxy_info: # - GPL-3.0-only # - Apache-2.0 # - CC-BY-4.0 - license: license (GPL-2.0-or-later, MIT, etc) + license: (GPL-2.0-or-later, MIT, etc) - min_ansible_version: 2.1 + min_ansible_version: "2.1" # If this a Container Enabled role, provide the minimum Ansible Container version. # min_ansible_container_version: diff --git a/ansible/roles/firewalld/tasks/install.yml b/ansible/roles/firewalld/tasks/install.yml index 1709cfb43..c30c06419 100644 --- a/ansible/roles/firewalld/tasks/install.yml +++ b/ansible/roles/firewalld/tasks/install.yml @@ -1,3 +1,4 @@ +--- - name: Install firewalld package - dnf: + ansible.builtin.dnf: name: firewalld diff --git a/ansible/roles/firewalld/tasks/main.yml b/ansible/roles/firewalld/tasks/main.yml index 98a7aa732..7a1e32950 100644 --- a/ansible/roles/firewalld/tasks/main.yml +++ b/ansible/roles/firewalld/tasks/main.yml @@ -1,3 +1,3 @@ --- -- import_tasks: install.yml -- import_tasks: runtime.yml +- ansible.builtin.import_tasks: install.yml +- ansible.builtin.import_tasks: runtime.yml diff --git a/ansible/roles/firewalld/tasks/runtime.yml b/ansible/roles/firewalld/tasks/runtime.yml index 2c9ab59cc..03a535611 100644 --- a/ansible/roles/firewalld/tasks/runtime.yml +++ b/ansible/roles/firewalld/tasks/runtime.yml @@ -1,10 +1,10 @@ -- name: Apply filewalld configs +--- +- name: Apply filewalld configs # noqa: args[module] ansible.posix.firewalld: "{{ item }}" notify: Restart filewalld loop: "{{ firewalld_configs }}" -- meta: flush_handlers - +- ansible.builtin.meta: flush_handlers - name: Ensure filewalld state ansible.builtin.systemd: name: firewalld diff --git a/ansible/roles/freeipa/README.md b/ansible/roles/freeipa/README.md index ae0854395..2863c8374 100644 --- a/ansible/roles/freeipa/README.md +++ b/ansible/roles/freeipa/README.md @@ -1,15 +1,15 @@ - # freeipa Support FreeIPA in the appliance. In production use it is expected the FreeIPA server(s) will be external to the cluster, implying that hosts and users are managed outside the appliance. However for testing and development the role can also deploy an "in-appliance" FreeIPA server, add hosts to it and manage users in FreeIPA. -# FreeIPA Client +## FreeIPA Client + +### FreeIPA Client Usage -## Usage - Add hosts to the `freeipa_client` group and run (at a minimum) the `ansible/iam.yml` playbook. -- Host names must match the domain name. By default (using the skeleton OpenTofu) hostnames are of the form `nodename.cluster_name.cluster_domain_suffix` where `cluster_name` and `cluster_domain_suffix` are OpenTofu variables. +- Hostnames must match the domain name. By default (using the skeleton OpenTofu) hostnames are of the form `nodename.cluster_name.cluster_domain_suffix` where `cluster_name` and `cluster_domain_suffix` are OpenTofu variables. - Hosts discover the FreeIPA server FQDN (and their own domain) from DNS records. If DNS servers are not set this is not set from DHCP, then use the `resolv_conf` role to configure this. For example when using the in-appliance FreeIPA development server: - + ```ini # environments//groups ... @@ -21,29 +21,32 @@ Support FreeIPA in the appliance. In production use it is expected the FreeIPA s ```yaml # environments//inventory/group_vars/all/resolv_conf.yml resolv_conf_nameservers: - - "{{ hostvars[groups['freeipa_server'] | first].ansible_host }}" + - "{{ hostvars[groups['freeipa_server'] | first].ansible_host }}" ``` - -- For production use with an external FreeIPA server, a random one-time password (OTP) must be generated when adding hosts to FreeIPA (e.g. using `ipa host-add --random ...`). This password should be set as a hostvar `freeipa_host_password`. Initial host enrolment will use this OTP to enrol the host. After this it becomes irrelevant so it does not need to be committed to git. This approach means the appliance does not require the FreeIPA administrator password. +- For production use with an external FreeIPA server, a random one-time password (OTP) must be generated when adding hosts to FreeIPA (e.g. using `ipa host-add --random ...`). + This password should be set as a hostvar `freeipa_host_password`. + Initial host enrolment will use this OTP to enrol the host. After this it becomes irrelevant so it does not need to be committed to Git. + This approach means the appliance does not require the FreeIPA administrator password. - For development use with the in-appliance FreeIPA server, `freeipa_host_password` will be automatically generated in memory. - The `control` host must define `appliances_state_dir` (on persistent storage). This is used to back-up keytabs to allow FreeIPA clients to automatically re-enrol after e.g. reimaging. Note that: - This is implemented when using the skeleton OpenTofu; on the control node `appliances_state_dir` defaults to `/var/lib/state` which is mounted from a volume. - Nodes are not re-enroled by a [Slurm-driven reimage](../../collections/ansible_collections/stackhpc/slurm_openstack_tools/roles/rebuild/README.md) (as that does not run this role). - If both a backed-up keytab and `freeipa_host_password` exist, the former is used. - -## Role Variables for Clients +### Role Variables for Clients - `freeipa_host_password`. Required for initial enrolment only, FreeIPA host password as described above. - `freeipa_setup_dns`: Optional, whether to use the FreeIPA server as the client's nameserver. Defaults to `true` when `freeipa_server` contains a host, otherwise `false`. See also use of `appliances_state_dir` on the control node as described above. -# FreeIPA Server +## FreeIPA Server + As noted above this is only intended for development and testing. Note it cannot be run on the `openondemand` node as no other virtual servers must be defined in the Apache configuration. -## Usage +### FreeIPA Server Usage + - Add a single host to the `freeipa_server` group and run (at a minimum) the `ansible/bootstrap.yml` and `ansible/iam.yml` playbooks. - As well as configuring the FreeIPA server, the role will also: - Add ansible hosts in the group `freeipa_client` as FreeIPA hosts. @@ -51,7 +54,7 @@ As noted above this is only intended for development and testing. Note it cannot The FreeIPA GUI will be available on `https:///ipa/ui`. -## Role Variables for Server +### Role Variables for Server These role variables are only required when using `freeipa_server`: @@ -59,10 +62,10 @@ These role variables are only required when using `freeipa_server`: - `freeipa_domain`: Optional, name of domain. Default is lowercased `freeipa_realm`. - `freeipa_ds_password`: Optional, password to be used by the Directory Server for the Directory Manager user (`ipa-server-install --ds-password`). Default is generated in `environments//inventory/group_vars/all/secrets.yml` - `freeipa_admin_password`: Optional, password for the IPA `admin` user. Default is generated as for `freeipa_ds_password`. -- `freeipa_server_ip`: Optional, IP address of freeipa_server host. Default is `ansible_host` of the `freeipa_server` host. Default `false`. +- `freeipa_server_ip`: Optional, IP address of freeipa_server host. Default is `ansible_host` of the `freeipa_server` host. Default `false`. - `freeipa_setup_dns`: Optional bool, whether to configure the FreeIPA server as an integrated DNS server and define a zone and records. NB: This also controls whether `freeipa_client` hosts use the `freeipa_server` host for name resolution. Default `true` when `freeipa_server` contains a host. - `freeipa_client_ip`: Optional, IP address of FreeIPA client. Default is `ansible_host`. - `freeipa_users`: A list of dicts defining users to add, with keys/values as for [community.general.ipa_user](https://docs.ansible.com/ansible/latest/collections/community/general/ipa_user_module.html): Note that: - `name`, `givenname` (firstname) and `sn` (surname) are required. - `ipa_host`, `ipa_port`, `ipa_prot`, `ipa_user`, `validate_certs` are automatically provided and cannot be overridden. - - If `password` is set, the value should *not* be a hash (unlike `ansible.builtin.user` as used by the `basic_users` role), and it must be changed on first login. `krbpasswordexpiration` does not appear to be able to override this. + - If `password` is set, the value should _not_ be a hash (unlike `ansible.builtin.user` as used by the `basic_users` role), and it must be changed on first login. `krbpasswordexpiration` does not appear to be able to override this. diff --git a/ansible/roles/freeipa/defaults/main.yml b/ansible/roles/freeipa/defaults/main.yml index 03b844c8a..7a80d1a3d 100644 --- a/ansible/roles/freeipa/defaults/main.yml +++ b/ansible/roles/freeipa/defaults/main.yml @@ -1,8 +1,9 @@ -#freeipa_realm: +--- +# freeipa_realm: freeipa_domain: "{{ freeipa_realm | lower }}" -#freeipa_ds_password: -#freeipa_admin_password: -#freeipa_server_ip: +# freeipa_ds_password: +# freeipa_admin_password: +# freeipa_server_ip: freeipa_setup_dns: "{{ groups['freeipa_server'] | length > 0 }}" freeipa_client_ip: "{{ ansible_host }}" # when run on freeipa_client group! # freeipa_host_password: diff --git a/ansible/roles/freeipa/tasks/addhost.yml b/ansible/roles/freeipa/tasks/addhost.yml index 8020f803f..f01cba09a 100644 --- a/ansible/roles/freeipa/tasks/addhost.yml +++ b/ansible/roles/freeipa/tasks/addhost.yml @@ -1,3 +1,4 @@ +--- - name: Get ipa host information # This uses DNS to find the ipa server, which works as this is running on the enrolled ipa server # It doesn't fail even if the host doesn't exist @@ -10,7 +11,7 @@ validate_certs: false delegate_to: "{{ groups['freeipa_server'].0 }}" register: _ipa_host_check - check_mode: yes + check_mode: true changed_when: false - name: Add host to IPA @@ -29,6 +30,6 @@ register: _ipa_host_add - name: Set fact for ipa host password - set_fact: + ansible.builtin.set_fact: freeipa_host_password: "{{ _ipa_host_add.host.randompassword }}" - when: _ipa_host_add.changed + when: _ipa_host_add.changed # noqa: no-handler diff --git a/ansible/roles/freeipa/tasks/backup-keytabs.yml b/ansible/roles/freeipa/tasks/backup-keytabs.yml index 7fc77f9e1..1de3f7fd2 100644 --- a/ansible/roles/freeipa/tasks/backup-keytabs.yml +++ b/ansible/roles/freeipa/tasks/backup-keytabs.yml @@ -1,5 +1,6 @@ +--- - name: Retrieve keytabs to localhost - fetch: + ansible.builtin.fetch: src: "{{ _freeipa_keytab_backup_path }}" dest: "{{ appliances_environment_root }}/keytabs/{{ inventory_hostname }}/" flat: true @@ -7,8 +8,9 @@ tags: retrieve - name: Copy keytabs back to control node - copy: + ansible.builtin.copy: src: "{{ appliances_environment_root }}/keytabs/{{ inventory_hostname }}/" dest: "{{ _freeipa_keytab_backup_path | dirname }}" + mode: "0644" delegate_to: "{{ groups['control'].0 }}" tags: deploy diff --git a/ansible/roles/freeipa/tasks/client-install.yml b/ansible/roles/freeipa/tasks/client-install.yml index a164cd26e..82f7901d5 100644 --- a/ansible/roles/freeipa/tasks/client-install.yml +++ b/ansible/roles/freeipa/tasks/client-install.yml @@ -1,4 +1,4 @@ - +--- - name: Install FreeIPA client package - dnf: + ansible.builtin.dnf: name: ipa-client diff --git a/ansible/roles/freeipa/tasks/enrol.yml b/ansible/roles/freeipa/tasks/enrol.yml index 07436509b..96152430f 100644 --- a/ansible/roles/freeipa/tasks/enrol.yml +++ b/ansible/roles/freeipa/tasks/enrol.yml @@ -1,14 +1,16 @@ +--- +# yamllint disable-line rule:line-length # based on https://access.redhat.com/documentation/en-us/red_hat_enterprise_linux/8/html/installing_identity_management/assembly_installing-an-idm-client_installing-identity-management - name: Retrieve persisted keytab from previous enrolement - slurp: + ansible.builtin.slurp: src: "{{ _freeipa_keytab_backup_path }}" delegate_to: "{{ groups['control'] | first }}" register: _slurp_persisted_keytab failed_when: false - name: Write persisted keytab from previous enrolment - copy: + ansible.builtin.copy: content: "{{ _slurp_persisted_keytab.content | b64decode }}" dest: /tmp/krb5.keytab owner: root @@ -24,7 +26,7 @@ # 3. New SSH keys are generated # 4. ipaUniqueID is preserved # and ALSO that the keytab is changed! - command: + ansible.builtin.command: cmd: > ipa-client-install --unattended @@ -40,7 +42,7 @@ - name: Enrol with FreeIPA using random password # Note --password is overloaded - it's bulkpassword unless --principal or --force-join is used in which case it's admin password - command: + ansible.builtin.command: cmd: > ipa-client-install --unattended @@ -60,19 +62,19 @@ # This service is installed by nfs-utils, which attempts to start it. # It has ConditionPathExists=/etc/krb5.keytab which fails if host is not enroled. # This task avoids a reboot. - systemd: + ansible.builtin.systemd: name: rpc-gssd.service state: started enabled: true - name: Retrieve current keytab - slurp: + ansible.builtin.slurp: src: /etc/krb5.keytab register: _slurp_current_keytab failed_when: false - name: Ensure keytab backup directory exists - file: + ansible.builtin.file: path: "{{ _freeipa_keytab_backup_path | dirname }}" state: directory owner: root @@ -81,7 +83,8 @@ delegate_to: "{{ groups['control'] | first }}" - name: Persist keytab - copy: + ansible.builtin.copy: content: "{{ _slurp_current_keytab.content | b64decode }}" dest: "{{ _freeipa_keytab_backup_path }}" + mode: "0644" delegate_to: "{{ groups['control'] | first }}" diff --git a/ansible/roles/freeipa/tasks/server.yml b/ansible/roles/freeipa/tasks/server.yml index e555ebefd..b711998ae 100644 --- a/ansible/roles/freeipa/tasks/server.yml +++ b/ansible/roles/freeipa/tasks/server.yml @@ -1,20 +1,22 @@ +--- +# yamllint disable-line rule:line-length # Based on https://access.redhat.com/documentation/en-us/red_hat_enterprise_linux/8/html/installing_identity_management/preparing-the-system-for-ipa-server-installation_installing-identity-management#host-name-and-dns-requirements-for-ipa_preparing-the-system-for-ipa-server-installation - name: Install freeipa server packages - dnf: - name: '@idm:DL1/dns' + ansible.builtin.dnf: + name: "@idm:DL1/dns" state: present - name: Install ipa server -# TODO: make no-ui-redirect and dns configurable?? -# TODO: set file mask as per docs? Would be hard to cope with failures. Doesn't appear to be necessary actually. - command: + # TODO: make no-ui-redirect and dns configurable?? + # TODO: set file mask as per docs? Would be hard to cope with failures. Doesn't appear to be necessary actually. + ansible.builtin.command: cmd: > ipa-server-install --realm {{ freeipa_realm | quote }} --domain {{ freeipa_domain | lower | quote }} --ds-password {{ freeipa_ds_password | quote }} - --admin-password {{ freeipa_admin_password | quote }} + --admin-password {{ freeipa_admin_password | quote }} --ip-address={{ freeipa_server_ip }} {% if freeipa_setup_dns | bool %}--setup-dns{% endif %} --auto-reverse @@ -32,26 +34,26 @@ - name: Disable redirects to hard-coded domain # see https://pagure.io/freeipa/issue/7479 - replace: + ansible.builtin.replace: path: /etc/httpd/conf.d/ipa-rewrite.conf - regexp: '{{ item.regexp }}' - replace: '{{ item.replace }}' + regexp: "{{ item.regexp }}" + replace: "{{ item.replace }}" loop: # RewriteRule ^/$ https://${FQDN}/ipa/ui [L,NC,R=301] - irrelevant if using --no-ui-redirect - - regexp: '^(RewriteRule \^/\$) (https://.*)(/ipa/ui.*)$' - replace: '\1 \3' + - regexp: "^(RewriteRule \\^/\\$) (https://.*)(/ipa/ui.*)$" + replace: "\\1 \\3" # RewriteRule ^/ipa/(.*) - occurs twice - - regexp: '^(RewriteRule \^\/ipa\/\(.*)$' - replace: '#\1' - - regexp: '^(RewriteCond .*)$' - replace: '#\1' + - regexp: "^(RewriteRule \\^\\/ipa\\/\\(.*)$" + replace: "#\\1" + - regexp: "^(RewriteCond .*)$" + replace: "#\\1" # RewriteRule ^/(.*) https://${FQDN}/$1 [L,R=301] - - regexp: '^(RewriteRule \^/\(\.\*\).*)$' - replace: '#\1' + - regexp: "^(RewriteRule \\^/\\(\\.\\*\\).*)$" + replace: "#\\1" register: _replace_freeipa_rewrites - name: Get freeipa server facts - setup: + ansible.builtin.setup: - name: Fix HTTP_REFERER ansible.builtin.lineinfile: @@ -60,7 +62,7 @@ register: _http_referer - name: Reload apache configuration - service: + ansible.builtin.service: name: httpd state: reloaded when: _replace_freeipa_rewrites.changed or _http_referer.changed diff --git a/ansible/roles/freeipa/tasks/users.yml b/ansible/roles/freeipa/tasks/users.yml index bd1cacad3..97068fa06 100644 --- a/ansible/roles/freeipa/tasks/users.yml +++ b/ansible/roles/freeipa/tasks/users.yml @@ -4,12 +4,12 @@ displayname: "{{ item.displayname | default(omit) }}" gidnumber: "{{ item.gidnumber | default(omit) }}" givenname: "{{ item.givenname }}" - #ipa_host + # ipa_host ipa_pass: "{{ freeipa_admin_password | quote }}" - #ipa_port - #ipa_prot + # ipa_port + # ipa_prot ipa_timeout: "{{ item.ipa_timeout | default(omit) }}" - #ipa_user + # ipa_user krbpasswordexpiration: "{{ item.krbpasswordexpiration | default(omit) }}" loginshell: "{{ item.loginshell | default(omit) }}" mail: "{{ item.mail | default(omit) }}" @@ -23,5 +23,5 @@ uidnumber: "{{ item.uidnumber | default(omit) }}" update_password: "{{ item.update_password | default(omit) }}" userauthtype: "{{ item.userauthtype | default(omit) }}" - #validate_certs + # validate_certs loop: "{{ freeipa_users }}" diff --git a/ansible/roles/freeipa/tasks/validate.yml b/ansible/roles/freeipa/tasks/validate.yml index 238f89e60..39faba3ff 100644 --- a/ansible/roles/freeipa/tasks/validate.yml +++ b/ansible/roles/freeipa/tasks/validate.yml @@ -1,12 +1,13 @@ +--- - name: Get hostname as reported by command - command: hostname + ansible.builtin.command: hostname register: _freeipa_validate_hostname changed_when: false when: "'freeipa_server' in group_names" - name: Ensure hostname is fully-qualified # see section 2.7 of redhat guide to installing identity management - assert: + ansible.builtin.assert: that: _freeipa_validate_hostname.stdout | split('.') | length >= 3 fail_msg: "freeipa_server hostname '{{ _freeipa_validate_hostname.stdout }}' is not fully-qualified (a.b.c)" when: "'freeipa_server' in group_names" @@ -14,23 +15,23 @@ - name: Check for virtual servers in httpd configuration of freeipa_server # e.g. fatimage with OOD config; community.general.ipa_host fails with "401 Unauthorized: No session cookie found" # https://lists.fedoraproject.org/archives/list/freeipa-users@lists.fedorahosted.org/message/7RH7XDFR35KDPYJ7AQCQI2H2EOWIZCWA/ - find: + ansible.builtin.find: path: /etc/httpd/conf.d/ - contains: '- {{ @@ -25,24 +26,24 @@ # batch takes default '' because last devices doesn't have trailing blank line - name: Examine whether device address contains gateway_ip - set_fact: + ansible.builtin.set_fact: device_is_gateway_device: "{{ nmcli_devices | map(attribute='ip4_address') | map('ansible.utils.network_in_network', gateway_ip) }}" # list of bools - false if gateway_ip == '' - name: Get name of connection containing gateway_ip # might be empty string - set_fact: + ansible.builtin.set_fact: gateway_ip_connection: >- {{ nmcli_devices | map(attribute='connection') | - zip(device_is_gateway_device) | selectattr('1') | + zip(device_is_gateway_device) | selectattr('1') | map(attribute=0) | list | first | default ('') }} - name: Show debug info - debug: + ansible.builtin.debug: msg: "gateway_ip={{ gateway_ip }} access_ip={{ access_ip }} gateway_ip_connection={{ gateway_ip_connection }}" - name: Error if device has a gateway which is not the desired one - assert: + ansible.builtin.assert: that: item.gateway == gateway_ip fail_msg: "Device {{ item | to_nice_json }} has gateway: cannot apply gateway {{ gateway_ip }}" when: @@ -51,8 +52,8 @@ - item.ip4_gateway != gateway_ip loop: "{{ nmcli_devices }}" - - name: Remove undesired gateways - shell: | + - name: Remove undesired gateways # noqa: no-changed-when + ansible.builtin.shell: | nmcli connection modify '{{ item.connection }}' \ ipv4.never-default yes \ ipv6.never-default yes @@ -62,9 +63,9 @@ - item.ip4_gateway != '' - item.connection != gateway_ip_connection loop: "{{ nmcli_devices }}" - - - name: Add desired gateways - shell: | + + - name: Add desired gateways # noqa: no-changed-when + ansible.builtin.shell: | nmcli connection modify '{{ item.connection }}' \ ipv4.address {{ item.ip4_address }} \ ipv4.gateway {{ gateway_ip }} diff --git a/ansible/roles/gateway/tasks/main.yml b/ansible/roles/gateway/tasks/main.yml index c13ba5ce9..82b481a09 100644 --- a/ansible/roles/gateway/tasks/main.yml +++ b/ansible/roles/gateway/tasks/main.yml @@ -1,7 +1,8 @@ +--- - name: Add gateway playbook - copy: + ansible.builtin.copy: src: gateway-init.yml dest: /etc/ansible-init/playbooks/05-gateway-init.yml owner: root group: root - mode: 0644 + mode: "0644" diff --git a/ansible/roles/grafana-dashboards/files/openhpc-slurm.json b/ansible/roles/grafana-dashboards/files/openhpc-slurm.json index fb4078c5e..4cc5a4676 100644 --- a/ansible/roles/grafana-dashboards/files/openhpc-slurm.json +++ b/ansible/roles/grafana-dashboards/files/openhpc-slurm.json @@ -2072,4 +2072,4 @@ "title": "OpenHPC Slurm", "uid": "openhpc-slurm", "version": 2 -} \ No newline at end of file +} diff --git a/ansible/roles/grafana-dashboards/tasks/main.yml b/ansible/roles/grafana-dashboards/tasks/main.yml index 235088f77..2292dac3b 100644 --- a/ansible/roles/grafana-dashboards/tasks/main.yml +++ b/ansible/roles/grafana-dashboards/tasks/main.yml @@ -25,7 +25,7 @@ - become: false block: - name: Create local grafana dashboard directory - tempfile: + ansible.builtin.tempfile: state: directory register: _tmp_dashboards changed_when: false @@ -52,10 +52,11 @@ tags: - skip_ansible_lint - - name: copy in-role grafana dashboards + - name: Copy in-role grafana dashboards ansible.builtin.copy: src: "{{ item.dashboard_file }}" dest: "{{ _tmp_dashboards.path }}" + mode: "0644" loop: "{{ grafana_dashboards }}" when: - grafana_dashboards | length > 0 @@ -109,7 +110,7 @@ - name: Create/Update dashboards file (provisioning) become: true - copy: + ansible.builtin.copy: dest: "/etc/grafana/provisioning/dashboards/ansible.yml" content: | apiVersion: 1 @@ -123,12 +124,12 @@ backup: false owner: root group: grafana - mode: 0640 + mode: "0640" notify: restart grafana - name: Register preexisting dashboards become: true - find: + ansible.builtin.find: paths: "{{ grafana_data_dir }}/dashboards" hidden: true patterns: @@ -137,15 +138,17 @@ - name: Import grafana dashboards become: true - copy: - remote_src: yes + ansible.builtin.copy: + remote_src: true src: "{{ _tmp_dashboards.path }}/" # Note trailing / to only copy contents, not directory itself dest: "{{ grafana_data_dir }}/dashboards/" + directory_mode: "0755" + mode: "0644" notify: "provisioned dashboards changed" - name: Register all installed dashboards become: true - find: + ansible.builtin.find: paths: "{{ grafana_data_dir }}/dashboards" hidden: true patterns: @@ -153,13 +156,13 @@ register: _dashboards_post - name: Get dashboard lists - set_fact: - _dashboards_pre_list: "{{ _dashboards_pre | json_query('files[*].path') | default([]) }}" + ansible.builtin.set_fact: + _dashboards_pre_list: "{{ _dashboards_pre | json_query('files[*].path') | default([]) }}" _dashboards_post_list: "{{ _dashboards_post | json_query('files[*].path') | default([]) }}" - name: Remove installed dashboards not defined through this role become: true - file: + ansible.builtin.file: path: "{{ item }}" state: absent with_items: "{{ _dashboards_pre_list | difference( _dashboards_post_list ) }}" diff --git a/ansible/roles/hpctests/README.md b/ansible/roles/hpctests/README.md index 2cb9b7663..ed3d64efe 100644 --- a/ansible/roles/hpctests/README.md +++ b/ansible/roles/hpctests/README.md @@ -1,53 +1,55 @@ -hpctests -========= +# hpctests An MPI-based test suite for Slurm appliance clusters. -This is intended as a replacement for [this test role](https://github.com/stackhpc/ansible_collection_slurm_openstack_tools/tree/main/roles/test/) but will be safe to run on clusters in production use as it does not use NFS exports for package installs. Instead it assumes the required packages are pre-installed, which is the case by default with this appliance. +This is intended as a replacement for [this test role](https://github.com/stackhpc/ansible_collection_slurm_openstack_tools/tree/main/roles/test/) but will be safe to run on clusters in production use as it does not use NFS exports for package installs. Instead it assumes the required packages are pre-installed, which is the case by default with this appliance. Tests (with corresponding tags) are: + - `pingpong`: Runs Intel MPI Benchmark's IMB-MPI1 pingpong between a pair of (scheduler-selected) nodes. Reports zero-size message latency and maximum bandwidth. - `pingmatrix`: Runs a similar pingpong test but between all pairs of nodes. Reports zero-size message latency & maximum bandwidth. - `hpl-solo`: Runs the HPL benchmark individually on all nodes. Reports Gflops. All tests use GCC 9 and OpenMPI 4 with UCX. The HPL-based tests use OpenBLAS. -Requirements ------------- +## Requirements - An OpenHPC v2.x cluster. - The following OpenHPC packages installed (note this is the default in the appliance, see `environments/common/inventory/group_vars/all/openhpc.yml:openhpc_default_packages`): - `ohpc-gnu9-openmpi4-perf-tools` - `openblas-gnu9-ohpc` -Role Variables --------------- +## Role Variables + - `hpctests_user`: Optional. User to run jobs as. Default is `ansible_user`. - `hpctests_rootdir`: Optional. Path to root of test directory tree. This must be a r/w filesystem shared to all cluster nodes under test. Default is `/home/{{ hpctests_user }}/hpctests`. **NB:** Do not use `~` in this path. - `hpctests_partition`: Optional. Name of partition to use, otherwise default partition is used. - `hpctests_nodes`: Optional. A Slurm node expression, e.g. `'compute-[0-15,19]'` defining the nodes to use. If not set all nodes in the selected partition are used. -- `hpctests_ucx_net_devices`: Optional. Control which network device/interface to use, e.g. `mlx5_1:0`. The default of `all` (as per UCX) may not be appropriate for multi-rail nodes with different bandwidths on each device. See [here](https://openucx.readthedocs.io/en/master/faq.html#what-is-the-default-behavior-in-a-multi-rail-environment) and [here](https://github.com/openucx/ucx/wiki/UCX-environment-parameters#setting-the-devices-to-use). Alternatively a mapping of partition name (as `hpctests_partition`) to device/interface can be used. For partitions not defined in the mapping the default of `all` is used. +- `hpctests_ucx_net_devices`: Optional. Control which network device/interface to use, e.g. `mlx5_1:0`. + The default of `all` (as per UCX) may not be appropriate for multi-rail nodes with different bandwidths on each device. See [here](https://openucx.readthedocs.io/en/master/faq.html#what-is-the-default-behavior-in-a-multi-rail-environment) and [here](https://github.com/openucx/ucx/wiki/UCX-environment-parameters#setting-the-devices-to-use). + Alternatively a mapping of partition name (as `hpctests_partition`) to device/interface can be used. For partitions not defined in the mapping the default of `all` is used. - `hpctests_outdir`: Optional. Directory to use for test output on local host. Defaults to `$HOME/hpctests` (for local user). - `hpctests_hpl_NB`: Optional, default 192. The HPL block size "NB" - for Intel CPUs see [here](https://software.intel.com/content/www/us/en/develop/documentation/onemkl-linux-developer-guide/top/intel-oneapi-math-kernel-library-benchmarks/intel-distribution-for-linpack-benchmark/configuring-parameters.html). - `hpctests_hpl_mem_frac`: Optional, default 0.3. The HPL problem size "N" will - be selected to target using this fraction of each node's memory - - **CAUTION: see note below**. + be selected to target using this fraction of each node's memory - + **CAUTION: see note below**. - `hpctests_hpl_arch`: Optional, default 'linux64'. Arbitrary architecture name for HPL build. HPL is compiled on the first compute node of those selected (see `hpctests_nodes`), so this can be used to create different builds for different types of compute node. - --- + **CAUTION** > The default of `hpctests_hpl_mem_frac=0.3` will not significantly load nodes. -Values up to ~0.8 may be appropriate for a stress test but ensure cloud -operators are aware in case this overloads e.g. power supplies or cooling. -Values > 0.8 require longer runtimes and increase the risk of out-of-memory -errors without normally significantly increasing the stress on the node. ---- +> Values up to ~0.8 may be appropriate for a stress test but ensure cloud +> operators are aware in case this overloads e.g. power supplies or cooling. +> Values > 0.8 require longer runtimes and increase the risk of out-of-memory + +## errors without normally significantly increasing the stress on the node The following variables should not generally be changed: + - `hpctests_pre_cmd`: Optional. Command(s) to include in sbatch templates before module load commands. - `hpctests_pingmatrix_modules`: Optional. List of modules to load for pingmatrix test. Defaults are suitable for OpenHPC 2.x cluster using the required packages. - `hpctests_pingpong_modules`: As above but for pingpong test. @@ -55,13 +57,11 @@ The following variables should not generally be changed: - `hpctests_hpl_modules`: As above but for hpl tests. - `hpctests_hpl_version`: Version of HPL -Dependencies ------------- +## Dependencies None. -Example Playbook ----------------- +## Example Playbook The role should be run on a login node; @@ -76,12 +76,10 @@ The role should be run on a login node; name: hpctests ``` -License -------- +## License Apache v2 -Author Information ------------------- +## Author Information stackhpc.com diff --git a/ansible/roles/hpctests/defaults/main.yml b/ansible/roles/hpctests/defaults/main.yml index e514de5c7..fa1c3c27c 100644 --- a/ansible/roles/hpctests/defaults/main.yml +++ b/ansible/roles/hpctests/defaults/main.yml @@ -2,16 +2,24 @@ hpctests_user: "{{ ansible_user }}" hpctests_group: "{{ hpctests_user }}" hpctests_rootdir: "/home/{{ hpctests_user }}/hpctests" -hpctests_pre_cmd: '' -hpctests_pingmatrix_modules: [gnu12 openmpi4] -hpctests_pingpong_modules: [gnu12 openmpi4 imb] -hpctests_pingpong_plot: yes -hpctests_hpl_modules: [gnu12 openmpi4 openblas] +hpctests_pre_cmd: "" +hpctests_pingmatrix_modules: + - gnu12 + - openmpi4 +hpctests_pingpong_modules: + - gnu12 + - openmpi4 + - imb +hpctests_pingpong_plot: true +hpctests_hpl_modules: + - gnu12 + - openmpi4 + - openblas hpctests_outdir: "{{ lookup('env', 'APPLIANCES_ENVIRONMENT_ROOT') }}/hpctests" hpctests_ucx_net_devices: all hpctests_hpl_version: "2.3" hpctests_hpl_NB: 192 hpctests_hpl_mem_frac: 0.3 hpctests_hpl_arch: linux64 -#hpctests_nodes: -#hpctests_partition: +# hpctests_nodes: +# hpctests_partition: diff --git a/ansible/roles/hpctests/files/.clang-format-ignore b/ansible/roles/hpctests/files/.clang-format-ignore new file mode 100644 index 000000000..72e8ffc0d --- /dev/null +++ b/ansible/roles/hpctests/files/.clang-format-ignore @@ -0,0 +1 @@ +* diff --git a/ansible/roles/hpctests/files/CPPLINT.cfg b/ansible/roles/hpctests/files/CPPLINT.cfg new file mode 100644 index 000000000..88e41cded --- /dev/null +++ b/ansible/roles/hpctests/files/CPPLINT.cfg @@ -0,0 +1 @@ +exclude_files=.*.c diff --git a/ansible/roles/hpctests/files/plot_imb_pingpong.py b/ansible/roles/hpctests/files/plot_imb_pingpong.py index dbf6398e9..eb15c4ffa 100644 --- a/ansible/roles/hpctests/files/plot_imb_pingpong.py +++ b/ansible/roles/hpctests/files/plot_imb_pingpong.py @@ -1,55 +1,76 @@ -import matplotlib as mpl -import matplotlib.pyplot as plt -from matplotlib import ticker -import numpy as np +# pylint: disable=missing-module-docstring import os -def sizeof_fmt(num, suffix='B'): - """ TODO: """ +import matplotlib.pyplot as plt # pylint: disable=import-error +from matplotlib import ticker # pylint: disable=import-error + + +def sizeof_fmt(num, suffix="B"): + """TODO:""" # from https://stackoverflow.com/a/1094933/916373 - for unit in ['','Ki','Mi','Gi','Ti','Pi','Ei','Zi']: + for unit in ["", "Ki", "Mi", "Gi", "Ti", "Pi", "Ei", "Zi"]: if abs(num) < 1024.0: - return "%3.1f%s%s" % (num, unit, suffix) + # pylint: disable-next=consider-using-f-string + return "%3.1f%s%s" % ( + num, + unit, + suffix, + ) num /= 1024.0 - return "%.1f%s%s" % (num, 'Yi', suffix) + return "%.1f%s%s" % (num, "Yi", suffix) # pylint: disable=consider-using-f-string + def read_imb_out(path): - """ Read stdout from an IMB-MPI1 run. - - Returns a dict with: - key:= int, total number of processes involved - value:= pandas dataframe, i.e. one per results table. Columns as per table. - - If multiple results tables are present it is assumed that they are all the same benchmark, - and only differ in the number of processes. + """Read stdout from an IMB-MPI1 run. + + Returns a dict with: + key:= int, total number of processes involved + value:= pandas dataframe, i.e. one per results table. Columns as per table. + + If multiple results tables are present it is assumed that they are all the same benchmark, + and only differ in the number of processes. """ data = {} - COLTYPES = { # all benchmark names here should be lowercase - 'uniband': (int, int, float, int), # #bytes #repetitions Mbytes/sec Msg/sec - 'biband': (int, int, float, int), - 'pingpong':(int, int, float, float), # #bytes #repetitions t[usec] Mbytes/sec - 'alltoall':(int, int, float, float, float) # #bytes #repetitions t_min[usec] t_max[usec] t_avg[usec] + COLTYPES = { # all benchmark names here should be lowercase # pylint: disable=invalid-name + # #bytes #repetitions Mbytes/sec Msg/sec + "uniband": (int, int, float, int), + "biband": (int, int, float, int), + # #bytes #repetitions t[usec] Mbytes/sec + "pingpong": (int, int, float, float), + "alltoall": ( + int, + int, + float, + float, + float, + ), # #bytes #repetitions t_min[usec] t_max[usec] t_avg[usec] } - with open(path) as f: + with open(path) as f: # pylint: disable=unspecified-encoding for line in f: - if line.startswith('# Benchmarking '): + if line.startswith("# Benchmarking "): benchmark = line.split()[-1].lower() if benchmark not in COLTYPES: - raise ValueError('Do not know how to read %r benchmark in %s' % (benchmark, path)) + raise ValueError( + "Do not know how to read %r benchmark in %s" # pylint: disable=consider-using-f-string + % (benchmark, path) + ) converters = COLTYPES[benchmark] line = next(f) - if not line.startswith('# #processes = '): - raise ValueError('expected %s, got %s' % (expect, nprocs_line)) - n_procs = int(line.split('=')[-1].strip()) - while line.startswith('#'): - line = next(f) # may or may not include line "# .. additional processes waiting in MPI_Barrier", plus other # lines + expected = "# #processes = " + if not line.startswith(expected): + raise ValueError(f"expected {expected}, got {line}") + n_procs = int(line.split("=")[-1].strip()) + while line.startswith("#"): + # may or may not include line "# .. additional processes + # waiting in MPI_Barrier", plus other # lines + line = next(f) rows = [] while True: line = next(f).strip() - if line == '': + if line == "": break rows.append([f(v) for (f, v) in zip(converters, line.split())]) # turn data around: @@ -60,26 +81,30 @@ def read_imb_out(path): data[n_procs] = cols return data -if __name__ == '__main__': + +if __name__ == "__main__": import sys + d = read_imb_out(sys.argv[1]) if len(d) > 1: - raise ValueError('Found > 1 benchmark in', sys.argv[1]) + raise ValueError("Found > 1 benchmark in", sys.argv[1]) outdir = os.path.dirname(sys.argv[1]) for n, df in d.items(): fig, ax1 = plt.subplots() ax2 = ax1.twinx() - ax1.plot(df[0], df[2], label='latency', color='b') - ax2.plot(df[0], df[3], label='bandwidth', color='r') - ax1.set_xscale('log', base=2) - ax1.set_yscale('log', base=10) - ax1.xaxis.set_major_formatter(ticker.FuncFormatter(lambda x, pos: sizeof_fmt(x))) + ax1.plot(df[0], df[2], label="latency", color="b") + ax2.plot(df[0], df[3], label="bandwidth", color="r") + ax1.set_xscale("log", base=2) + ax1.set_yscale("log", base=10) + ax1.xaxis.set_major_formatter( + ticker.FuncFormatter(lambda x, pos: sizeof_fmt(x)) + ) ax1.grid(True, which="both") - ax1.set_xlabel('#bytes') - ax1.set_ylabel('latency ($\mu$s)', color='b') - ax2.set_ylabel('bandwidth (Mbytes/sec)', color='r') - fig.legend(loc='upper left') + ax1.set_xlabel("#bytes") + ax1.set_ylabel("latency ($\\mu$s)", color="b") + ax2.set_ylabel("bandwidth (Mbytes/sec)", color="r") + fig.legend(loc="upper left") plt.tight_layout() - figpath = os.path.join(outdir, 'pingpong.png') + figpath = os.path.join(outdir, "pingpong.png") plt.savefig(figpath) print(figpath) diff --git a/ansible/roles/hpctests/library/hpl_pq.py b/ansible/roles/hpctests/library/hpl_pq.py index 96eff80de..53262fadf 100644 --- a/ansible/roles/hpctests/library/hpl_pq.py +++ b/ansible/roles/hpctests/library/hpl_pq.py @@ -1,11 +1,11 @@ -#!/usr/bin/python +#!/usr/bin/python # pylint: disable=missing-module-docstring # -*- coding: utf-8 -*- # Copyright: (c) 2020, StackHPC # Apache 2 License -from ansible.module_utils.basic import AnsibleModule -import json + +from ansible.module_utils.basic import AnsibleModule # pylint: disable=import-error ANSIBLE_METADATA = { "metadata_version": "0.1", @@ -18,8 +18,9 @@ module: hpl_pq short_description: Calculate P and Q values for HPL. version_added: "0.0" -description: - - "Takes number of processes and returns a dict with keys 'P' and 'Q' giving appropriate values, i.e. with Q equal or slightly larger than P and P * Q == num_processes." +description: > + Takes number of processes and returns a dict with keys 'P' and 'Q' giving appropriate values, + i.e. with Q equal or slightly larger than P and P * Q == num_processes. options: num_processes: description: @@ -36,33 +37,39 @@ TODO """ + def factors(n): - """ Return a sequence of (a, b) tuples where a < b giving factors of n. - - Based on https://stackoverflow.com/a/6909532/916373 + """Return a sequence of (a, b) tuples where a < b giving factors of n. + + Based on https://stackoverflow.com/a/6909532/916373 """ - return [(i, n//i) for i in range(1, int(n**0.5) + 1) if n % i == 0] + return [(i, n // i) for i in range(1, int(n**0.5) + 1) if n % i == 0] -def run_module(): - module_args = dict( - num_processes=dict(type="int", required=True), - ) + +def run_module(): # pylint: disable=missing-function-docstring + module_args = { + "num_processes": { + "type": "int", + "required": True, + }, + } module = AnsibleModule(argument_spec=module_args, supports_check_mode=True) result = {"changed": False} if module.check_mode: module.exit_json(**result) - + num_processes = module.params["num_processes"] f = factors(num_processes) - p, q = f[-1] # nearest to square + p, q = f[-1] # nearest to square - result['grid'] = {'P':p, 'Q': q} + result["grid"] = {"P": p, "Q": q} module.exit_json(**result) -def main(): +def main(): # pylint: disable=missing-function-docstring run_module() + if __name__ == "__main__": main() diff --git a/ansible/roles/hpctests/library/plot_nxnlatbw.py b/ansible/roles/hpctests/library/plot_nxnlatbw.py index ade7d3ddf..ecc4c36de 100644 --- a/ansible/roles/hpctests/library/plot_nxnlatbw.py +++ b/ansible/roles/hpctests/library/plot_nxnlatbw.py @@ -1,11 +1,12 @@ -#!/usr/bin/python +#!/usr/bin/python # pylint: disable=missing-module-docstring # -*- coding: utf-8 -*- # Copyright: (c) 2020, StackHPC # Apache 2 License -from ansible.module_utils.basic import AnsibleModule -import json, os +import os + +from ansible.module_utils.basic import AnsibleModule # pylint: disable=import-error ANSIBLE_METADATA = { "metadata_version": "0.1", @@ -18,8 +19,10 @@ module: plot_nxnlatbw short_description: Read nxnlatbw output, report statistics and tabulate latencies version_added: "0.0" -description: - - "Reads output from running the nxnlatbw ping matrix. Return value includes a 'stats' key with min/max latency and bandwidth values. Generates an html table of pairwise latencies, coloured by value." +description: > + Reads output from running the nxnlatbw ping matrix. + Return value includes a 'stats' key with min/max latency and bandwidth values. + Generates an html table of pairwise latencies, coloured by value. options: src: description: @@ -32,8 +35,9 @@ required: true type: str nodes: - description: - - Comma-separated list of nodenames to label RANKS with - NB this should be provided in the same order as ranks + description: > + Comma-separated list of nodenames to label RANKS with - + NB this should be provided in the same order as ranks requirements: - "python >= 3.6" author: @@ -64,115 +68,175 @@ """ -def html_rows(rankAs, rankBs, nodes, data): - """ Create an HTML-format fragment defining table rows. - Args: - rankAs, rankBs: lists of ranks - nodes: list of nodenames in rank order - data: dict with keys (rankA, rankB) +def html_rows( + rankAs, rankBs, nodes, data +): # pylint: disable=invalid-name # pylint: disable=invalid-name + """Create an HTML-format fragment defining table rows. - Returns a string. + Args: + rankAs, rankBs: lists of ranks + nodes: list of nodenames in rank order + data: dict with keys (rankA, rankB) + + Returns a string. """ - + minv = min(data.values()) maxv = max(data.values()) rows = [] - for rankA in rankAs: # row + for rankA in rankAs: # row # pylint: disable=invalid-name if nodes: - outrow = ['%s [%s]' % (nodes[rankA], rankA)] + outrow = [ + # pylint: disable-next=consider-using-f-string + "%s [%s]" + % (nodes[rankA], rankA) + ] else: - outrow = ['%s' % rankA] - for rankB in rankBs: + outrow = [ + # pylint: disable-next=consider-using-f-string + "%s" + % rankA + ] + for rankB in rankBs: # pylint: disable=invalid-name val = data.get((rankA, rankB)) if val is not None: try: - lightness = 50 + (50 - 50 * ((val - minv) / (maxv - minv))) # want value in range LOW = 100 (white) -> HIGH 50(red) - except ZeroDivisionError: # no min-max spread + lightness = 50 + ( + 50 - 50 * ((val - minv) / (maxv - minv)) + ) # want value in range LOW = 100 (white) -> HIGH 50(red) + except ZeroDivisionError: # no min-max spread lightness = 100 - outrow += ['%.1f' % (lightness, val)] + outrow += [ + # pylint: disable-next=consider-using-f-string + '%.1f' + % (lightness, val) + ] else: - outrow += ['-'] - outrow += [''] - rows.append(' '.join(outrow)) - return '\n'.join(rows) - - -def run_module(): - module_args = dict( - src=dict(type="str", required=True), - dest=dict(type="str", required=True), - nodes=dict(type="str", required=False, default=None) - ) + outrow += ["-"] + outrow += [""] + rows.append(" ".join(outrow)) + return "\n".join(rows) + + +def run_module(): # pylint: disable=missing-function-docstring, too-many-locals + module_args = { + "src": { + "type": "str", + "required": True, + }, + "dest": { + "type": "str", + "required": True, + }, + "nodes": { + "type": "str", + "required": False, + "default": None, + }, + } module = AnsibleModule(argument_spec=module_args, supports_check_mode=True) result = {"changed": False} - + src = os.path.expanduser(module.params["src"]) dest = os.path.expanduser(module.params["dest"]) nodes = module.params["nodes"] if nodes is not None: - nodes = nodes.split(',') - + nodes = nodes.split(",") + if module.check_mode: module.exit_json(**result) - # read latencies/bandwidths: + # read latencies/bandwidths: latencies = {} bandwidths = {} - with open(src) as nxn_f: + with open(src) as nxn_f: # pylint: disable=unspecified-encoding for ln, line in enumerate(nxn_f): - vals = line.split(',') - if vals[0] == 'src': + vals = line.split(",") + if vals[0] == "src": continue if len(vals) != 4: - print('warning: skipping line %i (%i values)' % (ln, len(vals))) + print( + # pylint: disable-next=consider-using-f-string + "warning: skipping line %i (%i values)" + % (ln, len(vals)) + ) continue - rankA, rankB, lat, bw = int(vals[0]), int(vals[1]), float(vals[2]), float(vals[3]) + # pylint: disable=invalid-name + ( + rankA, + rankB, + lat, + bw, + ) = ( + int(vals[0]), + int(vals[1]), + float(vals[2]), + float(vals[3]), + ) latencies[rankA, rankB] = lat bandwidths[rankA, rankB] = bw - + # pylint: enable=invalid-name + # get list of node IDs: - rankAs = sorted(set(k[0] for k in latencies)) - rankBs = sorted(set(k[1] for k in latencies)) + rankAs = sorted(set(k[0] for k in latencies)) # pylint: disable=invalid-name + rankBs = sorted(set(k[1] for k in latencies)) # pylint: disable=invalid-name if rankAs != rankBs: module.fail_json("Ranks extracted from result columns differed", **result) if nodes and len(nodes) != len(rankAs): - module.fail_json("Results contained %i ranks but %i node names provided" % (len(rankAs), len(nodes)), **result) + module.fail_json( + "Results contained %i ranks but %i node names provided" # pylint: disable=consider-using-f-string + % (len(rankAs), len(nodes)), + **result + ) # find min values: min_lat = min(latencies.values()) max_lat = max(latencies.values()) min_bw = min(bandwidths.values()) max_bw = max(bandwidths.values()) - + # create HTML fragments: - ranks = ' '.join('%s' % rankB for rankB in rankBs) + ranks = " ".join( + # pylint: disable-next=consider-using-f-string + "%s" % rankB + for rankB in rankBs + ) lat_rows = html_rows(rankAs, rankBs, nodes, latencies) bw_rows = html_rows(rankAs, rankBs, nodes, bandwidths) - page = HTML_TEMPLATE.format(min_lat=min_lat, max_lat=max_lat, min_bw=min_bw, max_bw=max_bw, ranks=ranks, lat_rows=lat_rows, bw_rows=bw_rows) + page = HTML_TEMPLATE.format( + min_lat=min_lat, + max_lat=max_lat, + min_bw=min_bw, + max_bw=max_bw, + ranks=ranks, + lat_rows=lat_rows, + bw_rows=bw_rows, + ) - with open(dest, 'w') as outf: + with open(dest, "w") as outf: # pylint: disable=unspecified-encoding outf.write(page) - result['changed'] = True - result['stats'] = { - 'min_latency (us)': min_lat, - 'max_latency (us)': max_lat, - 'min_bandwidth (MB/s)': min_bw, - 'max_bandwidth (MB/s)': max_bw, - 'min_bandwidth (Gbit/s)': min_bw / 125.0, - 'max_bandwidth (Gbit/s)': max_bw / 125.0, + result["changed"] = True + result["stats"] = { + "min_latency (us)": min_lat, + "max_latency (us)": max_lat, + "min_bandwidth (MB/s)": min_bw, + "max_bandwidth (MB/s)": max_bw, + "min_bandwidth (Gbit/s)": min_bw / 125.0, + "max_bandwidth (Gbit/s)": max_bw / 125.0, } module.exit_json(**result) -def main(): +def main(): # pylint: disable=missing-function-docstring run_module() + if __name__ == "__main__": main() diff --git a/ansible/roles/hpctests/library/read_imb_pingpong.py b/ansible/roles/hpctests/library/read_imb_pingpong.py index fb52ef472..d1777e576 100644 --- a/ansible/roles/hpctests/library/read_imb_pingpong.py +++ b/ansible/roles/hpctests/library/read_imb_pingpong.py @@ -1,11 +1,11 @@ -#!/usr/bin/python +#!/usr/bin/python # pylint: disable=missing-module-docstring # -*- coding: utf-8 -*- # Copyright: (c) 2020, StackHPC # Apache 2 License -from ansible.module_utils.basic import AnsibleModule -import json + +from ansible.module_utils.basic import AnsibleModule # pylint: disable=import-error ANSIBLE_METADATA = { "metadata_version": "0.1", @@ -39,42 +39,47 @@ """ CONVERTERS = (int, int, float, float) -COLUMNS = ('bytes', 'repetitions', 'latency', 'bandwidth') +COLUMNS = ("bytes", "repetitions", "latency", "bandwidth") + -def run_module(): - module_args = dict( - path=dict(type="str", required=True), - ) +def run_module(): # pylint: disable=missing-function-docstring + module_args = { + "path": { + "type": "str", + "required": True, + }, + } module = AnsibleModule(argument_spec=module_args, supports_check_mode=True) result = {"changed": False} - + path = module.params["path"] if module.check_mode: module.exit_json(**result) columns = ([], [], [], []) - with open(path) as f: + with open(path) as f: # pylint: disable=unspecified-encoding for line in f: - if line == ' #bytes #repetitions t[usec] Mbytes/sec\n': + if line == " #bytes #repetitions t[usec] Mbytes/sec\n": while True: line = next(f).strip() - if line == '': + if line == "": break for ix, v in enumerate(line.split()): columns[ix].append(CONVERTERS[ix](v)) - - result['columns'] = { - 'bytes': columns[0], - 'repetitions': columns[1], - 'latency': columns[2], - 'bandwidth': columns[3], + + result["columns"] = { + "bytes": columns[0], + "repetitions": columns[1], + "latency": columns[2], + "bandwidth": columns[3], } module.exit_json(**result) -def main(): +def main(): # pylint: disable=missing-function-docstring run_module() + if __name__ == "__main__": - main() \ No newline at end of file + main() diff --git a/ansible/roles/hpctests/library/slurm_node_info.py b/ansible/roles/hpctests/library/slurm_node_info.py index 52e680018..c0f851f30 100644 --- a/ansible/roles/hpctests/library/slurm_node_info.py +++ b/ansible/roles/hpctests/library/slurm_node_info.py @@ -1,11 +1,11 @@ -#!/usr/bin/python +#!/usr/bin/python # pylint: disable=missing-module-docstring # -*- coding: utf-8 -*- # Copyright: (c) 2020, StackHPC # Apache 2 License -from ansible.module_utils.basic import AnsibleModule -import json + +from ansible.module_utils.basic import AnsibleModule # pylint: disable=import-error ANSIBLE_METADATA = { "metadata_version": "0.1", @@ -18,8 +18,10 @@ module: slurm_node_info short_description: Get information about Slurm nodes version_added: "0.0" -description: - - "Gets all the available information from Slurm's `sinfo` command about specified nodes. The returned `info` property is a dict with keys from sinfo --All parameters and values a list of strings in specified node order." +description: > + Gets all the available information from Slurm's `sinfo` command about specified nodes. + The returned `info` property is a dict with keys from sinfo -- + All parameters and values a list of strings in specified node order. options nodes: description: @@ -37,32 +39,42 @@ """ -def run_module(): - module_args = dict( - nodes=dict(type="list", required=True), - ) +def run_module(): # pylint: disable=missing-function-docstring + module_args = { + "nodes": { + "type": "list", + "required": True, + } + } module = AnsibleModule(argument_spec=module_args, supports_check_mode=True) result = {"changed": False} if module.check_mode: module.exit_json(**result) - - _, stdout,_ = module.run_command("sinfo --Format All --Node", check_rc=True) # `--nodes` doesn't filter enough, other partitions are still shown + + _, stdout, _ = module.run_command( + "sinfo --Format All --Node", check_rc=True + ) # `--nodes` doesn't filter enough, other partitions are still shown lines = stdout.splitlines() info = {} - params = [v.strip() for v in lines[0].split('|')] - values = [line.split('|') for line in lines[1:]] - nodelist_ix = params.index('NODELIST') + params = [v.strip() for v in lines[0].split("|")] + values = [line.split("|") for line in lines[1:]] + nodelist_ix = params.index("NODELIST") print(values) for ix, param in enumerate(params): - info[param] = [nodeinfo[ix].strip() for nodeinfo in values if nodeinfo[nodelist_ix].strip() in module.params['nodes']] - result['info'] = info - + info[param] = [ + nodeinfo[ix].strip() + for nodeinfo in values + if nodeinfo[nodelist_ix].strip() in module.params["nodes"] + ] + result["info"] = info + module.exit_json(**result) -def main(): +def main(): # pylint: disable=missing-function-docstring run_module() + if __name__ == "__main__": main() diff --git a/ansible/roles/hpctests/meta/main.yml b/ansible/roles/hpctests/meta/main.yml index 8d471f031..af6069502 100644 --- a/ansible/roles/hpctests/meta/main.yml +++ b/ansible/roles/hpctests/meta/main.yml @@ -1,6 +1,8 @@ +--- galaxy_info: author: Steve Brasier company: StackHPC + description: HPC Tests - Meta # If the issue tracker for your role is not on github, uncomment the # next line and provide a value @@ -15,7 +17,7 @@ galaxy_info: # - CC-BY-4.0 license: Apache-2.0 - min_ansible_version: 2.1 + min_ansible_version: "2.1" # If this a Container Enabled role, provide the minimum Ansible Container version. # min_ansible_container_version: diff --git a/ansible/roles/hpctests/tasks/build-hpl.yml b/ansible/roles/hpctests/tasks/build-hpl.yml index 4fec6b75e..dab7510b9 100644 --- a/ansible/roles/hpctests/tasks/build-hpl.yml +++ b/ansible/roles/hpctests/tasks/build-hpl.yml @@ -1,57 +1,58 @@ --- - - name: Make directory - file: + ansible.builtin.file: path: "{{ hpctests_rootdir }}/hpl" state: directory + mode: "0755" - name: Download HPL sources - unarchive: + ansible.builtin.unarchive: src: "http://www.netlib.org/benchmark/hpl/hpl-{{ hpctests_hpl_version }}.tar.gz" - remote_src: yes + remote_src: true dest: "{{ hpctests_rootdir }}/hpl" - keep_newer: yes + keep_newer: true - name: Copy BLAS make file - command: + ansible.builtin.command: cmd: "cp setup/Make.Linux_PII_CBLAS Make.{{ hpctests_hpl_arch }}" chdir: "{{ hpctests_hpl_srcdir }}" creates: "{{ hpctests_hpl_srcdir }}/Make.{{ hpctests_hpl_arch }}" - name: Modify make file - replace: + ansible.builtin.replace: path: "{{ hpctests_hpl_srcdir }}/Make.{{ hpctests_hpl_arch }}" regexp: "{{ item.regexp }}" replace: "{{ item.replace }}" loop: - - regexp: '^TOPdir.*$' + - regexp: "^TOPdir.*$" replace: "TOPdir = {{ hpctests_hpl_srcdir }}" - - regexp: '^ARCH\s+=.*$' + - regexp: "^ARCH\\s+=.*$" replace: "ARCH = {{ hpctests_hpl_arch }}" - - regexp: '^MPdir.*$' + - regexp: "^MPdir.*$" replace: "MPdir = $(MPI_DIR)" - - regexp: '^MPinc.*$' + - regexp: "^MPinc.*$" replace: "MPinc = -I$(MPI_DIR)/include" - - regexp: '^MPlib.*$' + - regexp: "^MPlib.*$" replace: "MPlib = $(MPI_DIR)/lib/libmpi.so" - - regexp: '^LAdir.*$' + - regexp: "^LAdir.*$" replace: "LAdir = $(OPENBLAS_DIR)" - - regexp: '^LAinc.*$' - replace: "LAinc =" # not sure if this one is needed? - - regexp: '^LAlib.*$' + - regexp: "^LAinc.*$" + replace: "LAinc =" # not sure if this one is needed? + - regexp: "^LAlib.*$" replace: "LAlib = $(OPENBLAS_LIB)/libopenblas.so" - - regexp: '^CC\s+=.*$' + - regexp: "^CC\\s+=.*$" replace: "CC = mpicc" - - regexp: '^LINKER\s+=.*$' + - regexp: "^LINKER\\s+=.*$" replace: "LINKER = mpicc" - name: Create build job script - template: + ansible.builtin.template: src: "hpl-build.sh.j2" dest: "{{ hpctests_hpl_srcdir }}/hpl-build-{{ hpctests_hpl_arch }}.sh" - + mode: "0644" + - name: Build HPL executable - shell: + ansible.builtin.command: cmd: "bash -l -c 'sbatch --wait hpl-build-{{ hpctests_hpl_arch }}.sh'" # need login shell for module command chdir: "{{ hpctests_hpl_srcdir }}" creates: "bin/{{ hpctests_hpl_arch }}/xhpl" diff --git a/ansible/roles/hpctests/tasks/hpl-solo.yml b/ansible/roles/hpctests/tasks/hpl-solo.yml index 4c495315b..78aebf29e 100644 --- a/ansible/roles/hpctests/tasks/hpl-solo.yml +++ b/ansible/roles/hpctests/tasks/hpl-solo.yml @@ -1,12 +1,14 @@ +--- # For further information on tuning HPL see e.g.: # - https://ulhpc-tutorials.readthedocs.io/en/latest/parallel/mpi/HPL/ # - https://community.arm.com/developer/tools-software/hpc/b/hpc-blog/posts/profiling-and-tuning-linpack-step-step-guide # - http://www.crc.nd.edu/~rich/CRC_Summer_Scholars_2014/HPL-HowTo.pdf - name: Make directory - file: + ansible.builtin.file: path: "{{ hpctests_rootdir }}/hpl-solo" state: directory + mode: "0755" - name: Get Slurm node info slurm_node_info: @@ -14,7 +16,7 @@ register: hpctests_nodeinfo - name: Check nodes are homogenous - assert: + ansible.builtin.assert: that: "{{ hpctests_nodeinfo.info[item] | unique | length == 1 }}" fail_msg: "Selected nodes are not homogenous: {{ item }} ({{ hpctests_nodeinfo.info['NODELIST'] }}) = {{ hpctests_nodeinfo.info[item] }}" loop: @@ -26,7 +28,7 @@ - name: Calculate number of processes (per node) # Will run array job, which is SAME on each node, so only need to deal with a single node's processors here # Also ignore any hyperthreading TODO: document - set_fact: + ansible.builtin.set_fact: hpctests_hplsolo_ntasks: "{{ (hpctests_nodeinfo.info['SOCKETS'][0]) | int * (hpctests_nodeinfo.info['CORES'][0] | int) }}" - name: Calculate problem shape @@ -37,50 +39,58 @@ - name: Calculate problem size # Based on example shown in http://www.crc.nd.edu/~rich/CRC_Summer_Scholars_2014/HPL-HowTo.pdf but we have MB not GB - set_fact: - hpctests_hplsolo_N: "{{ ((((( (hpctests_nodeinfo.info['MEMORY'][0] | int) * (hpctests_hpl_mem_frac | float) * 1024 * 1024 * 1) / 8) | root) / hpctests_hpl_NB) | int ) * hpctests_hpl_NB }}" -- debug: - msg: "Using {{ hpctests_hplsolo_ntasks }} process per node with P={{ hpctests_hplsolo_pq.grid.P }}, Q={{ hpctests_hplsolo_pq.grid.Q }} targeting {{ (hpctests_hpl_mem_frac | float) * 100 }}% of {{ hpctests_nodeinfo.info['MEMORY'][0] }} MB memory per node, block size (NB) = {{ hpctests_hpl_NB }}, problem size (N) = {{ hpctests_hplsolo_N }}" + ansible.builtin.set_fact: + # yamllint disable-line rule:line-length + hpctests_hplsolo_N: "{{ ((((((hpctests_nodeinfo.info['MEMORY'][0] | int) * (hpctests_hpl_mem_frac | float) * 1024 * 1024 * 1) / 8) | root) / hpctests_hpl_NB) + | int) * hpctests_hpl_NB }}" +- ansible.builtin.debug: + # yamllint disable rule:line-length + msg: "Using {{ hpctests_hplsolo_ntasks }} process per node with P={{ hpctests_hplsolo_pq.grid.P }}, Q={{ hpctests_hplsolo_pq.grid.Q }} targeting {{ (hpctests_hpl_mem_frac + | float) * 100 }}% of {{ hpctests_nodeinfo.info['MEMORY'][0] }} MB memory per node, block size (NB) = {{ hpctests_hpl_NB }}, problem size (N) = {{ hpctests_hplsolo_N + }}" + # yamllint enable rule:line-length - name: Get all nodes in partition - shell: "sinfo --Node --noheader --format %N --partition={{ hpctests_partition }}" + ansible.builtin.command: "sinfo --Node --noheader --format %N --partition={{ hpctests_partition }}" register: all_nodes changed_when: false - name: Calculate excluded nodes - set_fact: + ansible.builtin.set_fact: hpctests_hplsolo_excluded_nodes: "{{ all_nodes.stdout_lines | difference(hpctests_computes.stdout_lines) }}" - name: Copy HPL binary - copy: + ansible.builtin.copy: src: "{{ hpctests_hpl_srcdir }}/bin/{{ hpctests_hpl_arch }}/xhpl" dest: "{{ hpctests_rootdir }}/hpl-solo/xhpl-{{ hpctests_hpl_arch }}" mode: "u+x" - remote_src: yes + remote_src: true - name: Template out HPL.dat - template: + ansible.builtin.template: src: "HPL.dat.j2" dest: "{{ hpctests_rootdir }}/hpl-solo/HPL.dat" + mode: "0644" vars: - hpctests_hpl_N: "{{ hpctests_hplsolo_N }}" - hpctests_hpl_P: "{{ hpctests_hplsolo_pq.grid.P }}" - hpctests_hpl_Q: "{{ hpctests_hplsolo_pq.grid.Q }}" + hpctests_hpl_N: "{{ hpctests_hplsolo_N }}" + hpctests_hpl_P: "{{ hpctests_hplsolo_pq.grid.P }}" + hpctests_hpl_Q: "{{ hpctests_hplsolo_pq.grid.Q }}" - name: Create sbatch script - template: + ansible.builtin.template: src: hpl-solo.sh.j2 dest: "{{ hpctests_rootdir }}/hpl-solo/hpl-solo.sh" + mode: "0755" vars: hpctests_hplsolo_ntasks: 2 # TODO: FIXME -- name: Remove previous outputs +- name: Remove previous outputs # noqa: no-changed-when # As depending on the number of nodes there will be different numbers of output files for different partitions so won't all get overwritten - shell: + ansible.builtin.shell: cmd: "rm -f {{ hpctests_rootdir }}/hpl-solo/hpl-solo.sh.*.out" -- name: Run hpl-solo - shell: bash -l -c 'sbatch --wait hpl-solo.sh' # need login shell for module command +- name: Run hpl-solo # noqa: no-changed-when + ansible.builtin.command: bash -l -c 'sbatch --wait hpl-solo.sh' args: chdir: "{{ hpctests_rootdir }}/hpl-solo" async: "{{ 20 * 60 }}" # wait for up to 20 minutes @@ -89,7 +99,7 @@ - name: Check HPL completed OK tags: postpro - shell: "grep '1 tests completed and passed residual checks' *.out" + ansible.builtin.shell: "grep '1 tests completed and passed residual checks' *.out" args: chdir: "{{ hpctests_rootdir }}/hpl-solo" changed_when: false @@ -105,7 +115,7 @@ # HPL_pdgesv() start time Thu Feb 25 19:58:25 2021 # tags: postpro - shell: "grep '^W[R|C]' *.out | tr -s ' ' | cut -d ' ' -f 7" # tr -s squeezes multiple spaces to single, then take gflops column + ansible.builtin.shell: "set -o pipefail && grep '^W[R|C]' *.out | tr -s ' ' | cut -d ' ' -f 7" args: chdir: "{{ hpctests_rootdir }}/hpl-solo" changed_when: false @@ -113,8 +123,10 @@ - name: Summarise results tags: postpro - debug: + ansible.builtin.debug: + # yamllint disable rule:line-length msg: | + # yamllint disable-line rule:line-length Summary for hpl-solo on {{ hpctests_computes.stdout_lines | length }} nodes in '{{ hpctests_partition }}' partition, job ID {{ hpctests_hplsolo_sbatch.stdout.split()[-1] }}, device '{{ hpctests_ucx_net_devices }}': Max: {{ perf.stdout_lines | map('float') | max }} gflops @@ -122,4 +134,5 @@ Mean: {{ (perf.stdout_lines | map('float') | sum) / (hpctests_computes.stdout_lines | length) }} gflops Individual node results (gflops): - {{ dict(hpctests_computes.stdout_lines | zip(perf.stdout_lines | map('float') )) | to_nice_yaml }} + {{ dict(hpctests_computes.stdout_lines | zip(perf.stdout_lines | map('float'))) | to_nice_yaml }} + # yamllint enable rule:line-length diff --git a/ansible/roles/hpctests/tasks/main.yml b/ansible/roles/hpctests/tasks/main.yml index f0f0817a6..bee1b76d1 100644 --- a/ansible/roles/hpctests/tasks/main.yml +++ b/ansible/roles/hpctests/tasks/main.yml @@ -1,38 +1,39 @@ -- name: setup - block: - - include_tasks: setup.yml +--- +- name: Setup become: true become_user: "{{ hpctests_user }}" tags: always -- name: pingpong block: - - include_tasks: pingpong.yml - when: hpctests_computes.stdout_lines | length > 1 + - ansible.builtin.include_tasks: setup.yml +- name: Pingpong become: true become_user: "{{ hpctests_user }}" tags: pingpong -- name: pingmatrix block: - - include_tasks: pingmatrix.yml + - ansible.builtin.include_tasks: pingpong.yml when: hpctests_computes.stdout_lines | length > 1 +- name: Pingmatrix become: true become_user: "{{ hpctests_user }}" tags: pingmatrix -- name: build HPL block: - - include_tasks: build-hpl.yml + - ansible.builtin.include_tasks: pingmatrix.yml + when: hpctests_computes.stdout_lines | length > 1 +- name: Build HPL become: true become_user: "{{ hpctests_user }}" tags: - hpl-solo -- name: run HPL on individual nodes block: - - include_tasks: hpl-solo.yml + - ansible.builtin.include_tasks: build-hpl.yml +- name: Run HPL on individual nodes become: true become_user: "{{ hpctests_user }}" tags: - hpl-solo + block: + - ansible.builtin.include_tasks: hpl-solo.yml diff --git a/ansible/roles/hpctests/tasks/pingmatrix.yml b/ansible/roles/hpctests/tasks/pingmatrix.yml index 3d20b784b..fa214f9b9 100644 --- a/ansible/roles/hpctests/tasks/pingmatrix.yml +++ b/ansible/roles/hpctests/tasks/pingmatrix.yml @@ -1,40 +1,44 @@ --- - - name: Make directory - file: + ansible.builtin.file: path: "{{ hpctests_rootdir }}/pingmatrix" state: directory + mode: "0755" - name: Copy source - copy: + ansible.builtin.copy: src: mpi_nxnlatbw.c dest: "{{ hpctests_rootdir }}/pingmatrix/mpi_nxnlatbw.c" + mode: "0644" - name: Create sbatch script - template: + ansible.builtin.template: src: pingmatrix.sh.j2 dest: "{{ hpctests_rootdir }}/pingmatrix/pingmatrix.sh" + mode: "0755" -- name: Run ping matrix - shell: bash -l -c 'sbatch --wait pingmatrix.sh' # need login shell for module command +- name: Run ping matrix # noqa: no-changed-when + ansible.builtin.command: bash -l -c 'sbatch --wait pingmatrix.sh' args: chdir: "{{ hpctests_rootdir }}/pingmatrix" register: hpctests_pingmatrix_sbatch -# nxnlatbw outputs ranks, not nodenames which would be more useful for finding issues. The sbatch manpage says nodes provided via --nodelist are sorted, but doesn't specify how. -# Some testing using a "helloworld" program showed it is NOT sorted the same as python's sorted(), it's lexicographical. So we use scontrol to guarantee the same sort order. +# nxnlatbw outputs ranks, not nodenames which would be more useful for finding issues. +# The sbatch manpage says nodes provided via --nodelist are sorted, but doesn't specify how. +# Some testing using a "helloworld" program showed it is NOT sorted the same as python's sorted(), +# it's lexicographical. So we use scontrol to guarantee the same sort order. # Note this still doesn't fix any non-unique names but we should get a length mis-match at least with that. # although this looks a bit crazy: -- name: Expand node list - shell: "scontrol show hostnames {{ hpctests_nodes if hpctests_nodes is defined else (hpctests_computes.stdout_lines | join(',')) }}" +- name: Expand node list # noqa: no-changed-when + ansible.builtin.command: "scontrol show hostnames {{ hpctests_nodes if hpctests_nodes is defined else (hpctests_computes.stdout_lines | join(',')) }}" register: scontrol_hostnames -- name: Create sorted node expression - shell: "scontrol show hostlistsorted {{ scontrol_hostnames.stdout_lines | join(',') }}" +- name: Create sorted node expression # noqa: no-changed-when + ansible.builtin.command: "scontrol show hostlistsorted {{ scontrol_hostnames.stdout_lines | join(',') }}" register: scontrol_hostlistsorted -- name: Expand node list again - shell: "scontrol show hostnames {{ scontrol_hostlistsorted.stdout_lines | join(',') }}" +- name: Expand node list again # noqa: no-changed-when + ansible.builtin.command: "scontrol show hostnames {{ scontrol_hostlistsorted.stdout_lines | join(',') }}" register: slurm_names - name: Process output @@ -45,16 +49,19 @@ register: nxnlatbw - name: Fetch html results table to ansible control host - fetch: + ansible.builtin.fetch: src: "{{ hpctests_rootdir }}/pingmatrix/pingmatrix.html" dest: "{{ hpctests_outdir }}/pingmatrix.html" - flat: yes + flat: true - name: Summarise results - debug: + ansible.builtin.debug: + # yamllint disable rule:line-length msg: | + # yamllint disable-line rule:line-length Summary for pingmatrix pairwise over {{ slurm_names.stdout_lines | length }} nodes in '{{ hpctests_partition }}' partition, job ID {{ hpctests_pingmatrix_sbatch.stdout.split()[-1] }}, device '{{ hpctests_ucx_net_devices }}': - + {{ nxnlatbw['stats'] | to_nice_yaml }} - + Tabular output on ansible control host at {{ hpctests_outdir }}/pingmatrix.html + # yamllint enable rule:line-length diff --git a/ansible/roles/hpctests/tasks/pingpong.yml b/ansible/roles/hpctests/tasks/pingpong.yml index 3cde8c22b..3541f8d35 100644 --- a/ansible/roles/hpctests/tasks/pingpong.yml +++ b/ansible/roles/hpctests/tasks/pingpong.yml @@ -1,45 +1,46 @@ --- - - name: Make directory - file: + ansible.builtin.file: path: "{{ hpctests_rootdir }}/pingpong" state: directory + mode: "0755" - name: Create sbatch script - template: + ansible.builtin.template: src: pingpong.sh.j2 dest: "{{ hpctests_rootdir }}/pingpong/pingpong.sh" + mode: "0755" - name: Run pingpong block: - - name: Submit jobscript - shell: bash -l -c 'sbatch --wait pingpong.sh' # need login shell for module command + - name: Submit jobscript # noqa: no-changed-when + ansible.builtin.command: bash -l -c 'sbatch --wait pingpong.sh' args: chdir: "{{ hpctests_rootdir }}/pingpong" register: hpctests_pingpong_sbatch rescue: - name: Get slurm job output - slurp: + ansible.builtin.slurp: src: "{{ hpctests_rootdir }}/pingpong/pingpong.sh.out" register: _pingpong_out - name: Show job output - debug: + ansible.builtin.debug: msg: | PingPong output was: - + {{ _pingpong_out.content | b64decode }} failed_when: true -- set_fact: +- ansible.builtin.set_fact: _pingpong_jobid: "{{ hpctests_pingpong_sbatch.stdout.split()[-1] }}" -- set_fact: - _pingpong_local_output: "{{ hpctests_outdir }}/pingpong/{{_pingpong_jobid}}/pingpong.sh.out" +- ansible.builtin.set_fact: + _pingpong_local_output: "{{ hpctests_outdir }}/pingpong/{{ _pingpong_jobid }}/pingpong.sh.out" - name: Retrieve results file ansible.builtin.fetch: src: "{{ hpctests_rootdir }}/pingpong/pingpong.sh.out" dest: "{{ _pingpong_local_output }}" - flat: yes + flat: true - name: Read pingpong results read_imb_pingpong: @@ -48,30 +49,34 @@ delegate_to: localhost become: false -- name: Read nodes used - shell: "grep 'SLURM_JOB_NODELIST:' {{ _pingpong_local_output }}" +- name: Read nodes used # noqa: no-changed-when + ansible.builtin.command: "grep 'SLURM_JOB_NODELIST:' {{ _pingpong_local_output }}" register: hpctests_pingpong_run_nodes delegate_to: localhost become: false - name: Plot image - shell: + ansible.builtin.command: cmd: "python {{ role_path }}/files/plot_imb_pingpong.py {{ _pingpong_local_output }}" creates: "{{ _pingpong_local_output | dirname }}/latency.png" register: _pingpong_plot delegate_to: localhost become: false when: hpctests_pingpong_plot | bool - -- debug: + +- ansible.builtin.debug: + # yamllint disable rule:line-length msg: | + # yamllint disable-line rule:line-length Summary for pingpong using 2x scheduler-selected nodes in '{{ hpctests_partition }}' partition, job ID {{ _pingpong_jobid }}, device '{{ hpctests_ucx_net_devices }}': Nodes: {{ hpctests_pingpong_run_nodes.stdout.split()[1] }} Zero-size msg latency: {{ hpctests_pingpong_out['columns']['latency'][0] }} us + # yamllint disable-line rule:line-length Max bandwidth: {{ hpctests_pingpong_out['columns']['bandwidth'] | max }} Mbytes/s ({{ (hpctests_pingpong_out['columns']['bandwidth'] | max) / 125.0 }} Gbit/s) {% if hpctests_pingpong_plot %} See plot on localhost: {{ _pingpong_plot.stdout }} {% endif %} + # yamllint enable rule:line-length diff --git a/ansible/roles/hpctests/tasks/setup.yml b/ansible/roles/hpctests/tasks/setup.yml index 316b32880..cc9832a07 100644 --- a/ansible/roles/hpctests/tasks/setup.yml +++ b/ansible/roles/hpctests/tasks/setup.yml @@ -1,34 +1,36 @@ --- - - name: Get partition information - shell: "sinfo --format %P --noheader" + ansible.builtin.command: "sinfo --format %P --noheader" register: _sinfo_partitions changed_when: false - name: Select default partition if hpctests_partition not given - set_fact: - hpctests_partition: "{{ (_sinfo_partitions.stdout_lines | select('contains', '*') | first)[:-1] }}" + ansible.builtin.set_fact: + hpctests_partition: "{{ (_sinfo_partitions.stdout_lines | select('contains', '*') | first)[:-1] }}" when: hpctests_partition is not defined - name: Get info about compute nodes - shell: "sinfo --Node --noheader{%if hpctests_nodes is defined %} --nodes {{hpctests_nodes}}{% endif %} --partition {{hpctests_partition}} --format %N" + # yamllint disable-line rule:line-length + ansible.builtin.command: "sinfo --Node --noheader{%if hpctests_nodes is defined %} --nodes {{hpctests_nodes}}{% endif %} --partition {{hpctests_partition}} --format + %N" register: hpctests_computes changed_when: false failed_when: hpctests_computes.rc != 0 - name: Check compute node selection valid - assert: + ansible.builtin.assert: that: hpctests_computes.stdout_lines | length > 0 fail_msg: "No nodes selected - was variable `hpctests_nodes` set (correctly)?" - name: Create test root directory - file: + ansible.builtin.file: path: "{{ hpctests_rootdir }}" state: directory owner: "{{ hpctests_user }}" group: "{{ hpctests_group }}" + mode: "0755" - name: Set fact for UCX_NET_DEVICES - set_fact: + ansible.builtin.set_fact: hpctests_ucx_net_devices: "{{ hpctests_ucx_net_devices.get(hpctests_partition, 'all') }}" when: hpctests_ucx_net_devices is mapping diff --git a/ansible/roles/hpctests/templates/hpl-build.sh.j2 b/ansible/roles/hpctests/templates/hpl-build.sh.j2 old mode 100644 new mode 100755 diff --git a/ansible/roles/hpctests/templates/hpl-solo.sh.j2 b/ansible/roles/hpctests/templates/hpl-solo.sh.j2 old mode 100644 new mode 100755 diff --git a/ansible/roles/hpctests/templates/pingmatrix.sh.j2 b/ansible/roles/hpctests/templates/pingmatrix.sh.j2 old mode 100644 new mode 100755 diff --git a/ansible/roles/hpctests/templates/pingpong.sh.j2 b/ansible/roles/hpctests/templates/pingpong.sh.j2 old mode 100644 new mode 100755 diff --git a/ansible/roles/k3s/README.md b/ansible/roles/k3s/README.md index 68e8e2410..4031a002e 100644 --- a/ansible/roles/k3s/README.md +++ b/ansible/roles/k3s/README.md @@ -1,16 +1,12 @@ -k3s -===== +# k3s Installs k3s agent and server services on nodes and an ansible-init playbook to activate them. The service that each node will activate on init is determined by OpenStack metadata. Also includes Helm install. Currently only supports a single k3s-server (i.e one control node). Install based on the [official k3s ansible role](https://github.com/k3s-io/k3s-ansible). - -Requirements ------------- +## Requirements `azimuth_cloud.image_utils.linux_ansible_init` must have been run previously on targeted nodes during image build. -Role Variables --------------- +## Role Variables - `k3s_version`: Optional str. K3s version to install, see [official releases](https://github.com/k3s-io/k3s/releases/). diff --git a/ansible/roles/k3s/defaults/main.yml b/ansible/roles/k3s/defaults/main.yml index 984c63df9..38a5f73d1 100644 --- a/ansible/roles/k3s/defaults/main.yml +++ b/ansible/roles/k3s/defaults/main.yml @@ -1,8 +1,9 @@ +--- # Warning: changes to these variables won't be reflected in the cluster/image if k3s is already installed k3s_version: "v1.31.0+k3s1" k3s_selinux_release: v1.6.latest.1 k3s_selinux_rpm_version: 1.6-1 k3s_helm_version: v3.11.0 -k3s_bootstrap_token: '' # matches common environment default +k3s_bootstrap_token: "" # matches common environment default k3s_bootstrap_token_expiry: 10m k3s_server_name: "{{ None }}" # ansible managed diff --git a/ansible/roles/k3s/tasks/agent-runtime.yml b/ansible/roles/k3s/tasks/agent-runtime.yml index 8377817ce..732fceec1 100644 --- a/ansible/roles/k3s/tasks/agent-runtime.yml +++ b/ansible/roles/k3s/tasks/agent-runtime.yml @@ -1,5 +1,4 @@ --- - - name: Template k3s agent env file when: k3s_bootstrap_token != '' ansible.builtin.template: @@ -7,16 +6,16 @@ src: k3s-agent.service.env.j2 owner: root group: root - mode: 0640 + mode: "0640" register: _k3s_agent_token_result - name: Ensure password directory exists - ansible.builtin.file: + ansible.builtin.file: path: "/etc/rancher/node" state: directory owner: root group: root - mode: 0640 + mode: "0640" - name: Write node password ansible.builtin.copy: @@ -24,10 +23,10 @@ content: "{{ vault_k3s_node_password }}" owner: root group: root - mode: 0640 # normal k3s install is 644 but that doesn't feel right + mode: "0640" # normal k3s install is 644 but that doesn't feel right - name: Start/restart k3s agent - when: _k3s_agent_token_result.changed + when: _k3s_agent_token_result.changed # noqa: no-handler ansible.builtin.systemd: name: k3s-agent daemon_reload: true diff --git a/ansible/roles/k3s/tasks/install.yml b/ansible/roles/k3s/tasks/install.yml index c250f87a8..79efb6527 100644 --- a/ansible/roles/k3s/tasks/install.yml +++ b/ansible/roles/k3s/tasks/install.yml @@ -1,7 +1,6 @@ --- - - name: Check for existing k3s installation - stat: + ansible.builtin.stat: path: /var/lib/rancher/k3s register: stat_result @@ -9,62 +8,64 @@ # Using air-gapped install so containers are pre-installed to avoid rate-limiting from registries on cluster startup when: not stat_result.stat.exists block: + - name: Download k3s binary + ansible.builtin.get_url: + url: "https://github.com/k3s-io/k3s/releases/download/{{ k3s_version | urlencode }}/k3s" + dest: /usr/bin/k3s + owner: root + group: root + mode: "0755" - - name: Download k3s binary - ansible.builtin.get_url: - url: "https://github.com/k3s-io/k3s/releases/download/{{ k3s_version | urlencode }}/k3s" - dest: /usr/bin/k3s - owner: root - group: root - mode: "0755" - - - name: Install k3s SELinux policy package - yum: - name: "https://github.com/k3s-io/k3s-selinux/releases/download/{{ k3s_selinux_release }}/k3s-selinux-{{ k3s_selinux_rpm_version }}.el{{ ansible_distribution_major_version }}.noarch.rpm" - disable_gpg_check: true + - name: Install k3s SELinux policy package + ansible.builtin.dnf: + # yamllint disable-line rule:line-length + name: "https://github.com/k3s-io/k3s-selinux/releases/download/{{ k3s_selinux_release }}/k3s-selinux-{{ k3s_selinux_rpm_version }}.el{{ ansible_distribution_major_version }}.noarch.rpm" + disable_gpg_check: true - - name: Create image directory - ansible.builtin.file: - path: "/var/lib/rancher/k3s/agent/images" - state: directory + - name: Create image directory + ansible.builtin.file: + path: "/var/lib/rancher/k3s/agent/images" + state: directory + mode: "0755" - - name: Install k3s' internal images - ansible.builtin.get_url: - url: "https://github.com/k3s-io/k3s/releases/download/{{ k3s_version | urlencode }}/k3s-airgap-images-amd64.tar.zst" - dest: /var/lib/rancher/k3s/agent/images/k3s-airgap-images-amd64.tar.zst + - name: Install k3s' internal images + ansible.builtin.get_url: + url: "https://github.com/k3s-io/k3s/releases/download/{{ k3s_version | urlencode }}/k3s-airgap-images-amd64.tar.zst" + dest: /var/lib/rancher/k3s/agent/images/k3s-airgap-images-amd64.tar.zst + mode: "0644" - - name: Download k3s install script - ansible.builtin.get_url: - url: https://get.k3s.io/ - timeout: 120 - dest: /usr/bin/k3s-install.sh - owner: root - group: root - mode: "0755" + - name: Download k3s install script + ansible.builtin.get_url: + url: https://get.k3s.io/ + timeout: 120 + dest: /usr/bin/k3s-install.sh + owner: root + group: root + mode: "0755" - - name: Install k3s - ansible.builtin.shell: - cmd: /usr/bin/k3s-install.sh - environment: - INSTALL_K3S_VERSION: "{{ k3s_version }}" - INSTALL_K3S_EXEC: "{{ item }} --node-ip=${K3S_NODE_IP}" - INSTALL_K3S_SKIP_START: "true" - INSTALL_K3S_SKIP_ENABLE: "true" - INSTALL_K3S_BIN_DIR: "/usr/bin" - INSTALL_K3S_SKIP_DOWNLOAD: "true" - changed_when: true - loop: - - server --disable=traefik - - agent + - name: Install k3s + ansible.builtin.command: + cmd: /usr/bin/k3s-install.sh + environment: + INSTALL_K3S_VERSION: "{{ k3s_version }}" + INSTALL_K3S_EXEC: "{{ item }} --node-ip=${K3S_NODE_IP}" + INSTALL_K3S_SKIP_START: "true" + INSTALL_K3S_SKIP_ENABLE: "true" + INSTALL_K3S_BIN_DIR: "/usr/bin" + INSTALL_K3S_SKIP_DOWNLOAD: "true" + changed_when: true + loop: + - server --disable=traefik + - agent - name: Install helm - unarchive: + ansible.builtin.unarchive: src: "https://get.helm.sh/helm-{{ k3s_helm_version }}-linux-amd64.tar.gz" dest: /usr/bin extra_opts: "--strip-components=1" owner: root group: root - mode: 0755 + mode: "0755" remote_src: true - name: Add k3s kubeconfig as environment variable diff --git a/ansible/roles/k3s/tasks/server-runtime.yml b/ansible/roles/k3s/tasks/server-runtime.yml index 6c0878ee3..1221cdaa2 100644 --- a/ansible/roles/k3s/tasks/server-runtime.yml +++ b/ansible/roles/k3s/tasks/server-runtime.yml @@ -1,9 +1,9 @@ --- - - name: Template k3s env file ansible.builtin.template: dest: /etc/systemd/system/k3s.service.env src: k3s.service.env.j2 + mode: "0644" register: _k3s_env_file_status - name: Start k3s server @@ -14,9 +14,9 @@ enabled: true # Possible race here as there is a delay between agents disconnecting and being registered as down, probably won't be hit in general use though -- name: Check which k3s agents are connected +- name: Check which k3s agents are connected # noqa: no-changed-when ansible.builtin.shell: - cmd: kubectl get nodes --no-headers | grep -w Ready + cmd: set -o pipefail && kubectl get nodes --no-headers | grep -w Ready register: _k3s_connected_nodes retries: 6 # task may fail if server is not ready yet delay: 10 @@ -24,12 +24,12 @@ - when: _k3s_connected_nodes.stdout_lines | length != groups['k3s'] | length block: - - name: Generate new bootstrap token if not all agents are connected - no_log: true - shell: - cmd: "k3s token create --ttl {{ k3s_bootstrap_token_expiry }}" - register: _k3s_token_output + - name: Generate new bootstrap token if not all agents are connected # noqa: no-changed-when + no_log: true + ansible.builtin.command: + cmd: "k3s token create --ttl {{ k3s_bootstrap_token_expiry }}" + register: _k3s_token_output - - name: Set bootstrap token as fact - set_fact: - k3s_bootstrap_token: "{{ _k3s_token_output.stdout }}" + - name: Set bootstrap token as fact + ansible.builtin.set_fact: + k3s_bootstrap_token: "{{ _k3s_token_output.stdout }}" diff --git a/ansible/roles/k3s/templates/k3s-agent.service.env.j2 b/ansible/roles/k3s/templates/k3s-agent.service.env.j2 index b994b0680..94447655a 100644 --- a/ansible/roles/k3s/templates/k3s-agent.service.env.j2 +++ b/ansible/roles/k3s/templates/k3s-agent.service.env.j2 @@ -1,3 +1,3 @@ -K3S_NODE_IP={{ ansible_host }} -K3S_TOKEN={{ k3s_bootstrap_token }} -K3S_URL=https://{{ k3s_server_name }}:6443 +K3S_NODE_IP="{{ ansible_host }}" +K3S_TOKEN="{{ k3s_bootstrap_token }}" +K3S_URL="https://{{ k3s_server_name }}:6443" diff --git a/ansible/roles/k3s/templates/k3s.service.env.j2 b/ansible/roles/k3s/templates/k3s.service.env.j2 index 746e6d809..38fb9117c 100644 --- a/ansible/roles/k3s/templates/k3s.service.env.j2 +++ b/ansible/roles/k3s/templates/k3s.service.env.j2 @@ -1 +1 @@ -K3S_NODE_IP={{ ansible_host }} +K3S_NODE_IP="{{ ansible_host }}" diff --git a/ansible/roles/k9s/tasks/main.yml b/ansible/roles/k9s/tasks/main.yml index 674b4dffb..bebe7b82a 100644 --- a/ansible/roles/k9s/tasks/main.yml +++ b/ansible/roles/k9s/tasks/main.yml @@ -1,12 +1,12 @@ --- - - - name: Check if k9s is installed - ansible.builtin.stat: - path: "/usr/bin/k9s" - register: _k9s_stat_result +- name: Check if k9s is installed + ansible.builtin.stat: + path: "/usr/bin/k9s" + register: _k9s_stat_result - - name: Install k9s and clean up temporary files - block: +- name: Install k9s and clean up temporary files + when: not _k9s_stat_result.stat.exists + block: - name: Create install directory ansible.builtin.file: path: /tmp/k9s @@ -28,17 +28,16 @@ ansible.builtin.unarchive: src: /tmp/k9s/k9s_Linux_amd64.tar.gz dest: /tmp/k9s - remote_src: yes + remote_src: true - name: Add k9s to root path ansible.builtin.copy: src: /tmp/k9s/k9s dest: /usr/bin/k9s mode: u+rwx - remote_src: yes + remote_src: true - name: Cleanup k9s install directory ansible.builtin.file: path: /tmp/k9s state: absent - when: not _k9s_stat_result.stat.exists diff --git a/ansible/roles/lustre/README.md b/ansible/roles/lustre/README.md index 9abbb9b1b..2e165b2ab 100644 --- a/ansible/roles/lustre/README.md +++ b/ansible/roles/lustre/README.md @@ -7,24 +7,27 @@ Install and configure a Lustre client. This builds RPM packages from source. **NB:** Currently this only supports RockyLinux 9. ## Role Variables + The following variables control configuration of Lustre clients. + - `lustre_lnet_label`: Optional str. The "lnet label" part of the host's NID, e.g. `tcp0`. Only the `tcp` protocol type is currently supported. Default `tcp`. - `lustre_mgs_nid`: Required str. The NID(s) for the MGS, e.g. `192.168.227.11@tcp1` (separate mutiple MGS NIDs using `:`). - `lustre_mounts`: Required list. Define Lustre filesystems and mountpoints as a list of dicts with keys: - - `fs_name`: Required str. The name of the filesystem to mount - - `mount_point`: Required str. Path to mount filesystem at. - - `mount_state`: Optional mount state, as for [ansible.posix.mount](https://docs.ansible.com/ansible/latest/collections/ansible/posix/mount_module.html#parameter-state). Default is `lustre_mount_state`. - - `mount_options`: Optional mount options. Default is `lustre_mount_options`. + - `fs_name`: Required str. The name of the filesystem to mount + - `mount_point`: Required str. Path to mount filesystem at. + - `mount_state`: Optional mount state, as for [ansible.posix.mount](https://docs.ansible.com/ansible/latest/collections/ansible/posix/mount_module.html#parameter-state). Default is `lustre_mount_state`. + - `mount_options`: Optional mount options. Default is `lustre_mount_options`. - `lustre_mount_state`. Optional default mount state for all mounts, as for [ansible.posix.mount](https://docs.ansible.com/ansible/latest/collections/ansible/posix/mount_module.html#parameter-state). Default is `mounted`. - `lustre_mount_options`. Optional default mount options. Default values are systemd defaults from [Lustre client docs](http://wiki.lustre.org/Mounting_a_Lustre_File_System_on_Client_Nodes). The following variables control the package build and and install: + - `lustre_version`: Optional str. Version of lustre to build, default `2.15.6/lu-18085` - which is the first version with EL9.5 support, plus a fix for https://jira.whamcloud.com/browse/LU-18085. -- `lustre_repo`: Optional str. URL for Lustre repo. Default is a StackHPC repo + which is the first version with EL9.5 support, plus a fix for . +- `lustre_repo`: Optional str. URL for Lustre repository. Default is a StackHPC repository incorporating the above fix. - `lustre_build_packages`: Optional list. Prerequisite packages required to build Lustre. See `defaults/main.yml`. - `lustre_build_dir`: Optional str. Path to build lustre at, default `/tmp/lustre-release`. - `lustre_configure_opts`: Optional list. Options to `./configure` command. Default builds client rpms supporting Mellanox OFED, without support for GSS keys. -- `lustre_rpm_globs`: Optional list. Shell glob patterns for rpms to install. Note order is important as the built RPMs are not in a yum repo. Default is just the `kmod-lustre-client` and `lustre-client` packages. +- `lustre_rpm_globs`: Optional list. Shell glob patterns for rpms to install. Note order is important as the built RPMs are not in a yum repository. Default is just the `kmod-lustre-client` and `lustre-client` packages. - `lustre_build_cleanup`: Optional bool. Whether to uninstall prerequisite packages and delete the build directories etc. Default `true`. diff --git a/ansible/roles/lustre/defaults/main.yml b/ansible/roles/lustre/defaults/main.yml index 72337733c..a840c5cb3 100644 --- a/ansible/roles/lustre/defaults/main.yml +++ b/ansible/roles/lustre/defaults/main.yml @@ -1,10 +1,11 @@ +--- lustre_repo: https://github.com/stackhpc/lustre-release.git -lustre_version: '2.15.6/lu-18085' # Fixes https://jira.whamcloud.com/browse/LU-18085 +lustre_version: "2.15.6/lu-18085" # Fixes https://jira.whamcloud.com/browse/LU-18085 lustre_lnet_label: tcp -#lustre_mgs_nid: +# lustre_mgs_nid: lustre_mounts: [] lustre_mount_state: mounted -lustre_mount_options: 'defaults,_netdev,noauto,x-systemd.automount,x-systemd.requires=lnet.service,nosuid,nodev' +lustre_mount_options: "defaults,_netdev,noauto,x-systemd.automount,x-systemd.requires=lnet.service,nosuid,nodev" # below variables are for build and should not generally require changes lustre_git_repo: "git://git.whamcloud.com/fs/lustre-release.git" diff --git a/ansible/roles/lustre/tasks/configure.yml b/ansible/roles/lustre/tasks/configure.yml index be5ba3597..fab9e60e4 100644 --- a/ansible/roles/lustre/tasks/configure.yml +++ b/ansible/roles/lustre/tasks/configure.yml @@ -1,5 +1,6 @@ +--- - name: Gather Lustre interface info - shell: + ansible.builtin.shell: cmd: | ip --json r get {{ _lustre_mgs_ip }} changed_when: false @@ -8,23 +9,23 @@ _lustre_mgs_ip: "{{ lustre_mgs_nid | split('@') | first }}" - name: Set facts for Lustre interface - set_fact: + ansible.builtin.set_fact: _lustre_interface: "{{ _lustre_ip_r_mgs_info.dev }}" _lustre_ip: "{{ _lustre_ip_r_mgs_info.prefsrc }}" vars: _lustre_ip_r_mgs_info: "{{ _lustre_ip_r_mgs.stdout | from_json | first }}" - name: Write LNet configuration file - template: + ansible.builtin.template: src: lnet.conf.j2 - dest: /etc/lnet.conf # exists from package install, expected by lnet service + dest: /etc/lnet.conf # exists from package install, expected by lnet service owner: root group: root mode: u=rw,go=r # from package install register: _lnet_conf - name: Ensure lnet service state - systemd: + ansible.builtin.systemd: name: lnet state: "{{ 'restarted' if _lnet_conf.changed else 'started' }}" @@ -32,6 +33,7 @@ ansible.builtin.file: path: "{{ item.mount_point }}" state: directory + mode: "0755" loop: "{{ lustre_mounts }}" when: "(item.mount_state | default(lustre_mount_state)) != 'absent'" diff --git a/ansible/roles/lustre/tasks/install.yml b/ansible/roles/lustre/tasks/install.yml index aedc3a505..7a91a3820 100644 --- a/ansible/roles/lustre/tasks/install.yml +++ b/ansible/roles/lustre/tasks/install.yml @@ -1,25 +1,26 @@ +--- - name: Install lustre build prerequisites ansible.builtin.dnf: name: "{{ lustre_build_packages }}" register: _lustre_dnf_build_packages - + - name: Clone lustre git repo ansible.builtin.git: repo: "{{ lustre_repo }}" dest: "{{ lustre_build_dir }}" version: "{{ lustre_version }}" -- name: Prepare for lustre configuration +- name: Prepare for lustre configuration # noqa: no-changed-when ansible.builtin.command: cmd: sh ./autogen.sh chdir: "{{ lustre_build_dir }}" -- name: Configure lustre build +- name: Configure lustre build # noqa: no-changed-when ansible.builtin.command: cmd: "./configure {{ lustre_configure_opts | join(' ') }}" chdir: "{{ lustre_build_dir }}" -- name: Build lustre +- name: Build lustre # noqa: no-changed-when ansible.builtin.command: cmd: make rpms chdir: "{{ lustre_build_dir }}" @@ -32,17 +33,18 @@ register: _lustre_find_rpms - name: Check rpms found - assert: + ansible.builtin.assert: that: _lustre_find_rpms.files | length fail_msg: "No lustre repos found with lustre_rpm_globs = {{ lustre_rpm_globs }}" - name: Install lustre rpms + # checkov:skip=CKV2_ANSIBLE_4: "Ensure that packages with untrusted or missing GPG signatures are not used by dnf" ansible.builtin.dnf: - name: "{{ _lustre_find_rpms.files | map(attribute='path')}}" - disable_gpg_check: yes + name: "{{ _lustre_find_rpms.files | map(attribute='path') }}" + disable_gpg_check: true - name: Delete lustre build dir - file: + ansible.builtin.file: path: "{{ lustre_build_dir }}" state: absent when: lustre_build_cleanup | bool diff --git a/ansible/roles/lustre/tasks/validate.yml b/ansible/roles/lustre/tasks/validate.yml index 609a77f31..6469ac151 100644 --- a/ansible/roles/lustre/tasks/validate.yml +++ b/ansible/roles/lustre/tasks/validate.yml @@ -1,20 +1,21 @@ +--- - name: Check kernel-devel package is installed - command: "dnf list --installed kernel-devel-{{ ansible_kernel }}" + ansible.builtin.command: "dnf list --installed kernel-devel-{{ ansible_kernel }}" changed_when: false # NB: we don't check here the kernel will remain the same after reboot etc, see ofed/install.yml - name: Ensure SELinux in permissive mode - assert: + ansible.builtin.assert: that: selinux_state in ['permissive', 'disabled'] fail_msg: "SELinux must be permissive for Lustre not '{{ selinux_state }}'; see variable selinux_state" - name: Ensure lustre_mgs_nid is defined - assert: + ansible.builtin.assert: that: lustre_mgs_nid is defined fail_msg: Variable lustre_mgs_nid must be defined - name: Ensure lustre_mounts entries define filesystem name and mount point - assert: + ansible.builtin.assert: that: - item.fs_name is defined - item.mount_point is defined diff --git a/ansible/roles/mysql/README.md b/ansible/roles/mysql/README.md index 2c735dbb0..e85c17303 100644 --- a/ansible/roles/mysql/README.md +++ b/ansible/roles/mysql/README.md @@ -1,18 +1,14 @@ -mysql -===== +# MySQL Deploy containerised `mysql` server using Podman. - -Requirements ------------- +## Requirements None. -Role Variables --------------- +## Role Variables -- `mysql_root_password`: Required str. Password to set for `root` mysql user. **NB** This cannot be changed by this role once mysql server has initialised. +- `mysql_root_password`: Required str. Password to set for `root` MySQL user. **NB** This cannot be changed by this role once MySQL server has initialised. - `mysql_tag`: Optional str. Tag for version of `mysql` container image to use. Default `8.0.30`. - `mysql_systemd_service_enabled`: Optional bool. Whether `mysql` service starts on boot. Default `yes`. - `mysql_state`: Optional str. As per `ansible.builtin.systemd:state`. Default is `started` or `restarted` as required. @@ -22,13 +18,11 @@ Role Variables - `mysql_users`: Optional list of dicts defining users as per `community.mysql.mysql_user`. Default `[]`. - `mysql_databases`: Optional list of dicts defining databases as per `community.mysql.mysql_db`. Default `[]`. -Dependencies ------------- +## Dependencies None. -Example Playbook ----------------- +## Example Playbook ```yaml - name: Setup DB @@ -38,15 +32,13 @@ Example Playbook - mysql tasks: - include_role: - name: mysql + name: mysql ``` -License -------- +## License Apache v2 -Author Information ------------------- +## Author Information -Steve Brasier steveb@stackhpc.com +Steve Brasier diff --git a/ansible/roles/mysql/defaults/main.yml b/ansible/roles/mysql/defaults/main.yml index b15c8003e..9d549b3f9 100644 --- a/ansible/roles/mysql/defaults/main.yml +++ b/ansible/roles/mysql/defaults/main.yml @@ -1,9 +1,10 @@ +--- # required: # mysql_root_password: # TODO: make it possible to CHANGE root password mysql_tag: 8.0.30 -mysql_systemd_service_enabled: yes -#mysql_state: # default is started or restarted as required +mysql_systemd_service_enabled: true +# mysql_state: # default is started or restarted as required mysql_podman_user: "{{ ansible_user }}" mysql_datadir: /var/lib/mysql mysql_mysqld_options: [] # list of str options to mysqld, see `run -it --rm mysql:tag --verbose --help` diff --git a/ansible/roles/mysql/tasks/configure.yml b/ansible/roles/mysql/tasks/configure.yml index d4dd4cd54..dca4bde08 100644 --- a/ansible/roles/mysql/tasks/configure.yml +++ b/ansible/roles/mysql/tasks/configure.yml @@ -1,6 +1,7 @@ +--- - name: Create environment file for mysql server root password # NB: This doesn't trigger a restart on changes as it will be ignored once mysql is initialised - copy: + ansible.builtin.copy: dest: /etc/sysconfig/mysqld content: | MYSQL_INITIAL_ROOT_PASSWORD='{{ mysql_root_password }}' @@ -9,29 +10,31 @@ mode: u=rw,go= - name: Ensure mysql service state - systemd: + ansible.builtin.systemd: name: mysql state: "{{ mysql_state | default('restarted' if _mysql_unitfile.changed else 'started') }}" enabled: "{{ mysql_systemd_service_enabled }}" daemon_reload: "{{ _mysql_unitfile.changed }}" -- block: - - name: Wait for mysql to initialise +- when: "mysql_state | default('unspecified') != 'stopped'" + block: + - name: Wait for mysql to initialise # NB: It is not sufficent to wait_for the port - community.mysql.mysql_info: - login_user: root - login_password: "{{ mysql_root_password }}" - no_log: "{{ no_log | default(true) }}" - register: _mysql_info - until: "'version' in _mysql_info" - retries: 90 - delay: 2 + community.mysql.mysql_info: + login_user: root + login_password: "{{ mysql_root_password }}" + no_log: "{{ no_log | default(true) }}" + register: _mysql_info + until: "'version' in _mysql_info" + retries: 90 + delay: 2 - - name: Ensure mysql databases created - community.mysql.mysql_db: "{{ item }}" - loop: "{{ mysql_databases}}" + - name: Ensure mysql databases created + community.mysql.mysql_db: + name: "{{ item }}" + loop: "{{ mysql_databases}}" - - name: Ensure mysql users present - community.mysql.mysql_user: "{{ item }}" - loop: "{{ mysql_users }}" - when: "mysql_state | default('unspecified') != 'stopped'" + - name: Ensure mysql users present + community.mysql.mysql_user: + name: "{{ item }}" + loop: "{{ mysql_users }}" diff --git a/ansible/roles/mysql/tasks/install.yml b/ansible/roles/mysql/tasks/install.yml index 4ed5d30ba..a3c66d758 100644 --- a/ansible/roles/mysql/tasks/install.yml +++ b/ansible/roles/mysql/tasks/install.yml @@ -1,22 +1,24 @@ +--- - name: Install pip - dnf: + ansible.builtin.dnf: name: python3-pip - name: Install python mysql client - pip: + ansible.builtin.pip: name: - pymysql - cryptography state: present - name: Create systemd mysql container unit file - template: + ansible.builtin.template: dest: /etc/systemd/system/mysql.service src: mysql.service.j2 + mode: "0644" register: _mysql_unitfile - name: Pull container image containers.podman.podman_image: name: docker.io/library/mysql tag: "{{ mysql_tag }}" - become_user: "{{ mysql_podman_user }}" + # become_user: "{{ mysql_podman_user }}" # Commenting out as become_user does not imply become: true diff --git a/ansible/roles/mysql/tasks/main.yml b/ansible/roles/mysql/tasks/main.yml index 2b65e84b4..cc29fba63 100644 --- a/ansible/roles/mysql/tasks/main.yml +++ b/ansible/roles/mysql/tasks/main.yml @@ -1,2 +1,3 @@ -- import_tasks: install.yml -- import_tasks: configure.yml +--- +- ansible.builtin.import_tasks: install.yml +- ansible.builtin.import_tasks: configure.yml diff --git a/ansible/roles/ofed/README.md b/ansible/roles/ofed/README.md index ea41df682..96138c08c 100644 --- a/ansible/roles/ofed/README.md +++ b/ansible/roles/ofed/README.md @@ -1,20 +1,21 @@ # ofed This role installs Mellanox OFED: + - It checks that the running kernel is the latest installed one, and errors if not. - Installation uses the `mlnxofedinstall` command, with support for the running kernel -and (by default) without firmware updates. + and (by default) without firmware updates. As OFED installation takes a long time generally this should only be used during image build, for example by setting: -``` +```yaml environments/groups//groups: [ofed:children] builder ``` -# Role variables +## Role variables See `defaults/main.yml` diff --git a/ansible/roles/ofed/defaults/main.yml b/ansible/roles/ofed/defaults/main.yml index 0d040b55e..b5c9a07b9 100644 --- a/ansible/roles/ofed/defaults/main.yml +++ b/ansible/roles/ofed/defaults/main.yml @@ -1,5 +1,8 @@ -ofed_version: '23.10-3.2.2.0' # LTS -ofed_download_url: https://content.mellanox.com/ofed/MLNX_OFED-{{ ofed_version }}/MLNX_OFED_LINUX-{{ ofed_version }}-{{ ofed_distro }}{{ ofed_distro_version }}-{{ ofed_arch }}.tgz +--- +ofed_version: "23.10-3.2.2.0" # LTS +# yamllint disable-line rule:line-length +ofed_download_url: https://content.mellanox.com/ofed/MLNX_OFED-{{ ofed_version }}/MLNX_OFED_LINUX-{{ ofed_version }}-{{ ofed_distro }}{{ ofed_distro_version }}-{{ + ofed_arch }}.tgz ofed_distro: rhel # NB: not expected to work on other distros due to installation differences ofed_distro_version: "{{ ansible_distribution_version }}" # e.g. '8.9' ofed_distro_major_version: "{{ ansible_distribution_major_version }}" # e.g. '8' diff --git a/ansible/roles/ofed/tasks/install.yml b/ansible/roles/ofed/tasks/install.yml index 45f341bf9..e3561ef5e 100644 --- a/ansible/roles/ofed/tasks/install.yml +++ b/ansible/roles/ofed/tasks/install.yml @@ -1,30 +1,35 @@ +--- - name: Get installed kernels - command: dnf list --installed kernel + ansible.builtin.command: dnf list --installed kernel register: _ofed_dnf_kernels changed_when: false - name: Determine running kernel - command: uname -r # e.g. 4.18.0-513.18.1.el8_9.x86_64 + ansible.builtin.command: uname -r register: _ofed_loaded_kernel changed_when: false - name: Check current kernel is newest installed - assert: + ansible.builtin.assert: that: _ofed_kernel_current == _ofed_dnf_kernels_newest fail_msg: "Kernel {{ _ofed_loaded_kernel.stdout }} is loaded but newer {{ _ofed_dnf_kernels_newest }} is installed: consider rebooting?" vars: _ofed_kernel_current: >- {{ _ofed_loaded_kernel.stdout | regex_replace('\.(?:.(?!\.))+$', '') | regex_replace('\.(?:.(?!\.))+$', '') }} + # yamllint disable rule:line-length _ofed_dnf_kernels_newest: >- - {{ _ofed_dnf_kernels.stdout_lines[1:] | map('split') | map(attribute=1) | map('regex_replace', '\.(?:.(?!\.))+$', '') | community.general.version_sort | last }} - # dnf line format e.g. "kernel.x86_64 4.18.0-513.18.1.el8_9 @baseos " + # yamllint disable-line rule:line-length + {{ _ofed_dnf_kernels.stdout_lines[1:] | map('split') | map(attribute=1) | map('regex_replace', '\.(?:.(?!\.))+$', '') | community.general.version_sort | last + }} + # yamllint enable rule:line-length + # dnf line format e.g. "kernel.x86_64 4.18.0-513.18.1.el8_9 @baseos " - name: Enable epel - dnf: + ansible.builtin.dnf: name: epel-release - name: Check for existing OFED installation - command: ofed_info + ansible.builtin.command: ofed_info changed_when: false failed_when: - _ofed_info.rc > 0 @@ -32,7 +37,7 @@ register: _ofed_info - name: Install build prerequisites - dnf: + ansible.builtin.dnf: name: "{{ ofed_build_packages + (ofed_build_rl8_packages if ofed_distro_major_version == '8' else []) }}" when: "'MLNX_OFED_LINUX-' + ofed_version not in _ofed_info.stdout" # don't want to install a load of prereqs unnecessarily @@ -41,13 +46,13 @@ ansible.builtin.unarchive: src: "{{ ofed_download_url }}" dest: "{{ ofed_tmp_dir }}" - remote_src: yes - become: no + remote_src: true + become: false when: "'MLNX_OFED_LINUX-' + ofed_version not in _ofed_info.stdout" # Below from https://docs.nvidia.com/networking/display/mlnxofedv24010331/user+manual -- name: Run OFED install script - command: +- name: Run OFED install script # noqa: no-changed-when + ansible.builtin.command: cmd: > ./mlnxofedinstall --add-kernel-support @@ -63,13 +68,13 @@ async: "{{ 45 * 60 }}" # wait for up to 45 minutes poll: 15 # check every 15 seconds -- name: Update initramfs - command: +- name: Update initramfs # noqa: no-changed-when + ansible.builtin.command: cmd: dracut -f when: '"update your initramfs" in _ofed_install.stdout | default("")' failed_when: false # always shows errors due to deleted modules for inbox RDMA drivers -- name: Load the new driver - command: +- name: Load the new driver # noqa: no-changed-when + ansible.builtin.command: cmd: /etc/init.d/openibd restart when: '"To load the new driver" in _ofed_install.stdout | default("")' diff --git a/ansible/roles/ofed/tasks/main.yml b/ansible/roles/ofed/tasks/main.yml index e7a272f38..df97825c1 100644 --- a/ansible/roles/ofed/tasks/main.yml +++ b/ansible/roles/ofed/tasks/main.yml @@ -1 +1,2 @@ -- include_tasks: install.yml +--- +- ansible.builtin.include_tasks: install.yml diff --git a/ansible/roles/openondemand/README.md b/ansible/roles/openondemand/README.md index 099276c7e..b1fb6731c 100644 --- a/ansible/roles/openondemand/README.md +++ b/ansible/roles/openondemand/README.md @@ -17,9 +17,14 @@ This uses the [osc.ood](https://github.com/OSC/ood-ansible) Ansible role to prov ### General - `openondemand_clusters`: Required. Synonym for [osc.ood: clusters](https://github.com/OSC/ood-ansible#clusters) role variable. -- `openondemand_servername`: Required. Synonym for [osc.ood: servername](https://github.com/OSC/ood-ansible/blob/master/defaults/main/ood_portal.yml#L27) role variable. This defines what the Open Ondemand portal's Apache server uses for the [name-based virtual host](https://httpd.apache.org/docs/current/mod/core.html#servername). It should be the IP or hostname(+domain) part of the URL used to access Open Ondemand in the browser, e.g. `ondemand.mysite.org`. **NB:** If a domain or external IP is not available, specify the host's internal IP here and use ssh with a `DynamicForward` option and a SOCKS proxy to access this address. Using ssh's `LocalForward` option is not recommended as the server name will have to be `localhost` which causes some issues. Changing this value on an already deployed cluster requires a reboot of the login node for OOD app state to be correctly refreshed. +- `openondemand_servername`: Required. Synonym for [osc.ood: servername](https://github.com/OSC/ood-ansible/blob/master/defaults/main/ood_portal.yml#L27) role variable. + This defines what the Open Ondemand portal's Apache server uses for the [name-based virtual host](https://httpd.apache.org/docs/current/mod/core.html#servername). + It should be the IP or hostname(+domain) part of the URL used to access Open Ondemand in the browser, e.g. `ondemand.mysite.org`. **NB:** If a domain or external IP is not available, specify the host's internal IP here and use SSH with a `DynamicForward` option and a SOCKS proxy to access this address. + Using ssh's `LocalForward` option is not recommended as the server name will have to be `localhost` which causes some issues. + Changing this value on an already deployed cluster requires a reboot of the login node for OOD app state to be correctly refreshed. ### Authentication + See the Open Ondemand [Authentication docs](https://osc.github.io/ood-documentation/latest/authentication/overview.html) for an overview of the authentication process. - `openondemand_auth`: Required. Authentication method, either `'oidc'` or `'basic_pam'`. See relevant subsection below. @@ -28,36 +33,41 @@ See the Open Ondemand [Authentication docs](https://osc.github.io/ood-documentat - `openondemand_username`: The remote authenticated username. See also `openondemand_oidc_remote_user_claim` if using OIDC authentication. #### OIDC authentication + The following variables are active when `openondemand_auth` is `oidc`. This role uses the variables below plus a few required defaults to set the `osc.ood: ood_auth_openidc` [variable](https://github.com/OSC/ood-ansible#open-id-connect) - if the below is insufficent to correctly configure OIDC then set `ood_auth_openidc` directly. + - `openondemand_oidc_client_id`: Required. Client ID, as specified by the OIDC provider - `openondemand_oidc_client_secret`: Required. Client secret, as specified the OIDC provider (should be vault-protected). - `openondemand_oidc_provider_url`: Required. URL including protocol for the OIDC provider. - `openondemand_oidc_crypto_passphrase`: Required. Random string (should be vault protected) - `openondemand_oidc_scope`: Optional. A space-separated string giving the [OIDC scopes](https://auth0.com/docs/configure/apis/scopes/openid-connect-scopes) to request from the OIDC provider. What is available depends on the provider. Default: `openid profile preferred_username`. -- `openondemand_oidc_remote_user_claim`: Optional. A string giving the [OIDC claim](https://auth0.com/docs/configure/apis/scopes/openid-connect-scopes#standard-claims) to use as the remote user name. What is available depends on the provider and the claims made. Default: `preferred_username`. +- `openondemand_oidc_remote_user_claim`: Optional. A string giving the [OIDC claim](https://auth0.com/docs/configure/apis/scopes/openid-connect-scopes#standard-claims) to use as the remote username. What is available depends on the provider and the claims made. Default: `preferred_username`. The OIDC provider should be configured to redirect to `https://{{ openondemand_servername }}/oidc` with scopes as appropriate for `openondemand_oidc_scope`. - #### Basic/PAM authentication + This option uses HTTP Basic Authentication (i.e. browser prompt) to get a username and password. This is then checked against an existing local user using PAM. Note that HTTPS is configured by default, so the password is protected in transit, although there are [other](https://security.stackexchange.com/a/990) security concerns with Basic Authentication. No other authentication options are required for this method. ### SSL Certificates + This role enables SSL on the Open Ondemand server, using the following self-signed certificate & key which are autogenerated by the `mod_ssl` package installed as part of the `ondemand-apache` package. Replace with your own keys if required. + - `openondemand_ssl_cert`: Optional. Default `/etc/pki/tls/certs/localhost.crt`. - `openondemand_ssl_cert_key`: Optional. Default `/etc/pki/tls/private/localhost.key` ### Dashboard and application configuration + - `openondemand_dashboard_docs_url`: Optional. URL of docs to show under Help in dashboard. Default `(undefined)`. - `openondemand_dashboard_links`: Optional. List of mappings defining additional links to add as menu items in the dashboard. Keys are: - - `name`: Required. User-facing name for the link. - - `category`: Required. Menu to add link under, either a default one (e.g. `Files`, `Jobs`, `Clusters`, `Interactive Apps`) or a new category to add. - - `icon`: Optional. URL of icon, defaults to Open Ondemand clock icon as used in standard menus. - - `url`: Required. URL of link. - - `new_window`: Optional. Whether to open link in new window. Bool, default `false`. - - `app_name`: Optional. Unique name for app appended to `/var/www/ood/apps/sys/`. Default is `name`, useful if that is not unique or not suitable as a path component. + - `name`: Required. User-facing name for the link. + - `category`: Required. Menu to add link under, either a default one (e.g. `Files`, `Jobs`, `Clusters`, `Interactive Apps`) or a new category to add. + - `icon`: Optional. URL of icon, defaults to Open Ondemand clock icon as used in standard menus. + - `url`: Required. URL of link. + - `new_window`: Optional. Whether to open link in new window. Bool, default `false`. + - `app_name`: Optional. Unique name for app appended to `/var/www/ood/apps/sys/`. Default is `name`, useful if that is not unique or not suitable as a path component. - `openondemand_dashboard_support_url`: Optional. URL or email etc to show as support contact under Help in dashboard. Default `(undefined)`. - `openondemand_desktop_partition`: Optional. Name of Slurm partition to use for remote desktops. Requires a corresponding group named "openondemand_desktop" and entry in openhpc_partitions. - `openondemand_desktop_screensaver`: Optional. Whether to enable screen locking/screensaver. **NB:** Users must have passwords if this is enabled. Bool, default `false`. @@ -65,16 +75,19 @@ This role enables SSL on the Open Ondemand server, using the following self-sign - `openondemand_jupyter_partition`: Required. Name of Slurm partition to use for Jupyter Notebook servers. Requires a corresponding group named "openondemand_jupyter" and entry in openhpc_partitions. ### Monitoring + - `openondemand_exporter`: Optional. Install the Prometheus [ondemand_exporter](https://github.com/OSC/ondemand_exporter) on the `openondemand` node to export metrics about Open Ondemand itself. Default `true`. ### Proxying + The Open Ondemand portal can proxy other servers. Variables: -- `openondemand_host_regex`: Synomyn for the `osc.ood: host_regex` [variable](https://osc.github.io/ood-documentation/latest/app-development/interactive/setup/enable-reverse-proxy.html). A Python regex matching servernames which Open Ondemand should proxy. Enables proxying and restricts which addresses are proxied (for security). E.g. this might be: +- `openondemand_host_regex`: Synomyn for the `osc.ood: host_regex` [variable](https://osc.github.io/ood-documentation/latest/app-development/interactive/setup/enable-reverse-proxy.html). A Python regular expression matching servernames which Open Ondemand should proxy. Enables proxying and restricts which addresses are proxied (for security). E.g. this might be: `'({{ openhpc_cluster_name }}-compute-\d+)|({{ groups["grafana"] | first }})'` to proxy: + - All "compute" nodes, e.g. for Open Ondemand interactive apps such as remote desktop and Jupyter notebook server. - The Grafana server - note a link to Grafana is always added to the Open Ondemand dashboard. @@ -83,21 +96,22 @@ The Open Ondemand portal can proxy other servers. Variables: - `openondemand_node_proxy_directives`: Optional, default ''. Multiline string to insert into Apache directives definition for `node_uri` ([docs](https://osc.github.io/ood-documentation/master/reference/files/ood-portal-yml.html#configure-reverse-proxy)). Note that: + - If Open Ondemand and Grafana are deployed, Grafana is automatically configured so that proxying it through Open Ondemand works. - The `osc.ood` role variables `node_uri` and `rnode_uri` are set automatically if `openondemand_host_regex` is set. -# Dependencies +## Dependencies - `osc.ood` role as described above. -# Example Playbook +## Example Playbook See `ansible/portal.yml`. Note the `main` playbook should be run on the `openondemand` node (i.e. the node to configure as hosting the Open Ondemand server/portal), and the other playbooks should be run on some subset of the `compute` group. -# License +## License Apache v2 -# Author Information +## Author Information Stackhpc Ltd. diff --git a/ansible/roles/openondemand/defaults/main.yml b/ansible/roles/openondemand/defaults/main.yml index 23359f01c..f45194b27 100644 --- a/ansible/roles/openondemand/defaults/main.yml +++ b/ansible/roles/openondemand/defaults/main.yml @@ -3,7 +3,6 @@ # Authentication: openondemand_auth: # "oidc" or "basic_pam" openondemand_mapping_users: [] - ## Variables for `openondemand_auth=oidc` : openondemand_oidc_client_id: openondemand_oidc_client_secret: @@ -19,22 +18,20 @@ openondemand_ssl_cert_key: /etc/pki/tls/private/localhost.key # Dashboard and application config: openondemand_dashboard_docs_url: (undefined) openondemand_dashboard_support_url: (undefined) -openondemand_desktop_partition: '' +openondemand_desktop_partition: "" openondemand_desktop_screensaver: false openondemand_filesapp_paths: [] -openondemand_jupyter_partition: '' +openondemand_jupyter_partition: "" openondemand_dashboard_links: [] - # Monitoring: openondemand_exporter: true # Synonyms for osc:ood role vars: openondemand_clusters: {} # synonym for osc.ood:clusters -openondemand_servername: '' +openondemand_servername: "" openondemand_host_regex: - # Other: -openondemand_node_proxy_directives: '' # Added to Apache directives for `node_uri` forwarding. +openondemand_node_proxy_directives: "" # Added to Apache directives for `node_uri` forwarding. openondemand_auth_defaults: # Defaults for OIDC auth - keys are osc.ood vars & can be overriden using the osc.ood var name in inventory @@ -51,23 +48,23 @@ openondemand_auth_defaults: OIDCScope: "{{ openondemand_oidc_scope }}" OIDCRemoteUserClaim: "{{ openondemand_oidc_remote_user_claim }}" httpd_auth: # ood_portal.yml.j2 - - 'AuthType openid-connect' - - 'Require valid-user' - - 'ProxyPreserveHost On' # see under https://grafana.com/blog/2022/02/08/grafana-7.5.15-and-8.3.5-released-with-moderate-severity-security-fixes/ + - "AuthType openid-connect" + - "Require valid-user" + - "ProxyPreserveHost On" # see under https://grafana.com/blog/2022/02/08/grafana-7.5.15-and-8.3.5-released-with-moderate-severity-security-fixes/ user_map_cmd: /opt/ood/ood_auth_map/bin/ood_auth_map.mapfile user_map_match: none - + # Defaults for basic/PAM auth - see https://osc.github.io/ood-documentation/latest/authentication/pam.html basic_pam: httpd_auth: # ood_portal.yml.j2 - - 'AuthType Basic' + - "AuthType Basic" - 'AuthName "Open OnDemand"' - - 'AuthBasicProvider PAM' - - 'AuthPAMService ood' - - 'Require valid-user' - - 'ProxyPreserveHost On' # see under https://grafana.com/blog/2022/02/08/grafana-7.5.15-and-8.3.5-released-with-moderate-severity-security-fixes/ - user_map_cmd: null - user_map_match: '.*' + - "AuthBasicProvider PAM" + - "AuthPAMService ood" + - "Require valid-user" + - "ProxyPreserveHost On" # see under https://grafana.com/blog/2022/02/08/grafana-7.5.15-and-8.3.5-released-with-moderate-severity-security-fixes/ + user_map_cmd: + user_map_match: ".*" # The below mapping is used to override osc.ood defaults. Keys are osc.ood variable names. # If you need to override *these* defaults (i.e. this role's vars are not sufficent) just set the corresponding osc.ood var as normal. @@ -91,7 +88,7 @@ openondemand_osc_ood_defaults: - SSLHonorCipherOrder On - SSLCompression off - SSLSessionTickets Off - + # User mapping: user_map_cmd: "{{ openondemand_auth_defaults[openondemand_auth | lower].user_map_cmd }}" user_map_match: "{{ openondemand_auth_defaults[openondemand_auth | lower].user_map_match }}" diff --git a/ansible/roles/openondemand/files/missing_home_directory.html b/ansible/roles/openondemand/files/missing_home_directory.html index db790c9b7..512fb920f 100644 --- a/ansible/roles/openondemand/files/missing_home_directory.html +++ b/ansible/roles/openondemand/files/missing_home_directory.html @@ -1,49 +1,54 @@ - + - - Home Directory Not Found - - - -

Home directory not found

-

- Your home directory appears to be missing. If this is the first time you have logged in with this account, you may - need to access our systems using SSH in order to trigger the creation of your home directory. -

-
    - Open Shell to create home directory -
    -
    - Restart Web Server -
- + + Home Directory Not Found + + + +

Home directory not found

+

+ Your home directory appears to be missing. If this is the first time you + have logged in with this account, you may need to access our systems using + SSH in order to trigger the creation of your home directory. +

+
    + Open Shell to create home directory +
    +
    + Restart Web Server +
+ diff --git a/ansible/roles/openondemand/tasks/config_changes.yml b/ansible/roles/openondemand/tasks/config_changes.yml index f83c670b5..835411dca 100644 --- a/ansible/roles/openondemand/tasks/config_changes.yml +++ b/ansible/roles/openondemand/tasks/config_changes.yml @@ -1,5 +1,6 @@ +--- - name: Add Apache directives for node_uri forwarding - blockinfile: + ansible.builtin.blockinfile: path: /opt/ood/ood-portal-generator/templates/ood-portal.conf.erb block: "{{ openondemand_node_proxy_directives }}" insertafter: ' Header edit Set-Cookie "\^\(\[\^;\]\+\)" "\$1; Path=<%= @node_uri %>\/%{MATCH_HOST}e\/%{MATCH_PORT}e"' diff --git a/ansible/roles/openondemand/tasks/exporter.yml b/ansible/roles/openondemand/tasks/exporter.yml index e3c387a8d..f9100f7a0 100644 --- a/ansible/roles/openondemand/tasks/exporter.yml +++ b/ansible/roles/openondemand/tasks/exporter.yml @@ -1,10 +1,11 @@ +--- - name: Install ondemand prometheus exporter - yum: + ansible.builtin.dnf: name: ondemand_exporter when: openondemand_exporter - name: Start and enable ondemand prometheus exporter - service: + ansible.builtin.service: name: ondemand_exporter enabled: true state: started diff --git a/ansible/roles/openondemand/tasks/jupyter_compute.yml b/ansible/roles/openondemand/tasks/jupyter_compute.yml index a87d07da2..6df0c785f 100644 --- a/ansible/roles/openondemand/tasks/jupyter_compute.yml +++ b/ansible/roles/openondemand/tasks/jupyter_compute.yml @@ -1,32 +1,32 @@ +--- # Should be run on compute nodes you want to run jupyter notebook on # See https://osc.github.io/ood-documentation/latest/app-development/tutorials-interactive-apps/add-jupyter/software-requirements.html # - Will already have openssl and lmod - name: Ensure python3.9 installed - dnf: + ansible.builtin.dnf: name: python39 tags: install - name: Install jupyter venv # Requires separate step so that the upgraded pip is used to install packages - pip: + ansible.builtin.pip: name: pip - state: latest + state: latest # noqa: package-latest virtualenv: /opt/jupyter-py39 virtualenv_command: python3.9 -m venv tags: install - name: Copy jupyter requirements file - copy: + ansible.builtin.copy: src: jupyter_requirements.txt dest: /opt/jupyter-py39/jupyter_requirements.txt + mode: "0644" tags: install - name: Install jupyter package in venv - pip: + ansible.builtin.pip: virtualenv: /opt/jupyter-py39 virtualenv_command: python3.9 -m venv requirements: /opt/jupyter-py39/jupyter_requirements.txt tags: install - - diff --git a/ansible/roles/openondemand/tasks/main.yml b/ansible/roles/openondemand/tasks/main.yml index bd5706ecb..783be8911 100644 --- a/ansible/roles/openondemand/tasks/main.yml +++ b/ansible/roles/openondemand/tasks/main.yml @@ -1,7 +1,6 @@ --- - - name: Set osc.ood variables from this role's defaults if no overriding inventory var - set_fact: + ansible.builtin.set_fact: "{{ item.key }}": "{{ lookup('vars', item.key, default=item.value) }}" loop: "{{ openondemand_osc_ood_defaults | dict2items }}" when: (item.key in hostvars[inventory_hostname]) or (item.value) @@ -14,47 +13,48 @@ file: "{{ playbook_dir }}/roles/osc.ood/vars/Rocky/{{ ansible_distribution_major_version }}.yml" # if using PAM auth we need apache installed but NOT started so split the osc.ood role up: -- include_role: +- ansible.builtin.include_role: name: osc.ood tasks_from: install-package.yml vars_from: "Rocky/{{ ansible_distribution_major_version }}.yml" when: appliances_mode != 'configure' # can't set vars: from a dict hence the workaround above -- include_tasks: +- ansible.builtin.include_tasks: file: pam_auth.yml when: openondemand_auth | lower == 'basic_pam' -- include_tasks: +- ansible.builtin.include_tasks: file: config_changes.yml # The configure.yml playbook needs vars from Rocky (for nginx) and main if using OIDC auth. However vars_from doensn't take a list. # include_vars doens't interpolate from role vars, so we use that for main.yml which only requires things we override in the appliance inventory # and use vars_from for Rocky which requires interpolation from role vars. -#- include_vars: -# file: roles/osc.ood/vars/main.yml +# - include_vars: +# file: roles/osc.ood/vars/main.yml -- include_role: +- ansible.builtin.include_role: name: osc.ood tasks_from: configure.yml vars_from: main.yml - public: yes + public: true -- include_role: +- ansible.builtin.include_role: name: osc.ood tasks_from: install-apps.yml when: ood_install_apps -- include_role: +- ansible.builtin.include_role: name: osc.ood tasks_from: apps.yml # vars_from: Rocky.yml when: ood_apps - name: Ensure post_tasks dirs exists - file: + ansible.builtin.file: path: "{{ item }}" state: directory + mode: "0755" loop: # - /etc/ood/config/apps/dashboard/initializers - /etc/ood/config/locales @@ -62,15 +62,15 @@ - /etc/ood/config/pun/html - name: Create dashboard additional config directory - file: + ansible.builtin.file: path: /etc/ood/config/apps/dashboard/initializers state: directory - recurse: yes + recurse: true owner: root mode: o=rwX,go=rX - name: Create additional shortcuts in Files app - template: + ansible.builtin.template: src: files_shortcuts.rb.j2 dest: /etc/ood/config/apps/dashboard/initializers/ood.rb owner: root @@ -78,21 +78,22 @@ when: openondemand_filesapp_paths - name: Create job template directory - file: + ansible.builtin.file: path: "/etc/ood/config/apps/myjobs/templates/" state: directory - recurse: True + recurse: true owner: root group: root mode: o=rwX,go=rX - name: Copy web page to let users create their home directory - copy: + ansible.builtin.copy: src: missing_home_directory.html dest: /etc/ood/config/pun/html/missing_home_directory.html + mode: "0644" - name: Create mapping directory - file: + ansible.builtin.file: path: /etc/grid-security state: directory owner: root @@ -101,7 +102,7 @@ when: openondemand_mapping_users - name: Create mapping file - template: + ansible.builtin.template: dest: /etc/grid-security/grid-mapfile src: grid-mapfile.j2 owner: root @@ -110,15 +111,17 @@ when: openondemand_mapping_users - name: Create app directories for dashboard links - file: + ansible.builtin.file: path: /var/www/ood/apps/sys/{{ item.app_name | default(item.name) }} state: directory + mode: "0755" loop: "{{ openondemand_dashboard_links }}" - name: Create app manifests for dashboard links - template: + ansible.builtin.template: src: dashboard_app_links.yml.j2 dest: /var/www/ood/apps/sys/{{ item.app_name | default(item.name) }}/manifest.yml + mode: "0644" loop: "{{ openondemand_dashboard_links }}" # - name: Ensure ondemand-dex is running and active @@ -137,13 +140,13 @@ # - /usr/share/ondemand-dex/web/themes/ - name: Keyscan login host - command: + ansible.builtin.command: cmd: "ssh-keyscan {{ openondemand_clusters.slurm.v2.login.host }}" register: _openondemand_login_key changed_when: false - name: Add login hostkeys to known hosts - blockinfile: + ansible.builtin.blockinfile: path: /etc/ssh/ssh_known_hosts create: true block: "{{ _openondemand_login_key.stdout_lines | sort | join('\n') }}" diff --git a/ansible/roles/openondemand/tasks/pam_auth.yml b/ansible/roles/openondemand/tasks/pam_auth.yml index 6bc4bda36..2cf8a5b15 100644 --- a/ansible/roles/openondemand/tasks/pam_auth.yml +++ b/ansible/roles/openondemand/tasks/pam_auth.yml @@ -1,31 +1,31 @@ # https://osc.github.io/ood-documentation/latest/authentication/pam.html --- - name: Install Apache PAM module # Extracted from start of roles/openondemand/tasks/pam_auth.yml to ensure only installed during build - yum: + ansible.builtin.dnf: name: mod_authnz_pam - name: Enable Apache PAM module - lineinfile: + ansible.builtin.lineinfile: path: /etc/httpd/conf.modules.d/55-authnz_pam.conf line: LoadModule authnz_pam_module modules/mod_authnz_pam.so regexp: ^LoadModule authnz_pam_module modules/mod_authnz_pam.so - name: Set PAM service # TODO: might need subsequent modification?? - command: + ansible.builtin.command: cmd: cp /etc/pam.d/sshd /etc/pam.d/ood creates: /etc/pam.d/ood - name: Allow the Apache user to read /etc/shadow - file: + ansible.builtin.file: path: /etc/shadow - mode: 0640 + mode: "0640" group: apache - name: Allow httpd access to PAM in SELinux ansible.posix.seboolean: name: httpd_mod_auth_pam - state: yes - persistent: yes + state: true + persistent: true when: ansible_facts.selinux.status == 'enabled' # TODO: do we need to restart OOD here?? diff --git a/ansible/roles/openondemand/tasks/validate.yml b/ansible/roles/openondemand/tasks/validate.yml index 92e83d344..b22f51b86 100644 --- a/ansible/roles/openondemand/tasks/validate.yml +++ b/ansible/roles/openondemand/tasks/validate.yml @@ -1,4 +1,5 @@ +--- - name: Check Open Ondemand servername is defined - assert: + ansible.builtin.assert: that: openondemand_servername != '' fail_msg: "Variable `openondemand_servername` must be set on openondemand and (by default) grafana hosts. See ansible/roles/openondemand/README.md" diff --git a/ansible/roles/openondemand/tasks/vnc_compute.yml b/ansible/roles/openondemand/tasks/vnc_compute.yml index 8b6f6cdec..dc8d72b75 100644 --- a/ansible/roles/openondemand/tasks/vnc_compute.yml +++ b/ansible/roles/openondemand/tasks/vnc_compute.yml @@ -1,13 +1,15 @@ +--- # Should be run on compute nodes you want to run the graphical desktop on - name: Enable TurboVNC repo tags: install - get_url: + ansible.builtin.get_url: url: https://raw.githubusercontent.com/TurboVNC/repo/main/TurboVNC.repo dest: /etc/yum.repos.d/TurboVNC.repo + mode: "0644" - name: Install EPEL tags: install - yum: + ansible.builtin.dnf: name: epel-release - name: Check /etc/init.d @@ -28,7 +30,7 @@ - name: Install VNC-related packages tags: install - dnf: + ansible.builtin.dnf: name: - turbovnc-3.0.1 - nmap-ncat @@ -37,7 +39,7 @@ - name: Stop turbovnc service # This is not actually required - systemd: + ansible.builtin.systemd: name: tvncserver state: stopped enabled: false @@ -47,16 +49,18 @@ src: /etc/init.d.orig/ # trailing / to get contents dest: /etc/init.d remote_src: true + directory_mode: "0755" + mode: "0644" when: - init_d.stat.exists - not init_d.stat.islnk - name: Install Xfce desktop tags: install - yum: - name: '@Xfce' + ansible.builtin.dnf: + name: "@Xfce" when: appliances_mode != 'configure' # dnf group/module installs aren't idempotent so only run during build - + # - name: Ensure python3.9 installed # dnf: # name: python39 @@ -64,22 +68,23 @@ - name: Install websockify venv # Requires separate step so that the upgraded pip is used to install packages - pip: + ansible.builtin.pip: name: pip - state: latest + state: latest # noqa: package-latest virtualenv: /opt/websockify-py39 virtualenv_command: python3.9 -m venv tags: install - name: Install websockify package in venv - pip: + ansible.builtin.pip: name: websockify virtualenv: /opt/websockify-py39 virtualenv_command: python3 -m venv tags: install - name: Symlink websockify to where Open Ondemand expects - file: "{{ item }}" + ansible.builtin.file: + path: "{{ item }}" loop: - path: /opt/websockify state: directory @@ -87,7 +92,7 @@ dest: /opt/websockify/run state: link - name: Disable screensaver # as users might not have passwords - yum: + ansible.builtin.dnf: name: xfce4-screensaver state: absent when: not (openondemand_desktop_screensaver | bool) diff --git a/ansible/roles/opensearch/defaults/main.yml b/ansible/roles/opensearch/defaults/main.yml index 69e7f9c25..1b05521ad 100644 --- a/ansible/roles/opensearch/defaults/main.yml +++ b/ansible/roles/opensearch/defaults/main.yml @@ -1,9 +1,9 @@ --- # Used to set passwords -#opensearch_internal_users_path: +# opensearch_internal_users_path: opensearch_podman_user: "{{ ansible_user }}" -opensearch_version: '2.9.0' # https://hub.docker.com/r/opensearchproject/opensearch/tags +opensearch_version: "2.9.0" # https://hub.docker.com/r/opensearchproject/opensearch/tags opensearch_config_path: /usr/share/opensearch/config opensearch_data_path: /usr/share/opensearch/data opensearch_state: started # will be restarted if required diff --git a/ansible/roles/opensearch/handlers/main.yml b/ansible/roles/opensearch/handlers/main.yml index d3a040dbb..61f5bbfe6 100644 --- a/ansible/roles/opensearch/handlers/main.yml +++ b/ansible/roles/opensearch/handlers/main.yml @@ -1,7 +1,6 @@ --- - - name: Restart opensearch service - systemd: + ansible.builtin.systemd: name: opensearch.service state: "{{ 'restarted' if 'started' in opensearch_state else opensearch_state }}" enabled: "{{ opensearch_systemd_service_enabled }}" diff --git a/ansible/roles/opensearch/tasks/archive_data.yml b/ansible/roles/opensearch/tasks/archive_data.yml index 298f66a8e..cb3403e79 100644 --- a/ansible/roles/opensearch/tasks/archive_data.yml +++ b/ansible/roles/opensearch/tasks/archive_data.yml @@ -1,8 +1,9 @@ +--- # Remove data which was NOT indexed by Slurm Job ID # It will be re-ingested by filebeat from the slurmdbd, with that index - name: Ensure opensearch stopped - systemd: + ansible.builtin.systemd: name: opensearch state: stopped register: _opensearch_stop @@ -15,3 +16,4 @@ path: "{{ opensearch_data_path }}" dest: "{{ opensearch_data_path | dirname }}/data-{{ lookup('pipe', 'date --iso-8601=minutes') }}.tar.gz" remove: true + mode: "0644" diff --git a/ansible/roles/opensearch/tasks/certs.yml b/ansible/roles/opensearch/tasks/certs.yml index e40f65242..4eee580a3 100644 --- a/ansible/roles/opensearch/tasks/certs.yml +++ b/ansible/roles/opensearch/tasks/certs.yml @@ -1,5 +1,6 @@ +--- - name: Ensure host certs directory exists - file: + ansible.builtin.file: path: "{{ opensearch_config_path }}/certs" state: directory owner: "{{ opensearch_podman_user }}" diff --git a/ansible/roles/opensearch/tasks/install.yml b/ansible/roles/opensearch/tasks/install.yml index 9a0ffd361..f9bc1f8d0 100644 --- a/ansible/roles/opensearch/tasks/install.yml +++ b/ansible/roles/opensearch/tasks/install.yml @@ -1,25 +1,27 @@ +--- # safe to use during build - name: Increase maximum number of virtual memory maps # see https://opensearch.org/docs/2.0/opensearch/install/important-settings/ ansible.posix.sysctl: name: vm.max_map_count - value: '262144' + value: "262144" state: present - reload: yes + reload: true - name: Create systemd unit file - template: + ansible.builtin.template: dest: /etc/systemd/system/opensearch.service src: opensearch.service.j2 + mode: "0644" register: _opensearch_unit - name: Pull container image containers.podman.podman_image: name: docker.io/opensearchproject/opensearch tag: "{{ opensearch_version }}" - become_user: "{{ opensearch_podman_user }}" + # become_user: "{{ opensearch_podman_user }}" # Commenting out as become_user does not imply become: true -- name: Reload opensearch unit file - command: systemctl daemon-reload - when: _opensearch_unit.changed +- name: Reload opensearch unit file # noqa: no-changed-when + ansible.builtin.command: systemctl daemon-reload # noqa: command-instead-of-module + when: _opensearch_unit.changed # noqa: no-handler diff --git a/ansible/roles/opensearch/tasks/migrate-opendistro.yml b/ansible/roles/opensearch/tasks/migrate-opendistro.yml index 7cb5c8190..fd239bc11 100644 --- a/ansible/roles/opensearch/tasks/migrate-opendistro.yml +++ b/ansible/roles/opensearch/tasks/migrate-opendistro.yml @@ -1,3 +1,4 @@ +--- # Migrate data from existing containerised opendistro v1.12.0 to containerised opensearch 2.1.0. # # This relies on: @@ -22,7 +23,7 @@ dest: "{{ opensearch_data_path | dirname }}/" # copying a directory, so need to specify the parent for destination owner: "{{ opensearch_podman_user }}" group: "{{ opensearch_podman_user }}" - mode: 0770 + mode: "0770" vars: # from environments/common/inventory/group_vars/all/opendistro.yml: _default_opendistro_data_path: "{{ appliances_state_dir | default('/usr/share') }}/elasticsearch/data" diff --git a/ansible/roles/opensearch/tasks/runtime.yml b/ansible/roles/opensearch/tasks/runtime.yml index 7fe197abe..7247f1561 100644 --- a/ansible/roles/opensearch/tasks/runtime.yml +++ b/ansible/roles/opensearch/tasks/runtime.yml @@ -1,55 +1,54 @@ --- - - name: Check for existing opendistro service - stat: + ansible.builtin.stat: path: /etc/systemd/system/opendistro.service register: _opensearch_opendistro_service - name: Migrate opendistro data - import_tasks: + ansible.builtin.import_tasks: file: migrate-opendistro.yml when: _opensearch_opendistro_service.stat.exists - name: Remove opendistro service - file: + ansible.builtin.file: path: /etc/systemd/system/opendistro.service state: absent - name: Enumerate files in data directory - find: + ansible.builtin.find: path: "{{ opensearch_data_path }}" register: _find_opensearch_data - name: Archive incorrectly indexed data - import_tasks: archive_data.yml + ansible.builtin.import_tasks: archive_data.yml when: - _find_opensearch_data.files | length > 0 - "'slurm_jobid_index' not in _find_opensearch_data.files | map(attribute='path') | map('basename')" - name: Ensure required opensearch host directories exist - file: + ansible.builtin.file: state: directory path: "{{ item }}" owner: "{{ opensearch_podman_user }}" group: "{{ opensearch_podman_user }}" - mode: 0770 + mode: "0770" become: true loop: - "{{ opensearch_config_path }}" - "{{ opensearch_data_path }}" - name: Set indexed data flag - copy: + ansible.builtin.copy: dest: "{{ opensearch_data_path }}/slurm_jobid_index" content: | This is a flag file to indicate that filebeat is pushing data indexed by Slurm JobID to prevent duplicate OpenSearch records owner: "{{ opensearch_podman_user }}" group: "{{ opensearch_podman_user }}" + mode: "0644" - name: Create certs - import_tasks: certs.yml - + ansible.builtin.import_tasks: certs.yml - name: Template general configuration ansible.builtin.template: src: opensearch.yml.j2 @@ -58,27 +57,26 @@ group: "{{ opensearch_podman_user }}" # NOTE: root user in container maps to user on host, so this will appear as # owned by root in the container. - mode: 0660 + mode: "0660" notify: Restart opensearch service become: true - name: Template internal user configuration - template: - src: "{{ opensearch_internal_users_path }}" - dest: "{{ opensearch_config_path }}/internal_users.yml" - owner: "{{ opensearch_podman_user }}" - group: "{{ opensearch_podman_user }}" - # NOTE: root user in container maps to user on host, so this will appear as - # owned by root in the container. - mode: 0660 + ansible.builtin.template: + src: "{{ opensearch_internal_users_path }}" + dest: "{{ opensearch_config_path }}/internal_users.yml" + owner: "{{ opensearch_podman_user }}" + group: "{{ opensearch_podman_user }}" + # NOTE: root user in container maps to user on host, so this will appear as + # owned by root in the container. + mode: "0660" notify: Restart opensearch service become: true - name: Flush handlers - meta: flush_handlers - + ansible.builtin.meta: flush_handlers - name: Ensure opensearch service state - systemd: + ansible.builtin.systemd: name: opensearch.service state: "{{ opensearch_state }}" enabled: "{{ opensearch_systemd_service_enabled }}" diff --git a/ansible/roles/passwords/defaults/main.yml b/ansible/roles/passwords/defaults/main.yml index 95e3b6aca..a848431c2 100644 --- a/ansible/roles/passwords/defaults/main.yml +++ b/ansible/roles/passwords/defaults/main.yml @@ -1,7 +1,9 @@ --- slurm_appliance_secrets: + # yamllint disable-line rule:line-length vault_grafana_admin_password: "{{ secrets_openhpc_grafana_admin_password | default(vault_grafana_admin_password | default(lookup('password', '/dev/null'))) }}" + # yamllint disable-line rule:line-length vault_elasticsearch_admin_password: "{{ secrets_openhpc_elasticsearch_admin_password | default(vault_elasticsearch_admin_password | default(lookup('password', '/dev/null'))) }}" vault_mysql_root_password: "{{ secrets_openhpc_mysql_root_password | default(vault_mysql_root_password | default(lookup('password', '/dev/null'))) }}" vault_mysql_slurm_password: "{{ secrets_openhpc_mysql_slurm_password | default(vault_mysql_slurm_password | default(lookup('password', '/dev/null'))) }}" @@ -16,4 +18,5 @@ slurm_appliance_secrets: secrets_openhpc_mungekey_default: content: "{{ lookup('pipe', 'dd if=/dev/urandom bs=1 count=1024 2>/dev/null | base64') }}" +# yamllint disable-line rule:line-length openhpc_passwords_output_path: "{{ lookup('env', 'APPLIANCES_ENVIRONMENT_ROOT') | default(undefined, true) | mandatory('You must define the APPLIANCES_ENVIRONMENT_ROOT environment variable') }}/inventory/group_vars/all/secrets.yml" diff --git a/ansible/roles/passwords/tasks/main.yml b/ansible/roles/passwords/tasks/main.yml index 743a6cda8..cb41cbbce 100644 --- a/ansible/roles/passwords/tasks/main.yml +++ b/ansible/roles/passwords/tasks/main.yml @@ -1,8 +1,8 @@ --- - - name: Template passwords - template: + ansible.builtin.template: src: passwords.yml dest: "{{ openhpc_passwords_output_path }}" + mode: "0644" delegate_to: localhost run_once: true diff --git a/ansible/roles/passwords/tasks/validate.yml b/ansible/roles/passwords/tasks/validate.yml index b30b0696e..6cde14445 100644 --- a/ansible/roles/passwords/tasks/validate.yml +++ b/ansible/roles/passwords/tasks/validate.yml @@ -1,4 +1,5 @@ +--- - name: Assert secrets created - assert: + ansible.builtin.assert: that: (hostvars[inventory_hostname].keys() | select('contains', 'vault_') | length) > 1 # 1 as may have vault_demo_user_password defined in dev fail_msg: "No inventory variables 'vault_*' found: Has ansible/adhoc/generate-passwords.yml been run?" diff --git a/ansible/roles/persist_hostkeys/defaults/main.yml b/ansible/roles/persist_hostkeys/defaults/main.yml index 3c0000466..0de0b7104 100644 --- a/ansible/roles/persist_hostkeys/defaults/main.yml +++ b/ansible/roles/persist_hostkeys/defaults/main.yml @@ -1,2 +1,3 @@ +--- persist_hostkeys_state_server: "{{ groups['control'] | first }}" persist_hostkeys_state_dir: "{{ hostvars[persist_hostkeys_state_server]['appliances_state_dir'] }}/hostkeys" diff --git a/ansible/roles/persist_hostkeys/tasks/main.yml b/ansible/roles/persist_hostkeys/tasks/main.yml index deff112f7..014feadb7 100644 --- a/ansible/roles/persist_hostkeys/tasks/main.yml +++ b/ansible/roles/persist_hostkeys/tasks/main.yml @@ -1,47 +1,47 @@ --- - - name: Generate persistent hostkeys in state directory delegate_to: "{{ persist_hostkeys_state_server }}" block: - - name: Ensure hostkeys directory exists on persistent storage - file: - path: "{{ persist_hostkeys_state_dir }}" - state: directory - owner: root - group: root - mode: 0600 + - name: Ensure hostkeys directory exists on persistent storage + ansible.builtin.file: + path: "{{ persist_hostkeys_state_dir }}" + state: directory + owner: root + group: root + mode: "0600" + + - name: Check for existing hostkeys + ansible.builtin.find: + paths: "{{ persist_hostkeys_state_dir }}/" + register: _files_found - - name: Check for existing hostkeys - find: - paths: "{{ persist_hostkeys_state_dir }}/" - register: _files_found + - name: Generate hostkeys # noqa: no-changed-when + when: _files_found.matched == 0 + ansible.builtin.shell: + # ssh-keygen -A needs a directory with an /etc/ssh suffix to write hostkeys into + cmd: | + mkdir -p {{ persist_hostkeys_state_dir }}/etc/ssh + ssh-keygen -A -N '' -f {{ persist_hostkeys_state_dir }} + mv {{ persist_hostkeys_state_dir }}/etc/ssh/* {{ persist_hostkeys_state_dir }} + rm -rf {{ persist_hostkeys_state_dir }}/etc/ssh - - name: Generate hostkeys - when: _files_found.matched == 0 - shell: - # ssh-keygen -A needs a directory with an /etc/ssh suffix to write hostkeys into - cmd: | - mkdir -p {{ persist_hostkeys_state_dir }}/etc/ssh - ssh-keygen -A -N '' -f {{ persist_hostkeys_state_dir }} - mv {{ persist_hostkeys_state_dir }}/etc/ssh/* {{ persist_hostkeys_state_dir }} - rm -rf {{ persist_hostkeys_state_dir }}/etc/ssh - - - name: Get created key names - find: - path: "{{ persist_hostkeys_state_dir }}/" - register: _find_ssh_keys + - name: Get created key names + ansible.builtin.find: + path: "{{ persist_hostkeys_state_dir }}/" + register: _find_ssh_keys - - name: Create in-memory copies of keys - ansible.builtin.slurp: - src: "{{ item.path }}" - loop: "{{ _find_ssh_keys.files }}" - register: _slurp_keys + - name: Create in-memory copies of keys + ansible.builtin.slurp: + src: "{{ item.path }}" + loop: "{{ _find_ssh_keys.files }}" + register: _slurp_keys - name: Copy keys to hosts no_log: true - copy: + ansible.builtin.copy: content: "{{ item.content | b64decode }}" dest: "/etc/ssh/{{ item.source | regex_search('[^/]+$') }}" + mode: "0644" loop: "{{ _slurp_keys.results }}" -- meta: reset_connection +- ansible.builtin.meta: reset_connection diff --git a/ansible/roles/persist_openhpc_secrets/tasks/main.yml b/ansible/roles/persist_openhpc_secrets/tasks/main.yml index 6ae9bcd59..c55766568 100644 --- a/ansible/roles/persist_openhpc_secrets/tasks/main.yml +++ b/ansible/roles/persist_openhpc_secrets/tasks/main.yml @@ -1,35 +1,34 @@ --- - - name: Check if OpenHPC secrets exist in persistent storage - stat: + ansible.builtin.stat: path: "{{ appliances_state_dir }}/ansible.facts.d/openhpc_secrets.fact" register: openhpc_secrets_stat - name: Ensure Ansible facts directories exist - file: + ansible.builtin.file: path: "{{ item }}" state: directory owner: root - mode: 0600 + mode: "0600" loop: - "{{ appliances_state_dir }}/ansible.facts.d" - "/etc/ansible/facts.d" - + - name: Write OpenHPC secrets - template: + ansible.builtin.template: src: openhpc_secrets.fact dest: "{{ appliances_state_dir }}/ansible.facts.d/openhpc_secrets.fact" owner: root - mode: 0600 + mode: "0600" when: "not openhpc_secrets_stat.stat.exists" - name: Symlink persistent facts to facts_path - file: + ansible.builtin.file: state: link src: "{{ appliances_state_dir }}/ansible.facts.d/openhpc_secrets.fact" dest: /etc/ansible/facts.d/openhpc_secrets.fact owner: root - + - name: Read facts ansible.builtin.setup: filter: ansible_local diff --git a/ansible/roles/podman/defaults/main.yml b/ansible/roles/podman/defaults/main.yml index 8b3c9ef55..fc76d062c 100644 --- a/ansible/roles/podman/defaults/main.yml +++ b/ansible/roles/podman/defaults/main.yml @@ -1,2 +1,3 @@ +--- podman_users: - name: "{{ ansible_user }}" diff --git a/ansible/roles/podman/tasks/config.yml b/ansible/roles/podman/tasks/config.yml index 74cf1d576..d2a7804aa 100644 --- a/ansible/roles/podman/tasks/config.yml +++ b/ansible/roles/podman/tasks/config.yml @@ -1,7 +1,6 @@ --- - - name: Up default resource limits - copy: + ansible.builtin.copy: content: | # WARNING: This file is managed by ansible, do not modify. # This is so non-root containers can use more resources. This is useful @@ -11,6 +10,7 @@ * soft nofile 65536 * hard nofile 65536 dest: /etc/security/limits.d/custom.conf + mode: "0644" become: true - name: Up number of non-root kernel keys permitted per user @@ -36,17 +36,17 @@ value: '"cgroupfs"' become: true -- name: reset ssh connection to allow user changes to affect 'current login user' - meta: reset_connection - +- name: Reset ssh connection to allow user changes to affect 'current login user' + ansible.builtin.meta: reset_connection - name: Ensure podman users exist - user: "{{ item }}" + ansible.builtin.user: + name: "{{ item }}" with_items: "{{ podman_users }}" register: podman_user_info - become: yes + become: true - name: Clear up podman temporary files on startup - copy: + ansible.builtin.copy: content: | # Created by ansible # Delete ephemeral podman files to avoid issues where /tmp is not of type tmpfs and persists across reboots. @@ -59,5 +59,5 @@ dest: /etc/tmpfiles.d/podman-local.conf owner: root group: root - mode: 0660 + mode: "0660" become: true diff --git a/ansible/roles/podman/tasks/prereqs.yml b/ansible/roles/podman/tasks/prereqs.yml index 362d3a13d..d7a4d869e 100644 --- a/ansible/roles/podman/tasks/prereqs.yml +++ b/ansible/roles/podman/tasks/prereqs.yml @@ -1,8 +1,8 @@ --- - name: Install OS packages - yum: + ansible.builtin.dnf: name: - podman - python3 state: installed - become: true \ No newline at end of file + become: true diff --git a/ansible/roles/proxy/README.md b/ansible/roles/proxy/README.md index 6d51fd9d4..d3b1b7285 100644 --- a/ansible/roles/proxy/README.md +++ b/ansible/roles/proxy/README.md @@ -4,7 +4,7 @@ Define http/s proxy configuration. ## Role variables -- `proxy_http_proxy`: Required. Address of http proxy. E.g. "http://10.1.0.28:3128" for a Squid proxy on default port. +- `proxy_http_proxy`: Required. Address of http proxy. E.g. "" for a Squid proxy on default port. - `proxy_https_proxy`: Optional. Address of https proxy. Default is `{{ proxy_http_proxy }}`. - `proxy_no_proxy_extra`: Optional. List of additional addresses not to proxy. Will be combined with default list which includes `inventory_hostname` (for hostnames) and `ansible_host` (for host IPs) for all Ansible hosts. - `proxy_dnf`: Optional bool. Whether to configure yum/dnf proxying through `proxy_http_proxy`. Default `true`. diff --git a/ansible/roles/proxy/defaults/main.yml b/ansible/roles/proxy/defaults/main.yml index fd2b079ec..33c699e8d 100644 --- a/ansible/roles/proxy/defaults/main.yml +++ b/ansible/roles/proxy/defaults/main.yml @@ -1,3 +1,4 @@ +--- # proxy_http_proxy: proxy_https_proxy: "{{ proxy_http_proxy }}" proxy_no_proxy_defaults: "{{ ['localhost', '127.0.0.1'] + groups['all'] + hostvars.values() | map(attribute='ansible_host') }}" diff --git a/ansible/roles/proxy/tasks/main.yml b/ansible/roles/proxy/tasks/main.yml index 70a7eca67..005d511b9 100644 --- a/ansible/roles/proxy/tasks/main.yml +++ b/ansible/roles/proxy/tasks/main.yml @@ -1,8 +1,9 @@ +--- - name: Define configuration in /etc/environment tags: proxy - lineinfile: + ansible.builtin.lineinfile: path: "/etc/environment" - create: yes + create: true owner: root group: root mode: o=rw,go=r @@ -18,7 +19,7 @@ value: "{{ proxy_no_proxy }}" - name: Define dnf proxy - ini_file: + community.general.ini_file: path: /etc/dnf/dnf.conf section: main option: "proxy" @@ -30,7 +31,7 @@ when: proxy_dnf | bool - name: Create systemd configuration directory - file: + ansible.builtin.file: path: /etc/systemd/system.conf.d/ state: directory owner: root @@ -44,9 +45,9 @@ section: Manager option: DefaultEnvironment value: >- - "http_proxy={{ proxy_http_proxy }}" - "https_proxy={{ proxy_http_proxy }}" - "no_proxy={{ proxy_no_proxy }}" + "http_proxy={{ proxy_http_proxy }}" + "https_proxy={{ proxy_http_proxy }}" + "no_proxy={{ proxy_no_proxy }}" no_extra_spaces: true owner: root group: root @@ -54,12 +55,11 @@ register: _copy_systemd_proxy when: proxy_systemd | bool -- name: Restart systemd - command: systemctl daemon-reexec - when: +- name: Restart systemd # noqa: no-changed-when + ansible.builtin.command: systemctl daemon-reexec # noqa: command-instead-of-module + when: - proxy_systemd | bool - _copy_systemd_proxy.changed | default(false) - name: Reset connection to get new /etc/environment - meta: reset_connection - # NB: conditionals not supported + ansible.builtin.meta: reset_connection diff --git a/ansible/roles/pulp_site/defaults/main.yml b/ansible/roles/pulp_site/defaults/main.yml index d30d1bdff..e115b5cb6 100644 --- a/ansible/roles/pulp_site/defaults/main.yml +++ b/ansible/roles/pulp_site/defaults/main.yml @@ -12,24 +12,26 @@ pulp_site_target_distribution_version: "{{ pulp_site_target_facts['distribution_ pulp_site_target_distribution_version_major: "{{ pulp_site_target_facts['distribution_major_version'] }}" pulp_site_rpm_info: -- name: "baseos-{{ pulp_site_target_distribution_version }}-{{ appliances_pulp_repos.baseos[pulp_site_target_distribution_version].timestamp }}" - subpath: "{{ appliances_pulp_repos.baseos[pulp_site_target_distribution_version] | appliances_repo_to_subpath }}" -- name: "appstream-{{ pulp_site_target_distribution_version }}-{{ appliances_pulp_repos.appstream[pulp_site_target_distribution_version].timestamp }}" - subpath: "{{ appliances_pulp_repos.appstream[pulp_site_target_distribution_version] | appliances_repo_to_subpath }}" -- name: "crb-{{ pulp_site_target_distribution_version }}-{{ appliances_pulp_repos.crb[pulp_site_target_distribution_version].timestamp }}" - subpath: "{{ appliances_pulp_repos.crb[pulp_site_target_distribution_version] | appliances_repo_to_subpath }}" -- name: "extras-{{ pulp_site_target_distribution_version }}-{{ appliances_pulp_repos.extras[pulp_site_target_distribution_version].timestamp }}" - subpath: "{{ appliances_pulp_repos.extras[pulp_site_target_distribution_version] | appliances_repo_to_subpath }}" -- name: "epel-{{ pulp_site_target_distribution_version_major }}-{{ appliances_pulp_repos.epel[pulp_site_target_distribution_version_major].timestamp }}" - subpath: "{{ appliances_pulp_repos.epel[pulp_site_target_distribution_version_major] | appliances_repo_to_subpath }}" -- name: "ohpc-{{ pulp_site_target_distribution_version_major }}-{{ appliances_pulp_repos.openhpc_base[pulp_site_target_distribution_version_major].timestamp }}" - subpath: "{{ appliances_pulp_repos.openhpc_base[pulp_site_target_distribution_version_major] | appliances_repo_to_subpath }}" -- name: "ohpc-updates-{{ pulp_site_target_distribution_version_major }}-{{ appliances_pulp_repos.openhpc_updates[pulp_site_target_distribution_version_major].timestamp }}" - subpath: "{{ appliances_pulp_repos.openhpc_updates[pulp_site_target_distribution_version_major] | appliances_repo_to_subpath }}" -- name: "ceph-{{ pulp_site_target_distribution_version_major }}-{{ appliances_pulp_repos.ceph[pulp_site_target_distribution_version_major].timestamp }}" - subpath: "{{ appliances_pulp_repos.ceph[pulp_site_target_distribution_version_major] | appliances_repo_to_subpath }}" -- name: "grafana-{{ pulp_site_target_distribution_version_major }}-{{ appliances_pulp_repos.grafana.timestamp[pulp_site_target_distribution_version_major].timestamp }} - subpath: "{{ appliances_pulp_repos.grafana[pulp_site_target_distribution_version_major] | appliances_repo_to_subpath }}" + # yamllint disable rule:line-length + - name: "baseos-{{ pulp_site_target_distribution_version }}-{{ appliances_pulp_repos.baseos[pulp_site_target_distribution_version].timestamp }}" + subpath: "{{ appliances_pulp_repos.baseos[pulp_site_target_distribution_version] | appliances_repo_to_subpath }}" + - name: "appstream-{{ pulp_site_target_distribution_version }}-{{ appliances_pulp_repos.appstream[pulp_site_target_distribution_version].timestamp }}" + subpath: "{{ appliances_pulp_repos.appstream[pulp_site_target_distribution_version] | appliances_repo_to_subpath }}" + - name: "crb-{{ pulp_site_target_distribution_version }}-{{ appliances_pulp_repos.crb[pulp_site_target_distribution_version].timestamp }}" + subpath: "{{ appliances_pulp_repos.crb[pulp_site_target_distribution_version] | appliances_repo_to_subpath }}" + - name: "extras-{{ pulp_site_target_distribution_version }}-{{ appliances_pulp_repos.extras[pulp_site_target_distribution_version].timestamp }}" + subpath: "{{ appliances_pulp_repos.extras[pulp_site_target_distribution_version] | appliances_repo_to_subpath }}" + - name: "epel-{{ pulp_site_target_distribution_version_major }}-{{ appliances_pulp_repos.epel[pulp_site_target_distribution_version_major].timestamp }}" + subpath: "{{ appliances_pulp_repos.epel[pulp_site_target_distribution_version_major] | appliances_repo_to_subpath }}" + - name: "ohpc-{{ pulp_site_target_distribution_version_major }}-{{ appliances_pulp_repos.openhpc_base[pulp_site_target_distribution_version_major].timestamp }}" + subpath: "{{ appliances_pulp_repos.openhpc_base[pulp_site_target_distribution_version_major] | appliances_repo_to_subpath }}" + - name: "ohpc-updates-{{ pulp_site_target_distribution_version_major }}-{{ appliances_pulp_repos.openhpc_updates[pulp_site_target_distribution_version_major].timestamp }}" + subpath: "{{ appliances_pulp_repos.openhpc_updates[pulp_site_target_distribution_version_major] | appliances_repo_to_subpath }}" + - name: "ceph-{{ pulp_site_target_distribution_version_major }}-{{ appliances_pulp_repos.ceph[pulp_site_target_distribution_version_major].timestamp }}" + subpath: "{{ appliances_pulp_repos.ceph[pulp_site_target_distribution_version_major] | appliances_repo_to_subpath }}" + - name: "grafana-{{ pulp_site_target_distribution_version_major }}-{{ appliances_pulp_repos.grafana.timestamp[pulp_site_target_distribution_version_major].timestamp }}" + subpath: "{{ appliances_pulp_repos.grafana[pulp_site_target_distribution_version_major] | appliances_repo_to_subpath }}" + # yamllint enable rule:line-length pulp_site_rpm_repo_defaults: remote_username: "{{ pulp_site_upstream_username }}" diff --git a/ansible/roles/pulp_site/filter_plugins/pulp-list-filters.py b/ansible/roles/pulp_site/filter_plugins/pulp-list-filters.py index 50e912685..0d315be47 100644 --- a/ansible/roles/pulp_site/filter_plugins/pulp-list-filters.py +++ b/ansible/roles/pulp_site/filter_plugins/pulp-list-filters.py @@ -1,31 +1,43 @@ +# pylint: disable=invalid-name, missing-module-docstring +# pylint: disable-next=missing-class-docstring, useless-object-inheritance class FilterModule(object): - def filters(self): + + def filters(self): # pylint: disable=missing-function-docstring return { - 'to_rpm_repos': self.to_rpm_repos, - 'to_rpm_pubs': self.to_rpm_pubs, - 'to_rpm_distros': self.to_rpm_distros + "to_rpm_repos": self.to_rpm_repos, + "to_rpm_pubs": self.to_rpm_pubs, + "to_rpm_distros": self.to_rpm_distros, } - def to_rpm_repos(self, list, pulp_url): - repo_list = map(lambda x: { - 'name': x['name'], - 'url': pulp_url+'/'+x['subpath'], - 'remote_username': x['remote_username'], - 'remote_password': x['remote_password'], - 'policy': x['policy'], - 'state': x['state'] }, list) + # pylint: disable-next=missing-function-docstring + def to_rpm_repos(self, list, pulp_url): # pylint: disable=redefined-builtin + repo_list = map( + lambda x: { + "name": x["name"], + "url": pulp_url + "/" + x["subpath"], + "remote_username": x["remote_username"], + "remote_password": x["remote_password"], + "policy": x["policy"], + "state": x["state"], + }, + list, + ) return repo_list - - def to_rpm_pubs(self, list): - pub_list = map(lambda x: { - 'repository': x['name'], - 'state': x['state'] }, list) + + # pylint: disable-next=missing-function-docstring + def to_rpm_pubs(self, list): # pylint: disable=redefined-builtin + pub_list = map(lambda x: {"repository": x["name"], "state": x["state"]}, list) return pub_list - - def to_rpm_distros(self, list): - distro_list = map(lambda x: { - 'name': x['name'], - 'repository': x['name'], - 'base_path': x['subpath'], - 'state': x['state'] }, list) - return distro_list \ No newline at end of file + + # pylint: disable-next=missing-function-docstring + def to_rpm_distros(self, list): # pylint: disable=redefined-builtin + distro_list = map( + lambda x: { + "name": x["name"], + "repository": x["name"], + "base_path": x["subpath"], + "state": x["state"], + }, + list, + ) + return distro_list diff --git a/ansible/roles/pulp_site/tasks/install.yml b/ansible/roles/pulp_site/tasks/install.yml index 39b4fcd97..f043740fa 100644 --- a/ansible/roles/pulp_site/tasks/install.yml +++ b/ansible/roles/pulp_site/tasks/install.yml @@ -1,24 +1,25 @@ --- - - name: Install packages - dnf: + ansible.builtin.dnf: name: - - podman + - podman - name: Create install directories ansible.builtin.file: state: directory path: "{{ pulp_site_install_dir }}/{{ item }}" + mode: "0755" loop: - - settings/certs - - pulp_storage - - pgsql - - containers + - settings/certs + - pulp_storage + - pgsql + - containers - name: Template settings file ansible.builtin.template: src: settings.py.j2 dest: "{{ pulp_site_install_dir }}/settings/settings.py" + mode: "0644" - name: Install pulp podman container containers.podman.podman_container: @@ -26,16 +27,16 @@ publish: - "{{ pulp_site_port }}:80" volume: - - "{{ pulp_site_install_dir }}/settings:/etc/pulp{{ pulp_site_selinux_suffix }}" - - "{{ pulp_site_install_dir }}/pulp_storage:/var/lib/pulp{{ pulp_site_selinux_suffix }}" - - "{{ pulp_site_install_dir }}/pgsql:/var/lib/pgsql{{ pulp_site_selinux_suffix }}" - - "{{ pulp_site_install_dir }}/containers:/var/lib/containers{{ pulp_site_selinux_suffix }}" + - "{{ pulp_site_install_dir }}/settings:/etc/pulp{{ pulp_site_selinux_suffix }}" + - "{{ pulp_site_install_dir }}/pulp_storage:/var/lib/pulp{{ pulp_site_selinux_suffix }}" + - "{{ pulp_site_install_dir }}/pgsql:/var/lib/pgsql{{ pulp_site_selinux_suffix }}" + - "{{ pulp_site_install_dir }}/containers:/var/lib/containers{{ pulp_site_selinux_suffix }}" device: /dev/fuse image: docker.io/pulp/pulp:3.68.1 -- name: Reset admin password once container has initialised +- name: Reset admin password once container has initialised # noqa: no-changed-when no_log: true - ansible.builtin.shell: + ansible.builtin.command: cmd: "podman exec pulp bash -c 'pulpcore-manager reset-admin-password -p {{ pulp_site_password }}'" register: _admin_reset_output until: 0 == _admin_reset_output.rc diff --git a/ansible/roles/pulp_site/tasks/sync.yml b/ansible/roles/pulp_site/tasks/sync.yml index 5ef2bc5f1..6dcf36609 100644 --- a/ansible/roles/pulp_site/tasks/sync.yml +++ b/ansible/roles/pulp_site/tasks/sync.yml @@ -1,5 +1,4 @@ --- - - ansible.builtin.assert: that: pulp_site_upstream_password != '' quiet: true @@ -19,34 +18,37 @@ ansible.builtin.file: path: ~/.config/pulp state: directory + mode: "0755" - name: Create config file no_log: true ansible.builtin.template: src: cli.toml.j2 dest: ~/.config/pulp/cli.toml - mode: '0644' + mode: "0644" -- block: - - name: Ensure squeezer cache exists - ansible.builtin.file: - path: "{{ _cache_dir }}" - state: directory +- vars: + _cache_dir: "~/.cache/squeezer/{{ pulp_site_url | regex_replace(':|/', '_') }}" - - name: Check if squeezer cache is populated - ansible.builtin.stat: - path: "{{ _cache_dir }}/api.json" - register: _cache_stat + block: + - name: Ensure squeezer cache exists + ansible.builtin.file: + path: "{{ _cache_dir }}" + state: directory + mode: "0755" - - name: Prepopulate squeezer cache # workaround for race on the cache - ansible.builtin.get_url: - url: "{{ pulp_site_url }}/pulp/api/v3/docs/api.json" - dest: "{{ _cache_dir }}/api.json" - timeout: 40 - when: not _cache_stat.stat.exists - vars: - _cache_dir: "~/.cache/squeezer/{{ pulp_site_url | regex_replace( ':|/' , '_' ) }}" + - name: Check if squeezer cache is populated + ansible.builtin.stat: + path: "{{ _cache_dir }}/api.json" + register: _cache_stat + - name: Prepopulate squeezer cache # workaround for race on the cache + ansible.builtin.get_url: + url: "{{ pulp_site_url }}/pulp/api/v3/docs/api.json" + dest: "{{ _cache_dir }}/api.json" + mode: "0644" + timeout: 40 + when: not _cache_stat.stat.exists - name: Get Pulp repos from release train ansible.builtin.include_role: name: stackhpc.pulp.pulp_repository diff --git a/ansible/roles/rebuild/README.md b/ansible/roles/rebuild/README.md index 4e4e87a8e..affc7b65f 100644 --- a/ansible/roles/rebuild/README.md +++ b/ansible/roles/rebuild/README.md @@ -1,17 +1,14 @@ -rebuild -========= +# rebuild -Enables reboot tool from https://github.com/stackhpc/slurm-openstack-tools.git +Enables reboot tool from to be run from control node. -Requirements ------------- +## Requirements An OpenStack clouds.yaml file containing credentials for a cloud under the "openstack" key. -Role Variables --------------- +## Role Variables The below is only used by this role's `main.yml` task file, i.e. when running the `ansible/site.yml` or `ansible/slurm.yml` playbooks: diff --git a/ansible/roles/rebuild/defaults/main.yml b/ansible/roles/rebuild/defaults/main.yml index 948283633..16e2141d4 100644 --- a/ansible/roles/rebuild/defaults/main.yml +++ b/ansible/roles/rebuild/defaults/main.yml @@ -4,9 +4,9 @@ rebuild_clouds_path: ~/.config/openstack/clouds.yaml rebuild_job_partitions: rebuild rebuild_job_name: "rebuild-{{ item }}" # item is nodename -rebuild_job_command: 'sleep 5' +rebuild_job_command: "sleep 5" rebuild_job_reboot: true -rebuild_job_options: '' +rebuild_job_options: "" rebuild_job_user: root rebuild_job_template: >- sbatch @@ -20,4 +20,4 @@ rebuild_job_template: >- --output=/dev/null --wrap="{{ rebuild_job_command }}" {{ rebuild_job_options }} -#rebuild_job_hostlist: \ No newline at end of file +# rebuild_job_hostlist: diff --git a/ansible/roles/rebuild/tasks/main.yml b/ansible/roles/rebuild/tasks/main.yml index 5612ab515..e31da6562 100644 --- a/ansible/roles/rebuild/tasks/main.yml +++ b/ansible/roles/rebuild/tasks/main.yml @@ -1,7 +1,6 @@ --- - - name: Create /etc/openstack - file: + ansible.builtin.file: path: /etc/openstack state: directory owner: slurm @@ -9,7 +8,7 @@ mode: u=rX,g=rwX - name: Copy out clouds.yaml - copy: + ansible.builtin.copy: src: "{{ rebuild_clouds_path }}" dest: /etc/openstack/clouds.yaml owner: slurm @@ -17,5 +16,5 @@ mode: u=r,g=rw - name: Setup slurm tools - include_role: + ansible.builtin.include_role: name: slurm_tools diff --git a/ansible/roles/rebuild/tasks/rebuild.yml b/ansible/roles/rebuild/tasks/rebuild.yml index 466951f63..bc202df6b 100644 --- a/ansible/roles/rebuild/tasks/rebuild.yml +++ b/ansible/roles/rebuild/tasks/rebuild.yml @@ -1,11 +1,11 @@ +--- - name: Create rebuild jobs for partition - include_tasks: + ansible.builtin.include_tasks: file: rebuild_partition.yml args: apply: - become: yes + become: true become_user: "{{ rebuild_job_user }}" loop: "{{ rebuild_job_partitions | split(',') }}" loop_control: loop_var: _rebuild_job_current_partition - diff --git a/ansible/roles/rebuild/tasks/rebuild_partition.yml b/ansible/roles/rebuild/tasks/rebuild_partition.yml index 3b319e6e2..35c748a5c 100644 --- a/ansible/roles/rebuild/tasks/rebuild_partition.yml +++ b/ansible/roles/rebuild/tasks/rebuild_partition.yml @@ -1,4 +1,5 @@ -- name: Get list of nodes in partition +--- +- name: Get list of nodes in partition # noqa: no-changed-when ansible.builtin.command: cmd: >- sinfo @@ -9,13 +10,13 @@ register: _sinfo_partition when: rebuild_job_hostlist is not defined -- name: Expand rebuild_job_hostlist to host names +- name: Expand rebuild_job_hostlist to host names # noqa: no-changed-when ansible.builtin.command: cmd: "scontrol show hostnames {{ rebuild_job_hostlist }}" register: _scontrol_hostnames when: rebuild_job_hostlist is defined -- name: Submit rebuild jobs +- name: Submit rebuild jobs # noqa: no-changed-when ansible.builtin.command: cmd: "{{ rebuild_job_template }}" loop: "{{ _scontrol_hostnames.stdout_lines | default(_sinfo_partition.stdout_lines) }}" diff --git a/ansible/roles/resolv_conf/README.md b/ansible/roles/resolv_conf/README.md index 3746407ea..781ec49a3 100644 --- a/ansible/roles/resolv_conf/README.md +++ b/ansible/roles/resolv_conf/README.md @@ -3,9 +3,11 @@ Template out `/etc/resolv.conf`. ## Role variables + - `resolv_conf_nameservers`: List of up to 3 nameserver addresses. Notes: + - `NetworkManager` (if used) will be prevented from rewriting this file on boot. - If `/etc/resolv.conf` includes `127.0.0.1` (e.g. due to a FreeIPA server installation), then `resolv_conf_nameservers` is ignored and this role does not change `/etc/resolv.conf` - For hosts in the `resolv_conf` group, the `/etc/resolv.conf` created with `resolv_conf_nameservers` will diff --git a/ansible/roles/resolv_conf/defaults/main.yml b/ansible/roles/resolv_conf/defaults/main.yml index 37c97b786..44e2d85b6 100644 --- a/ansible/roles/resolv_conf/defaults/main.yml +++ b/ansible/roles/resolv_conf/defaults/main.yml @@ -1 +1,2 @@ +--- resolv_conf_nameservers: [] diff --git a/ansible/roles/resolv_conf/tasks/main.yml b/ansible/roles/resolv_conf/tasks/main.yml index 486ec181b..41ef9c1a5 100644 --- a/ansible/roles/resolv_conf/tasks/main.yml +++ b/ansible/roles/resolv_conf/tasks/main.yml @@ -1,3 +1,4 @@ +--- - name: Read nameservers from /etc/resolv.conf ansible.builtin.slurp: src: /etc/resolv.conf @@ -27,4 +28,4 @@ ansible.builtin.systemd: name: NetworkManager state: reloaded - when: _copy_nm_config.changed | default(false) + when: _copy_nm_config.changed | default(false) # noqa: no-handler diff --git a/ansible/roles/slurm_exporter/README.md b/ansible/roles/slurm_exporter/README.md index 7ade27357..3b42f134e 100644 --- a/ansible/roles/slurm_exporter/README.md +++ b/ansible/roles/slurm_exporter/README.md @@ -1,37 +1,34 @@ -slurm_exporter -============== +# slurm_exporter -Build, install and configure a Prometheus exporter for metrics about Slurm itself: https://github.com/vpenso/prometheus-slurm-exporter/ +Build, install and configure a Prometheus exporter for metrics about Slurm itself: -Requirements ------------- +## Requirements Rocky Linux 8.5 host. -Role Variables --------------- +## Role Variables See `defaults/main.yml` -Dependencies ------------- +## Dependencies None. -Example Playbook ----------------- +## Example Playbook - - name: Deploy Slurm exporter - hosts: control - become: true - tags: slurm_exporter - tasks: - - import_role: - name: slurm_exporter +```yaml +- name: Deploy Slurm exporter + hosts: control + become: true + tags: slurm_exporter + tasks: + - import_role: + name: slurm_exporter +``` Prometheus scrape configuration for this might look like: -``` +```text - job_name: "slurm_exporter" scrape_interval: 30s scrape_timeout: 30s @@ -40,12 +37,10 @@ Prometheus scrape configuration for this might look like: - "{{ openhpc_slurm_control_host }}:9341" ``` -License -------- +## License Apache v2 -Author Information ------------------- +## Author Information StackHPC Ltd. diff --git a/ansible/roles/slurm_exporter/defaults/main.yml b/ansible/roles/slurm_exporter/defaults/main.yml index eda259b42..d0b5a0f1d 100644 --- a/ansible/roles/slurm_exporter/defaults/main.yml +++ b/ansible/roles/slurm_exporter/defaults/main.yml @@ -1,5 +1,5 @@ --- # see https://github.com/stackhpc/prometheus-slurm-exporter/releases - version follows upstream, release is stackhpc build -slurm_exporter_version: '0.21' -slurm_exporter_release: '1' +slurm_exporter_version: "0.21" +slurm_exporter_release: "1" slurm_exporter_state: started diff --git a/ansible/roles/slurm_exporter/handlers/main.yml b/ansible/roles/slurm_exporter/handlers/main.yml index b55c9c689..33266fe4d 100644 --- a/ansible/roles/slurm_exporter/handlers/main.yml +++ b/ansible/roles/slurm_exporter/handlers/main.yml @@ -1,7 +1,7 @@ --- - name: Restart slurm exporter become: true - systemd: + ansible.builtin.systemd: daemon_reload: true name: prometheus-slurm-exporter state: restarted diff --git a/ansible/roles/slurm_exporter/tasks/install.yml b/ansible/roles/slurm_exporter/tasks/install.yml index 49ee57fef..df8a63c0e 100644 --- a/ansible/roles/slurm_exporter/tasks/install.yml +++ b/ansible/roles/slurm_exporter/tasks/install.yml @@ -1,13 +1,15 @@ +--- - name: Install slurm_exporter package - dnf: - name: "https://github.com/stackhpc/prometheus-slurm-exporter/releases/download/{{ slurm_exporter_version }}/prometheus-slurm-exporter-{{ slurm_exporter_version }}-{{slurm_exporter_release}}.el8.x86_64.rpm" - disable_gpg_check: yes + # checkov:skip=CKV2_ANSIBLE_4: "Ensure that packages with untrusted or missing GPG signatures are not used by dnf" + ansible.builtin.dnf: + # yamllint disable-line rule:line-length + name: "https://github.com/stackhpc/prometheus-slurm-exporter/releases/download/{{ slurm_exporter_version }}/prometheus-slurm-exporter-{{ slurm_exporter_version }}-{{ slurm_exporter_release }}.el8.x86_64.rpm" + disable_gpg_check: true notify: Restart slurm exporter -- meta: flush_handlers - +- ansible.builtin.meta: flush_handlers - name: Ensure slurm exporter state - systemd: + ansible.builtin.systemd: name: prometheus-slurm-exporter state: "{{ slurm_exporter_state }}" enabled: true diff --git a/ansible/roles/slurm_exporter/tasks/main.yml b/ansible/roles/slurm_exporter/tasks/main.yml index 52b260f07..822e6922f 100644 --- a/ansible/roles/slurm_exporter/tasks/main.yml +++ b/ansible/roles/slurm_exporter/tasks/main.yml @@ -1,2 +1,2 @@ --- -- import_tasks: install.yml +- ansible.builtin.import_tasks: install.yml diff --git a/ansible/roles/slurm_stats/README.md b/ansible/roles/slurm_stats/README.md index f8bd38caf..c67e2c074 100644 --- a/ansible/roles/slurm_stats/README.md +++ b/ansible/roles/slurm_stats/README.md @@ -1,33 +1,25 @@ -stackhpc.slurm_openstack_tools.slurm-stats -========================================== +# stackhpc.slurm_openstack_tools.slurm-stats -Configures slurm-stats from https://github.com/stackhpc/slurm-openstack-tools.git which +Configures slurm-stats from which transforms sacct output into a form that is more amenable for importing into elasticsearch/loki. -Requirements ------------- +## Requirements -Role Variables --------------- +## Role Variables See `defaults/main.yml`. -Dependencies ------------- +## Dependencies -Example Playbook ----------------- +## Example Playbook - hosts: compute tasks: - import_role: name: slurm_stats - -License -------- +## License Apache-2.0 -Author Information ------------------- +## Author Information diff --git a/ansible/roles/slurm_stats/tasks/main.yml b/ansible/roles/slurm_stats/tasks/main.yml index 6f02405c6..257196e36 100644 --- a/ansible/roles/slurm_stats/tasks/main.yml +++ b/ansible/roles/slurm_stats/tasks/main.yml @@ -1,17 +1,17 @@ --- - - name: Setup slurm tools - include_role: + ansible.builtin.include_role: name: slurm_tools - name: Create a directory to house the log files - file: + ansible.builtin.file: state: directory path: /var/log/slurm-stats + mode: "0755" become: true - name: Create cron job - cron: + ansible.builtin.cron: name: Generate slurm stats minute: "*/5" user: root @@ -21,7 +21,7 @@ become: true - name: Setup log rotate - copy: + ansible.builtin.copy: content: | # WARNING: This file is managed by ansible, do not modify. /var/log/slurm-stats/finished_jobs.json { @@ -31,4 +31,5 @@ delaycompress } dest: /etc/logrotate.d/slurm-stats + mode: "0644" become: true diff --git a/ansible/roles/slurm_tools/README.md b/ansible/roles/slurm_tools/README.md index 9724c4460..07911cbdb 100644 --- a/ansible/roles/slurm_tools/README.md +++ b/ansible/roles/slurm_tools/README.md @@ -1,10 +1,8 @@ -slurm_tools -========= +# slurm_tools -Install python-based tools from https://github.com/stackhpc/slurm-openstack-tools.git into `/opt/slurm-tools/bin/`. +Install python-based tools from into `/opt/slurm-tools/bin/`. -Role Variables --------------- +## Role Variables - `pytools_editable`: Optional bool. Whether to install the package using `pip`'s editable mode (installing source to `/opt/slurm-tools/src`). Default `false`. diff --git a/ansible/roles/slurm_tools/tasks/main.yml b/ansible/roles/slurm_tools/tasks/main.yml index deedb034a..9f5eff08f 100644 --- a/ansible/roles/slurm_tools/tasks/main.yml +++ b/ansible/roles/slurm_tools/tasks/main.yml @@ -1,33 +1,33 @@ --- -- name: install python3 - package: +- name: Install python3 + ansible.builtin.package: name: python3,git become: true - name: Create virtualenv directory - file: + ansible.builtin.file: path: /opt/slurm-tools owner: "{{ pytools_user }}" group: "{{ pytools_user }}" state: directory + mode: "0755" become: true -- block: - - name: Upgrade pip - # This needs to a separate step so that we use the updated version - # to install the packages below. - pip: - name: pip - - - name: Create virtualenv - pip: - name: "git+https://github.com/stackhpc/slurm-openstack-tools.git@{{ pytools_gitref }}#egg=slurm_openstack_tools" - editable: "{{ pytools_editable }}" - - module_defaults: +- module_defaults: ansible.builtin.pip: virtualenv: /opt/slurm-tools virtualenv_command: "{{ 'python3.9 -m venv' if ansible_distribution_major_version == '8' else 'python3 -m venv' }}" state: latest become: true become_user: "{{ pytools_user }}" + block: + - name: Upgrade pip + # This needs to a separate step so that we use the updated version + # to install the packages below. + ansible.builtin.pip: + name: pip + + - name: Create virtualenv + ansible.builtin.pip: + name: "git+https://github.com/stackhpc/slurm-openstack-tools.git@{{ pytools_gitref }}#egg=slurm_openstack_tools" + editable: "{{ pytools_editable }}" diff --git a/ansible/roles/squid/README.md b/ansible/roles/squid/README.md index e514c3605..7b7b8db57 100644 --- a/ansible/roles/squid/README.md +++ b/ansible/roles/squid/README.md @@ -35,5 +35,5 @@ Where noted these map to squid parameters of the same name without the `squid_` http_access allow localhost # Finally deny all other access to this proxy http_access deny all - + See squid parameter. diff --git a/ansible/roles/squid/defaults/main.yml b/ansible/roles/squid/defaults/main.yml index 7457bdccf..b224d131a 100644 --- a/ansible/roles/squid/defaults/main.yml +++ b/ansible/roles/squid/defaults/main.yml @@ -1,3 +1,4 @@ +--- squid_conf_template: squid.conf.j2 squid_started: true squid_enabled: true @@ -5,8 +6,8 @@ squid_enabled: true squid_cache_mem: "{{ undef(hint='squid_cache_mem required, e.g. \"12 GB\"') }}" squid_cache_dir: /var/spool/squid squid_cache_disk: "{{ undef(hint='squid_cache_disk (in MB) required, e.g. \"1024\"') }}" # always in MB -squid_maximum_object_size_in_memory: '64 MB' -squid_maximum_object_size: '200 MB' +squid_maximum_object_size_in_memory: "64 MB" +squid_maximum_object_size: "200 MB" squid_http_port: 3128 squid_acls: acl anywhere src all # rely on openstack security groups squid_http_access: | diff --git a/ansible/roles/squid/handlers/main.yml b/ansible/roles/squid/handlers/main.yml index 135d98d3b..7448a013d 100644 --- a/ansible/roles/squid/handlers/main.yml +++ b/ansible/roles/squid/handlers/main.yml @@ -1,5 +1,6 @@ +--- - name: Restart squid - service: + ansible.builtin.service: name: squid state: restarted when: squid_started | bool diff --git a/ansible/roles/squid/tasks/configure.yml b/ansible/roles/squid/tasks/configure.yml index 0d4dec681..d1e49e382 100644 --- a/ansible/roles/squid/tasks/configure.yml +++ b/ansible/roles/squid/tasks/configure.yml @@ -1,5 +1,6 @@ +--- - name: Ensure squid cache directory exists - file: + ansible.builtin.file: path: "{{ squid_cache_dir }}" # based on what dnf package creates: owner: squid @@ -7,7 +8,7 @@ mode: u=rwx,g=rw,o= - name: Template squid configuration - template: + ansible.builtin.template: src: "{{ squid_conf_template }}" dest: /etc/squid/squid.conf owner: squid @@ -15,10 +16,9 @@ mode: ug=rwX,go= notify: Restart squid -- meta: flush_handlers - +- ansible.builtin.meta: flush_handlers - name: Ensure squid service state - systemd: + ansible.builtin.systemd: name: squid state: "{{ 'started' if squid_started | bool else 'stopped' }}" enabled: "{{ true if squid_enabled else false }}" diff --git a/ansible/roles/squid/tasks/install.yml b/ansible/roles/squid/tasks/install.yml index 672186c48..d60af91e8 100644 --- a/ansible/roles/squid/tasks/install.yml +++ b/ansible/roles/squid/tasks/install.yml @@ -1,3 +1,4 @@ +--- - name: Install squid package - dnf: + ansible.builtin.dnf: name: squid diff --git a/ansible/roles/squid/tasks/main.yml b/ansible/roles/squid/tasks/main.yml index 2b65e84b4..cc29fba63 100644 --- a/ansible/roles/squid/tasks/main.yml +++ b/ansible/roles/squid/tasks/main.yml @@ -1,2 +1,3 @@ -- import_tasks: install.yml -- import_tasks: configure.yml +--- +- ansible.builtin.import_tasks: install.yml +- ansible.builtin.import_tasks: configure.yml diff --git a/ansible/roles/sshd/defaults/main.yml b/ansible/roles/sshd/defaults/main.yml index c7a83b875..ca2f8c73f 100644 --- a/ansible/roles/sshd/defaults/main.yml +++ b/ansible/roles/sshd/defaults/main.yml @@ -1,3 +1,4 @@ +--- sshd_password_authentication: false sshd_disable_forwarding: true sshd_conf_src: sshd.conf.j2 diff --git a/ansible/roles/sshd/handlers/main.yml b/ansible/roles/sshd/handlers/main.yml index e11aa7801..e3e8b1c4a 100644 --- a/ansible/roles/sshd/handlers/main.yml +++ b/ansible/roles/sshd/handlers/main.yml @@ -1,4 +1,5 @@ +--- - name: Restart sshd - systemd: + ansible.builtin.systemd: name: sshd state: restarted diff --git a/ansible/roles/sshd/tasks/configure.yml b/ansible/roles/sshd/tasks/configure.yml index 359d782f6..f47d48c9e 100644 --- a/ansible/roles/sshd/tasks/configure.yml +++ b/ansible/roles/sshd/tasks/configure.yml @@ -1,17 +1,18 @@ +--- - name: Grab facts to determine distribution - setup: + ansible.builtin.setup: - name: Ensure drop in directory exists - file: + ansible.builtin.file: path: /etc/ssh/sshd_config.d/ state: directory owner: root group: root - mode: 700 + mode: "0700" become: true - name: Ensure drop in configuration is included - blockinfile: + ansible.builtin.blockinfile: dest: /etc/ssh/sshd_config content: | # To modify the system-wide sshd configuration, create .conf @@ -32,7 +33,7 @@ # Include /etc/ssh/sshd_config.d/*.conf # early on, which is generally held to be the correct approach, so adding # values to the end of that file won't work - template: + ansible.builtin.template: src: "{{ sshd_conf_src }}" dest: "{{ sshd_conf_dest }}" owner: root diff --git a/ansible/roles/sshd/tasks/export.yml b/ansible/roles/sshd/tasks/export.yml index 0c153ca94..a21daee7f 100644 --- a/ansible/roles/sshd/tasks/export.yml +++ b/ansible/roles/sshd/tasks/export.yml @@ -1,6 +1,7 @@ +--- # Exclusively used for compute-init - name: Inject host specific config template - template: + ansible.builtin.template: src: "{{ sshd_conf_src }}" dest: "/exports/cluster/hostconfig/{{ inventory_hostname }}/sshd.conf" owner: root diff --git a/ansible/roles/sshd/tasks/main.yml b/ansible/roles/sshd/tasks/main.yml index 84f493457..ec83d2b16 100644 --- a/ansible/roles/sshd/tasks/main.yml +++ b/ansible/roles/sshd/tasks/main.yml @@ -1 +1,2 @@ -- import_tasks: configure.yml +--- +- ansible.builtin.import_tasks: configure.yml diff --git a/ansible/roles/sssd/README.md b/ansible/roles/sssd/README.md index 5c9b50e6d..ad6de4af0 100644 --- a/ansible/roles/sssd/README.md +++ b/ansible/roles/sssd/README.md @@ -2,7 +2,6 @@ Install and configure [sssd](https://sssd.io/docs/introduction.html). - ## Role variables The only required configuration is to create a [sssd.conf](https://www.mankier.com/5/sssd.conf) template at the location specified by `sssd_conf_src`. diff --git a/ansible/roles/sssd/defaults/main.yml b/ansible/roles/sssd/defaults/main.yml index 5bc58c990..605e746c1 100644 --- a/ansible/roles/sssd/defaults/main.yml +++ b/ansible/roles/sssd/defaults/main.yml @@ -1,3 +1,4 @@ +--- sssd_packages: - sssd-common sssd_install_ldap: false diff --git a/ansible/roles/sssd/handlers/main.yml b/ansible/roles/sssd/handlers/main.yml index 72c36e736..4965b15bc 100644 --- a/ansible/roles/sssd/handlers/main.yml +++ b/ansible/roles/sssd/handlers/main.yml @@ -1,5 +1,6 @@ +--- - name: Restart sssd - systemd: + ansible.builtin.systemd: name: sssd state: restarted when: sssd_started | bool diff --git a/ansible/roles/sssd/tasks/configure.yml b/ansible/roles/sssd/tasks/configure.yml index c8ebd829e..66d86f6d8 100644 --- a/ansible/roles/sssd/tasks/configure.yml +++ b/ansible/roles/sssd/tasks/configure.yml @@ -1,5 +1,6 @@ +--- - name: Manage sssd.conf configuration - template: + ansible.builtin.template: src: "{{ sssd_conf_src }}" dest: "{{ sssd_conf_dest }}" owner: root @@ -7,29 +8,28 @@ mode: u=rw,go= notify: "Restart sssd" -- meta: flush_handlers - +- ansible.builtin.meta: flush_handlers - name: Ensure sssd service state - systemd: + ansible.builtin.systemd: name: sssd state: "{{ 'started' if sssd_started | bool else 'stopped' }}" enabled: "{{ sssd_enabled | bool }}" - name: Get current authselect configuration - command: authselect current --raw + ansible.builtin.command: authselect current --raw changed_when: false failed_when: - _authselect_current.rc != 0 - "'No existing configuration detected' not in _authselect_current.stdout" register: _authselect_current # stdout: sssd with-mkhomedir -- name: Configure nsswitch and PAM for SSSD - command: "authselect select sssd --force{% if sssd_enable_mkhomedir | bool %} with-mkhomedir{% endif %}" +- name: Configure nsswitch and PAM for SSSD # noqa: no-changed-when + ansible.builtin.command: "authselect select sssd --force{% if sssd_enable_mkhomedir | bool %} with-mkhomedir{% endif %}" when: "'sssd' not in _authselect_current.stdout" - name: "Ensure oddjob is started" - service: + ansible.builtin.service: name: oddjobd - state: 'started' + state: "started" enabled: true - when: sssd_enable_mkhomedir | bool \ No newline at end of file + when: sssd_enable_mkhomedir | bool diff --git a/ansible/roles/sssd/tasks/export.yml b/ansible/roles/sssd/tasks/export.yml index 0be66749e..607878648 100644 --- a/ansible/roles/sssd/tasks/export.yml +++ b/ansible/roles/sssd/tasks/export.yml @@ -1,9 +1,10 @@ +--- # Exclusively used for compute-init - name: Inject host specific config template - template: + ansible.builtin.template: src: "{{ sssd_conf_src }}" dest: "/exports/cluster/hostconfig/{{ inventory_hostname }}/sssd.conf" owner: root group: root mode: u=rw,go= - delegate_to: "{{ groups['control'] | first }}" \ No newline at end of file + delegate_to: "{{ groups['control'] | first }}" diff --git a/ansible/roles/sssd/tasks/install.yml b/ansible/roles/sssd/tasks/install.yml index 97aa82a2f..b7c8f114f 100644 --- a/ansible/roles/sssd/tasks/install.yml +++ b/ansible/roles/sssd/tasks/install.yml @@ -1,13 +1,14 @@ +--- - name: Ensure sssd packages are installed - dnf: + ansible.builtin.dnf: name: "{{ sssd_packages + sssd_ldap_packages if (sssd_install_ldap | bool) else [] }}" - name: Control if sssd should start on boot # Needs to be done here to prevent starting after image build, is enabled by default - systemd: + ansible.builtin.systemd: name: sssd enabled: "{{ sssd_enabled | bool }}" - name: Ensure mkhomedir packages are installed if required - dnf: + ansible.builtin.dnf: name: "{{ sssd_mkhomedir_packages }}" diff --git a/ansible/roles/sssd/tasks/main.yml b/ansible/roles/sssd/tasks/main.yml index 2b65e84b4..cc29fba63 100644 --- a/ansible/roles/sssd/tasks/main.yml +++ b/ansible/roles/sssd/tasks/main.yml @@ -1,2 +1,3 @@ -- import_tasks: install.yml -- import_tasks: configure.yml +--- +- ansible.builtin.import_tasks: install.yml +- ansible.builtin.import_tasks: configure.yml diff --git a/ansible/roles/systemd/README.md b/ansible/roles/systemd/README.md index e18599f17..9ec8cb82b 100644 --- a/ansible/roles/systemd/README.md +++ b/ansible/roles/systemd/README.md @@ -2,18 +2,17 @@ Create drop-in files for systemd services. -# Role Variables +## Role Variables + - `systemd_dropins`: Required. A mapping where keys = systemd service name, values are a dict as follows: - - `group`: Required str. Inventory group this drop-in applies to. - - `comment`: Optional str. Comment describing reason for drop-in. - - `content`: Required str. Content of drop-in file. -# systemd + - `group`: Required str. Inventory group this drop-in applies to. + - `comment`: Optional str. Comment describing reason for drop-in. + - `content`: Required str. Content of drop-in file. -Create drop-in files for systemd services. +## Role Variables - optional restart -# Role Variables - `systemd_dropins`: Required. A mapping where keys = systemd service name, values are a dict as follows: - - `group`: Required str. Inventory group this drop-in applies to. - - `comment`: Optional str. Comment describing reason for drop-in. - - `content`: Required str. Content of drop-in file. + - `group`: Required str. Inventory group this drop-in applies to. + - `comment`: Optional str. Comment describing reason for drop-in. + - `content`: Required str. Content of drop-in file. - `systemd_restart`: Optional bool. Whether to reload unit definitions and restart services. Default `false`. diff --git a/ansible/roles/systemd/defaults/main.yml b/ansible/roles/systemd/defaults/main.yml index 7ca54aa15..29b9b75e7 100644 --- a/ansible/roles/systemd/defaults/main.yml +++ b/ansible/roles/systemd/defaults/main.yml @@ -1,4 +1,5 @@ -#systemd_dropins: +--- +# systemd_dropins: # : # group: # comment: diff --git a/ansible/roles/systemd/tasks/main.yml b/ansible/roles/systemd/tasks/main.yml index 822a6767d..8fa6f4898 100644 --- a/ansible/roles/systemd/tasks/main.yml +++ b/ansible/roles/systemd/tasks/main.yml @@ -1,11 +1,12 @@ +--- # NB: As `systemd_TODO:` is defined in group_vars/all, all tasks here are conditional on group. - name: Make directory for unit dropins - file: + ansible.builtin.file: path: "/etc/systemd/system/{{ item.key }}.service.d/" state: directory owner: root group: root - mode: 0644 + mode: "0644" loop: "{{ systemd_dropins | dict2items }}" when: "item.value.group in group_names" @@ -17,14 +18,14 @@ dest: "/etc/systemd/system/{{ item.key }}.service.d/slurm_app.conf" owner: root group: root - mode: 0644 + mode: "0644" loop: "{{ systemd_dropins | dict2items }}" register: _systemd_dropins when: "item.value.group in group_names" -- name: Reload unit definitions - ansible.builtin.shell: - cmd: systemctl daemon-reload +- name: Reload unit definitions # noqa: no-changed-when + ansible.builtin.command: + cmd: systemctl daemon-reload # noqa: command-instead-of-module when: - _systemd_dropins.changed - systemd_restart | default(false) | bool diff --git a/ansible/roles/tuned/README.md b/ansible/roles/tuned/README.md index 34885af84..a4626c4ca 100644 --- a/ansible/roles/tuned/README.md +++ b/ansible/roles/tuned/README.md @@ -1,14 +1,11 @@ -tuned -========= +# tuned This role configures the TuneD tool for system tuning, ensuring optimal performance based on the profile settings defined. -Role Variables --------------- +## Role Variables See the [TuneD documentation](https://docs.redhat.com/en/documentation/red_hat_enterprise_linux/9/html/monitoring_and_managing_system_status_and_performance/getting-started-with-tuned_monitoring-and-managing-system-status-and-performance) for profile details. - - `tuned_profile_baremetal`: Optional str. Name of default profile for non-virtualised hosts. Default `hpc-compute`. - `tuned_profile_vm`: Optional str. Name of default profile for virtualised hosts. Default `virtual-guest`. - `tuned_profile`: Optional str. Name of profile to apply to host. Defaults to `tuned_profile_baremetal` or `tuned_profile_vm` as appropriate. diff --git a/ansible/roles/tuned/defaults/main.yml b/ansible/roles/tuned/defaults/main.yml index 1426bbedd..8ddb13913 100644 --- a/ansible/roles/tuned/defaults/main.yml +++ b/ansible/roles/tuned/defaults/main.yml @@ -4,4 +4,4 @@ tuned_profile_baremetal: hpc-compute tuned_profile_vm: virtual-guest tuned_profile: "{{ tuned_profile_baremetal if ansible_virtualization_role != 'guest' else tuned_profile_vm }}" tuned_enabled: true -tuned_started: true +tuned_started: true diff --git a/ansible/roles/tuned/tasks/configure.yml b/ansible/roles/tuned/tasks/configure.yml index cf122d1fb..fa106483d 100644 --- a/ansible/roles/tuned/tasks/configure.yml +++ b/ansible/roles/tuned/tasks/configure.yml @@ -12,7 +12,7 @@ register: _tuned_profile_current changed_when: false -- name: Set TuneD profile +- name: Set TuneD profile # noqa: no-changed-when ansible.builtin.command: cmd: "tuned-adm profile {{ tuned_profile }}" when: diff --git a/ansible/roles/tuned/tasks/install.yml b/ansible/roles/tuned/tasks/install.yml index 0a2db4e42..0890684c1 100644 --- a/ansible/roles/tuned/tasks/install.yml +++ b/ansible/roles/tuned/tasks/install.yml @@ -12,5 +12,6 @@ path: /usr/lib/tuned/hpc-compute/tuned.conf section: sysctl option: vm.min_free_kbytes - value: '>135168' + value: ">135168" no_extra_spaces: true + mode: "0644" diff --git a/ansible/roles/tuned/tasks/main.yml b/ansible/roles/tuned/tasks/main.yml index ef0bea2d1..cc29fba63 100644 --- a/ansible/roles/tuned/tasks/main.yml +++ b/ansible/roles/tuned/tasks/main.yml @@ -1,3 +1,3 @@ --- -- import_tasks: install.yml -- import_tasks: configure.yml \ No newline at end of file +- ansible.builtin.import_tasks: install.yml +- ansible.builtin.import_tasks: configure.yml diff --git a/ansible/roles/zenith_proxy/defaults/main.yml b/ansible/roles/zenith_proxy/defaults/main.yml index 02267cb87..748ad71c6 100644 --- a/ansible/roles/zenith_proxy/defaults/main.yml +++ b/ansible/roles/zenith_proxy/defaults/main.yml @@ -15,7 +15,7 @@ zenith_proxy_pod_name: "{{ zenith_proxy_service_name }}" zenith_proxy_client_container_name: "{{ zenith_proxy_client_service_name }}" zenith_proxy_mitm_container_name: "{{ zenith_proxy_mitm_service_name }}" -zenith_proxy_image_tag: '0.14.0' +zenith_proxy_image_tag: "0.14.0" zenith_proxy_client_image_repository: ghcr.io/azimuth-cloud/zenith-client zenith_proxy_client_image: "{{ zenith_proxy_client_image_repository }}:{{ zenith_proxy_image_tag }}" @@ -27,14 +27,12 @@ zenith_proxy_upstream_scheme: http zenith_proxy_upstream_host: "{{ undef(hint = 'zenith_proxy_upstream_host is required') }}" zenith_proxy_upstream_port: "{{ undef(hint = 'zenith_proxy_upstream_port is required') }}" zenith_proxy_upstream_read_timeout: - zenith_proxy_client_token: "{{ undef(hint = 'zenith_proxy_client_token is required') }}" zenith_proxy_client_auth_skip: false zenith_proxy_client_auth_params: {} - -zenith_proxy_mitm_enabled: no +zenith_proxy_mitm_enabled: false zenith_proxy_mitm_listen_port: 8080 -zenith_proxy_mitm_auth_inject: none # valid values are 'basic' and 'bearer' +zenith_proxy_mitm_auth_inject: none # valid values are 'basic' and 'bearer' zenith_proxy_mitm_auth_basic_username: >- {{ undef(hint = 'zenith_proxy_mitm_auth_basic_username is required') diff --git a/ansible/roles/zenith_proxy/files/podman-pod-infra-attach.sh b/ansible/roles/zenith_proxy/files/podman-pod-infra-attach.sh old mode 100644 new mode 100755 index aab232a0a..0cdfae274 --- a/ansible/roles/zenith_proxy/files/podman-pod-infra-attach.sh +++ b/ansible/roles/zenith_proxy/files/podman-pod-infra-attach.sh @@ -14,4 +14,4 @@ echo "[INFO] Finding infra container for pod '$1'" INFRA_CONTAINER_ID="$(podman pod inspect --format '{{.InfraContainerID}}' "$1")" echo "[INFO] Attaching to infra container '${INFRA_CONTAINER_ID}'" -exec podman container attach --no-stdin ${INFRA_CONTAINER_ID} +exec podman container attach --no-stdin "${INFRA_CONTAINER_ID}" diff --git a/ansible/roles/zenith_proxy/tasks/main.yml b/ansible/roles/zenith_proxy/tasks/main.yml index 1a42b0438..db02dfe96 100644 --- a/ansible/roles/zenith_proxy/tasks/main.yml +++ b/ansible/roles/zenith_proxy/tasks/main.yml @@ -1,68 +1,72 @@ --- - - name: Install script for attaching to pod infra containers - copy: + ansible.builtin.copy: src: podman-pod-infra-attach.sh dest: /usr/bin/ mode: +x become: true - name: Create systemd unit for Zenith pod - template: + ansible.builtin.template: src: pod.service.j2 dest: /etc/systemd/system/{{ zenith_proxy_service_name }}.service + mode: "0644" become: true register: zenith_proxy_pod_systemd_unit - name: Ensure Zenith pod is started and enabled - service: + ansible.builtin.service: name: "{{ zenith_proxy_service_name }}.service" state: "{{ 'restarted' if zenith_proxy_pod_systemd_unit is changed else 'started' }}" - enabled: yes + enabled: true daemon_reload: "{{ zenith_proxy_pod_systemd_unit is changed }}" become: true -- block: +- become: true + when: zenith_proxy_mitm_enabled + + block: - name: Create systemd unit file for MITM proxy - template: + ansible.builtin.template: src: mitm.service.j2 dest: /etc/systemd/system/{{ zenith_proxy_mitm_service_name }}.service + mode: "0644" register: zenith_proxy_mitm_systemd_unit - name: Ensure MITM proxy is started and enabled - service: + ansible.builtin.service: name: "{{ zenith_proxy_mitm_service_name }}.service" state: "{{ 'restarted' if zenith_proxy_mitm_systemd_unit is changed else 'started' }}" - enabled: yes + enabled: true daemon_reload: "{{ zenith_proxy_mitm_systemd_unit is changed }}" - become: true - when: zenith_proxy_mitm_enabled - - name: Ensure Zenith config directory exists - file: + ansible.builtin.file: path: /etc/zenith/{{ zenith_proxy_service_name }} state: directory + mode: "0755" become: true - name: Write Zenith client configuration - template: + ansible.builtin.template: src: zenith-client.yaml.j2 dest: /etc/zenith/{{ zenith_proxy_service_name }}/client.yaml + mode: "0644" become: true register: zenith_proxy_client_config_file - name: Create directory to persist SSH key - file: + ansible.builtin.file: path: "{{ appliances_state_dir }}/{{ zenith_proxy_service_name }}-ssh" state: directory owner: "{{ zenith_proxy_podman_user }}" group: "{{ zenith_proxy_podman_user }}" + mode: "0755" become: true - name: Initialise Zenith client # Use a foreground command rather than the podman_container module as I could not # work out the combination of parameters that produced the desired behaviour :-( - command: >- + ansible.builtin.command: >- podman run --name {{ zenith_proxy_service_name }}-init --replace @@ -79,14 +83,15 @@ "token has already been used" not in zenith_proxy_client_init.stderr - name: Create systemd unit file for Zenith client - template: + ansible.builtin.template: src: client.service.j2 dest: /etc/systemd/system/{{ zenith_proxy_client_service_name }}.service + mode: "0644" become: true register: zenith_proxy_client_systemd_unit - name: Ensure Zenith client is started and enabled - service: + ansible.builtin.service: name: "{{ zenith_proxy_client_service_name }}.service" state: >- {{ @@ -98,6 +103,6 @@ ) else 'started' }} - enabled: yes + enabled: true daemon_reload: "{{ zenith_proxy_client_systemd_unit is changed }}" become: true diff --git a/ansible/site.yml b/ansible/site.yml index d973d9cb3..c26f0a428 100644 --- a/ansible/site.yml +++ b/ansible/site.yml @@ -1,5 +1,4 @@ --- - - name: Run pre.yml hook vars: # hostvars not available here, so have to recalculate environment root: @@ -39,10 +38,8 @@ - name: Clean up and shutdown Packer VM hosts: builder - gather_facts: no - become: yes + gather_facts: false + become: true tasks: - - import_tasks: cleanup.yml + - ansible.builtin.import_tasks: cleanup.yml - community.general.shutdown: - -... \ No newline at end of file diff --git a/ansible/slurm.yml b/ansible/slurm.yml index d1bb93a9f..4f4072d60 100644 --- a/ansible/slurm.yml +++ b/ansible/slurm.yml @@ -1,65 +1,64 @@ --- - - name: Setup DB hosts: mysql become: true tags: - mysql tasks: - - include_role: - name: mysql + - ansible.builtin.include_role: + name: mysql - name: Setup slurm-driven rebuild hosts: rebuild:!builder - become: yes + become: true tags: - rebuild - openhpc tasks: - - import_role: + - ansible.builtin.import_role: name: rebuild - name: Set locked memory limits on user-facing nodes hosts: - compute - login - become: yes + become: true tags: - openhpc tasks: - - name: set memory limits - lineinfile: + - name: Set memory limits + ansible.builtin.lineinfile: path: /etc/security/limits.conf - regexp: '\* soft memlock unlimited' + regexp: "\\* soft memlock unlimited" line: "* soft memlock unlimited" - name: Block ssh to compute nodes for non-privileged users without running jobs hosts: compute - become: yes + become: true tags: - openhpc tasks: - name: Configure sshd pam module - blockinfile: + ansible.builtin.blockinfile: path: /etc/pam.d/sshd - insertafter: 'account\s+required\s+pam_nologin.so' + insertafter: "account\\s+required\\s+pam_nologin.so" block: | account sufficient pam_access.so account required pam_slurm.so - name: Configure login access control - blockinfile: + ansible.builtin.blockinfile: path: /etc/security/access.conf block: | +:adm:ALL -:ALL:ALL - # vagrant uses (deprecated) ansible_ssh_user + # vagrant uses (deprecated) ansible_ssh_user - name: Setup slurm hosts: openhpc - become: yes + become: true tags: - openhpc tasks: - - include_role: + - ansible.builtin.include_role: name: stackhpc.openhpc tasks_from: "{{ 'runtime.yml' if appliances_mode == 'configure' else 'main.yml' }}" diff --git a/ansible/validate.yml b/ansible/validate.yml index e307ec649..12076b7d3 100644 --- a/ansible/validate.yml +++ b/ansible/validate.yml @@ -1,12 +1,11 @@ --- - # Fail early if configuration is invalid - name: Validate secrets created hosts: localhost gather_facts: false tasks: - - import_role: + - ansible.builtin.import_role: name: passwords tasks_from: validate.yml @@ -14,7 +13,7 @@ hosts: all gather_facts: false tasks: - - assert: + - ansible.builtin.assert: that: groups['control'] | length fail_msg: "no hosts found in group 'control' - has control node been deployed?" @@ -23,10 +22,10 @@ gather_facts: false tags: openhpc tasks: - - import_role: + - ansible.builtin.import_role: name: stackhpc.openhpc tasks_from: validate.yml - - assert: + - ansible.builtin.assert: # noqa: run-once[task] that: "'enable_configless' in openhpc_config.SlurmctldParameters | default([])" fail_msg: | 'enable_configless' not found in openhpc_config.SlurmctldParameters - is variable openhpc_config overridden? @@ -40,7 +39,7 @@ gather_facts: false tags: filebeat tasks: - - import_role: + - ansible.builtin.import_role: name: filebeat tasks_from: validate.yml tags: validate @@ -55,17 +54,17 @@ - openondemand_server - grafana tasks: - - import_role: + - ansible.builtin.import_role: name: openondemand tasks_from: validate.yml - # This set of tasks will run if there are grafana hosts configured. - # It is a valid configuration to have a grafana group with hosts + # This set of tasks will run if there are grafana hosts configured. + # It is a valid configuration to have a grafana group with hosts # when *not* deploying openondemand. This would mean that openondemand # vars validated in the below task are not set in a way that passes # this set of validation tasks. To ensure that this validation does # not fail with a valid config, only run these tasks when the # openondemand group both exists *and* contains hosts. - when: + when: - "'openondemand' in groups" - groups['openondemand'] | length > 0 tags: @@ -77,7 +76,7 @@ hosts: freeipa tags: freeipa tasks: - - import_role: + - ansible.builtin.import_role: name: freeipa tasks_from: validate.yml @@ -85,6 +84,6 @@ hosts: lustre tags: lustre tasks: - - import_role: + - ansible.builtin.import_role: name: lustre tasks_from: validate.yml diff --git a/dev/ansible-ssh b/dev/ansible-ssh index 1e7bf756a..b2e13ff47 100755 --- a/dev/ansible-ssh +++ b/dev/ansible-ssh @@ -1,23 +1,28 @@ #!/usr/bin/env python3 # This tool allows you to ssh into a host using the ansible inventory. -# Example: ansible-ssh compute[0] -o GlobalKnownHostsFile=/dev/null -o UserKnownHostsFile=/dev/null +# Example: ansible-ssh compute[0] -o GlobalKnownHostsFile=/dev/null -o +# UserKnownHostsFile=/dev/null -import sys -import subprocess -import shlex import json import os +import shlex +import subprocess +import sys from collections import defaultdict + def _optional_arg(prototype, *values): # returns empty string if any of the values are falsey filtered = [value for value in values if value] return prototype.format(*values) if len(values) == len(filtered) else "" + if __name__ == "__main__": if len(sys.argv) < 2: - msg = (f"Usage: {sys.argv[0]} [args to pass to ssh]") + msg = ( + f"Usage: { + sys.argv[0]} [args to pass to ssh]") print(msg, file=sys.stderr) sys.exit(-1) @@ -25,7 +30,8 @@ if __name__ == "__main__": host = shlex.quote(sys.argv[1]) try: - output = subprocess.check_output(f'ansible-inventory --host { host }', shell=True) + output = subprocess.check_output( + f'ansible-inventory --host {host}', shell=True) except (subprocess.CalledProcessError) as e: msg = (f"[ERROR]: Is {host} missing from the inventory?") print(msg, file=sys.stderr) @@ -56,7 +62,5 @@ if __name__ == "__main__": base = shlex.split(f'ssh {port} {identity} {opts}') extras = sys.argv[2:] cmd = base + extras + [host] - print(f"[INFO]: Running: { subprocess.list2cmdline(cmd) }") - os.execvp(cmd[0],cmd) - - + print(f"[INFO]: Running: {subprocess.list2cmdline(cmd)}") + os.execvp(cmd[0], cmd) diff --git a/dev/delete-cluster.py b/dev/delete-cluster.py index 05f53fbfa..b8f24b13a 100755 --- a/dev/delete-cluster.py +++ b/dev/delete-cluster.py @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/env python # pylint: disable=invalid-name """ Delete infrastructure for a cluster without using Terraform. Useful for CI clusters. @@ -10,41 +10,53 @@ If --force is provided, it will delete all resources without confirmation. """ -import sys, json, subprocess +import json +import subprocess +import sys +CLUSTER_RESOURCES = ["server", "port", "volume"] -CLUSTER_RESOURCES = ['server', 'port', 'volume'] +# pylint: disable-next=missing-function-docstring, redefined-outer-name def delete_cluster(cluster_prefix, force=False): + to_delete = {} for resource_type in CLUSTER_RESOURCES: to_delete[resource_type] = [] - resource_list = subprocess.run(f'openstack {resource_type} list --format json', stdout=subprocess.PIPE, shell=True) + resource_list = subprocess.run( # pylint: disable=subprocess-run-check + f"openstack {resource_type} list --format json", + stdout=subprocess.PIPE, + shell=True, + ) resources = json.loads(resource_list.stdout) for item in resources: try: - if item['Name'] is not None and item['Name'].startswith(cluster_prefix): - print(resource_type, item['Name'], item['ID']) + if item["Name"] is not None and item["Name"].startswith(cluster_prefix): + print(resource_type, item["Name"], item["ID"]) to_delete[resource_type].append(item) - except: + except BaseException: print(resource_type, item) raise - - if force or input('Delete these (y/n)?:') == 'y': + + if force or input("Delete these (y/n)?:") == "y": for resource_type in CLUSTER_RESOURCES: - items = [v['ID'] for v in to_delete[resource_type]] + items = [v["ID"] for v in to_delete[resource_type]] if items: # delete all resources of each type in a single call for speed: - subprocess.run(f"openstack {resource_type} delete {' '.join(items)}", stdout=subprocess.PIPE, shell=True) - print(f'Deleted {len(items)} {resource_type}s') + subprocess.run( # pylint: disable=subprocess-run-check + f"openstack {resource_type} delete {' '.join(items)}", + stdout=subprocess.PIPE, + shell=True, + ) + print(f"Deleted {len(items)} {resource_type}s") else: - print('Cancelled - no resources deleted') + print("Cancelled - no resources deleted") -if __name__ == '__main__': + +if __name__ == "__main__": if len(sys.argv) < 2 or len(sys.argv) > 3: - print('ERROR: Incorrect argument(s).\n' + __doc__) - exit(1) - force_flag = '--force' in sys.argv + print("ERROR: Incorrect argument(s).\n" + __doc__) + exit(1) # pylint: disable=consider-using-sys-exit + force_flag = "--force" in sys.argv cluster_prefix = sys.argv[1] delete_cluster(cluster_prefix, force_flag) - diff --git a/dev/extract_logs.py b/dev/extract_logs.py index 65df0140e..3aecd352e 100644 --- a/dev/extract_logs.py +++ b/dev/extract_logs.py @@ -1,7 +1,7 @@ #!/usr/bin/env python """ -Process packer build workflow logs into CSV. Useful for timing +Process packer build workflow logs into CSV. Useful for timing dissemination. Usage: @@ -13,70 +13,94 @@ import csv import re -import os import sys -def convert_time_to_seconds(time_str): - h, m, s = time_str.split(':') + +def convert_time_to_seconds(time_str): # pylint: disable=missing-function-docstring + h, m, s = time_str.split(":") return int(h) * 3600 + int(m) * 60 + float(s) -def extract_log_info_and_generate_csv(log_file_path, output_csv_path, target_directory): + +# pylint: disable-next=missing-function-docstring, too-many-locals +def extract_log_info_and_generate_csv( + # pylint: disable=redefined-outer-name + log_file_path, + output_csv_path, + target_directory, + # pylint: enable=redefined-outer-name +): data = [] - unwanted_chars = re.compile(r'(\x1B\[[0-9;]*m)|([^\x00-\x7F])') + unwanted_chars = re.compile(r"(\x1B\[[0-9;]*m)|([^\x00-\x7F])") - with open(log_file_path, 'r') as file: + with open(log_file_path, "r") as file: # pylint: disable=unspecified-encoding lines = file.readlines() previous_task = None - for i in range(len(lines)): + for i in range(len(lines)): # pylint: disable=consider-using-enumerate if "TASK [" in lines[i]: - task_name = lines[i].strip().split('TASK [')[1].split(']')[0] + task_name = lines[i].strip().split("TASK [")[1].split("]")[0] - full_task_path = lines[i + 1].strip().split('task path: ')[1] + full_task_path = lines[i + 1].strip().split("task path: ")[1] if target_directory in full_task_path: - start_index = full_task_path.find(target_directory) + len(target_directory) + start_index = full_task_path.find(target_directory) + len( + target_directory + ) partial_task_path = full_task_path[start_index:] else: partial_task_path = full_task_path - partial_task_path = unwanted_chars.sub('', partial_task_path).strip() + partial_task_path = unwanted_chars.sub("", partial_task_path).strip() - time_to_complete = lines[i + 2].strip().split('(')[1].split(')')[0] + time_to_complete = lines[i + 2].strip().split("(")[1].split(")")[0] if previous_task: - previous_task[2] = time_to_complete # Shift the time to the previous task + # pylint: disable-next=unsupported-assignment-operation + previous_task[2] = ( + time_to_complete # Shift the time to the previous task + ) data.append(previous_task) - previous_task = [task_name, partial_task_path, None] # Placeholder for the next time_to_complete + previous_task = [ + task_name, + partial_task_path, + None, + ] # Placeholder for the next time_to_complete if previous_task: - previous_task[2] = time_to_complete if time_to_complete else 'N/A' + previous_task[2] = time_to_complete if time_to_complete else "N/A" data.append(previous_task) for row in data: - if row[2] != 'N/A': + if row[2] != "N/A": row[2] = convert_time_to_seconds(row[2]) data.sort(key=lambda x: x[2], reverse=True) for row in data: if isinstance(row[2], float): - row[2] = f'{int(row[2] // 3600):02}:{int((row[2] % 3600) // 60):02}:{row[2] % 60:.3f}' + row[2] = ( + f"{int(row[2] // 3600):02}:{int((row[2] % 3600) // 60):02}:{row[2] % 60:.3f}" + ) - with open(output_csv_path, 'w', newline='') as csvfile: + # pylint: disable-next=unspecified-encoding + with open(output_csv_path, "w", newline="") as csvfile: csvwriter = csv.writer(csvfile) - csvwriter.writerow(['Task Name', 'Task Path', 'Time to Complete']) + csvwriter.writerow(["Task Name", "Task Path", "Time to Complete"]) csvwriter.writerows(data) print(f"Data extracted, sorted, and saved to {output_csv_path}") - + + if len(sys.argv) != 2: - print("Path to workflow log plain text file should be provided as the only arg to this script") + print( + "Path to workflow log plain text file should be provided as the only arg to this script" + ) sys.exit(1) -log_file_path = sys.argv[1] # Input workflow log name -output_csv_path = log_file_path.replace('.txt', '.csv') # Output CSV name -target_directory = '/ansible/' # Shared directory for task path +log_file_path = sys.argv[1] # Input workflow log name +output_csv_path = log_file_path.replace(".txt", ".csv") # Output CSV name +# pylint: disable-next=invalid-name +target_directory = "/ansible/" # Shared directory for task path extract_log_info_and_generate_csv(log_file_path, output_csv_path, target_directory) diff --git a/dev/image-share.sh b/dev/image-share.sh index 93a57ca53..f109f162f 100755 --- a/dev/image-share.sh +++ b/dev/image-share.sh @@ -13,18 +13,18 @@ DEST=$2 IMAGE_NAME=$3 export OS_CLOUD=$SOURCE -SOURCE_PROJECT=$(openstack project show -c id -f value $SOURCE) +SOURCE_PROJECT=$(openstack project show -c id -f value "$SOURCE") export OS_CLOUD=$DEST -DEST_PROJECT=$(openstack project show -c id -f value $DEST) +DEST_PROJECT=$(openstack project show -c id -f value "$DEST") export OS_CLOUD=$SOURCE -IMAGE=$(openstack image show -c id -f value $IMAGE_NAME) +IMAGE=$(openstack image show -c id -f value "$IMAGE_NAME") echo "Sharing $IMAGE_NAME ($IMAGE) from $SOURCE ($SOURCE_PROJECT) ..." -openstack image set --shared $IMAGE +openstack image set --shared "$IMAGE" echo "Adding destination project $DEST ($DEST_PROJECT) ..." -openstack image add project $IMAGE $DEST_PROJECT +openstack image add project "$IMAGE" "$DEST_PROJECT" export OS_CLOUD=$DEST echo "Accepting share ..." -openstack image set --accept $IMAGE +openstack image set --accept "$IMAGE" echo "Done" diff --git a/dev/output_manifest.py b/dev/output_manifest.py index b68ed494a..39a6776a1 100755 --- a/dev/output_manifest.py +++ b/dev/output_manifest.py @@ -1,4 +1,5 @@ -#!/usr/bin/env python +#!/usr/bin/env python # pylint: disable=missing-module-docstring +# pylint: disable=line-too-long # Set github workflow output parameters defining image IDs from a packer manifest. # Usage: # ./packer/read_manifest.py packer/packer-manifest.json @@ -10,14 +11,23 @@ # which can be used in subsequent workflow steps: [1] # # [1]: https://docs.github.com/en/actions/using-workflows/workflow-commands-for-github-actions#example-setting-a-value +# pylint: enable=line-too-long + +import json +import sys -import sys, json output = {} -with open(sys.argv[1]) as f: +with open(sys.argv[1]) as f: # pylint: disable=unspecified-encoding data = json.load(f) -for build in data['builds']: - node_type = build['custom_data']['source'] - image_id = build['artifact_id'] - output[node_type] = image_id # NB: this deliberately gets the LAST build for a node type +for build in data["builds"]: + node_type = build["custom_data"]["source"] + image_id = build["artifact_id"] + output[node_type] = ( + image_id # NB: this deliberately gets the LAST build for a node type + ) for node_type, image_id in output.items(): - print('::set-output name=NEW_%s_IMAGE_ID::%s' % (node_type.upper(), image_id)) + print( + # pylint: disable-next=consider-using-f-string + "::set-output name=NEW_%s_IMAGE_ID::%s" + % (node_type.upper(), image_id) + ) diff --git a/dev/setup-env.sh b/dev/setup-env.sh index d2f6ae0c4..51db17cbb 100755 --- a/dev/setup-env.sh +++ b/dev/setup-env.sh @@ -5,33 +5,35 @@ set -euo pipefail PYTHON_VERSION=${PYTHON_VERSION:-} if [[ "$PYTHON_VERSION" == "" ]]; then - if [[ -f /etc/os-release ]]; then - . /etc/os-release - OS=$ID - OS_VERSION=$VERSION_ID - else - exit 1 - fi + if [[ -f /etc/os-release ]]; then + # shellcheck disable=SC1091 + . /etc/os-release + OS=$ID + OS_VERSION=$VERSION_ID + else + exit 1 + fi - MAJOR_VERSION=$(echo $OS_VERSION | cut -d. -f1) + MAJOR_VERSION=$(echo "$OS_VERSION" | cut -d. -f1) - if [[ "$OS" == "ubuntu" && "$MAJOR_VERSION" == "22" ]]; then - PYTHON_VERSION="/usr/bin/python3.10" - elif [[ "$OS" == "rocky" && "$MAJOR_VERSION" == "8" ]]; then - # python3.9+ doesn't have selinux bindings - PYTHON_VERSION="/usr/bin/python3.8" # use `sudo yum install python38` on Rocky Linux 8 to install this - elif [[ "$OS" == "rocky" && "$MAJOR_VERSION" == "9" ]]; then - PYTHON_VERSION="/usr/bin/python3.9" - else - echo "Unsupported OS version: $OS $MAJOR_VERSION" - exit 1 - fi + if [[ "$OS" == "ubuntu" && "$MAJOR_VERSION" == "22" ]]; then + PYTHON_VERSION="/usr/bin/python3.10" + elif [[ "$OS" == "rocky" && "$MAJOR_VERSION" == "8" ]]; then + # python3.9+ doesn't have selinux bindings + PYTHON_VERSION="/usr/bin/python3.8" # use `sudo yum install python38` on Rocky Linux 8 to install this + elif [[ "$OS" == "rocky" && "$MAJOR_VERSION" == "9" ]]; then + PYTHON_VERSION="/usr/bin/python3.9" + else + echo "Unsupported OS version: $OS $MAJOR_VERSION" + exit 1 + fi fi if [[ ! -d "venv" ]]; then - $PYTHON_VERSION -m venv venv + $PYTHON_VERSION -m venv venv fi +# shellcheck disable=SC1091 . venv/bin/activate pip install -U pip pip install -r requirements.txt diff --git a/docs/README.md b/docs/README.md index dfa914468..c66868a73 100644 --- a/docs/README.md +++ b/docs/README.md @@ -1,6 +1,6 @@ # StackHPC Slurm Appliance Documentation -### Operator docs +## Operator docs [Image build](image-build.md) @@ -16,7 +16,7 @@ [Sequence diagrams](sequence.md) -### Configuration docs +## Configuration docs [Alerting](alerting.md) @@ -32,7 +32,7 @@ [Persistent state](persistent-state.md) -#### Experimental fetaures +### Experimental fetaures [Compute init](experimental/compute-init.md) @@ -40,6 +40,6 @@ [Slurm controlled rebuild](experimental/slurm-controlled-rebuild.md) -### Contributor docs +## Contributor docs [Adding functionality](adding-functionality.md) diff --git a/docs/adding-functionality.md b/docs/adding-functionality.md index 69d3b3a3f..53c705acb 100644 --- a/docs/adding-functionality.md +++ b/docs/adding-functionality.md @@ -1,9 +1,10 @@ # Adding new functionality Please contact us for specific advice, but this generally involves: + - Adding a role. - Adding a play calling that role into an existing playbook in `ansible/`, or adding a new playbook there and updating `site.yml`. - Adding a new (empty) group named after the role into `environments/common/inventory/groups` and a non-empty example group into `environments/common/layouts/everything`. - Adding new default group vars into `environments/common/inventory/group_vars/all//`. - Updating the default Packer build variables in `environments/common/inventory/group_vars/builder/defaults.yml`. -- Updating READMEs. +- Updating readmes. diff --git a/docs/alerting.md b/docs/alerting.md index b53c0fa40..e42530c5a 100644 --- a/docs/alerting.md +++ b/docs/alerting.md @@ -4,10 +4,10 @@ The [prometheus.io docs](https://prometheus.io/docs/alerting/latest/overview/) describe the overall alerting process: > Alerting with Prometheus is separated into two parts. Alerting rules in - Prometheus servers send alerts to an Alertmanager. The Alertmanager then - manages those alerts, including silencing, inhibition, aggregation and - sending out notifications via methods such as email, on-call notification - systems, and chat platforms. +> Prometheus servers send alerts to an Alertmanager. The Alertmanager then +> manages those alerts, including silencing, inhibition, aggregation and +> sending out notifications via methods such as email, on-call notification +> systems, and chat platforms. The general Prometheus configuration is described in [monitoring-and-logging.md](./monitoring-and-logging.md#defaults-3) - note that @@ -20,39 +20,36 @@ must be configured to generate notifications. ## Enabling alertmanager -1. Ensure both the `prometheus` and `alertmanager` servers are deployed on the -control node - for new environments the `cookiecutter` tool will have done -this: +1. Ensure both the `prometheus` and `alertmanager` servers are deployed on the control node - for new environments the `cookiecutter` tool will have done + this: - ```ini - # environments/site/groups: - [prometheus:children] - control - - [alertmanager:children] - control - ``` +```ini +# environments/site/groups: +[prometheus:children] +control +[alertmanager:children] +control +``` -2. If the appliance was deployed before the alertmanager functionality was included, -generate a password for the alertmanager UI user: +2. If the appliance was deployed before the alertmanager functionality was included, generate a password for the alertmanager UI user: - ```shell - ansible-playbook ansible/adhoc/generate-passwords.yml - ``` +```shell +ansible-playbook ansible/adhoc/generate-passwords.yml +``` 3. Configure a receiver to generate notifications from alerts. Currently a Slack -integration is provided (see below) but alternative receivers could be defined -via overriding role defaults. - + integration is provided (see below) but alternative receivers could be defined + via overriding role defaults. + 4. If desired, any other [role defaults](../ansible/roles/alertmanager/README.md) -may be overriden in e.g. `environments/site/inventory/group_vars/all/alertmanager.yml`. + may be overriden in e.g. `environments/site/inventory/group_vars/all/alertmanager.yml`. 5. Run the `monitoring.yml` playbook (if the cluster is already up) to configure -both alertmanager and prometheus: + both alertmanager and prometheus: - ```shell - ansible-playbook ansible/monitoring.yml - ``` +```shell +ansible-playbook ansible/monitoring.yml +``` ## Access @@ -76,7 +73,7 @@ of alerts via Slack. 1. Create an app with a bot token: -- Go to https://api.slack.com/apps +- Go to - select "Create an App" - select "From scratch" - Set app name and workspace fields, select "Create" @@ -93,16 +90,20 @@ of alerts via Slack. - Uncomment `vault_alertmanager_slack_integration_app_creds` and add the token - Vault-encrypt that file: - ansible-vault encrypt environments/$ENV/inventory/group_vars/all/vault_alertmanager.yml +```shell +ansible-vault encrypt environments/$ENV/inventory/group_vars/all/vault_alertmanager.yml +``` - Open `environments/$ENV/inventory/group_vars/all/alertmanager.yml` - Uncomment the `alertmanager_slack_integration` mapping and set your alert channel name 3. Invite the bot to your alerts channel -- In the appropriate Slack channel type: - /invite @YOUR_BOT_NAME +- In the appropriate Slack channel type: +```text +/invite @YOUR_BOT_NAME +``` ## Alerting Rules @@ -112,15 +113,16 @@ which is defined for the appliance at Two [cloudalchemy.prometheus](https://github.com/cloudalchemy/ansible-prometheus) role variables are relevant: + - `prometheus_alert_rules_files`: Paths to check for files providing rules. Note these are copied to Prometheus config directly, so jinja expressions for Prometheus do not need escaping. - `prometheus_alert_rules`: Yaml-format rules. Jinja templating here will be -interpolated by Ansible, so templating intended for Prometheus must be escaped -using `{% raw %}`/`{% endraw %}` tags. + interpolated by Ansible, so templating intended for Prometheus must be escaped + using `{% raw %}`/`{% endraw %}` tags. By default, `prometheus_alert_rules_files` is set so that any `*.rules` files -in a directory `files/prometheus/rules` in the current environment or *any* +in a directory `files/prometheus/rules` in the current environment or _any_ parent environment are loaded. So usually, site-specific alerts should be added by creating additional rules files in `environments/site/files/prometheus/rules`. If the same file exists in more than one environment, the "child" file will take @@ -128,6 +130,7 @@ precedence and any rules in the "parent" file will be ignored. A set of default alert rule files is provided at `environments/common/files/prometheus/rules/`. These cover: + - Some node-exporter metrics for disk, filesystems, memory and clock. Note no alerts are triggered on memory for compute nodes due to the intended use of those nodes. @@ -137,6 +140,7 @@ These cover: When defining additional rules, note the [labels defined](./monitoring-and-logging.md#prometheus_node_exporter_targets) for node-exporter targets. In future more alerts may be added for: + - smartctl-exporter-based rules for baremetal nodes where there is no infrastructure-level smart monitoring - loss of "up" network interfaces diff --git a/docs/chrony.md b/docs/chrony.md index 0d6f8b100..a80cd4081 100644 --- a/docs/chrony.md +++ b/docs/chrony.md @@ -4,7 +4,7 @@ Use variables from the [mrlesmithjr.chrony](https://github.com/mrlesmithjr/ansib For example in: `environments//inventory/group_vars/all/chrony`: -``` +```yaml --- chrony_ntp_servers: - server: ntp-0.example.org @@ -17,5 +17,4 @@ chrony_ntp_servers: - option: iburst - option: minpoll val: 8 - ``` diff --git a/docs/ci.md b/docs/ci.md index c6fa8900d..1352649f5 100644 --- a/docs/ci.md +++ b/docs/ci.md @@ -2,7 +2,6 @@ The `.github` directory contains a set of sample workflows which can be used by downstream site-specific configuration repositories to simplify ongoing maintainence tasks. These include: -- An [upgrade check](.github/workflows/upgrade-check.yml.sample) workflow which automatically checks this upstream stackhpc/ansible-slurm-appliance repo for new releases and proposes a pull request to the downstream site-specific repo when a new release is published. +- An [upgrade check](.github/workflows/upgrade-check.yml.sample) workflow which automatically checks this upstream stackhpc/ansible-slurm-appliance repository for new releases and proposes a pull request to the downstream site-specific repository when a new release is published. - An [image upload](.github/workflows/upload-s3-image.yml.sample) workflow which takes an image name, downloads it from StackHPC's public S3 bucket if available, and uploads it to the target OpenStack cloud. - diff --git a/docs/environments.md b/docs/environments.md index d1c492312..3b5a396bf 100644 --- a/docs/environments.md +++ b/docs/environments.md @@ -3,6 +3,7 @@ ## Overview An environment defines the configuration for a single instantiation of this Slurm appliance. Each environment is a directory in `environments/`, containing: + - Any deployment automation required - e.g. OpenTofu configuration or HEAT templates. - An Ansible `inventory/` directory. - An `activate` script which sets environment variables to point to this configuration. @@ -13,18 +14,24 @@ All environments load the inventory from the `common` environment first, with th ### Environment-specific inventory structure The ansible inventory for the environment is in `environments//inventory/`. It should generally contain: -- A `hosts` file. This defines the hosts in the appliance. Generally it should be templated out by the deployment automation so it is also a convenient place to define variables which depend on the deployed hosts such as connection variables, IP addresses, ssh proxy arguments etc. -- A `groups` file defining ansible groups, which essentially controls which features of the appliance are enabled and where they are deployed. This repository generally follows a convention where functionality is defined using ansible roles applied to a group of the same name, e.g. `openhpc` or `grafana`. The meaning and use of each group is described in comments in `environments/common/inventory/groups`. As the groups defined there for the common environment are empty, functionality is disabled by default and must be enabled in a specific environment's `groups` file. Two template examples are provided in `environments/commmon/layouts/` demonstrating a minimal appliance with only the Slurm cluster itself, and an appliance with all functionality. + +- A `hosts` file. This defines the hosts in the appliance. Generally it should be templated out by the deployment automation so it is also a convenient place to define variables which depend on the deployed hosts such as connection variables, IP addresses, SSH proxy arguments etc. +- A `groups` file defining ansible groups, which essentially controls which features of the appliance are enabled and where they are deployed. + This repository generally follows a convention where functionality is defined using ansible roles applied to a group of the same name, e.g. `openhpc` or `grafana`. + The meaning and use of each group is described in comments in `environments/common/inventory/groups`. + As the groups defined there for the common environment are empty, functionality is disabled by default and must be enabled in a specific environment's `groups` file. + Two template examples are provided in `environments/commmon/layouts/` demonstrating a minimal appliance with only the Slurm cluster itself, and an appliance with all functionality. - Optionally, group variable files in `group_vars//overrides.yml`, where the group names match the functional groups described above. These can be used to override the default configuration for each functionality, which are defined in `environments/common/inventory/group_vars/all/.yml` (the use of `all` here is due to ansible's precedence rules). Although most of the inventory uses the group convention described above there are a few special cases: + - The `control`, `login` and `compute` groups are special as they need to contain actual hosts rather than child groups, and so should generally be defined in the templated-out `hosts` file. - The cluster name must be set on all hosts using `openhpc_cluster_name`. Using an `[all:vars]` section in the `hosts` file is usually convenient. - `environments/common/inventory/group_vars/all/defaults.yml` contains some variables which are not associated with a specific role/feature. These are unlikely to need changing, but if necessary that could be done using a `environments//inventory/group_vars/all/overrides.yml` file. - The `ansible/adhoc/generate-passwords.yml` playbook sets secrets for all hosts in `environments//inventory/group_vars/all/secrets.yml`. - The Packer-based pipeline for building compute images creates a VM in groups `builder` and `compute`, allowing build-specific properties to be set in `environments/common/inventory/group_vars/builder/defaults.yml` or the equivalent inventory-specific path. - Each Slurm partition must have: - - An inventory group `_` defining the hosts it contains - these must be homogenous w.r.t CPU and memory. - - An entry in the `openhpc_slurm_partitions` mapping in `environments//inventory/group_vars/openhpc/overrides.yml`. + - An inventory group `_` defining the hosts it contains - these must be homogenous w.r.t CPU and memory. + - An entry in the `openhpc_slurm_partitions` mapping in `environments//inventory/group_vars/openhpc/overrides.yml`. See the [openhpc role documentation](https://github.com/stackhpc/ansible-role-openhpc#slurmconf) for more options. - On an OpenStack cloud, rebuilding/reimaging compute nodes from Slurm can be enabled by defining a `rebuild` group containing the relevant compute hosts (e.g. in the generated `hosts` file). diff --git a/docs/experimental/compute-init.md b/docs/experimental/compute-init.md index 8b5d5e389..dfad27bcf 100644 --- a/docs/experimental/compute-init.md +++ b/docs/experimental/compute-init.md @@ -2,7 +2,7 @@ See the role README.md -# Changes to image / tofu state +## Changes to image / tofu state When a compute group has the `ignore_image_changes` parameter set to true, changes to the `image_id` parameter (which defaults to `cluster_image_id`) are @@ -14,17 +14,21 @@ role templates out hostvars to the control node, which means the "target" image ID is then available on the control node. Subsequent work will use this to rebuild the node via slurm. -# CI workflow +## CI workflow The compute node rebuild is tested in CI after the tests for rebuilding the login and control nodes. The process follows 1. Compute nodes are reimaged: - ansible-playbook -v --limit compute ansible/adhoc/rebuild.yml +```shell +ansible-playbook -v --limit compute ansible/adhoc/rebuild.yml +``` 2. Ansible-init runs against newly reimaged compute nodes 3. Run sinfo and check nodes have expected slurm state - ansible-playbook -v ansible/ci/check_slurm.yml \ No newline at end of file +```shell +ansible-playbook -v ansible/ci/check_slurm.yml +``` diff --git a/docs/experimental/pulp.md b/docs/experimental/pulp.md index c6b437d20..21d5c93b3 100644 --- a/docs/experimental/pulp.md +++ b/docs/experimental/pulp.md @@ -5,13 +5,16 @@ In order to ensure reproducible builds, the appliance can build images using rep ## Deploying/configuring Pulp Server ### Deploying a Pulp server + A playbook is provided to install and configure a Pulp server on a given host. Admin credentials for this server are automatically generated through the `ansible/adhoc/generate-passwords.yml` playbook. This can be run with `ansible-playbook ansible/adhoc/deploy-pulp.yml -e "pulp_server="` where `target_host` is any resolvable host. This will print a Pulp URL which can be copied to your environments as appropriate. Ensure that the server is accessible on the specified port. Note access to this server's content isn't authenticated so assumes the server is deployed behind a secure network. ### Using an existing Pulp server + An existing Pulp server can be used to host Ark repos by overriding `pulp_site_password` and `appliances_pulp_url` in the target environment. Note that this assumes the same configuration as the appliance deployed Pulp i.e no content authentication. ## Syncing Pulp content with Ark -If the `pulp` group is added to the Packer build groups, the local Pulp server will be synced with Ark on build. You must authenticate with Ark by overriding `pulp_site_upstream_username` and `pulp_site_upstream_password` with your vault encrypted Ark dev credentials. `dnf_repos_username` and `dnf_repos_password` must remain unset to access content from the local Pulp. Content can also be synced by running `ansible/adhoc/sync-pulp.yml`. By default this syncs repositories for Rocky 9.5 with x86_64 architecture, but can be overridden by setting extra variables for `pulp_site_target_arch`, `pulp_site_target_distribution`, `pulp_site_target_distribution_version` and `pulp_site_target_distribution_version_major`. +If the `pulp` group is added to the Packer build groups, the local Pulp server will be synced with Ark on build. You must authenticate with Ark by overriding `pulp_site_upstream_username` and `pulp_site_upstream_password` with your vault encrypted Ark dev credentials. `dnf_repos_username` and `dnf_repos_password` must remain unset to access content from the local Pulp. +Content can also be synced by running `ansible/adhoc/sync-pulp.yml`. By default this syncs repositories for Rocky 9.5 with x86_64 architecture, but can be overridden by setting extra variables for `pulp_site_target_arch`, `pulp_site_target_distribution`, `pulp_site_target_distribution_version` and `pulp_site_target_distribution_version_major`. diff --git a/docs/experimental/slurm-controlled-rebuild.md b/docs/experimental/slurm-controlled-rebuild.md index 7f9efa22c..fc654d354 100644 --- a/docs/experimental/slurm-controlled-rebuild.md +++ b/docs/experimental/slurm-controlled-rebuild.md @@ -9,6 +9,7 @@ This provides a way to upgrade nodes with less impact than the normal approach. > or usage may change with further development. In summary, the way this functionality works is as follows: + 1. The image references(s) are manually updated in the OpenTofu configuration in the normal way. 2. `tofu apply` is run which rebuilds the login and control nodes to the new @@ -20,7 +21,7 @@ In summary, the way this functionality works is as follows: and control nodes and the old image for the compute nodes. This playbook also: - Writes cluster configuration to the control node, using the - [compute_init](../../ansible/roles/compute_init/README.md) role. + [compute_init](../../ansible/roles/compute_init/README.md) role. - Configures an application credential and helper programs on the control node, using the [rebuild](../../ansible/roles/rebuild/README.md) role. 4. An admin submits Slurm jobs, one for each node, to a special "rebuild" @@ -34,7 +35,7 @@ In summary, the way this functionality works is as follows: configuration, and if it does not match, uses OpenStack to rebuild the node to the desired (updated) image. TODO: Describe the logic if they DO match -6. After a rebuild, the compute node runs various Ansible tasks during boot, +6. After a rebuild, the compute node runs various Ansible tasks during boot, controlled by the [compute_init](../../ansible/roles/compute_init/README.md) role, to fully configure the node again. It retrieves the required cluster configuration information from the control node via an NFS mount. @@ -47,7 +48,7 @@ In summary, the way this functionality works is as follows: To enable a compute node to rejoin the cluster after a rebuild, functionality must be built into the image. Before progressing you should check that all the functionality required for your cluster is currently supported by the -`compute_init` role. Review that role's [README](../../ansible/roles/compute_init/README.md) +`compute_init` role. Review that role's [Readme](../../ansible/roles/compute_init/README.md) against `environments/*/inventory/groups` files (and any similar files which define groups). Note that some functionality does not require support, e.g. because it does not run on compute nodes. @@ -55,9 +56,10 @@ because it does not run on compute nodes. ## Configuration The configuration of this is complex and involves: + - OpenTofu variables to stop tracking image changes on compute nodes - Definition of partition(s) to use for launching rebuild jobs -- Configuration of the [rebuild](../../ansible/roles/rebuild/README.md) role +- Configuration of the [rebuild](../../ansible/roles/rebuild/README.md) role to enable the Slurm controller to rebuild compute nodes via OpenStack. - Configuration of the [compute_init](../../ansible/roles/compute_init/README.md) role so that compute nodes rejoin the cluster after rebuilding - this is likely @@ -71,107 +73,110 @@ The configuration of this is complex and involves: relevant node group in the OpenTofu `compute` variable, set the parameter `ignore_image_changes: true`. E.g. - ```terraform - # environments/$ENV/main.tf: - ... - compute = { - general = { - nodes = ["general-0", "general-1"] - ignore_image_changes = true - ... - } - gpu = { - node = ["a100-0", "a100-1"] - ignore_image_changes = true - ... - } +```terraform +# environments/$ENV/main.tf: +... +compute = { + general = { + nodes = ["general-0", "general-1"] + ignore_image_changes = true + ... } - ... - ``` + gpu = { + node = ["a100-0", "a100-1"] + ignore_image_changes = true + ... + } +} +... +``` -3. Follow the [compute_init](../../ansible/roles/compute_init/README.md) README +3. Follow the [compute_init](../../ansible/roles/compute_init/README.md) readme to add OpenTofu and Ansible configuration for that role. The "rebootable" nodes should all be in the `compute_init` group with the `compute_init_enable` OpenTofu parameter set. -4. If the [compute_init](../../ansible/roles/compute_init/README.md) README +4. If the [compute_init](../../ansible/roles/compute_init/README.md) readme showed that a custom image is required for any entry in the `compute_init_enable` parameter, follow the usual process to build new images as required. 5. Update image references in the OpenTofu configuration. Normally these should be in: - - `environments/site/tofu/variables.tf`: `cluster_image_id` for the default - cluster image. - - `environments/$ENV/tofu/main.tf`: parameter `image_id` in node groups - defined in the `compute` or `login` variables, to override the default - image for specific node groups. -5. Ensure `openhpc_partitions` contains a partition covering the nodes to run + - `environments/site/tofu/variables.tf`: `cluster_image_id` for the default + cluster image. + - `environments/$ENV/tofu/main.tf`: parameter `image_id` in node groups + defined in the `compute` or `login` variables, to override the default + image for specific node groups. + +6. Ensure `openhpc_partitions` contains a partition covering the nodes to run rebuild jobs. The default definition in `environments/common/inventory/group_vars/all/openhpc.yml` will automatically include this via `openhpc_rebuild_partition` also in that file. If modifying this, note the important parameters are: - - - `name`: Partition name matching `rebuild` role variable `rebuild_partitions`, - default `rebuild`. - - `groups`: A list of nodegroup names, matching `openhpc_nodegroup` and - keys in the OpenTofu `compute` variable (see example in step 2 above). - Normally every compute node group should be listed here, unless - Slurm-controlled rebuild is not required for certain node groups. - - `default`: Must be set to `NO` so that it is not the default partition. - - `maxtime`: Maximum time to allow for rebuild jobs, in - [slurm.conf format](https://slurm.schedmd.com/slurm.conf.html#OPT_MaxTime). - The example here is 30 minutes, but see discussion below. - - `partition_params`: A mapping of additional parameters, which must be set - as follows: - - `PriorityJobFactor`: Ensures jobs in this partition (i.e. rebuild jobs) - are always scheduled before jobs in "normal" partitions on the same - nodes. This value is the highest which can be set. See - [slurm.conf docs](https://slurm.schedmd.com/slurm.conf.html#OPT_PriorityJobFactor). - Note this is used instead of `PriorityTier` as the latter (with the - default appliance configuration) allows rebuild jobs to preempt and - suspend running user jobs, which is probably undesirable. - - `Hidden`: Don't show this partition in e.g. `sinfo` for unpriviledged - users. - - `RootOnly`: Only allow the root user to submit jobs to this partition. - - `DisableRootJobs`: Don't disable the root user, in case this parameter - is set globally via `openhpc_config_extra`. - - `PreemptMode`: Don't allow reboot jobs to be preempted/suspended. - - `OverSubscribe`: Ensure that jobs run in this partition require the - entire node. This means they do not run on nodes as the same time as - user jobs running in partitions allowing non-exclusive use. - - The value for `maxtime` needs to be sufficent not just for a single node - to be rebuilt, but also to allow for any batching in either OpenTofu or - in Nova - see remarks in the [production docs](../production.md). - - If it is desirable to roll out changes more gradually, it is possible to - create multiple "rebuild" partitions, but it is necessary that: - - The rebuild partitions should not themselves overlap, else nodes may be - rebuilt more than once. - - Each rebuild partition should entirely cover one or more "normal" - partitions, to avoid the possibility of user jobs being scheduled to a - mix of nodes using old and new images. - -6. Configure the [rebuild](../../ansible/roles/rebuild/README.md) role: - - Add the `control` node into the `rebuild` group. - - Ensure an application credential to use for rebuilding nodes is available - on the deploy host (default location `~/.config/openstack/clouds.yaml`). - - If required, override `rebuild_clouds_path` or other variables in the site - environment. - -7. Run `tofu apply` as usual to apply the new OpenTofu configuration. - - > [!NOTE] - > If the cluster image references were updated at step 5, this will be - > a disruptive operation and should be planned as part of a normal upgrade - > cycle. - - > [!CAUTION] - > Due to OpenTofu/Terraform state limitations, this will plan to delete and - > recreate all compute nodes in node groups where `ignore_image_changes: true`. - > was not previously set. This is a one-time issue with adding this - > parameter, i.e. subsequent applys will not require this. + + - `name`: Partition name matching `rebuild` role variable `rebuild_partitions`, + default `rebuild`. + - `groups`: A list of nodegroup names, matching `openhpc_nodegroup` and + keys in the OpenTofu `compute` variable (see example in step 2 above). + Normally every compute node group should be listed here, unless + Slurm-controlled rebuild is not required for certain node groups. + - `default`: Must be set to `NO` so that it is not the default partition. + - `maxtime`: Maximum time to allow for rebuild jobs, in + [slurm.conf format](https://slurm.schedmd.com/slurm.conf.html#OPT_MaxTime). + The example here is 30 minutes, but see discussion below. + - `partition_params`: A mapping of additional parameters, which must be set + as follows: + - `PriorityJobFactor`: Ensures jobs in this partition (i.e. rebuild jobs) + are always scheduled before jobs in "normal" partitions on the same + nodes. This value is the highest which can be set. See + [slurm.conf docs](https://slurm.schedmd.com/slurm.conf.html#OPT_PriorityJobFactor). + Note this is used instead of `PriorityTier` as the latter (with the + default appliance configuration) allows rebuild jobs to preempt and + suspend running user jobs, which is probably undesirable. + - `Hidden`: Don't show this partition in e.g. `sinfo` for unpriviledged + users. + - `RootOnly`: Only allow the root user to submit jobs to this partition. + - `DisableRootJobs`: Don't disable the root user, in case this parameter + is set globally via `openhpc_config_extra`. + - `PreemptMode`: Don't allow reboot jobs to be preempted/suspended. + - `OverSubscribe`: Ensure that jobs run in this partition require the + entire node. This means they do not run on nodes as the same time as + user jobs running in partitions allowing non-exclusive use. + + The value for `maxtime` needs to be sufficent not just for a single node + to be rebuilt, but also to allow for any batching in either OpenTofu or + in Nova - see remarks in the [production docs](../production.md). + + If it is desirable to roll out changes more gradually, it is possible to + create multiple "rebuild" partitions, but it is necessary that: + + - The rebuild partitions should not themselves overlap, else nodes may be + rebuilt more than once. + - Each rebuild partition should entirely cover one or more "normal" + partitions, to avoid the possibility of user jobs being scheduled to a + mix of nodes using old and new images. + +7. Configure the [rebuild](../../ansible/roles/rebuild/README.md) role: + + - Add the `control` node into the `rebuild` group. + - Ensure an application credential to use for rebuilding nodes is available + on the deploy host (default location `~/.config/openstack/clouds.yaml`). + - If required, override `rebuild_clouds_path` or other variables in the site + environment. + +8. Run `tofu apply` as usual to apply the new OpenTofu configuration. + + > [!NOTE] + > If the cluster image references were updated at step 5, this will be + > a disruptive operation and should be planned as part of a normal upgrade + > cycle. + > + > [!CAUTION] + > Due to OpenTofu/Terraform state limitations, this will plan to delete and + > recreate all compute nodes in node groups where `ignore_image_changes: true`. + > was not previously set. This is a one-time issue with adding this + > parameter, i.e. subsequent applys will not require this. TODO: clarify whether, if the image is bumped at this point, the compute nodes actually get recreated on the new or the old image?? @@ -193,7 +198,9 @@ However there is no need to drain compute nodes and create reservations etc. Triggering rebuild jobs is done using the following playbook: - ansible-playbook ansible/adhoc/rebuild-via-slurm.yml +```shell +ansible-playbook ansible/adhoc/rebuild-via-slurm.yml +``` This will create jobs to reimage every slurm-rebuildable node to the image currently defined in the OpenTofu configuration. @@ -204,17 +211,22 @@ example the following comand will run in a non-default partition and does not actually reboot/rebuild nodes, which may be useful for testing interactions with other priority or QOS settings: - ansible-playbook ansible/adhoc/rebuild-via-slurm.yml -e 'rebuild_job_partitions=test rebuild_job_reboot=false' +```shell +ansible-playbook ansible/adhoc/rebuild-via-slurm.yml -e 'rebuild_job_partitions=test rebuild_job_reboot=false' +``` ## Testing The below demonstrates testing this using the `.stackhpc` CI environment, using: + - A 2-node default "standard" partition. - A 2-node "extra" partition (note this does not usually have any nodes by default). In one terminal launch a watch of job state: - [root@RL9-control rocky]# clear && ~/ewatch/ewatch.py -n 1 -i '\d+:\d+' 'squeue --all --Format=PARTITION,NAME:25,USERNAME:11,STATE:12,NUMNODES:8,NODELIST' +```shell +[root@RL9-control rocky]# clear && ~/ewatch/ewatch.py -n 1 -i '\d+:\d+' 'squeue --all --Format=PARTITION,NAME:25,USERNAME:11,STATE:12,NUMNODES:8,NODELIST' +``` This uses [ewatch](https://github.com/sjpb/ewatch) to summarise changes in output. @@ -222,17 +234,24 @@ output. In a second terminal, launch 2x normal jobs into the default ("standard") partition: - [demo_user@RL9-login-0 ~]$ sbatch -N2 --job-name=JobA --wrap "sleep 20" && sbatch -N2 --job-name=JobB --wrap "sleep 10" +```shell +[demo_user@RL9-login-0 ~]$ sbatch -N2 --job-name=JobA --wrap "sleep 20" && sbatch -N2 --job-name=JobB --wrap "sleep 10" +``` In a third terminal, trigger rebuild jobs: - .stackhpc/ (venv) [rocky@steveb-dev slurm-app-rl9]$ ansible-playbook ansible/adhoc/rebuild-via-slurm.yml -e 'rebuild_job_reboot=false rebuild_job_command="sleep 30"' - +```shell +.stackhpc/ (venv) [rocky@steveb-dev slurm-app-rl9]$ ansible-playbook ansible/adhoc/rebuild-via-slurm.yml -e 'rebuild_job_reboot=false rebuild_job_command="sleep 30"' - +``` Back in the second terminal, submit more user jobs to either partition: - [demo_user@RL9-login-0 ~]$ sbatch -N2 --job-name=JobC --partition,standard,extra --wrap "sleep 10" +```shell +[demo_user@RL9-login-0 ~]$ sbatch -N2 --job-name=JobC --partition,standard,extra --wrap "sleep 10" +``` The output from the first terminal should show: + - Job A runs on submission in the default "standard" partition. - Job B pends for the default "standard" partition. - Rebuild jobs runs on submission in the "extra" partition and pend for the "standard" partition @@ -246,48 +265,49 @@ The output from the first terminal should show: - Job B runs in the "standard" partition Example output: -``` + +```text [2025-03-28T14:26:34.510466] -PARTITION NAME USER STATE NODES NODELIST -standard JobB demo_user PENDING 2 -standard JobA demo_user RUNNING 2 RL9-compute-[0-1] +PARTITION NAME USER STATE NODES NODELIST +standard JobB demo_user PENDING 2 +standard JobA demo_user RUNNING 2 RL9-compute-[0-1] [2025-03-28T14:26:38.530213] -PARTITION NAME USER STATE NODES NODELIST -rebuild rebuild-RL9-compute-1 root PENDING 1 -rebuild rebuild-RL9-compute-0 root PENDING 1 -rebuild rebuild-RL9-extra-0 root RUNNING 1 RL9-extra-0 -rebuild rebuild-RL9-extra-1 root RUNNING 1 RL9-extra-1 -standard JobB demo_user PENDING 2 -standard JobA demo_user RUNNING 2 RL9-compute-[0-1] -standard,extra JobC demo_user PENDING 2 +PARTITION NAME USER STATE NODES NODELIST +rebuild rebuild-RL9-compute-1 root PENDING 1 +rebuild rebuild-RL9-compute-0 root PENDING 1 +rebuild rebuild-RL9-extra-0 root RUNNING 1 RL9-extra-0 +rebuild rebuild-RL9-extra-1 root RUNNING 1 RL9-extra-1 +standard JobB demo_user PENDING 2 +standard JobA demo_user RUNNING 2 RL9-compute-[0-1] +standard,extra JobC demo_user PENDING 2 [2025-03-28T14:26:54.609651] -PARTITION NAME USER STATE NODES NODELIST -rebuild rebuild-RL9-compute-0 root RUNNING 1 RL9-compute-0 -rebuild rebuild-RL9-compute-1 root RUNNING 1 RL9-compute-1 -rebuild rebuild-RL9-extra-0 root RUNNING 1 RL9-extra-0 -rebuild rebuild-RL9-extra-1 root RUNNING 1 RL9-extra-1 -standard JobB demo_user PENDING 2 -standard,extra JobC demo_user PENDING 2 +PARTITION NAME USER STATE NODES NODELIST +rebuild rebuild-RL9-compute-0 root RUNNING 1 RL9-compute-0 +rebuild rebuild-RL9-compute-1 root RUNNING 1 RL9-compute-1 +rebuild rebuild-RL9-extra-0 root RUNNING 1 RL9-extra-0 +rebuild rebuild-RL9-extra-1 root RUNNING 1 RL9-extra-1 +standard JobB demo_user PENDING 2 +standard,extra JobC demo_user PENDING 2 [2025-03-28T14:28:39.091571] -PARTITION NAME USER STATE NODES NODELIST -extra JobC demo_user RUNNING 2 RL9-extra-[0-1] -rebuild rebuild-RL9-compute-0 root RUNNING 1 RL9-compute-0 -rebuild rebuild-RL9-compute-1 root RUNNING 1 RL9-compute-1 -standard JobB demo_user PENDING 2 +PARTITION NAME USER STATE NODES NODELIST +extra JobC demo_user RUNNING 2 RL9-extra-[0-1] +rebuild rebuild-RL9-compute-0 root RUNNING 1 RL9-compute-0 +rebuild rebuild-RL9-compute-1 root RUNNING 1 RL9-compute-1 +standard JobB demo_user PENDING 2 [2025-03-28T14:28:49.139349] -PARTITION NAME USER STATE NODES NODELIST -rebuild rebuild-RL9-compute-0 root RUNNING 1 RL9-compute-0 -rebuild rebuild-RL9-compute-1 root RUNNING 1 RL9-compute-1 -standard JobB demo_user PENDING 2 +PARTITION NAME USER STATE NODES NODELIST +rebuild rebuild-RL9-compute-0 root RUNNING 1 RL9-compute-0 +rebuild rebuild-RL9-compute-1 root RUNNING 1 RL9-compute-1 +standard JobB demo_user PENDING 2 [2025-03-28T14:28:55.168264] -PARTITION NAME USER STATE NODES NODELIST -standard JobB demo_user RUNNING 2 RL9-compute-[0-1] +PARTITION NAME USER STATE NODES NODELIST +standard JobB demo_user RUNNING 2 RL9-compute-[0-1] [2025-03-28T14:29:05.216346] -PARTITION NAME USER STATE NODES NODELIST +PARTITION NAME USER STATE NODES NODELIST ``` diff --git a/docs/image-build.md b/docs/image-build.md index dc968ebfd..71be030de 100644 --- a/docs/image-build.md +++ b/docs/image-build.md @@ -3,59 +3,67 @@ The appliance contains code and configuration to use [Packer](https://developer.hashicorp.com/packer) with the [OpenStack builder](https://www.packer.io/plugins/builders/openstack) to build images. The Packer configuration defined here builds "fat images" which contain packages, binaries and container images but no cluster-specific configuration. Using these: + - Enables the image to be tested in CI before production use. - Ensures re-deployment of the cluster or deployment of additional nodes can be completed even if packages are changed in upstream repositories (e.g. due to RockyLinux or OpenHPC updates). - Improves deployment speed by reducing the number of package downloads to improve deployment speed. The fat images StackHPC builds and tests in CI are available from [GitHub releases](https://github.com/stackhpc/ansible-slurm-appliance/releases). However with some additional configuration it is also possible to: + 1. Build site-specific fat images from scratch. 2. Extend an existing fat image with additional functionality. - -# Usage +## Usage To build either a site-specific fat image from scratch, or to extend an existing StackHPC fat image: 1. Ensure the current OpenStack credentials have sufficient authorisation to upload images (this may or may not require the `member` role for an application credential, depending on your OpenStack configuration). 2. The provided dev credentials for StackHPC's "Ark" Pulp server must be added to the target environments. This is done by overriding `dnf_repos_username` and `dnf_repos_password` with your vault encrypted credentials in `environments//inventory/group_vars/all/pulp.yml`. See the [experimental docs](experimental/pulp.md) if you wish instead wish to use a local Pulp server. 3. Create a Packer [variable definition file](https://developer.hashicorp.com/packer/docs/templates/hcl_templates/variables#assigning-values-to-input-variables) at e.g. `environments//builder.pkrvars.hcl` containing at a minimum: - - ```hcl - flavor = "general.v1.small" # VM flavor to use for builder VMs - networks = ["26023e3d-bc8e-459c-8def-dbd47ab01756"] # List of network UUIDs to attach the VM to - source_image_name = "Rocky-9-GenericCloud-Base-9.4" # Name of image to create VM with, i.e. starting image - inventory_groups = "control,login,compute" # Additional inventory groups to add build VM to - - ``` - - Note that: - - The network used for the Packer VM must provide outbound internet access but does not need to provide access to resources which the final cluster nodes require (e.g. Slurm control node, network filesystem servers etc.). - - The flavor used must have sufficent memory for the build tasks, but otherwise does not need to match the final cluster nodes. Usually 8GB is sufficent. By default, the build VM is volume-backed to allow control of the root disk size (and hence final image size) so the flavor disk size does not matter. - - The source image should be either a RockyLinux GenericCloud image for a site-specific image build from scratch, or a StackHPC fat image if extending an existing image. - - The `inventory_groups` variable takes a comma-separated list of Ansible inventory groups to add the build VM to. This is in addition to the `builder` group which it is always added to. This controls which Ansible roles and functionality run during build, and hence what gets added to the image. All possible groups are listed in `environments/common/groups` but common options for this variable will be: - - `update,control,login,compute`: The resultant image has all packages in the source image updated, and then packages for all types of nodes in the cluster are added. When using a GenericCloud image for `source_image_name` this builds a site-specific fat image from scratch. - - One or more specific groups which are not enabled in the appliance by default, e.g. `lustre`. When using a StackHPC fat image for `source_image_name` this extends the image with just this additional functionality. + +```hcl +flavor = "general.v1.small" # VM flavor to use for builder VMs +networks = ["26023e3d-bc8e-459c-8def-dbd47ab01756"] # List of network UUIDs to attach the VM to +source_image_name = "Rocky-9-GenericCloud-Base-9.4" # Name of image to create VM with, i.e. starting image +inventory_groups = "control,login,compute" # Additional inventory groups to add build VM to +``` + +Note that: + +- The network used for the Packer VM must provide outbound internet access but does not need to provide access to resources which the final cluster nodes require (e.g. Slurm control node, network filesystem servers etc.). +- The flavor used must have sufficent memory for the build tasks, but otherwise does not need to match the final cluster nodes. Usually 8GB is sufficent. By default, the build VM is volume-backed to allow control of the root disk size (and hence final image size) so the flavor disk size does not matter. +- The source image should be either a RockyLinux GenericCloud image for a site-specific image build from scratch, or a StackHPC fat image if extending an existing image. +- The `inventory_groups` variable takes a comma-separated list of Ansible inventory groups to add the build VM to. This is in addition to the `builder` group which it is always added to. This controls which Ansible roles and functionality run during build, and hence what gets added to the image. + All possible groups are listed in `environments/common/groups` but common options for this variable will be: + - `update,control,login,compute`: The resultant image has all packages in the source image updated, and then packages for all types of nodes in the cluster are added. When using a GenericCloud image for `source_image_name` this builds a site-specific fat image from scratch. + - One or more specific groups which are not enabled in the appliance by default, e.g. `lustre`. When using a StackHPC fat image for `source_image_name` this extends the image with just this additional functionality. 4. Activate the venv and the relevant environment. 5. Build images using the relevant variable definition file, e.g.: - cd packer/ - PACKER_LOG=1 /usr/bin/packer build -on-error=ask -var-file=$PKR_VAR_environment_root/builder.pkrvars.hcl openstack.pkr.hcl +```shell +cd packer/ +PACKER_LOG=1 /usr/bin/packer build -on-error=ask -var-file=$PKR_VAR_environment_root/builder.pkrvars.hcl openstack.pkr.hcl +``` - **NB:** If the build fails while creating the volume, check if the source image has the `signature_verified` property: +**NB:** If the build fails while creating the volume, check if the source image has the `signature_verified` property: - openstack image show $SOURCE_IMAGE +```shell +openstack image show $SOURCE_IMAGE +``` - If it does, remove this property: +If it does, remove this property: - openstack image unset --property signature_verified $SOURCE_IMAGE +```shell +openstack image unset --property signature_verified $SOURCE_IMAGE +``` - then delete the failed volume, select cancelling the build when Packer queries, and then retry. This is [OpenStack bug 1823445](https://bugs.launchpad.net/cinder/+bug/1823445). +then delete the failed volume, select cancelling the build when Packer queries, and then retry. This is [OpenStack bug 1823445](https://bugs.launchpad.net/cinder/+bug/1823445). -6. The built image will be automatically uploaded to OpenStack with a name prefixed `openhpc` and including a timestamp and a shortened git hash. +6. The built image will be automatically uploaded to OpenStack with a name prefixed `openhpc` and including a timestamp and a shortened Git hash. -# Build Process +## Build Process In summary, Packer creates an OpenStack VM, runs Ansible on that, shuts it down, then creates an image from the root disk. @@ -66,6 +74,7 @@ shows the use of the environment variable `$PKR_VAR_environment_root` (which its using a path in a "parent" environment is likely to be more appropriate (as builds should not be environment-specific to allow testing before deployment to a production environment). What is Slurm Appliance-specific are the details of how Ansible is run: + - The build VM is always added to the `builder` inventory group, which differentiates it from nodes in a cluster. This allows Ansible variables to be set differently during Packer builds, e.g. to prevent services starting. The defaults for this are in `environments/common/inventory/group_vars/builder/`, which could be extended or overriden for site-specific fat image builds using `builder` groupvars for the relevant environment. It also runs some builder-specific code (e.g. to clean up the image). - The default fat image builds also add the build VM to the "top-level" `compute`, `control` and `login` groups. This ensures @@ -76,9 +85,10 @@ What is Slurm Appliance-specific are the details of how Ansible is run: groupvars is not sufficient (e.g. a role always attempts to configure or start services). There are some things to be aware of when developing Ansible to run in a Packer build VM: - - Only some tasks make sense. E.g. any services with a reliance on the network cannot be started, and should not be enabled if, when creating an instance with the resulting image, the remote service will not be immediately present. - - Nothing should be written to the persistent state directory `appliances_state_dir`, as this is on the root filesystem rather than an OpenStack volume. - - Care should be taken not to leave data on the root filesystem which is not wanted in the final image (e.g secrets). - - Build VM hostnames are not the same as for equivalent "real" hosts and do not contain `login`, `control` etc. Therefore variables used by the build VM must be defined as groupvars not hostvars. - - Ansible may need to use a proxyjump to reach cluster nodes, which can be defined via Ansible's `ansible_ssh_common_args` variable. If Packer should not use the same proxy - to connect to build VMs (e.g. because build happens on a different network), this proxy configuration should not be added to the `all` group. + +- Only some tasks make sense. E.g. any services with a reliance on the network cannot be started, and should not be enabled if, when creating an instance with the resulting image, the remote service will not be immediately present. +- Nothing should be written to the persistent state directory `appliances_state_dir`, as this is on the root filesystem rather than an OpenStack volume. +- Care should be taken not to leave data on the root filesystem which is not wanted in the final image (e.g secrets). +- Build VM hostnames are not the same as for equivalent "real" hosts and do not contain `login`, `control` etc. Therefore variables used by the build VM must be defined as groupvars not hostvars. +- Ansible may need to use a proxyjump to reach cluster nodes, which can be defined via Ansible's `ansible_ssh_common_args` variable. If Packer should not use the same proxy + to connect to build VMs (e.g. because build happens on a different network), this proxy configuration should not be added to the `all` group. diff --git a/docs/k3s.README.md b/docs/k3s.README.md index 1b6651159..500a7899b 100644 --- a/docs/k3s.README.md +++ b/docs/k3s.README.md @@ -1,8 +1,10 @@ # Overview -A K3s cluster is deployed with the Slurm cluster. Both an agent and server instance of K3s is installed during image build and the correct service (determined by OpenStack metadata) will be -enabled during boot. Nodes with the `k3s_server` metadata field defined will be configured as K3s agents (this field gives them the address of the server). The Slurm control node is currently configured as a server while all other nodes are configured as agents. Using multiple K3s servers isn't supported. Currently only the root user on the control node has + +A K3s cluster is deployed with the Slurm cluster. Both an agent and server instance of K3s is installed during image build and the correct service (determined by OpenStack metadata) will be +enabled during boot. Nodes with the `k3s_server` metadata field defined will be configured as K3s agents (this field gives them the address of the server). The Slurm control node is currently configured as a server while all other nodes are configured as agents. Using multiple K3s servers isn't supported. Currently only the root user on the control node has access to the Kubernetes API. The `k3s` role installs Helm for package management. K9s is also installed in the image and can be used by the root user. -# Idempotency +## Idempotency + K3s is intended to only be installed during image build as it is configured by the appliance on first boot with `azimuth_cloud.image_utils.linux_ansible_init`. Therefore, the `k3s` role isn't idempotent and changes to variables will not be reflected in the image when running `site.yml`. diff --git a/docs/monitoring-and-logging.md b/docs/monitoring-and-logging.md index 6913c285f..2d9de8079 100644 --- a/docs/monitoring-and-logging.md +++ b/docs/monitoring-and-logging.md @@ -39,14 +39,15 @@ Where `role_name` is the name of the internal role. ## Customising variables -You should only customise the variables in `environments/common` if you are working on a feature that you intend to contribute back. Instead you should override the variables in the environment relevant to your deployment. This is possible since inventories later in the inheritance chain have greater precedence. Please see [README.md](../README.md#environments) for a more detailed explanation. This notice exists to avoid the need to need to keep repeating this point in the following sections. Where it is noted that you should customise a variable, it is implied that this change should be made to your own environment e.g `environments/production` in preference to `environments/common`, even when +You should only customise the variables in `environments/common` if you are working on a feature that you intend to contribute back. Instead you should override the variables in the environment relevant to your deployment. This is possible since inventories later in the inheritance chain have greater precedence. Please see [README.md](../README.md#environments) for a more detailed explanation. +This notice exists to avoid the need to need to keep repeating this point in the following sections. Where it is noted that you should customise a variable, it is implied that this change should be made to your own environment e.g `environments/production` in preference to `environments/common`, even when this is not explicitly stated. ## filebeat This section details the configuration of filebeat. -### Defaults +### filebeat defaults Filebeat is configured by the internal `filebeat` role which can be found here: @@ -56,7 +57,7 @@ The appliance defaults for the `filebeat` role can be found at the following loc > [environments/common/inventory/group_vars/all/filebeat.yml](../environments/common/inventory/group_vars/all/filebeat.yml) -### Overview +### filebeat overview Filebeat is configured to scrape the output of slurm stats. Slurm stats produces a json log file in the following location on the host: @@ -73,9 +74,9 @@ This file is configurable by the `filebeat_config_path` variable. It is not currently possible to partially override `filebeat.yml`. You will have to configure `filebeat_config_path` to refer to another file, copying the parts of the default configuration you want to keep. Pull requests are welcomed to add the functionality needed to allow for partial overrides. -### Placement +### filebeat placement -The `filebeat` group controls the placement of the `filebeat` service. The default configuration scrapes the `slurm_stats` service output. This requires a `filebeat` instance to be co-located with the `slurm_stats` service. +The `filebeat` group controls the placement of the `filebeat` service. The default configuration scrapes the `slurm_stats` service output. This requires a `filebeat` instance to be colocateed with the `slurm_stats` service. In the simplest configuration, a single host should be assigned to the `filebeat` and `slurm_stats` group. The host assigned to the `slurm_stats` group should the same host as assigned to the `filebeat` group. More advanced configurations are possible, but require overriding `filebeat_config_path` using `group` or `host` variables. @@ -83,18 +84,18 @@ In the simplest configuration, a single host should be assigned to the `filebeat This section details the configuration of grafana. -### Defaults +### grafana defaults Internally, we use the [cloudalchemy.grafana](https://github.com/cloudalchemy/ansible-grafana) role. You can customise any of the variables that the role supports. For a full list, please see the [upstream documentation](https://github.com/cloudalchemy/ansible-grafana). The appliance defaults can be found here: > [environments/common/inventory/group_vars/all/grafana.yml](../environments/common/inventory/group_vars/all/grafana.yml) -### Placement +### grafana placement The `grafana` group controls the placement of the grafana service. Load balancing is currently unsupported so it is important that you only assign one host to this group. -### Access +### grafana access If Open OnDemand is enabled then by default this is used to proxy Grafana, otherwise Grafana is accessed through the first . See `grafana_url` in [environments/common/inventory/group_vars/all/grafana.yml](../environments/common/inventory/group_vars/all/grafana.yml). The port used (variable `grafana_port`) defaults to `3000`. @@ -159,7 +160,7 @@ This can be customised with the `grafana_datasources` variable. This section details the configuration of OpenSearch. -### Defaults +### opensearch defaults The internal `opensearch` role is used to configure the service. The list of variables that can be customised can found in: @@ -169,11 +170,11 @@ The appliance defaults are in the following file: > [environments/common/inventory/group_vars/all/opensearch.yml](../environments/common/inventory/group_vars/all/opensearch.yml) -### Placement +### opensearch placement The `opensearch` group determines the placement of the OpenSearch service. Load balancing is currently unsupported so it is important that you only assign one host to this group. -### Access +### opensearch access By default, OpenSearch only listens on the loopback interface. It should therefore be placed on the same node as `filebeat` and `grafana` which need to access the OpenSearch API. @@ -185,9 +186,9 @@ The default set of users is defined in: This defines an the following accounts: -| username | password | purpose | -| ------------- | ------------------------------------------------|-------------------------------------------| -| admin | | User of highest privilege | +| username | password | purpose | +| -------- | ------------------------------------ | ------------------------- | +| admin | | User of highest privilege | Where the password field refers to a variable containing the actual password. These are generated by the `generate-passwords.yml` adhoc playbook (see [README.md](../README.md#creating-a-slurm-appliance)). @@ -208,7 +209,7 @@ found in: This section details the configuration of prometheus. -### Defaults +### Prometheus defaults Internally, we use the [cloudalchemy.prometheus](https://github.com/cloudalchemy/ansible-prometheus) role. You can customise any of the variables that the role supports. For a full list, please see the [upstream documentation](https://github.com/cloudalchemy/ansible-prometheus). The appliance defaults can be found here: @@ -217,19 +218,20 @@ Internally, we use the [cloudalchemy.prometheus](https://github.com/cloudalchemy Prometheus will be functional by default but the following variables should commonly be modified: + - `prometheus_web_external_url` - `prometheus_storage_retention` - `prometheus_storage_retention_size` -### Placement +### Prometheus placement The `prometheus` group determines the placement of the prometheus service. Load balancing is currently unsupported so it is important that you only assign one host to this group. -### Access +### Prometheus access Prometheus is exposed on port `9090` on all hosts in the prometheus group. Currently, the configuration assumes a single host. Following the reference layout in `environments/common/layouts/everything`, this will be set to the slurm `control` node, prometheus would then be accessible from: - > http://:9090 +> http://:9090 The port can customised by overriding the `prometheus_web_external_url` variable. @@ -268,7 +270,7 @@ The list can be customised by overriding the `collect[]` parameter of the `node` > [environments/common/inventory/group_vars/all/prometheus.yml](../environments/common/inventory/group_vars/all/prometheus.yml). -Variables in this file should *not* be customised directly, but should be overridden in your `environment`. See [README.md](../README.md#environments) which details the process of overriding default variables in more detail. +Variables in this file should _not_ be customised directly, but should be overridden in your `environment`. See [README.md](../README.md#environments) which details the process of overriding default variables in more detail. ### custom ansible filters @@ -276,12 +278,13 @@ Variables in this file should *not* be customised directly, but should be overri Groups prometheus targets. Metrics from `node_exporter` hosts have two labels applied: - - `env`: This is set from the Ansible variable `prometheus_env` if present - (e.g. from hostvars or groupvars), defaulting to `ungrouped`. This can be - used to group metrics by some arbitrary "environment", e.g. rack. - - `group`: This refers to the "top-level" inventory group for the host and - is one of `control`, `login`, `compute` or `other`. This can be used to - define rules for specific host functionalities. + +- `env`: This is set from the Ansible variable `prometheus_env` if present + (e.g. from hostvars or groupvars), defaulting to `ungrouped`. This can be + used to group metrics by some arbitrary "environment", e.g. rack. +- `group`: This refers to the "top-level" inventory group for the host and + is one of `control`, `login`, `compute` or `other`. This can be used to + define rules for specific host functionalities. ## slurm-stats @@ -291,16 +294,12 @@ Slurm stats periodically queries the slurm accounting database to gather informa The polling of this data is controlled by a cron job. The default is to scrape the data every 5 minutes. -### Defaults +### slurm-stats defaults -slurm-stats is configured `slurm-stats` role in the [slurm_openstack_tools collection](https://github.com/stackhpc/ansible_collection_slurm_openstack_tools). Currently there is no customisation of this role in the common environment i.e we are just using role defaults. It is possible to override these by setting the relevant variable in your environment config. See [here](https://github.com/stackhpc/ansible_collection_slurm_openstack_tools/tree/main/roles/slurm-stats) for a list of variables that can be set. +slurm-stats is configured `slurm-stats` role in the [slurm_openstack_tools collection](https://github.com/stackhpc/ansible_collection_slurm_openstack_tools). Currently there is no customisation of this role in the common environment i.e we are just using role defaults. +It is possible to override these by setting the relevant variable in your environment config. See [here](https://github.com/stackhpc/ansible_collection_slurm_openstack_tools/tree/main/roles/slurm-stats) for a list of variables that can be set. - -### Placement +### slurm-stats placement The `slurm_stats` group controls the placement of the `slurm_stats` service. -This should be configured to be a group with a single host. That host must be co-located on the same host as the `filebeat` service that scrapes its output. - - - - +This should be configured to be a group with a single host. That host must be colocated on the same host as the `filebeat` service that scrapes its output. diff --git a/docs/networks.md b/docs/networks.md index bd10c380d..dc0afa499 100644 --- a/docs/networks.md +++ b/docs/networks.md @@ -2,11 +2,12 @@ The default OpenTofu configurations in the appliance do not provision networks, subnets or associated infrastructure such as routers. The requirements are that: + 1. At least one network exists. 2. The first network defined spans all nodes, referred to as the "access network". 3. Only one subnet per network is attached to nodes. 4. At least one network on each node provides outbound internet access (either -directly, or via a proxy). + directly, or via a proxy). Addresses on the "access network" used as the `ansible_host` IPs. @@ -35,6 +36,7 @@ Note that if an OpenStack subnet has a gateway IP defined then by default nodes with ports attached to that subnet get a default route set via that gateway. ## Single network + This is the simplest possible configuration. A single network and subnet is used for all nodes. The subnet provides outbound internet access via the default route defined by the subnet gateway (often an OpenStack router to an external @@ -51,6 +53,7 @@ cluster_networks = [ ``` ## Multiple homogenous networks + This is similar to the above, except each node has multiple networks. The first network, "netA" is the access network. Note that only one subnet must have a gateway defined, else default routes via both subnets will be present causing @@ -75,7 +78,6 @@ vnic_types = { ... ``` - ## Additional networks on some nodes This example shows how to modify variables for specific node groups. In this @@ -119,13 +121,14 @@ In some multiple network configurations it may be necessary to manage default routes rather than them being automatically created from a subnet gateway. This can be done using the tofu variable `gateway_ip` which can be set for the cluster and/or overriden on the compute and login groups. If this is set: + - a default route via that address will be created on the appropriate interface during boot if it does not exist - any other default routes will be removed For example the cluster configuration below has a "campus" network with a default gateway which provides inbound SSH / ondemand access and outbound -internet attached only to the login nodes, and a "data" network attached to +internet attached only to the login nodes, and a "data" network attached to all nodes. The "data" network has no gateway IP set on its subnet to avoid dual default routes and routing conflicts on the multi-homed login nodes, but does have outbound connectivity via a router: @@ -182,7 +185,7 @@ compute # environments/$SITE/inventory/group_vars/all/squid.yml: # these are just examples squid_cache_disk: 1024 # MB -squid_cache_mem: '12 GB' +squid_cache_mem: "12 GB" ``` Note that name resolution must still be possible and may require defining an diff --git a/docs/openondemand.md b/docs/openondemand.md index 70a3bc642..d0a458323 100644 --- a/docs/openondemand.md +++ b/docs/openondemand.md @@ -2,40 +2,44 @@ The appliance can deploy the Open OnDemand portal. This page describes how to enable this and the default appliance configuration/behaviour. Note that detailed configuration documentation is provided by: -- The README for the included `openondemand` role in this repo - [ansible/roles/openondemand/README.md](../ansible/roles/openondemand/README.md). -- The README and default variables for the underlying "official" role which the above wraps - [Open OnDemand Ansible Role](https://github.com/OSC/ood-ansible) +- The readme for the included `openondemand` role in this repository - [ansible/roles/openondemand/README.md](../ansible/roles/openondemand/README.md). +- The readme and default variables for the underlying "official" role which the above wraps - [Open OnDemand Ansible Role](https://github.com/OSC/ood-ansible) - The documentation for Open OnDemand [itself](https://osc.github.io/ood-documentation/latest/index.html) This appliance can deploy and configure: + - The Open OnDemand server itself (usually on a single login node). - User authentication using one of: - - An external OIDC provider. - - HTTP basic authentication and PAM. + - An external OIDC provider. + - HTTP basic authentication and PAM. - Virtual desktops on compute nodes. - Jupyter nodebook servers on compute nodes. - Proxying of Grafana (usually deployed on the control node) via the Open OnDemand portal. - Links to additional filesystems and pages from the Open OnDemand Dashboard. - A Prometheus exporter for the Open OnDemand server and related Grafana dashboard -For examples of all of the above see the `smslabs-example` environment in this repo. +For examples of all of the above see the `smslabs-example` environment in this repository. + +## Enabling Open OnDemand -# Enabling Open OnDemand To enable the Open OnDemand server, add single host to the `openondemand` inventory group. Generally, this should be a node in the `login` group, as Open OnDemand must be able to access Slurm commands. To enable compute nodes for virtual desktops or Jupyter notebook servers (accessed through the Open OnDemand portal), add nodes/groups to the `openondemand_desktop` and `openondemand_jupyter` inventory groups respectively. These may be all or a subset of the `compute` group. The above functionality is configured by running the `ansible/portal.yml` playbook. This is automatically run as part of `ansible/site.yml`. -# Default configuration +## Default configuration See the [ansible/roles/openondemand/README.md](../ansible/roles/openondemand/README.md) for more details on the variables described below. The following variables have been given default values to allow Open OnDemand to work in a newly created environment without additional configuration, but generally should be overridden in `environments/site/inventory/group_vars/all/` with site-specific values: + - `openondemand_servername` - this must be defined for both `openondemand` and `grafana` hosts (when Grafana is enabled). Default is `ansible_host` (i.e. the IP address) of the first host in the `openondemand` group. - `openondemand_auth` and any corresponding options. Defaults to `basic_pam`. - `openondemand_desktop_partition` and `openondemand_jupyter_partition` if the corresponding inventory groups are defined. Defaults to the first compute group defined in the `compute` OpenTofu variable in `environments/$ENV/tofu`. It is also recommended to set: + - `openondemand_dashboard_support_url` - `openondemand_dashboard_docs_url` @@ -45,5 +49,8 @@ The appliance automatically configures Open OnDemand to proxy Grafana and adds a [^1]: Note that if `openondemand_auth` is `basic_pam` and anonymous Grafana login is enabled, the appliance will (by default) configure Open OnDemand's Apache server to remove the Authorisation header from proxying of all `node/` addresses. This is done as otherwise Grafana tries to use this header to authenticate, which fails with the default configuration where only the admin Grafana user `grafana` is created. Note that the removal of this header in this configuration means it cannot be used to authenticate proxied interactive applications - however the appliance-deployed remote desktop and Jupyter Notebook server applications use other authentication methods. An alternative if using `basic_pam` is not to enable anonymous Grafana login and to create Grafana users matching the local users (e.g. in `environments//hooks/post.yml`). -# Access -By default the appliance authenticates against OOD with basic auth through PAM. When creating a new environment, a new user with username `demo_user` will be created. Its password is found under `vault_openondemand_default_user` in the appliance secrets store in `environments/{ENV}/inventory/group_vars/all/secrets.yml`. Other users can be defined by overriding the `basic_users_users` variable in your environment (templated into `environments/{ENV}/inventory/group_vars/all/basic_users.yml` by default). +## Access + +By default the appliance authenticates against OOD with basic auth through PAM. When creating a new environment, a new user with username `demo_user` will be created. +Its password is found under `vault_openondemand_default_user` in the appliance secrets store in `environments/{ENV}/inventory/group_vars/all/secrets.yml`. +Other users can be defined by overriding the `basic_users_users` variable in your environment (templated into `environments/{ENV}/inventory/group_vars/all/basic_users.yml` by default). diff --git a/docs/operations.md b/docs/operations.md index 7d5d24a44..251aa380c 100644 --- a/docs/operations.md +++ b/docs/operations.md @@ -3,6 +3,7 @@ This page describes the commands required for common operations. All subsequent sections assume that: + - Commands are run from the repository root, unless otherwise indicated by a `cd` command. - An Ansible vault secret is configured. - The correct private key is available to Ansible. @@ -15,24 +16,27 @@ All subsequent sections assume that: Review any [site-specific documentation](site/README.md) for more details on the above. -# Deploying a Cluster +## Deploying a Cluster This follows the same process as defined in the main [README.md](../README.md) for the default configuration. Note that tags as defined in the various sub-playbooks defined in `ansible/` may be used to only run part of the tasks in `site.yml`. -# SSH to Cluster Nodes +## SSH to Cluster Nodes This depends on how the cluster is accessed. The script `dev/ansible-ssh` may generally be used to connect to a host specified by a `inventory_hostname` using the same connection details as Ansible. If this does not work: + - Instance IPs are normally defined in `ansible_host` variables in an inventory file `environments/$ENV/inventory/hosts{,.yml}`. -- The ssh user is defined by `ansible_user`, default is `rocky`. This may be overridden in your environment. +- The SSH user is defined by `ansible_user`, default is `rocky`. This may be overridden in your environment. - If a jump host is required the user and address may be defined in the above inventory file. -# Modifying general Slurm.conf parameters +## Modifying general Slurm.conf parameters + Parameters for [slurm.conf](https://slurm.schedmd.com/slurm.conf.html) can be added to an `openhpc_config_extra` mapping in `environments/$SITE_ENV/inventory/group_vars/all/openhpc.yml`. Note that values in this mapping may be: + - A string, which will be inserted as-is. - A list, which will be converted to a comma-separated string. @@ -40,9 +44,9 @@ This allows specifying `slurm.conf` contents in an yaml-format Ansible-native wa **NB:** The appliance provides some default values in `environments/common/inventory/group_vars/all/openhpc.yml:openhpc_config_default` which is combined with the above. The `enable_configless` flag in the `SlurmCtldParameters` key this sets must not be overridden - a validation step checks this has not happened. -See [Reconfiguring Slurm](#Reconfiguring-Slurm) to apply changes. +See [Reconfiguring Slurm](#reconfiguring-slurm) to apply changes. -# Modifying Slurm Partition-specific Configuration +## Modifying Slurm Partition-specific Configuration Modify the `openhpc_slurm_partitions` mapping usually in `environments/$SITE_ENV/inventory/group_vars/all/openhpc.yml` as described for [stackhpc.openhpc:slurmconf](https://github.com/stackhpc/ansible-role-openhpc#slurmconf) (note the relevant version of this role is defined in the `requirements.yml`) @@ -50,20 +54,23 @@ Note an Ansible inventory group for the partition is required. This is generally **NB:** `default:NO` must be set on all non-default partitions, otherwise the last defined partition will always be set as the default. -See [Reconfiguring Slurm](#Reconfiguring-Slurm) to apply changes. +See [Reconfiguring Slurm](#reconfiguring-slurm) to apply changes. + +## Adding an Additional Partition -# Adding an Additional Partition This is a usually a two-step process: - If new nodes are required, define a new node group by adding an entry to the `compute` mapping in `environments/$ENV/tofu/main.tf` assuming the default OpenTofu configuration: - - The key is the partition name. - - The value should be a mapping, with the parameters defined in `environments/$SITE_ENV/tofu/compute/variables.tf`, but in brief will need at least `flavor` (name) and `nodes` (a list of node name suffixes). -- Add a new partition to the partition configuration as described under [Modifying Slurm Partition-specific Configuration](#Modifying-Slurm-Partition-specific-Configuration). + - The key is the partition name. + - The value should be a mapping, with the parameters defined in `environments/$SITE_ENV/tofu/compute/variables.tf`, but in brief will need at least `flavor` (name) and `nodes` (a list of node name suffixes). +- Add a new partition to the partition configuration as described under [Modifying Slurm Partition-specific Configuration](#modifying-slurm-partition-specific-configuration). + +Deploying the additional nodes and applying these changes requires rerunning both OpenTofu and the Ansible site.yml playbook - follow [Deploying a Cluster](#deploying-a-cluster). -Deploying the additional nodes and applying these changes requires rerunning both OpenTofu and the Ansible site.yml playbook - follow [Deploying a Cluster](#Deploying-a-Cluster). +## Adding Additional Packages -# Adding Additional Packages By default, the following utility packages are installed during the StackHPC image build: + - htop - nano - screen @@ -72,21 +79,22 @@ By default, the following utility packages are installed during the StackHPC ima - bind-utils - net-tools - postfix -- git +- Git - latest python version for system (3.6 for for Rocky 8.9 and 3.12 for Rocky 9.4) Additional packages can be added during image builds by: + - adding the `extra_packages` group to the build `inventory_groups` (see -[docs/image-build.md](./image-build.md)) + [docs/image-build.md](./image-build.md)) - defining a list of packages in `appliances_extra_packages_other` in e.g. -`environments/$SITE_ENV/inventory/group_vars/all/defaults.yml`. For example: + `environments/$SITE_ENV/inventory/group_vars/all/defaults.yml`. For example: - ```yaml - # environments/foo-base/inventory/group_vars/all/defaults.yml: - appliances_extra_packages_other: - - somepackage - - anotherpackage - ``` + ```yaml + # environments/foo-base/inventory/group_vars/all/defaults.yml: + appliances_extra_packages_other: + - somepackage + - anotherpackage + ``` For packages which come from repositories mirrored by StackHPC's "Ark" Pulp server (including rocky, EPEL and OpenHPC repositories), this will require either [Ark @@ -99,19 +107,21 @@ the OpenHPC installation guide (linked from the "user-facing" OpenHPC packages such as compilers, MPI libraries etc. include corresponding `lmod` modules. -Packages *may* also be installed during the site.yml, by adding the `cluster` +Packages _may_ also be installed during the site.yml, by adding the `cluster` group into the `extra_packages` group. An error will occur if Ark credentials are defined in this case, as they are readable by unprivileged users in the `.repo` files and a local Pulp mirror must be used instead. -If additional repositories are required, these could be added/enabled as necessary in a play added to `environments/$SITE_ENV/hooks/{pre,post}.yml` as appropriate. Note such a play should NOT exclude the builder group, so that the repositories are also added to built images. There are various Ansible modules which might be useful for this: - - `ansible.builtin.yum_repository`: Add a repo from an URL providing a 'repodata' directory. - - `ansible.builtin.rpm_key` : Add a GPG key to the RPM database. - - `ansible.builtin.get_url`: Can be used to install a repofile directly from an URL (e.g. https://turbovnc.org/pmwiki/uploads/Downloads/TurboVNC.repo) - - `ansible.builtin.dnf`: Can be used to install 'release packages' providing repos, e.g. `epel-release`, `ohpc-release`. +If additional repositories are required, these could be added/enabled as necessary in a play added to `environments/$SITE_ENV/hooks/{pre,post}.yml` as appropriate. +Note such a play should NOT exclude the builder group, so that the repositories are also added to built images. +There are various Ansible modules which might be useful for this: -The packages to be installed from that repo could also be defined in that play. Note using the `dnf` module with a list for its `name` parameter is more efficient and allows better dependency resolution than calling the module in a loop. +- `ansible.builtin.yum_repository`: Add a repository from a URL providing a 'repodata' directory. +- `ansible.builtin.rpm_key` : Add a GPG key to the RPM database. +- `ansible.builtin.get_url`: Can be used to install a repofile directly from a URL (e.g. ) +- `ansible.builtin.dnf`: Can be used to install 'release packages' providing repos, e.g. `epel-release`, `ohpc-release`. +The packages to be installed from that repository could also be defined in that play. Note using the `dnf` module with a list for its `name` parameter is more efficient and allows better dependency resolution than calling the module in a loop. Adding these repos/packages to the cluster/image would then require running: @@ -121,35 +131,35 @@ as appropriate. TODO: improve description about adding these to extra images. - -# Reconfiguring Slurm +## Reconfiguring Slurm At a minimum run: ansible-playbook ansible/slurm.yml --tags openhpc - **NB:** This will restart all daemons if the `slurm.conf` has any changes, even if technically only a `scontrol reconfigure` is required. - -# Running the MPI Test Suite +## Running the MPI Test Suite See [ansible/roles/hpctests/README.md](ansible/roles/hpctests/README.md) for a description of these. They can be run using ansible-playbook ansible/adhoc/hpctests.yml Note that: + - The above role provides variables to select specific partitions, nodes and interfaces which may be required. If not set in inventory, these can be passed as extravars: ansible-playbook ansible/adhoc/hpctests.yml -e hpctests_myvar=foo + - The HPL-based test is only reasonably optimised on Intel processors due the libraries and default parallelisation scheme used. For AMD processors it is recommended this -is skipped using: + is skipped using: - ansible-playbook ansible/adhoc/hpctests.yml --skip-tags hpl-solo. + ansible-playbook ansible/adhoc/hpctests.yml --skip-tags hpl-solo. Review any [site-specific documentation](site/README.md) for more details. -# Running CUDA Tests +## Running CUDA Tests + This uses the [cuda-samples](https://github.com/NVIDIA/cuda-samples/) utilities "deviceQuery" and "bandwidthTest" to test GPU functionality. It automatically runs on any host in the `cuda` inventory group: @@ -157,7 +167,7 @@ host in the `cuda` inventory group: **NB:** This test is not launched through Slurm, so confirm nodes are free/out of service or use `--limit` appropriately. -# Ad-hoc Commands and Playbooks +## Ad-hoc Commands and Playbooks A set of utility playbooks for managing a running appliance are provided in `ansible/adhoc` - run these by activating the environment and using: diff --git a/docs/persistent-state.md b/docs/persistent-state.md index a895f2e44..e2bec1c00 100644 --- a/docs/persistent-state.md +++ b/docs/persistent-state.md @@ -3,6 +3,7 @@ To enable cluster state to persist beyond individual node lifetimes (e.g. to survive a cluster deletion or rebuild) set `appliances_state_dir` to the path of a directory on persistent storage, such as an OpenStack volume. At present this will affect the following: + - `slurmctld` state, i.e. the Slurm queue. - The MySQL database for `slurmdbd`, i.e. Slurm accounting information as shown by the `sacct` command. - Prometheus database @@ -23,6 +24,7 @@ A new cookiecutter-produced environment supports persistent state in the default **NB: The default OpenTofu is provided as a working example and for internal CI use - therefore this volume is deleted when running `tofu destroy` - this may not be appropriate for a production environment.** In general, the Prometheus data is likely to be the only sizeable state stored. The size of this can be influenced through [Prometheus role variables](https://github.com/cloudalchemy/ansible-prometheus#role-variables), e.g.: + - `prometheus_storage_retention` - [default](../environments/common/inventory/group_vars/all/prometheus.yml) 31d - `prometheus_storage_retention_size` - [default](../environments/common/inventory/group_vars/all/prometheus.yml) 100GB - `prometheus_global.scrape_interval` and `scrape_interval` for [specific scrape definitions](../environments/common/inventory/group_vars/all/prometheus.yml) diff --git a/docs/production.md b/docs/production.md index e52e9d180..35f78ece8 100644 --- a/docs/production.md +++ b/docs/production.md @@ -8,9 +8,10 @@ production-ready deployments. requires instance deletion/recreation. - At least three environments should be created: - - `site`: site-specific base environment - - `production`: production environment - - `staging`: staging environment + + - `site`: site-specific base environment + - `production`: production environment + - `staging`: staging environment A `dev` environment should also be created if considered required, or this can be left until later. @@ -20,49 +21,47 @@ production-ready deployments. `environments/$ENV/ansible.cfg` file modifying so that they point to the `site` environment: - ```ini - inventory = ../common/inventory,../site/inventory,inventory - ``` + ```ini + inventory = ../common/inventory,../site/inventory,inventory + ``` - To avoid divergence of configuration all possible overrides for group/role -vars should be placed in `environments/site/inventory/group_vars/all/*.yml` -unless the value really is environment-specific (e.g. DNS names for -`openondemand_servername`). + vars should be placed in `environments/site/inventory/group_vars/all/*.yml` + unless the value really is environment-specific (e.g. DNS names for + `openondemand_servername`). - Where possible hooks should also be placed in `environments/site/hooks/` -and referenced from the `site` and `production` environments, e.g.: + and referenced from the `site` and `production` environments, e.g.: - ```yaml - # environments/production/hooks/pre.yml: - - name: Import parent hook - import_playbook: "{{ lookup('env', 'APPLIANCES_ENVIRONMENT_ROOT') }}/../site/hooks/pre.yml" - ``` +```yaml +# environments/production/hooks/pre.yml: +- name: Import parent hook + import_playbook: "{{ lookup('env', 'APPLIANCES_ENVIRONMENT_ROOT') }}/../site/hooks/pre.yml" +``` - OpenTofu configurations should be defined in the `site` environment and used as a module from the other environments. This can be done with the cookie-cutter generated configurations: - - Delete the *contents* of the cookie-cutter generated `tofu/` directories + + - Delete the _contents_ of the cookie-cutter generated `tofu/` directories from the `production` and `staging` environments. - Create a `main.tf` in those directories which uses `site/tofu/` as a [module](https://opentofu.org/docs/language/modules/), e.g. : - ``` +```text +... +module "cluster" { + source = "../../site/tofu/" + cluster_name = "foo" ... - module "cluster" { - source = "../../site/tofu/" - - cluster_name = "foo" - ... - } - ``` - - Note that: - - Environment-specific variables (`cluster_name`) should be hardcoded - into the cluster module block. - - Environment-independent variables (e.g. maybe `cluster_net` if the - same is used for staging and production) should be set as *defaults* - in `environments/site/tofu/variables.tf`, and then don't need to - be passed in to the module. +} +``` + +Note that: - Environment-specific variables (`cluster_name`) should be hardcoded +into the cluster module block. - Environment-independent variables (e.g. maybe `cluster_net` if the +same is used for staging and production) should be set as _defaults_ +in `environments/site/tofu/variables.tf`, and then don't need to +be passed in to the module. - Vault-encrypt secrets. Running the `generate-passwords.yml` playbook creates a secrets file at `environments/$ENV/inventory/group_vars/all/secrets.yml`. @@ -78,55 +77,64 @@ and referenced from the `site` and `production` environments, e.g.: - By default, the cookiecutter-provided OpenTofu configuration provisions two volumes and attaches them to the control node: - - "$cluster_name-home" for NFS-shared home directories - - "$cluster_name-state" for monitoring and Slurm data - The volumes mean this data is persisted when the control node is rebuilt. - However if the cluster is destroyed with `tofu destroy` then the volumes will - also be deleted. This is undesirable for production environments and usually - also for staging environments. Therefore the volumes should be manually - created, e.g. via the CLI: - - openstack volume create --size 200 mycluster-home # size in GB - openstack volume create --size 100 mycluster-state - - and OpenTofu configured to use those volumes instead of managing them itself - by setting: - - home_volume_provisioning = "attach" - state_volume_provisioning = "attach" - - either for a specific environment within the cluster module block in - `environments/$ENV/tofu/main.tf`, or as the site default by changing the - default in `environments/site/tofu/variables.tf`. - - For a development environment allowing OpenTofu to manage the volumes using - the default value of `"manage"` for those varibles is usually appropriate, as - it allows for multiple clusters to be created with this environment. - - If no home volume at all is required because the home directories are provided - by a parallel filesystem (e.g. manila) set - - home_volume_provisioning = "none" - - In this case the NFS share for home directories is automatically disabled. - - **NB:** To apply "attach" options to existing clusters, first remove the - volume(s) from the tofu state, e.g.: - - tofu state list # find the volume(s) - tofu state rm 'module.cluster.openstack_blockstorage_volume_v3.state[0]' - - This leaves the volume itself intact, but means OpenTofu "forgets" it. Then - set the "attach" options and run `tofu apply` again - this should show there - are no changes planned. + + - "$cluster_name-home" for NFS-shared home directories + - "$cluster_name-state" for monitoring and Slurm data + The volumes mean this data is persisted when the control node is rebuilt. + However if the cluster is destroyed with `tofu destroy` then the volumes will + also be deleted. This is undesirable for production environments and usually + also for staging environments. Therefore the volumes should be manually + created, e.g. via the CLI: + +```shell +openstack volume create --size 200 mycluster-home # size in GB +openstack volume create --size 100 mycluster-state +``` + +and OpenTofu configured to use those volumes instead of managing them itself +by setting: + +```text +home_volume_provisioning = "attach" +state_volume_provisioning = "attach" +``` + +either for a specific environment within the cluster module block in +`environments/$ENV/tofu/main.tf`, or as the site default by changing the +default in `environments/site/tofu/variables.tf`. + +For a development environment allowing OpenTofu to manage the volumes using +the default value of `"manage"` for those varibles is usually appropriate, as +it allows for multiple clusters to be created with this environment. + +If no home volume at all is required because the home directories are provided +by a parallel filesystem (e.g. manila) set + +```text +home_volume_provisioning = "none" +``` + +In this case the NFS share for home directories is automatically disabled. + +**NB:** To apply "attach" options to existing clusters, first remove the +volume(s) from the tofu state, e.g.: + +```shell +tofu state list # find the volume(s) +tofu state rm 'module.cluster.openstack_blockstorage_volume_v3.state[0]' +``` + +This leaves the volume itself intact, but means OpenTofu "forgets" it. Then +set the "attach" options and run `tofu apply` again - this should show there +are no changes planned. - Enable `etc_hosts` templating: - ```yaml - # environments/site/inventory/groups: - [etc_hosts:children] - cluster - ``` +```yaml +# environments/site/inventory/groups: +[etc_hosts:children] +cluster +``` - Configure Open OnDemand - see [specific documentation](openondemand.md). @@ -141,7 +149,7 @@ and referenced from the `site` and `production` environments, e.g.: [PR 485](https://github.com/stackhpc/ansible-slurm-appliance/pull/485). - Note [PR 473](https://github.com/stackhpc/ansible-slurm-appliance/pull/473) - may help identify any site-specific configuration. + may help identify any site-specific configuration. - See the [hpctests docs](../ansible/roles/hpctests/README.md) for advice on raising `hpctests_hpl_mem_frac` during tests. @@ -151,11 +159,13 @@ and referenced from the `site` and `production` environments, e.g.: 10 ports or 10 instances can be deployed at once. This should be raised by modifying `environments/$ENV/activate` to add a line like: - export TF_CLI_ARGS_apply="-parallelism=25" +```text +export TF_CLI_ARGS_apply="-parallelism=25" +``` - The value chosen should be the highest value demonstrated during testing. - Note that any time spent blocked due to this parallelism limit does not count - against the (un-overridable) internal OpenTofu timeout of 30 minutes +The value chosen should be the highest value demonstrated during testing. +Note that any time spent blocked due to this parallelism limit does not count +against the (un-overridable) internal OpenTofu timeout of 30 minutes - By default, OpenStack Nova also [limits](https://docs.openstack.org/nova/latest/configuration/config.html#DEFAULT.max_concurrent_builds) the number of concurrent instance builds to 10. This is per Nova controller, diff --git a/docs/sequence.md b/docs/sequence.md index 8723674e9..814929059 100644 --- a/docs/sequence.md +++ b/docs/sequence.md @@ -1,10 +1,9 @@ # Slurm Appliance Sequences - - ## Image build This sequence applies to both: + - "fatimage" builds, starting from GenericCloud images and using control,login,compute inventory groups to install all packages, e.g. StackHPC CI builds @@ -86,9 +85,10 @@ sequenceDiagram This sequence applies to active clusters, after running the `site.yml` playbook for the first time. Slurm controlled rebuild requires that: + - Compute groups in the OpenTofu `compute` variable have: - - `ignore_image_changes: true` - - `compute_init_enable: ['compute', ... ]` + - `ignore_image_changes: true` + - `compute_init_enable: ['compute', ... ]` - The Ansible `rebuild` inventory group contains the `control` group. TODO: should also document how compute-init does NOT run if the `site.yml` @@ -126,8 +126,9 @@ sequenceDiagram end nodes->>nodes: srun task completes ``` + Notes: + 1. And/or login/compute group overrides 2. Running on control node 3. On hosts targeted by job - diff --git a/docs/site/README.md b/docs/site/README.md index ee147875c..6597ea4e9 100644 --- a/docs/site/README.md +++ b/docs/site/README.md @@ -2,5 +2,4 @@ This document is a placeholder for any site-specific documentation, e.g. environment descriptions. -#TODO: list things which should commonly be specified here. - +## TODO: list things which should commonly be specified here diff --git a/docs/upgrades.md b/docs/upgrades.md index 05d9b832d..e62aa5c46 100644 --- a/docs/upgrades.md +++ b/docs/upgrades.md @@ -6,59 +6,70 @@ Generally, upstream releases will happen roughly monthly. Releases may contain n Any site-specific instructions in [docs/site/README.md](site/README.md) should be reviewed in tandem with this. This document assumes the deployment repository has: + 1. Remotes: - - `origin` referring to the site-specific remote repository. - - `stackhpc` referring to the StackHPC repository at https://github.com/stackhpc/ansible-slurm-appliance.git. + - `origin` referring to the site-specific remote repository. + - `stackhpc` referring to the StackHPC repository at . 2. Branches: - - `main` - following `main/origin`, the current site-specific code deployed to production. - - `upstream` - following `main/stackhpc`, i.e. the upstream `main` branch from `stackhpc`. + - `main` - following `main/origin`, the current site-specific code deployed to production. + - `upstream` - following `main/stackhpc`, i.e. the upstream `main` branch from `stackhpc`. 3. The following environments: - - `$PRODUCTION`: a production environment, as defined by e.g. `environments/production/`. - - `$STAGING`: a production environment, as defined by e.g. `environments/staging/`. - - `$SITE_ENV`: a base site-specific environment, as defined by e.g. `environments/mysite/`. + - `$PRODUCTION`: a production environment, as defined by e.g. `environments/production/`. + - `$STAGING`: a production environment, as defined by e.g. `environments/staging/`. + - `$SITE_ENV`: a base site-specific environment, as defined by e.g. `environments/mysite/`. **NB:** Commands which should be run on the Slurm login node are shown below prefixed `[LOGIN]$`. All other commands should be run on the Ansible deploy host. 1. Update the `upstream` branch from the `stackhpc` remote, including tags: - git fetch stackhpc main --tags +```shell +git fetch stackhpc main --tags +``` 1. Identify the latest release from the [Slurm appliance release page](https://github.com/stackhpc/ansible-slurm-appliance/releases). Below this release is shown as `vX.Y`. 1. Ensure your local site branch is up to date and create a new branch from it for the site-specfic release code: - git checkout main - git pull --prune - git checkout -b update/vX.Y +```shell +git checkout main +git pull --prune +git checkout -b update/vX.Y +``` 1. Merge the upstream code into your release branch: - git merge vX.Y +```shell +git merge vX.Y +``` - It is possible this will introduce merge conflicts; fix these following the usual git - prompts. Generally merge conflicts should only exist where functionality which was added - for your site (not in a hook) has subsequently been merged upstream. +It is possible this will introduce merge conflicts; fix these following the usual Git +prompts. Generally merge conflicts should only exist where functionality which was added +for your site (not in a hook) has subsequently been merged upstream. 1. Push this branch and create a PR: - git push - # follow instructions +```shell +git push +# follow instructions +``` 1. Review the PR to see if any added/changed functionality requires alteration of site-specific configuration. In general changes to existing functionality will aim to be backward compatible. Alteration of site-specific configuration will usually only be necessary to use new functionality or where functionality has been upstreamed as above. - Make changes as necessary. +Make changes as necessary. 1. Identify image(s) from the relevant [Slurm appliance release](https://github.com/stackhpc/ansible-slurm-appliance/releases), and download using the link on the release plus the image name, e.g. for an image `openhpc-ofed-RL8-240906-1042-32568dbb`: - wget https://object.arcus.openstack.hpc.cam.ac.uk/swift/v1/AUTH_3a06571936a0424bb40bc5c672c4ccb1/openhpc-images/openhpc-ofed-RL8-240906-1042-32568dbb +```shell +wget https://object.arcus.openstack.hpc.cam.ac.uk/swift/v1/AUTH_3a06571936a0424bb40bc5c672c4ccb1/openhpc-images/openhpc-ofed-RL8-240906-1042-32568dbb +``` - Note that some releases may not include new images. In this case use the image from the latest previous release with new images. +Note that some releases may not include new images. In this case use the image from the latest previous release with new images. 1. If required, build an "extra" image with local modifications, see [docs/image-build.md](./image-build.md). @@ -71,33 +82,42 @@ All other commands should be run on the Ansible deploy host. 1. Declare a future outage window to cluster users. A [Slurm reservation](https://slurm.schedmd.com/scontrol.html#lbAQ) can be used to prevent jobs running during that window, e.g.: - [LOGIN]$ sudo scontrol create reservation Flags=MAINT ReservationName="upgrade-vX.Y" StartTime=2024-10-16T08:00:00 EndTime=2024-10-16T10:00:00 Nodes=ALL Users=root +```shell +[LOGIN]$ sudo scontrol create reservation Flags=MAINT ReservationName="upgrade-vX.Y" StartTime=2024-10-16T08:00:00 EndTime=2024-10-16T10:00:00 Nodes=ALL Users=root +``` - Note a reservation cannot be created if it may overlap with currently running jobs (defined by job or partition time limits). +Note a reservation cannot be created if it may overlap with currently running jobs (defined by job or partition time limits). 1. At the outage window, check there are no jobs running: - [LOGIN]$ squeue +```shell +[LOGIN]$ squeue +``` 1. Deploy the branch created above to production, i.e. activate the production environment, run OpenTofu to reimage or -delete/recreate instances with the new images (depending on how the root disk is defined), and run Ansible's `site.yml` -playbook to reconfigure the cluster, e.g. as described in the main [README.md](../README.md). + delete/recreate instances with the new images (depending on how the root disk is defined), and run Ansible's `site.yml` + playbook to reconfigure the cluster, e.g. as described in the main [README.md](../README.md). 1. Check slurm is up: - [LOGIN]$ sinfo -R - - The `-R` shows the reason for any nodes being down. +```shell +[LOGIN]$ sinfo -R +``` + +The `-R` shows the reason for any nodes being down. 1. If the above shows nodes done for having been "unexpectedly rebooted", set them up again: - [LOGIN]$ sudo scontrol update state=RESUME nodename=$HOSTLIST_EXPR +```shell +[LOGIN]$ sudo scontrol update state=RESUME nodename=$HOSTLIST_EXPR +``` - where the hostlist expression might look like e.g. `general-[0-1]` to reset state for nodes 0 and 1 of the general partition. +where the hostlist expression might look like e.g. `general-[0-1]` to reset state for nodes 0 and 1 of the general partition. 1. Delete the reservation: - [LOGIN]$ sudo scontrol delete ReservationName="upgrade-slurm-v1.160" +```shell +[LOGIN]$ sudo scontrol delete ReservationName="upgrade-slurm-v1.160" +``` 1. Tell users the cluster is available again. - diff --git a/environments/.caas/README.md b/environments/.caas/README.md index 4a08433b0..84028453a 100644 --- a/environments/.caas/README.md +++ b/environments/.caas/README.md @@ -3,9 +3,10 @@ Environment for default Azimuth Slurm. This is not intended to be manually deployed. Non-standard things for this environment: + - There is no activate script. -- `ansible.cgf` is provided in the repo root, as expected by the caas operator. -- `ANSIBLE_INVENTORY` is set in the cluster type template, using a path relative to the +- `ansible.cgf` is provided in the repository root, as expected by the caas operator. +- `ANSIBLE_INVENTORY` is set in the cluster type template, using a path relative to the runner project directory: azimuth_caas_stackhpc_slurm_appliance_template: @@ -13,6 +14,6 @@ Non-standard things for this environment: envVars: ANSIBLE_INVENTORY: environments/common/inventory,environments/.caas/inventory - Ansible then defines `ansible_inventory_sources` which contains absolute paths, and - that is used to derive the `appliances_environment_root` and - `appliances_repository_root`. + Ansible then defines `ansible_inventory_sources` which contains absolute paths, and + that is used to derive the `appliances_environment_root` and + `appliances_repository_root`. diff --git a/environments/.caas/hooks/post.yml b/environments/.caas/hooks/post.yml index eaaeb23f9..cf606c746 100644 --- a/environments/.caas/hooks/post.yml +++ b/environments/.caas/hooks/post.yml @@ -1,9 +1,10 @@ +--- # Configure the Zenith clients that are required # Note zenith hosts are in podman group - hosts: grafana tasks: - name: Deploy the Zenith client for Grafana - include_role: + ansible.builtin.include_role: name: zenith_proxy vars: zenith_proxy_service_name: zenith-monitoring @@ -11,7 +12,7 @@ zenith_proxy_upstream_host: "{{ ansible_host }}" # IP zenith_proxy_upstream_port: "{{ grafana_port }}" zenith_proxy_client_token: "{{ zenith_token_monitoring }}" - zenith_proxy_mitm_enabled: yes + zenith_proxy_mitm_enabled: true zenith_proxy_mitm_auth_inject: basic zenith_proxy_mitm_auth_basic_username: "{{ grafana_security.admin_user }}" zenith_proxy_mitm_auth_basic_password: "{{ grafana_security.admin_password }}" @@ -20,7 +21,7 @@ - hosts: openondemand tasks: - name: Deploy the Zenith client for OOD - include_role: + ansible.builtin.include_role: name: zenith_proxy vars: zenith_proxy_service_name: zenith-ood @@ -29,7 +30,7 @@ zenith_proxy_upstream_host: "{{ ansible_host }}" # IP zenith_proxy_upstream_port: 443 zenith_proxy_client_token: "{{ zenith_token_ood }}" - zenith_proxy_mitm_enabled: yes + zenith_proxy_mitm_enabled: true zenith_proxy_mitm_auth_inject: basic zenith_proxy_mitm_auth_basic_username: azimuth zenith_proxy_mitm_auth_basic_password: "{{ vault_azimuth_user_password }}" @@ -40,14 +41,15 @@ become: false gather_facts: false tasks: - - import_role: + - ansible.builtin.import_role: name: hpctests when: cluster_run_validation | default(false) | bool # Write the outputs as the final task - hosts: localhost tasks: - - debug: var=outputs + - ansible.builtin.debug: + var: outputs vars: # Ansible has a fit when there are two 'hostvars' evaluations in a resolution chain, # so we have to repeat logic here unfortunately @@ -62,4 +64,4 @@ if zenith_fqdn_ood is not defined else {} ) - }} \ No newline at end of file + }} diff --git a/environments/.caas/hooks/pre.yml b/environments/.caas/hooks/pre.yml index 8c99e5953..052e6e561 100644 --- a/environments/.caas/hooks/pre.yml +++ b/environments/.caas/hooks/pre.yml @@ -1,8 +1,9 @@ --- - # Generate k3s token - name: Generate k3s token - # NB: Although this generates a new token on each run, the actual token set in metadata is retrieved from a set-once tofu resource, hence only the first value ever generated is relevant. + # NB: Although this generates a new token on each run, the actual token set in + # metadata is retrieved from a set-once tofu resource, hence only the first + # value ever generated is relevant. hosts: openstack tasks: - ansible.builtin.set_fact: @@ -17,18 +18,18 @@ # Ensure that the secrets are generated and persisted on the control host - name: Generate and persist secrets hosts: control - gather_facts: no - become: yes + gather_facts: false + become: true roles: - persist_openhpc_secrets # validate.yml asserts presence of a control group which doesn't exist when # destroying infra, so only validate when we're not destroying - hosts: openstack - gather_facts: no - become: no + gather_facts: false + become: false tasks: - - set_fact: + - ansible.builtin.set_fact: appliances_validate: false when: "cluster_state | default('') == 'absent'" @@ -40,21 +41,21 @@ gather_facts: false tasks: - name: Set up Ansible user - user: "{{ (appliances_local_users_default | selectattr('user.name', 'eq', appliances_local_users_ansible_user_name))[0]['user'] }}" - become_method: "sudo" + ansible.builtin.user: + name: "{{ (appliances_local_users_default | selectattr('user.name', 'eq', appliances_local_users_ansible_user_name))[0]['user'] }}" + become_method: "ansible.builtin.sudo" # Need to change working directory otherwise we try to switch back to non-existent directory. - become_flags: '-i' + become_flags: "-i" become: true - hosts: cluster - gather_facts: no + gather_facts: false tasks: - name: Reset persistent SSH connections - meta: reset_connection - + ansible.builtin.meta: reset_connection - hosts: localhost - gather_facts: no - become: no + gather_facts: false + become: false tasks: - name: Add hosts to dnf_repos group to enable repofiles ansible.builtin.add_host: diff --git a/environments/.caas/inventory/group_vars/all/basic_users.yml b/environments/.caas/inventory/group_vars/all/basic_users.yml index 0e381486e..ef19398f1 100644 --- a/environments/.caas/inventory/group_vars/all/basic_users.yml +++ b/environments/.caas/inventory/group_vars/all/basic_users.yml @@ -1,3 +1,4 @@ +--- basic_users_users: - name: azimuth # Hash the password with a salt that is different for each host diff --git a/environments/.caas/inventory/group_vars/all/cluster.yml b/environments/.caas/inventory/group_vars/all/cluster.yml index b06314c03..de7d1a43b 100644 --- a/environments/.caas/inventory/group_vars/all/cluster.yml +++ b/environments/.caas/inventory/group_vars/all/cluster.yml @@ -1,3 +1,4 @@ +--- # Account for the fact we are running outside of the expected environment system: caas_inventory: "{{ ansible_inventory_sources | last }}" # ansible_inventory_sources is absolute appliances_environment_root: "{{ caas_inventory | dirname }}" diff --git a/environments/.caas/inventory/group_vars/all/grafana.yml b/environments/.caas/inventory/group_vars/all/grafana.yml index 10fdc926c..d83146723 100644 --- a/environments/.caas/inventory/group_vars/all/grafana.yml +++ b/environments/.caas/inventory/group_vars/all/grafana.yml @@ -1 +1,2 @@ +--- grafana_auth_anonymous: "{{ groups['openondemand'] | count > 0 }}" diff --git a/environments/.caas/inventory/group_vars/all/hpctests.yml b/environments/.caas/inventory/group_vars/all/hpctests.yml index a6a2c9174..f4ade940c 100644 --- a/environments/.caas/inventory/group_vars/all/hpctests.yml +++ b/environments/.caas/inventory/group_vars/all/hpctests.yml @@ -1,8 +1,9 @@ +--- # Skip plotting pingpong as matplotlib not in runner environment hpctests_pingpong_plot: false # In Azimuth, the Ansible controller is an ephemeral pod, so all that matters is that -# this is a location that is writable by the container user +# this is a location that is writable by the container user hpctests_outdir: "{{ playbook_dir }}/.tmp/hpctests" # hpctests run by default in Azimuth but not trying to stress-test the nodes diff --git a/environments/.caas/inventory/group_vars/all/manila.yml b/environments/.caas/inventory/group_vars/all/manila.yml index 226ac210a..ebd1dde12 100644 --- a/environments/.caas/inventory/group_vars/all/manila.yml +++ b/environments/.caas/inventory/group_vars/all/manila.yml @@ -1,3 +1,4 @@ +--- caas_manila_home: share_name: "{{ cluster_name }}-home" mount_path: /home @@ -14,4 +15,6 @@ caas_manila_project: mount_group: root mount_mode: ugo=rwX -os_manila_mount_shares: "{{ ([caas_manila_home] if cluster_home_manila_share | bool else []) + ([caas_manila_project] if cluster_project_manila_share | bool else []) }}" +# yamllint disable-line rule:line-length +os_manila_mount_shares: "{{ ([caas_manila_home] if cluster_home_manila_share | bool else []) + ([caas_manila_project] if cluster_project_manila_share | bool else + []) }}" diff --git a/environments/.caas/inventory/group_vars/all/nfs.yml b/environments/.caas/inventory/group_vars/all/nfs.yml index f42422601..d5070e5da 100644 --- a/environments/.caas/inventory/group_vars/all/nfs.yml +++ b/environments/.caas/inventory/group_vars/all/nfs.yml @@ -1,10 +1,11 @@ +--- nfs_server: "{{ nfs_server_default }}" caas_nfs_home: - comment: Export /exports/home from Slurm control node as /home nfs_enable: - server: "{{ inventory_hostname in groups['control'] }}" - clients: "{{ inventory_hostname in groups['cluster'] }}" + server: "{{ inventory_hostname in groups['control'] }}" + clients: "{{ inventory_hostname in groups['cluster'] }}" nfs_export: "/exports/home" # assumes skeleton TF is being used nfs_client_mnt_point: "/home" diff --git a/environments/.caas/inventory/group_vars/all/openhpc.yml b/environments/.caas/inventory/group_vars/all/openhpc.yml index 74f196c6f..56c8b907d 100644 --- a/environments/.caas/inventory/group_vars/all/openhpc.yml +++ b/environments/.caas/inventory/group_vars/all/openhpc.yml @@ -1,3 +1,4 @@ +--- openhpc_cluster_name: "{{ cluster_name }}" # Provision a single "standard" compute nodegroup using the supplied diff --git a/environments/.caas/inventory/group_vars/all/openondemand.yml b/environments/.caas/inventory/group_vars/all/openondemand.yml index 4dc0b9337..83b15a2d8 100644 --- a/environments/.caas/inventory/group_vars/all/openondemand.yml +++ b/environments/.caas/inventory/group_vars/all/openondemand.yml @@ -6,4 +6,3 @@ openondemand_desktop_partition: "{{ openhpc_partitions[0]['name'] }}" httpd_listen_addr_port: - 80 - 443 - diff --git a/environments/.caas/inventory/group_vars/all/zenith.yml b/environments/.caas/inventory/group_vars/all/zenith.yml index 56dd0ca16..652f2daf3 100644 --- a/environments/.caas/inventory/group_vars/all/zenith.yml +++ b/environments/.caas/inventory/group_vars/all/zenith.yml @@ -1 +1,2 @@ +--- zenith_proxy_podman_user: podman diff --git a/environments/.caas/inventory/group_vars/openstack.yml b/environments/.caas/inventory/group_vars/openstack.yml index f76c05033..83dff89de 100644 --- a/environments/.caas/inventory/group_vars/openstack.yml +++ b/environments/.caas/inventory/group_vars/openstack.yml @@ -1,3 +1,4 @@ +--- # The default Terraform state key for backends that support it terraform_state_key: "cluster/{{ cluster_id }}/tfstate" diff --git a/environments/.caas/ui-meta/slurm-infra-fast-volume-type.yml b/environments/.caas/ui-meta/slurm-infra-fast-volume-type.yml index ab10eff20..5a105bd72 100644 --- a/environments/.caas/ui-meta/slurm-infra-fast-volume-type.yml +++ b/environments/.caas/ui-meta/slurm-infra-fast-volume-type.yml @@ -1,8 +1,8 @@ +--- name: "slurm" label: "Slurm" -description: >- - Batch cluster running the Slurm workload manager, the Open - OnDemand web interface, and custom monitoring. +description: >- + Batch cluster running the Slurm workload manager, the Open OnDemand web interface, and custom monitoring. logo: https://upload.wikimedia.org/wikipedia/commons/thumb/3/3a/Slurm_logo.svg/158px-Slurm_logo.svg.png requires_ssh_key: true @@ -49,7 +49,7 @@ parameters: count_parameter: compute_count min_ram: 2048 min_disk: 20 - + - name: home_volume_size label: Home volume size (GB) description: The size of the cloud volume to use for home directories. @@ -79,7 +79,7 @@ parameters: this volume, 10GB is set aside for cluster state and the remaining space is used to store cluster metrics. - The oldest metrics records in the [Prometheus](https://prometheus.io/) database will be + The oldest metrics records in the [Prometheus](https://prometheus.io/) database will be discarded to ensure that the database does not grow larger than this volume. kind: cloud.volume_size immutable: true @@ -98,6 +98,7 @@ parameters: options: checkboxLabel: Run post-configuration validation? +# yamllint disable rule:line-length usage_template: |- # Accessing the cluster using Open OnDemand @@ -137,6 +138,7 @@ usage_template: |- Other parts of the filesystem may be affected during a patch operation, including any packages that have been installed using `dnf`. +# yamllint enable rule:line-length services: - name: ood @@ -145,4 +147,3 @@ services: - name: monitoring label: Monitoring icon_url: https://raw.githubusercontent.com/cncf/artwork/master/projects/prometheus/icon/color/prometheus-icon-color.png - diff --git a/environments/.caas/ui-meta/slurm-infra-manila-home.yml b/environments/.caas/ui-meta/slurm-infra-manila-home.yml index 4a01bb6fa..6255f468f 100644 --- a/environments/.caas/ui-meta/slurm-infra-manila-home.yml +++ b/environments/.caas/ui-meta/slurm-infra-manila-home.yml @@ -1,9 +1,9 @@ +--- # Exactly as for slurm-infra.yml but to allow for separate manila/non-manila home appliances name: "slurm-manila-home" label: "Slurm (CephFS home)" -description: >- - Batch cluster running the Slurm workload manager, the Open - OnDemand web interface, and custom monitoring. +description: >- + Batch cluster running the Slurm workload manager, the Open OnDemand web interface, and custom monitoring. This version uses CephFS for home directories. logo: https://upload.wikimedia.org/wikipedia/commons/thumb/3/3a/Slurm_logo.svg/158px-Slurm_logo.svg.png @@ -52,7 +52,7 @@ parameters: count_parameter: compute_count min_ram: 2048 min_disk: 20 - + - name: home_volume_size label: Home share size (GB) description: The size of the share to use for home directories. @@ -69,7 +69,7 @@ parameters: this volume, 10GB is set aside for cluster state and the remaining space is used to store cluster metrics. - The oldest metrics records in the [Prometheus](https://prometheus.io/) database will be + The oldest metrics records in the [Prometheus](https://prometheus.io/) database will be discarded to ensure that the database does not grow larger than this volume. kind: cloud.volume_size immutable: true @@ -88,6 +88,7 @@ parameters: options: checkboxLabel: Run post-configuration validation? +# yamllint disable rule:line-length usage_template: |- # Accessing the cluster using Open OnDemand @@ -127,6 +128,7 @@ usage_template: |- Other parts of the filesystem may be affected during a patch operation, including any packages that have been installed using `dnf`. +# yamllint enable rule:line-length services: - name: ood diff --git a/environments/.caas/ui-meta/slurm-infra.yml b/environments/.caas/ui-meta/slurm-infra.yml index 36b89281d..b2d438385 100644 --- a/environments/.caas/ui-meta/slurm-infra.yml +++ b/environments/.caas/ui-meta/slurm-infra.yml @@ -1,8 +1,8 @@ +--- name: "slurm" label: "Slurm" -description: >- - Batch cluster running the Slurm workload manager, the Open - OnDemand web interface, and custom monitoring. +description: >- + Batch cluster running the Slurm workload manager, the Open OnDemand web interface, and custom monitoring. logo: https://upload.wikimedia.org/wikipedia/commons/thumb/3/3a/Slurm_logo.svg/158px-Slurm_logo.svg.png requires_ssh_key: true @@ -49,7 +49,7 @@ parameters: count_parameter: compute_count min_ram: 2048 min_disk: 20 - + - name: home_volume_size label: Home volume size (GB) description: The size of the cloud volume to use for home directories. @@ -66,7 +66,7 @@ parameters: this volume, 10GB is set aside for cluster state and the remaining space is used to store cluster metrics. - The oldest metrics records in the [Prometheus](https://prometheus.io/) database will be + The oldest metrics records in the [Prometheus](https://prometheus.io/) database will be discarded to ensure that the database does not grow larger than this volume. kind: cloud.volume_size immutable: true @@ -85,6 +85,7 @@ parameters: options: checkboxLabel: Run post-configuration validation? +# yamllint disable rule:line-length usage_template: |- # Accessing the cluster using Open OnDemand @@ -124,6 +125,7 @@ usage_template: |- Other parts of the filesystem may be affected during a patch operation, including any packages that have been installed using `dnf`. +# yamllint enable rule:line-length services: - name: ood diff --git a/environments/.stackhpc/hooks/post-bootstrap.yml b/environments/.stackhpc/hooks/post-bootstrap.yml index df3902698..3e24212f0 100644 --- a/environments/.stackhpc/hooks/post-bootstrap.yml +++ b/environments/.stackhpc/hooks/post-bootstrap.yml @@ -1,17 +1,19 @@ +--- - hosts: podman:!builder - become: yes + become: true gather_facts: false tags: podman tasks: - name: Configure container image registry to avoid docker.io ratelimits - copy: + ansible.builtin.copy: dest: /etc/containers/registries.conf.d/003-arcus-mirror.conf content: | [[registry]] location="docker.io/library/" prefix="docker.io/library/" - + [[registry.mirror]] location = "{{ podman_registry_address }}" insecure = true + mode: "0644" when: "ci_cloud == 'ARCUS'" diff --git a/environments/.stackhpc/hooks/pre.yml b/environments/.stackhpc/hooks/pre.yml index 305713a61..e810c205a 100644 --- a/environments/.stackhpc/hooks/pre.yml +++ b/environments/.stackhpc/hooks/pre.yml @@ -1,17 +1,18 @@ +--- - hosts: control:!builder - become: yes + become: true gather_facts: false tasks: - name: Output OS version - command: cat /etc/redhat-release + ansible.builtin.command: cat /etc/redhat-release changed_when: false - name: Write CI-generated inventory and secrets for debugging ansible.builtin.copy: dest: /etc/ci-config/ src: "{{ item }}" - directory_mode: 0400 - mode: 0400 + directory_mode: "0400" + mode: "0400" owner: root group: root no_log: "{{ no_log | default(true) }}" diff --git a/environments/.stackhpc/inventory/group_vars/all/basic_users.yml b/environments/.stackhpc/inventory/group_vars/all/basic_users.yml index e2088ffd9..235814ce1 100644 --- a/environments/.stackhpc/inventory/group_vars/all/basic_users.yml +++ b/environments/.stackhpc/inventory/group_vars/all/basic_users.yml @@ -1,3 +1,6 @@ +--- + +# yamllint disable-line rule:line-length test_demo_user_password: "{{ lookup('env', 'DEMO_USER_PASSWORD') | default(vault_demo_user_password, true) }}" # CI uses env, debug can set vault_demo_user_password basic_users_users: diff --git a/environments/.stackhpc/inventory/group_vars/all/bastion.yml b/environments/.stackhpc/inventory/group_vars/all/bastion.yml index a1001e862..ea2ad00e5 100644 --- a/environments/.stackhpc/inventory/group_vars/all/bastion.yml +++ b/environments/.stackhpc/inventory/group_vars/all/bastion.yml @@ -1,3 +1,4 @@ +--- ci_cloud: "{{ lookup('env', 'CI_CLOUD') }}" bastion_config: ARCUS: diff --git a/environments/.stackhpc/inventory/group_vars/all/freeipa.yml b/environments/.stackhpc/inventory/group_vars/all/freeipa.yml index 9a979ab16..a92f011ac 100644 --- a/environments/.stackhpc/inventory/group_vars/all/freeipa.yml +++ b/environments/.stackhpc/inventory/group_vars/all/freeipa.yml @@ -1,3 +1,4 @@ +--- # This file provides examples of using freeipa role variables. These are NOT functional in CI as freeipa_{server,client} groups are not defined. # NB: Users defined this way have expired passwords diff --git a/environments/.stackhpc/inventory/group_vars/all/grafana.yml b/environments/.stackhpc/inventory/group_vars/all/grafana.yml index 14fefa945..3c49fd5d7 100644 --- a/environments/.stackhpc/inventory/group_vars/all/grafana.yml +++ b/environments/.stackhpc/inventory/group_vars/all/grafana.yml @@ -1 +1,2 @@ +--- grafana_auth_anonymous: true diff --git a/environments/.stackhpc/inventory/group_vars/all/hpctests.yml b/environments/.stackhpc/inventory/group_vars/all/hpctests.yml index e8cfcea5f..472462169 100644 --- a/environments/.stackhpc/inventory/group_vars/all/hpctests.yml +++ b/environments/.stackhpc/inventory/group_vars/all/hpctests.yml @@ -1 +1,2 @@ +--- hpctests_user: demo_user diff --git a/environments/.stackhpc/inventory/group_vars/all/manila.yml b/environments/.stackhpc/inventory/group_vars/all/manila.yml index 59f935873..b37a13059 100644 --- a/environments/.stackhpc/inventory/group_vars/all/manila.yml +++ b/environments/.stackhpc/inventory/group_vars/all/manila.yml @@ -1,3 +1,4 @@ +--- os_manila_mount_shares_arcus: - share_name: slurm-v2-home mount_path: /project diff --git a/environments/.stackhpc/inventory/group_vars/all/openhpc.yml b/environments/.stackhpc/inventory/group_vars/all/openhpc.yml index 5aac5f8ad..ae1342b72 100644 --- a/environments/.stackhpc/inventory/group_vars/all/openhpc.yml +++ b/environments/.stackhpc/inventory/group_vars/all/openhpc.yml @@ -1,3 +1,4 @@ +--- openhpc_config_extra: SlurmctldDebug: debug SlurmdDebug: debug diff --git a/environments/.stackhpc/inventory/group_vars/all/openondemand.yml b/environments/.stackhpc/inventory/group_vars/all/openondemand.yml index 735da25df..baa4d1aa4 100644 --- a/environments/.stackhpc/inventory/group_vars/all/openondemand.yml +++ b/environments/.stackhpc/inventory/group_vars/all/openondemand.yml @@ -1,6 +1,7 @@ +--- openondemand_auth: basic_pam openondemand_jupyter_partition: standard openondemand_desktop_partition: standard -#openondemand_dashboard_support_url: -#openondemand_dashboard_docs_url: -#openondemand_filesapp_paths: +# openondemand_dashboard_support_url: +# openondemand_dashboard_docs_url: +# openondemand_filesapp_paths: diff --git a/environments/.stackhpc/inventory/group_vars/all/podman.yml b/environments/.stackhpc/inventory/group_vars/all/podman.yml index b9d4109ed..02d7e7fbb 100644 --- a/environments/.stackhpc/inventory/group_vars/all/podman.yml +++ b/environments/.stackhpc/inventory/group_vars/all/podman.yml @@ -1,2 +1,3 @@ +--- arcus_podman_registry_address: 192.168.3.95:5000 podman_registry_address: "{{ arcus_podman_registry_address if ci_cloud == 'ARCUS' else '' }}" diff --git a/environments/.stackhpc/inventory/group_vars/all/tuned.yml b/environments/.stackhpc/inventory/group_vars/all/tuned.yml index f1cb034ee..a8074e796 100644 --- a/environments/.stackhpc/inventory/group_vars/all/tuned.yml +++ b/environments/.stackhpc/inventory/group_vars/all/tuned.yml @@ -1,2 +1,3 @@ +--- # Set profile which is not default (on VMs) for testing tuned_profile: hpc-compute diff --git a/environments/.stackhpc/inventory/group_vars/builder.yml b/environments/.stackhpc/inventory/group_vars/builder.yml index 10b15adac..788666ab0 100644 --- a/environments/.stackhpc/inventory/group_vars/builder.yml +++ b/environments/.stackhpc/inventory/group_vars/builder.yml @@ -1,4 +1,5 @@ -#update_enable: false # Can uncomment for speed debugging non-update related build issues +--- +# update_enable: false # Can uncomment for speed debugging non-update related build issues sssd_install_ldap: true # include sssd-ldap package in fatimage # update_enable: false # Can uncomment for speed debugging non-update related build issues diff --git a/environments/.stackhpc/tofu/cluster_image.auto.tfvars.json b/environments/.stackhpc/tofu/cluster_image.auto.tfvars.json index 6ab4c4233..91f2b563d 100644 --- a/environments/.stackhpc/tofu/cluster_image.auto.tfvars.json +++ b/environments/.stackhpc/tofu/cluster_image.auto.tfvars.json @@ -1,6 +1,6 @@ { - "cluster_image": { - "RL8": "openhpc-RL8-250610-1435-d0ef926e", - "RL9": "openhpc-RL9-250610-1435-d0ef926e" - } + "cluster_image": { + "RL8": "openhpc-RL8-250610-1435-d0ef926e", + "RL9": "openhpc-RL9-250610-1435-d0ef926e" + } } diff --git a/environments/.stackhpc/tofu/main.tf b/environments/.stackhpc/tofu/main.tf index ad1549164..552be4b03 100644 --- a/environments/.stackhpc/tofu/main.tf +++ b/environments/.stackhpc/tofu/main.tf @@ -4,99 +4,106 @@ terraform { required_version = ">= 0.14" required_providers { openstack = { - source = "terraform-provider-openstack/openstack" + source = "terraform-provider-openstack/openstack" version = "~>3.0.0" } } } variable "environment_root" { - type = string - description = "Path to environment root, automatically set by activate script" + type = string + description = "Path to environment root, automatically set by activate script" } variable "cluster_name" { - type = string - description = "Name for cluster, used as prefix for resources - set by environment var in CI" + type = string + description = "Name for cluster, used as prefix for resources - set by environment var in CI" } variable "os_version" { - type = string + type = string description = "RL8 or RL9" - default = "RL9" + default = "RL9" } variable "cluster_image" { - description = "single image for all cluster nodes, keyed by os_version - a convenience for CI" - type = map(string) + description = "single image for all cluster nodes, keyed by os_version - a convenience for CI" + type = map(string) } +# tflint-ignore: terraform_typed_variables variable "cluster_networks" {} +# tflint-ignore: terraform_typed_variables variable "vnic_types" { - default = {} + default = {} } -variable "state_volume_type"{ - default = null +# tflint-ignore: terraform_typed_variables +variable "state_volume_type" { + default = null } -variable "home_volume_type"{ - default = null +# tflint-ignore: terraform_typed_variables +variable "home_volume_type" { + default = null } +# tflint-ignore: terraform_typed_variables variable "control_node_flavor" {} +# tflint-ignore: terraform_typed_variables variable "other_node_flavor" {} +# tflint-ignore: terraform_typed_variables variable "volume_backed_instances" { - default = false + default = false } data "openstack_images_image_v2" "cluster" { - name = var.cluster_image[var.os_version] - most_recent = true + name = var.cluster_image[var.os_version] + most_recent = true } module "cluster" { - source = "../../skeleton/{{cookiecutter.environment}}/tofu/" + source = "../../skeleton/{{cookiecutter.environment}}/tofu/" - cluster_name = var.cluster_name - cluster_networks = var.cluster_networks - vnic_types = var.vnic_types - key_pair = "slurm-app-ci" - cluster_image_id = data.openstack_images_image_v2.cluster.id - control_node_flavor = var.control_node_flavor + cluster_name = var.cluster_name + cluster_networks = var.cluster_networks + vnic_types = var.vnic_types + key_pair = "slurm-app-ci" + cluster_image_id = data.openstack_images_image_v2.cluster.id + control_node_flavor = var.control_node_flavor + login = { login = { - login = { - nodes = ["login-0"] - flavor = var.other_node_flavor - } + nodes = ["login-0"] + flavor = var.other_node_flavor } - compute = { - standard = { # NB: can't call this default! - nodes = ["compute-0", "compute-1"] - flavor = var.other_node_flavor - compute_init_enable = ["compute", "chrony", "etc_hosts", "nfs", "basic_users", "eessi", "tuned", "cacerts"] - ignore_image_changes = true - } - # Normally-empty partition for testing: - extra = { - nodes = [] - #nodes = ["extra-0", "extra-1"] - flavor = var.other_node_flavor - } + } + compute = { + standard = { # NB: can't call this default! + nodes = ["compute-0", "compute-1"] + flavor = var.other_node_flavor + compute_init_enable = ["compute", "chrony", "etc_hosts", "nfs", "basic_users", "eessi", "tuned", "cacerts"] + ignore_image_changes = true + } + # Normally-empty partition for testing: + extra = { + nodes = [] + #nodes = ["extra-0", "extra-1"] + flavor = var.other_node_flavor } + } - volume_backed_instances = var.volume_backed_instances + volume_backed_instances = var.volume_backed_instances - environment_root = var.environment_root - # Can reduce volume size a lot for short-lived CI clusters: - state_volume_size = 10 - home_volume_size = 20 + environment_root = var.environment_root + # Can reduce volume size a lot for short-lived CI clusters: + state_volume_size = 10 + home_volume_size = 20 - state_volume_type = var.state_volume_type - home_volume_type = var.home_volume_type + state_volume_type = var.state_volume_type + home_volume_type = var.home_volume_type } diff --git a/environments/README.md b/environments/README.md index 722c358ba..dbbb93e68 100644 --- a/environments/README.md +++ b/environments/README.md @@ -32,12 +32,12 @@ for usage instructions for that component. ### common Shared configuration for all environments. This is not -intended to be used as a standalone environment, hence the README does *not* detail +intended to be used as a standalone environment, hence the readme does _not_ detail how to provision the infrastructure. ### skeleton -Skeleton directory that is used as a template to create a new environemnt. +Skeleton directory that is used as a template to create a new environment. ## Defining an environment @@ -50,7 +50,7 @@ Once you have answered all questions, a new environment directory will be created. The directory will be named according to the answer you gave for `environment`. -Follow the README in the new directory to perform initial configuration. +Follow the readme in the new directory to perform initial configuration. ## Activating environments @@ -66,13 +66,12 @@ hosts from the associated group in the inventory. A pattern we use is to name th ansible inventory `group` after the name of the `role` that configures it. The playbook that runs this role targets hosts in that group. The `common` environment typically defines all groups as the empty group. You must explicly opt-in and add hosts to these these groups -to configure that service. For example, if you don't want to deploy and configure grafana, +to configure that service. For example, if you don't want to deploy and configure grafana, you simply do not add any hosts to the `grafana` group in the inventory. This allows us to -have a shared ansible code base as we can define playbooks to configure all things, +have a shared ansible codebase as we can define playbooks to configure all things, but these playbooks end up not being run if no host is in the associated group. -See also: - - `common/inventory/groups` for a list of all groups. +See also: - `common/inventory/groups` for a list of all groups. ## Overriding configuration diff --git a/environments/common/files/filebeat/filebeat.yml b/environments/common/files/filebeat/filebeat.yml index 0f7186b3a..7f19aa093 100644 --- a/environments/common/files/filebeat/filebeat.yml +++ b/environments/common/files/filebeat/filebeat.yml @@ -1,3 +1,4 @@ +--- filebeat.config: modules: path: ${path.config}/modules.d/*.yml diff --git a/environments/common/inventory/group_vars/all/alertmanager.yml b/environments/common/inventory/group_vars/all/alertmanager.yml index c677aaa29..8f5ef0f8e 100644 --- a/environments/common/inventory/group_vars/all/alertmanager.yml +++ b/environments/common/inventory/group_vars/all/alertmanager.yml @@ -1,5 +1,5 @@ - -alertmanager_port: '9093' # defined here as required for prometheus +--- +alertmanager_port: "9093" # defined here as required for prometheus alertmanager_slack_receiver_name: slack-receiver alertmanager_slack_receiver_send_resolved: true diff --git a/environments/common/inventory/group_vars/all/ansible_init.yml b/environments/common/inventory/group_vars/all/ansible_init.yml index df4060f94..0a5198bf0 100644 --- a/environments/common/inventory/group_vars/all/ansible_init.yml +++ b/environments/common/inventory/group_vars/all/ansible_init.yml @@ -1,3 +1,4 @@ +--- ansible_init_wait: 300 # seconds ansible_init_pip_packages: diff --git a/environments/common/inventory/group_vars/all/basic_users.yml b/environments/common/inventory/group_vars/all/basic_users.yml index d94d12982..8d5f86a8b 100644 --- a/environments/common/inventory/group_vars/all/basic_users.yml +++ b/environments/common/inventory/group_vars/all/basic_users.yml @@ -3,7 +3,6 @@ # See ansible/roles/basic_users/README.md for variable definitions. basic_users_users: [] - # The following are defined for the purpose of compute-init basic_users_homedir_server: "{{ groups['control'] | first }}" -basic_users_homedir_client: "{{ groups['login'] | first }}" \ No newline at end of file +basic_users_homedir_client: "{{ groups['login'] | first }}" diff --git a/environments/common/inventory/group_vars/all/defaults.yml b/environments/common/inventory/group_vars/all/defaults.yml index 1af2b36e5..b5adb889d 100644 --- a/environments/common/inventory/group_vars/all/defaults.yml +++ b/environments/common/inventory/group_vars/all/defaults.yml @@ -5,7 +5,7 @@ appliances_repository_root: "{{ lookup('env', 'APPLIANCES_REPO_ROOT') }}" appliances_environment_root: "{{ lookup('env', 'APPLIANCES_ENVIRONMENT_ROOT') }}" appliances_environment_name: "{{ appliances_environment_root | basename | regex_replace('\\W+', '') }}" # [a-zA-Z0-9_] only appliances_cockpit_state: absent # RHEL cockpit installed but not enabled in genericcloud images; appliance defaults to removing it -#appliances_state_dir: # define an absolute path here to use for persistent state: NB: This is defined as /var/lib/state in inventory by the default Terraform +# appliances_state_dir: # define an absolute path here to use for persistent state: NB: This is defined as /var/lib/state in inventory by the default Terraform appliances_mode: configure appliances_pulp_url: https://ark.stackhpc.com @@ -29,53 +29,53 @@ alertmanager_address: "{{ hostvars[groups['alertmanager'].0].api_address }}" appliances_local_users_ansible_user_name: "{{ ansible_ssh_user | default(ansible_user) }}" appliances_local_users_podman_uid: 1001 # UID for podman user - normally next UID after default user appliances_local_users_podman: # also used in environments/common/inventory/group_vars/all/podman.yml:podman_users - name: podman - comment: Used for running all containers - # Would like to set subuid so that we that we know what will appear in /etc/subuid - # See: https://github.com/ansible/ansible/issues/68199 - home: /var/lib/podman - uid: "{{ appliances_local_users_podman_uid }}" + name: podman + comment: Used for running all containers + # Would like to set subuid so that we that we know what will appear in /etc/subuid + # See: https://github.com/ansible/ansible/issues/68199 + home: /var/lib/podman + uid: "{{ appliances_local_users_podman_uid }}" appliances_local_users_default: - - user: - name: "{{ appliances_local_users_ansible_user_name }}" - home: /var/lib/{{ appliances_local_users_ansible_user_name }} - move_home: true - local: true + - user: + name: "{{ appliances_local_users_ansible_user_name }}" + home: /var/lib/{{ appliances_local_users_ansible_user_name }} + move_home: true + local: true - - user: "{{ appliances_local_users_podman }}" - enable: "{{ 'podman' in group_names }}" + - user: "{{ appliances_local_users_podman }}" + enable: "{{ 'podman' in group_names }}" - - user: - name: slurm - comment: SLURM resource manager - home: /etc/slurm - shell: /sbin/nologin - uid: 202 - system: true + - user: + name: slurm + comment: SLURM resource manager + home: /etc/slurm + shell: /sbin/nologin + uid: 202 + system: true - - group: - name: prometheus - gid: 976 - user: - name: prometheus - uid: 981 - home: "{{ prometheus_db_dir }}" - shell: /usr/sbin/nologin - system: true - enable: "{{ 'prometheus' in group_names }}" + - group: + name: prometheus + gid: 976 + user: + name: prometheus + uid: 981 + home: "{{ prometheus_db_dir }}" + shell: /usr/sbin/nologin + system: true + enable: "{{ 'prometheus' in group_names }}" - - group: - name: grafana - gid: 979 - user: - name: grafana - comment: grafana user - uid: 984 - home: /usr/share/grafana - shell: /sbin/nologin - system: true - enable: "{{ 'grafana' in group_names }}" + - group: + name: grafana + gid: 979 + user: + name: grafana + comment: grafana user + uid: 984 + home: /usr/share/grafana + shell: /sbin/nologin + system: true + enable: "{{ 'grafana' in group_names }}" # Overide this to add extra users whilst keeping the defaults. appliances_local_users_extra: [] # see format of appliances_local_users_default above @@ -84,17 +84,16 @@ appliances_local_users: "{{ (appliances_local_users_default + appliances_local_u ################## bootstrap: extra package installs ###################################### appliances_extra_packages_default: - - htop - - nano - - screen - - tmux - - wget - - bind-utils - - net-tools - - postfix - - git - - "{{ 'python36' if ansible_distribution_version == '8.9' else 'python312' }}" + - htop + - nano + - screen + - tmux + - wget + - bind-utils + - net-tools + - postfix + - git + - "{{ 'python36' if ansible_distribution_version == '8.9' else 'python312' }}" appliances_extra_packages_other: [] - appliances_extra_packages: "{{ (appliances_extra_packages_default + appliances_extra_packages_other) | select | list }}" diff --git a/environments/common/inventory/group_vars/all/filebeat.yml b/environments/common/inventory/group_vars/all/filebeat.yml index d268af1ba..4b91726bf 100644 --- a/environments/common/inventory/group_vars/all/filebeat.yml +++ b/environments/common/inventory/group_vars/all/filebeat.yml @@ -4,4 +4,4 @@ filebeat_config_path: "{{ appliances_repository_root }}/environments/common/files/filebeat/filebeat.yml" # User that runs the filebeat container -filebeat_podman_user: podman \ No newline at end of file +filebeat_podman_user: podman diff --git a/environments/common/inventory/group_vars/all/firewalld.yml b/environments/common/inventory/group_vars/all/firewalld.yml index 3548045ed..4b4f54b95 100644 --- a/environments/common/inventory/group_vars/all/firewalld.yml +++ b/environments/common/inventory/group_vars/all/firewalld.yml @@ -1,3 +1,4 @@ +--- # See ansible/roles/firewalld/README.md # for variable definitions. @@ -9,14 +10,14 @@ firewalld_configs_default: # name: An arbitrary name or description # group: An ansible group name - this rule is applied if the fail2ban node is in this group # rule: A dict of parameters passed to the `ansible.posix.firewalld` module. - # FaiBy default we rely on openstack security groups so + # FaiBy default we rely on openstack security groups so - name: Make firewalld permissive group: openhpc rule: zone: public state: enabled target: ACCEPT - permanent: yes + permanent: true firewalld_configs_extra: [] # list of dicts with parameters as for firewalld_configs_default diff --git a/environments/common/inventory/group_vars/all/freeipa_server.yml b/environments/common/inventory/group_vars/all/freeipa_server.yml index 7f0fee713..64a1f7a26 100644 --- a/environments/common/inventory/group_vars/all/freeipa_server.yml +++ b/environments/common/inventory/group_vars/all/freeipa_server.yml @@ -1,3 +1,4 @@ +--- # See ansible/roles/freeipa/README.md # These vars are only used when freeipa_server is enabled. They are not required when enabling only freeipa_client freeipa_realm: "{{ openhpc_cluster_name | upper }}.{{ cluster_domain_suffix | upper }}" diff --git a/environments/common/inventory/group_vars/all/grafana.yml b/environments/common/inventory/group_vars/all/grafana.yml index b03d16f37..b428849f6 100644 --- a/environments/common/inventory/group_vars/all/grafana.yml +++ b/environments/common/inventory/group_vars/all/grafana.yml @@ -2,7 +2,7 @@ # See: https://github.com/cloudalchemy/ansible-grafana # for variable definitions. -grafana_version: '10.4.18' +grafana_version: "10.4.18" # need to copy some role defaults here so we can use in inventory: grafana_port: 3000 @@ -53,9 +53,9 @@ grafana_dashboards_default: grafana_dashboards: "{{ grafana_dashboards_default + (openondemand_dashboard if groups.get('openondemand') else []) }}" grafana_security: - admin_user: grafana - admin_password: "{{ vault_grafana_admin_password }}" - allow_embedding: true + admin_user: grafana + admin_password: "{{ vault_grafana_admin_password }}" + allow_embedding: true grafana_datasources: - name: prometheus @@ -77,7 +77,7 @@ grafana_datasources: timeField: "@timestamp" # Have to set flavor and version, but ansible/roles/opensearch/templates/opensearch.yml.j2 fakes version for filebeat # so need to set to fake version here: - version: '7.10.2' + version: "7.10.2" flavor: elasticsearch editable: true # readOnly: false @@ -99,7 +99,7 @@ grafana_server: # appliance specific: serve_from_sub_path: "{{ grafana_serve_from_sub_path }}" - +# yamllint disable-line rule:line-length grafana_auth_anonymous: false # Enable anonymous View-only login - see implications: https://grafana.com/docs/grafana/latest/administration/security/#implications-of-enabling-anonymous-access-to-dashboards _grafana_auth_anon_cfg: diff --git a/environments/common/inventory/group_vars/all/k3s.yml b/environments/common/inventory/group_vars/all/k3s.yml index a7ba0a0bf..aa7172f87 100644 --- a/environments/common/inventory/group_vars/all/k3s.yml +++ b/environments/common/inventory/group_vars/all/k3s.yml @@ -1 +1,2 @@ +--- k3s_bootstrap_token: "{{ hostvars[groups['k3s_server'] | first].k3s_bootstrap_token | default('') }}" diff --git a/environments/common/inventory/group_vars/all/manila.yml b/environments/common/inventory/group_vars/all/manila.yml index baccd4432..0497d3d50 100644 --- a/environments/common/inventory/group_vars/all/manila.yml +++ b/environments/common/inventory/group_vars/all/manila.yml @@ -1,3 +1,4 @@ +--- # Default configuration for manila file shares, see # https://github.com/stackhpc/ansible-role-os-manila-mount # for all variable definitions, and override in your environment. diff --git a/environments/common/inventory/group_vars/all/mysql.yml b/environments/common/inventory/group_vars/all/mysql.yml index d5245fe1f..2c320af42 100644 --- a/environments/common/inventory/group_vars/all/mysql.yml +++ b/environments/common/inventory/group_vars/all/mysql.yml @@ -17,7 +17,7 @@ mysql_datadir: "{{ appliances_state_dir | default('/var/lib') }}/mysql" mysql_databases: - name: slurm_acct_db - config_file: '' + config_file: "" login_user: root login_password: "{{ mysql_root_password }}" login_host: "{{ mysql_host }}" diff --git a/environments/common/inventory/group_vars/all/nfs.yml b/environments/common/inventory/group_vars/all/nfs.yml index aec2213f1..a6851ebd0 100644 --- a/environments/common/inventory/group_vars/all/nfs.yml +++ b/environments/common/inventory/group_vars/all/nfs.yml @@ -14,22 +14,22 @@ nfs_export_clients: "{{ _nfs_node_ips }}" nfs_configuration_home_volume: # volume-backed home directories - comment: Export /exports/home from Slurm control node as /home nfs_enable: - server: "{{ inventory_hostname in groups['control'] }}" - # Don't mount share on control node: - clients: "{{ inventory_hostname in groups['cluster'] and inventory_hostname not in groups['control'] }}" + server: "{{ inventory_hostname in groups['control'] }}" + # Don't mount share on control node: + clients: "{{ inventory_hostname in groups['cluster'] and inventory_hostname not in groups['control'] }}" nfs_server: "{{ nfs_server_default }}" nfs_export: "/exports/home" # assumes skeleton TF is being used nfs_client_mnt_point: "/home" # prevent tunnelling and setuid binaries: # NB: this is stackhpc.nfs role defaults but are set here to prevent being # accidently overriden via default options - nfs_export_options: 'rw,secure,root_squash' + nfs_export_options: "rw,secure,root_squash" nfs_configuration_compute_nodes: # cluster configuration for compute_init/slurm-controlled rebuild - comment: Export /exports/cluster from Slurm control node nfs_enable: - server: "{{ inventory_hostname in groups['control'] }}" - clients: false + server: "{{ inventory_hostname in groups['control'] }}" + clients: false nfs_export: "/exports/cluster" nfs_configurations_extra: [] # site-specific nfs shares diff --git a/environments/common/inventory/group_vars/all/openhpc.yml b/environments/common/inventory/group_vars/all/openhpc.yml index 005bdf04d..fb8137831 100644 --- a/environments/common/inventory/group_vars/all/openhpc.yml +++ b/environments/common/inventory/group_vars/all/openhpc.yml @@ -9,7 +9,7 @@ openhpc_enable: database: "{{ inventory_hostname in groups['control'] }}" runtime: true openhpc_slurm_service_enabled: true -openhpc_slurm_accounting_storage_type: 'accounting_storage/slurmdbd' +openhpc_slurm_accounting_storage_type: "accounting_storage/slurmdbd" openhpc_slurmdbd_mysql_database: slurm_acct_db openhpc_slurmdbd_mysql_password: "{{ vault_mysql_slurm_password }}" openhpc_slurmdbd_mysql_username: slurm @@ -18,17 +18,18 @@ openhpc_slurmdbd_host: "{{ openhpc_slurm_control_host }}" openhpc_rebuild_partition: # not a role var - could actually add more indirection here for things we're expecting to be modified, e.g. groups and maxtime name: rebuild nodegroups: "{{ cluster_compute_groups | default([]) }}" - default: NO + default: false maxtime: 30 partition_params: PriorityJobFactor: 65533 - Hidden: YES - RootOnly: YES - DisableRootJobs: NO - PreemptMode: 'OFF' + Hidden: true + RootOnly: true + DisableRootJobs: false + PreemptMode: "OFF" OverSubscribe: EXCLUSIVE openhpc_nodegroups: "{{ cluster_compute_groups | map('community.general.dict_kv', 'name') }}" # create nodegroup for each compute group openhpc_user_partitions: "{{ openhpc_nodegroups }}" # create partition for each nodegroup (actually role default) - this is what we'd expect to be changed +# yamllint disable-line rule:line-length openhpc_partitions: "{{ openhpc_user_partitions + ([openhpc_rebuild_partition] if groups['rebuild'] | length > 0 else []) }}" # auto-create rebuild partition if reqd. openhpc_packages_default: # system packages @@ -55,6 +56,7 @@ openhpc_config_rebuild: - reboot_from_controller ResumeTimeout: 300 openhpc_config_extra: {} +# yamllint disable-line rule:line-length openhpc_config: "{{ openhpc_config_default | combine(openhpc_config_rebuild if groups['rebuild'] | length > 0 else {}, openhpc_config_extra, list_merge='append') }}" openhpc_state_save_location: "{{ appliances_state_dir + '/slurmctld' if appliances_state_dir is defined else '/var/spool' }}" @@ -65,12 +67,10 @@ openhpc_install_type: ohpc # 'ohpc' or 'generic', see https://github.com/stackhp ohpc_openhpc_repos: "9": [] "8": [] - # overriding to ensure doesn't overwrite Ark epel repo ohpc_default_extra_repos: "9": [] "8": [] - # configure slurm database pre-upgrade backups: openhpc_slurm_accounting_storage_service: mysql openhpc_slurm_accounting_storage_backup_cmd: >- diff --git a/environments/common/inventory/group_vars/all/openondemand.yml b/environments/common/inventory/group_vars/all/openondemand.yml index 2df138072..b25ea399f 100644 --- a/environments/common/inventory/group_vars/all/openondemand.yml +++ b/environments/common/inventory/group_vars/all/openondemand.yml @@ -1,11 +1,11 @@ --- -# See: ansible/roles/openondemand/README.md +# See: ansible/roles/openondemand/README.md # for variable definitions. # NB: Variables prefixed ood_ are all from https://github.com/OSC/ood-ansible -ondemand_package_version: '3.1.10' # used in ansible/cleanup.yml +ondemand_package_version: "3.1.10" # used in ansible/cleanup.yml ondemand_package: ondemand-"{{ ondemand_package_version }}" # osc.ood role var controlling installed package openondemand_servername: "{{ hostvars[groups['openondemand'].0].ansible_host if groups['openondemand'] else '' }}" @@ -17,7 +17,7 @@ openondemand_desktop_partition: "{{ openhpc_partitions[0]['name'] }}" # Regex defining hosts which openondemand can proxy; the default regex is compute nodes (for apps) and grafana host, # e.g. if the group `compute` has hosts `compute-{0,1,2,..}` this will be '(compute-\d+)|(control)'. -# The autogenerated regex may need overriding if compute node names do not contain numbers in a consistent position +# The autogenerated regex may need overriding if compute node names do not contain numbers in a consistent position # or include regex special characters. openondemand_host_regex: "{{ (groups['compute'] + groups['grafana']) | to_ood_regex }}" @@ -85,7 +85,7 @@ openondemand_clusters_grafana: ood_install_apps_defaults: jupyter: repo: https://github.com/OSC/bc_example_jupyter.git - version: master # defaults (optional) + version: master # defaults (optional) ood_install_apps: "{{ ood_install_apps_defaults if openondemand_jupyter_partition | default(none) else {} }}" # https://github.com/OSC/ondemand/tree/master/apps/bc_desktop @@ -123,6 +123,7 @@ openondemand_apps_desktop_default: - <%= "--nodelist=#{node}" %> openondemand_apps_desktop: "{{ {'bc_desktop':openondemand_apps_desktop_default} if openondemand_desktop_partition | default(none) else {} }}" +# yamllint disable-line rule:line-length # See https://osc.github.io/ood-documentation/latest/app-development/tutorials-interactive-apps/add-jupyter.html#app-development-tutorials-interactive-apps-add-jupyter openondemand_apps_jupyter_default: title: Jupyter Notebook @@ -157,8 +158,8 @@ openondemand_apps_jupyter_default: openondemand_apps_jupyter: "{{ {'jupyter':openondemand_apps_jupyter_default} if openondemand_jupyter_partition | default(none) else {} }}" # osc.ood:ood_apps - see https://github.com/OSC/ood-ansible#ood_apps -openondemand_dashboard_support_url: '' -openondemand_dashboard_docs_url: '' +openondemand_dashboard_support_url: "" +openondemand_dashboard_docs_url: "" openondemand_apps: files: env: @@ -185,11 +186,11 @@ openondemand_scrape_configs: scrape_timeout: 20s scrape_interval: 2m static_configs: - - targets: - - "{{ openondemand_address }}:9301" - labels: - environment: "{{ appliances_environment_name }}" - service: "openondemand" + - targets: + - "{{ openondemand_address }}:9301" + labels: + environment: "{{ appliances_environment_name }}" + service: "openondemand" openondemand_dashboard: - dashboard_id: 13465 @@ -198,8 +199,12 @@ openondemand_dashboard: replacement: prometheus revision_id: 1 -_opeonondemand_unset_auth: ' RequestHeader unset Authorization' +_opeonondemand_unset_auth: " RequestHeader unset Authorization" # Fix grafana proxying for basic auth if anonymous grafana access enabled: +# yamllint disable-line rule:line-length openondemand_node_proxy_directives: "{{ _opeonondemand_unset_auth if (openondemand_auth == 'basic_pam' and 'openondemand_host_regex' and groups['grafana'] | length > 0 and hostvars[ groups['grafana'] | first]._grafana_auth_is_anonymous) else '' }}" -# Reason: OOD server forwards headers to proxied servers, so when if using basic auth Grafana gets passed the Open Ondemand user. This probably isn't a Grafana user so it throws an auth error. If anonymous access is enabled we can work around this by not forwarding auth header. +# Reason: OOD server forwards headers to proxied servers, so when if using basic auth +# Grafana gets passed the Open Ondemand user. +# This probably isn't a Grafana user so it throws an auth error. +# If anonymous access is enabled we can work around this by not forwarding auth header. diff --git a/environments/common/inventory/group_vars/all/os-manila-mount.yml b/environments/common/inventory/group_vars/all/os-manila-mount.yml index 6b25d62cb..aa8a6b8f3 100644 --- a/environments/common/inventory/group_vars/all/os-manila-mount.yml +++ b/environments/common/inventory/group_vars/all/os-manila-mount.yml @@ -1,3 +1,4 @@ +--- # Empty repo lists from stackhpc.ansible-role-os-manila-mount role defaults, as these repofiles are # now generated by dnf_repos to allow injecting Ark creds: os_manila_mount_ceph_rpm_repos: [] diff --git a/environments/common/inventory/group_vars/all/podman.yml b/environments/common/inventory/group_vars/all/podman.yml index 8ca8eb1eb..a6d38f5a4 100644 --- a/environments/common/inventory/group_vars/all/podman.yml +++ b/environments/common/inventory/group_vars/all/podman.yml @@ -1 +1,2 @@ +--- podman_users: "{{ [appliances_local_users_podman] }}" # user to use for podman diff --git a/environments/common/inventory/group_vars/all/prometheus.yml b/environments/common/inventory/group_vars/all/prometheus.yml index f4587e672..6b33ce889 100644 --- a/environments/common/inventory/group_vars/all/prometheus.yml +++ b/environments/common/inventory/group_vars/all/prometheus.yml @@ -11,8 +11,8 @@ prometheus_db_dir: "{{ appliances_state_dir | default('/var/lib') }}/prometheus" prometheus_alertmanager_config_default: - static_configs: - - targets: - - "{{ alertmanager_address }}:{{ alertmanager_port }}" + - targets: + - "{{ alertmanager_address }}:{{ alertmanager_port }}" basic_auth: username: alertmanager # cloudalchemy.prometheus/preflight checks this config so it must be @@ -44,37 +44,38 @@ prometheus_targets: other: "{{ groups.get('node_exporter', []) | difference(groups['openhpc']) | prometheus_node_exporter_targets(hostvars, 'prometheus_env', 'other') }}" prometheus_scrape_configs_default: -- job_name: "prometheus" - metrics_path: "/metrics" - static_configs: - - targets: - - "{{ prometheus_address }}:9090" -- job_name: "grafana" - static_configs: - - targets: - - "{{ grafana_api_address }}:{{ grafana_port }}" -- job_name: "node" - file_sd_configs: - - files: - - /etc/prometheus/file_sd/control.yml - - /etc/prometheus/file_sd/login.yml - - /etc/prometheus/file_sd/compute.yml - - /etc/prometheus/file_sd/other.yml - relabel_configs: - # strip off port - - source_labels: ['__address__'] - separator: ':' - regex: '(.*):.*' - target_label: 'instance' - replacement: '${1}' - scrape_interval: 30s - scrape_timeout: 20s + - job_name: "prometheus" + metrics_path: "/metrics" + static_configs: + - targets: + - "{{ prometheus_address }}:9090" + - job_name: "grafana" + static_configs: + - targets: + - "{{ grafana_api_address }}:{{ grafana_port }}" + - job_name: "node" + file_sd_configs: + - files: + - /etc/prometheus/file_sd/control.yml + - /etc/prometheus/file_sd/login.yml + - /etc/prometheus/file_sd/compute.yml + - /etc/prometheus/file_sd/other.yml + relabel_configs: + # strip off port + - source_labels: + - '__address__' + separator: ':' + regex: '(.*):.*' + target_label: 'instance' + replacement: '${1}' + scrape_interval: 30s + scrape_timeout: 20s -- job_name: "slurm_exporter" - scrape_interval: 30s - scrape_timeout: 30s - static_configs: - - targets: - - "{{ openhpc_slurm_control_host }}:{{ slurm_exporter_port }}" + - job_name: "slurm_exporter" + scrape_interval: 30s + scrape_timeout: 30s + static_configs: + - targets: + - "{{ openhpc_slurm_control_host }}:{{ slurm_exporter_port }}" prometheus_scrape_configs: "{{ prometheus_scrape_configs_default + (openondemand_scrape_configs if groups['openondemand'] | count > 0 else []) }}" diff --git a/environments/common/inventory/group_vars/all/proxy.yml b/environments/common/inventory/group_vars/all/proxy.yml index d606ee1d9..ba8fb4b95 100644 --- a/environments/common/inventory/group_vars/all/proxy.yml +++ b/environments/common/inventory/group_vars/all/proxy.yml @@ -1,2 +1,3 @@ +--- # default proxy address to first squid api address port 3128 if squid group non-empty, else empty string to avoid breaking hostvars proxy_http_proxy: "{{ 'http://' + hostvars[groups['squid'].0].api_address + ':' + (squid_http_port | string) if groups['squid'] else '' }}" diff --git a/environments/common/inventory/group_vars/all/pulp.yml b/environments/common/inventory/group_vars/all/pulp.yml index 22bb83216..492a84aa1 100644 --- a/environments/common/inventory/group_vars/all/pulp.yml +++ b/environments/common/inventory/group_vars/all/pulp.yml @@ -1,3 +1,4 @@ +--- pulp_site_port: 8080 # If using Ark directly (no local Pulp server), override the following with Ark creds diff --git a/environments/common/inventory/group_vars/all/slurm_exporter.yml b/environments/common/inventory/group_vars/all/slurm_exporter.yml index 490231077..072c436b1 100644 --- a/environments/common/inventory/group_vars/all/slurm_exporter.yml +++ b/environments/common/inventory/group_vars/all/slurm_exporter.yml @@ -1,3 +1,4 @@ +--- slurm_exporter_port: 9341 # as defined by [1] and implemented in [2] -#[1]: https://github.com/prometheus/prometheus/wiki/Default-port-allocations -#[2]: https://github.com/stackhpc/prometheus-slurm-exporter/blob/master/lib/systemd/prometheus-slurm-exporter.service +# [1]: https://github.com/prometheus/prometheus/wiki/Default-port-allocations +# [2]: https://github.com/stackhpc/prometheus-slurm-exporter/blob/master/lib/systemd/prometheus-slurm-exporter.service diff --git a/environments/common/inventory/group_vars/all/squid.yml b/environments/common/inventory/group_vars/all/squid.yml index 59557291b..4218c5c95 100644 --- a/environments/common/inventory/group_vars/all/squid.yml +++ b/environments/common/inventory/group_vars/all/squid.yml @@ -1 +1,2 @@ +--- squid_http_port: 3128 # defined here for proxy role diff --git a/environments/common/inventory/group_vars/all/sshd.yaml b/environments/common/inventory/group_vars/all/sshd.yaml index 5d4ed228f..cf22b12c9 100644 --- a/environments/common/inventory/group_vars/all/sshd.yaml +++ b/environments/common/inventory/group_vars/all/sshd.yaml @@ -1 +1,2 @@ +--- sshd_password_authentication: "{{ sssd_install_ldap | default(false) | bool }}" diff --git a/environments/common/inventory/group_vars/all/systemd.yml b/environments/common/inventory/group_vars/all/systemd.yml index ae72a7882..bc267d5d1 100644 --- a/environments/common/inventory/group_vars/all/systemd.yml +++ b/environments/common/inventory/group_vars/all/systemd.yml @@ -1,3 +1,4 @@ +--- _systemd_requiresmount_statedir: | {% if appliances_state_dir is defined %} [Unit] diff --git a/environments/common/inventory/group_vars/all/timestamps.yml b/environments/common/inventory/group_vars/all/timestamps.yml index a7a4be269..2f89849a2 100644 --- a/environments/common/inventory/group_vars/all/timestamps.yml +++ b/environments/common/inventory/group_vars/all/timestamps.yml @@ -1,76 +1,77 @@ +--- appliances_pulp_repos: appstream: - '8.10': + "8.10": path: rocky/8.10/AppStream/x86_64/os timestamp: 20250328T030013 - '9.4': + "9.4": path: rocky/9.4/AppStream/x86_64/os timestamp: 20241112T003151 - '9.5': + "9.5": path: rocky/9.5/AppStream/x86_64/os timestamp: 20250328T031822 baseos: - '8.10': + "8.10": path: rocky/8.10/BaseOS/x86_64/os timestamp: 20250328T030013 - '9.4': + "9.4": path: rocky/9.4/BaseOS/x86_64/os timestamp: 20241115T011711 - '9.5': + "9.5": path: rocky/9.5/BaseOS/x86_64/os timestamp: 20250326T030636 ceph: - '8': + "8": path: centos/8-stream/storage/x86_64/ceph-quincy timestamp: 20231104T015751 - '9': + "9": path: centos/9-stream/storage/x86_64/ceph-reef timestamp: 20240923T233036 crb: - '8.10': + "8.10": path: rocky/8.10/PowerTools/x86_64/os timestamp: 20250328T030013 - '9.4': + "9.4": path: rocky/9.4/CRB/x86_64/os timestamp: 20241115T003133 - '9.5': + "9.5": path: rocky/9.5/CRB/x86_64/os timestamp: 20250325T031428 epel: - '8': + "8": path: epel/8/Everything/x86_64 timestamp: 20250609T000109 - '9': + "9": path: epel/9/Everything/x86_64 timestamp: 20250609T000109 extras: - '8.10': + "8.10": path: rocky/8.10/extras/x86_64/os timestamp: 20250327T030422 - '9.4': + "9.4": path: rocky/9.4/extras/x86_64/os timestamp: 20241118T002802 - '9.5': + "9.5": path: rocky/9.5/extras/x86_64/os timestamp: 20250328T031822 openhpc_base: - '8': + "8": path: OpenHPC/2/EL_8 timestamp: 20241218T154614 - '9': + "9": path: OpenHPC/3/EL_9 timestamp: 20241218T154614 openhpc_updates: - '8': + "8": path: OpenHPC/2/updates/EL_8 timestamp: 20250512T003315 - '9': + "9": path: OpenHPC/3/updates/EL_9 timestamp: 20250510T003301 grafana: - '8': + "8": path: grafana/oss/rpm timestamp: 20250505T025259 - '9': + "9": path: grafana/oss/rpm timestamp: 20250505T025259 diff --git a/environments/common/inventory/group_vars/all/update.yml b/environments/common/inventory/group_vars/all/update.yml index 715d418c7..a0b10ce3b 100644 --- a/environments/common/inventory/group_vars/all/update.yml +++ b/environments/common/inventory/group_vars/all/update.yml @@ -1,12 +1,13 @@ --- update_enable: false -# These variables define the packages updates and are passed to ansible's yum module parameters with the same names: https://docs.ansible.com/ansible/latest/collections/ansible/builtin/yum_module.html -update_name: '*' +# These variables define the packages updates and are passed to ansible's yum module parameters +# with the same names: https://docs.ansible.com/ansible/latest/collections/ansible/builtin/yum_module.html +update_name: "*" update_state: latest update_exclude: - grafana - apptainer # see https://github.com/stackhpc/ansible-slurm-appliance/pull/245 update_disablerepo: omit # Log changes during update here on localhost: -update_log_path: "{{ appliances_environment_root }}/logs/{{ inventory_hostname }}-updates.log" +update_log_path: "{{ appliances_environment_root }}/logs/{{ inventory_hostname }}-updates.log" diff --git a/environments/common/layouts/README.md b/environments/common/layouts/README.md index e87ad93ef..42f73aac5 100644 --- a/environments/common/layouts/README.md +++ b/environments/common/layouts/README.md @@ -3,4 +3,4 @@ This folder contains some predefined group mappings. You can copy them into an environment folder if you wish to modify them or just reference them directly in ansible.cfg as another inventory file. If you are referencing them in the -inventory file, it is advisable to put them just after the common environment. \ No newline at end of file +inventory file, it is advisable to put them just after the common environment. diff --git a/environments/skeleton/cookiecutter.json b/environments/skeleton/cookiecutter.json index 93b8e7e8c..3eb7acfa3 100644 --- a/environments/skeleton/cookiecutter.json +++ b/environments/skeleton/cookiecutter.json @@ -1,4 +1,4 @@ { - "environment": "foo", - "description" : "Describe the environment here" + "environment": "foo", + "description": "Describe the environment here" } diff --git a/environments/skeleton/{{cookiecutter.environment}}/README.md b/environments/skeleton/{{cookiecutter.environment}}/README.md index 202ca677c..89fe6b4af 100644 --- a/environments/skeleton/{{cookiecutter.environment}}/README.md +++ b/environments/skeleton/{{cookiecutter.environment}}/README.md @@ -2,4 +2,4 @@ {{ cookiecutter.description }} -See the main README.md in the repo root for an overview and general install instructions. Any environment-specific instructions should be added here. \ No newline at end of file +See the main README.md in the repository root for an overview and general install instructions. Any environment-specific instructions should be added here. diff --git a/environments/skeleton/{{cookiecutter.environment}}/inventory/group_vars/all/basic_users.yml b/environments/skeleton/{{cookiecutter.environment}}/inventory/group_vars/all/basic_users.yml index dc993c3b8..4b4287cfa 100644 --- a/environments/skeleton/{{cookiecutter.environment}}/inventory/group_vars/all/basic_users.yml +++ b/environments/skeleton/{{cookiecutter.environment}}/inventory/group_vars/all/basic_users.yml @@ -1,3 +1,4 @@ +--- basic_users_users: - name: demo_user password: "{% raw %}{{ vault_demo_user_password | password_hash('sha512', 65534 | random(seed=inventory_hostname) | string) }}{% endraw %}" # idempotent diff --git a/environments/skeleton/{{cookiecutter.environment}}/inventory/group_vars/all/grafana.yml b/environments/skeleton/{{cookiecutter.environment}}/inventory/group_vars/all/grafana.yml index 521616a1b..3c49fd5d7 100644 --- a/environments/skeleton/{{cookiecutter.environment}}/inventory/group_vars/all/grafana.yml +++ b/environments/skeleton/{{cookiecutter.environment}}/inventory/group_vars/all/grafana.yml @@ -1 +1,2 @@ -grafana_auth_anonymous: true \ No newline at end of file +--- +grafana_auth_anonymous: true diff --git a/environments/skeleton/{{cookiecutter.environment}}/inventory/group_vars/all/hpctests.yml b/environments/skeleton/{{cookiecutter.environment}}/inventory/group_vars/all/hpctests.yml index e8cfcea5f..472462169 100644 --- a/environments/skeleton/{{cookiecutter.environment}}/inventory/group_vars/all/hpctests.yml +++ b/environments/skeleton/{{cookiecutter.environment}}/inventory/group_vars/all/hpctests.yml @@ -1 +1,2 @@ +--- hpctests_user: demo_user diff --git a/environments/skeleton/{{cookiecutter.environment}}/inventory/group_vars/all/vault_alertmanager.yml b/environments/skeleton/{{cookiecutter.environment}}/inventory/group_vars/all/vault_alertmanager.yml index 4375ed725..02abb000d 100644 --- a/environments/skeleton/{{cookiecutter.environment}}/inventory/group_vars/all/vault_alertmanager.yml +++ b/environments/skeleton/{{cookiecutter.environment}}/inventory/group_vars/all/vault_alertmanager.yml @@ -1,3 +1,3 @@ # Add a bot token here THEN VAULT-ENCRYPT this file! -#vault_alertmanager_slack_integration_app_creds: '' +# vault_alertmanager_slack_integration_app_creds: '' diff --git a/environments/skeleton/{{cookiecutter.environment}}/tofu/baremetal-node-list.py b/environments/skeleton/{{cookiecutter.environment}}/tofu/baremetal-node-list.py index 14bc3ce4c..d07e59fb4 100755 --- a/environments/skeleton/{{cookiecutter.environment}}/tofu/baremetal-node-list.py +++ b/environments/skeleton/{{cookiecutter.environment}}/tofu/baremetal-node-list.py @@ -1,32 +1,33 @@ -#!/usr/bin/env python -""" opentofu external data program to list baremetal nodes +#!/usr/bin/env python # pylint: disable=invalid-name +"""opentofu external data program to list baremetal nodes - Example usage: +Example usage: - data "external" "example" { - program = [this_file] - } + data "external" "example" { + program = [this_file] + } - The external data resource's result attribute then contains a mapping of - Ironic node names to their UUIDs. +The external data resource's result attribute then contains a mapping of +Ironic node names to their UUIDs. - An empty list is returned if: - - There are no baremetal nodes - - The listing fails for any reason, e.g. - - there is no baremetal service - - admin credentials are required and are not provided +An empty list is returned if: +- There are no baremetal nodes +- The listing fails for any reason, e.g. + - there is no baremetal service + - admin credentials are required and are not provided """ -import openstack import json +import openstack # pylint: disable=import-error + nodes = [] -proxy = None +proxy = None # pylint: disable=invalid-name output = {} conn = openstack.connection.from_config() try: - proxy = getattr(conn, 'baremetal', None) -except Exception: + proxy = getattr(conn, "baremetal", None) +except Exception: # pylint: disable=broad-exception-caught pass if proxy is not None: nodes = proxy.nodes() diff --git a/environments/skeleton/{{cookiecutter.environment}}/tofu/compute.tf b/environments/skeleton/{{cookiecutter.environment}}/tofu/compute.tf index 4e6186e35..c2744934e 100644 --- a/environments/skeleton/{{cookiecutter.environment}}/tofu/compute.tf +++ b/environments/skeleton/{{cookiecutter.environment}}/tofu/compute.tf @@ -4,41 +4,41 @@ module "compute" { for_each = var.compute # must be set for group: - nodes = each.value.nodes + nodes = each.value.nodes flavor = each.value.flavor # always taken from top-level value: - cluster_name = var.cluster_name + cluster_name = var.cluster_name cluster_domain_suffix = var.cluster_domain_suffix - key_pair = var.key_pair - environment_root = var.environment_root - + key_pair = var.key_pair + environment_root = var.environment_root + # can be set for group, defaults to top-level value: - image_id = lookup(each.value, "image_id", var.cluster_image_id) - vnic_types = lookup(each.value, "vnic_types", var.vnic_types) + image_id = lookup(each.value, "image_id", var.cluster_image_id) + vnic_types = lookup(each.value, "vnic_types", var.vnic_types) volume_backed_instances = lookup(each.value, "volume_backed_instances", var.volume_backed_instances) - root_volume_size = lookup(each.value, "root_volume_size", var.root_volume_size) - root_volume_type = lookup(each.value, "root_volume_type", var.root_volume_type) - gateway_ip = lookup(each.value, "gateway_ip", var.gateway_ip) - nodename_template = lookup(each.value, "nodename_template", var.cluster_nodename_template) + root_volume_size = lookup(each.value, "root_volume_size", var.root_volume_size) + root_volume_type = lookup(each.value, "root_volume_type", var.root_volume_type) + gateway_ip = lookup(each.value, "gateway_ip", var.gateway_ip) + nodename_template = lookup(each.value, "nodename_template", var.cluster_nodename_template) # optionally set for group: networks = concat(var.cluster_networks, lookup(each.value, "extra_networks", [])) # here null means "use module var default" - extra_volumes = lookup(each.value, "extra_volumes", null) - compute_init_enable = lookup(each.value, "compute_init_enable", null) + extra_volumes = lookup(each.value, "extra_volumes", null) + compute_init_enable = lookup(each.value, "compute_init_enable", null) ignore_image_changes = lookup(each.value, "ignore_image_changes", null) - match_ironic_node = lookup(each.value, "match_ironic_node", null) - availability_zone = lookup(each.value, "availability_zone", null) - ip_addresses = lookup(each.value, "ip_addresses", null) + match_ironic_node = lookup(each.value, "match_ironic_node", null) + availability_zone = lookup(each.value, "availability_zone", null) + ip_addresses = lookup(each.value, "ip_addresses", null) # computed # not using openstack_compute_instance_v2.control.access_ip_v4 to avoid # updates to node metadata on deletion/recreation of the control node: - control_address = openstack_networking_port_v2.control[var.cluster_networks[0].network].all_fixed_ips[0] - security_group_ids = [for o in data.openstack_networking_secgroup_v2.nonlogin: o.id] - baremetal_nodes = data.external.baremetal_nodes.result - + control_address = openstack_networking_port_v2.control[var.cluster_networks[0].network].all_fixed_ips[0] + security_group_ids = [for o in data.openstack_networking_secgroup_v2.nonlogin : o.id] + baremetal_nodes = data.external.baremetal_nodes.result + # input dict validation: group_name = each.key group_keys = keys(each.value) diff --git a/environments/skeleton/{{cookiecutter.environment}}/tofu/control.tf b/environments/skeleton/{{cookiecutter.environment}}/tofu/control.tf index 7e2e51470..5c973825d 100644 --- a/environments/skeleton/{{cookiecutter.environment}}/tofu/control.tf +++ b/environments/skeleton/{{cookiecutter.environment}}/tofu/control.tf @@ -1,26 +1,26 @@ locals { control_volumes = concat( # convert maps to lists with zero or one entries: - [for v in data.openstack_blockstorage_volume_v3.state: v], - [for v in data.openstack_blockstorage_volume_v3.home: v] + [for v in data.openstack_blockstorage_volume_v3.state : v], + [for v in data.openstack_blockstorage_volume_v3.home : v] ) nodename = templatestring( var.cluster_nodename_template, { - node = "control", - cluster_name = var.cluster_name, + node = "control", + cluster_name = var.cluster_name, cluster_domain_suffix = var.cluster_domain_suffix, - environment_name = basename(var.environment_root) + environment_name = basename(var.environment_root) } ) } resource "openstack_networking_port_v2" "control" { - for_each = {for net in var.cluster_networks: net.network => net} + for_each = { for net in var.cluster_networks : net.network => net } - name = "${var.cluster_name}-control-${each.key}" - network_id = data.openstack_networking_network_v2.cluster_net[each.key].id + name = "${var.cluster_name}-control-${each.key}" + network_id = data.openstack_networking_network_v2.cluster_net[each.key].id admin_state_up = "true" fixed_ip { @@ -29,7 +29,7 @@ resource "openstack_networking_port_v2" "control" { } no_security_groups = lookup(each.value, "no_security_groups", false) - security_group_ids = lookup(each.value, "no_security_groups", false) ? [] : [for o in data.openstack_networking_secgroup_v2.nonlogin: o.id] + security_group_ids = lookup(each.value, "no_security_groups", false) ? [] : [for o in data.openstack_networking_secgroup_v2.nonlogin : o.id] binding { vnic_type = lookup(var.vnic_types, each.key, "normal") @@ -37,45 +37,45 @@ resource "openstack_networking_port_v2" "control" { } resource "openstack_compute_instance_v2" "control" { - - name = split(".", local.nodename)[0] - image_id = var.cluster_image_id + + name = split(".", local.nodename)[0] + image_id = var.cluster_image_id flavor_name = var.control_node_flavor - key_pair = var.key_pair - + key_pair = var.key_pair + # root device: block_device { - uuid = var.cluster_image_id - source_type = "image" - destination_type = var.volume_backed_instances ? "volume" : "local" - volume_size = var.volume_backed_instances ? var.root_volume_size : null - volume_type = var.volume_backed_instances ? var.root_volume_type : null - boot_index = 0 - delete_on_termination = true + uuid = var.cluster_image_id + source_type = "image" + destination_type = var.volume_backed_instances ? "volume" : "local" + volume_size = var.volume_backed_instances ? var.root_volume_size : null + volume_type = var.volume_backed_instances ? var.root_volume_type : null + boot_index = 0 + delete_on_termination = true } dynamic "block_device" { for_each = local.control_volumes content { destination_type = "volume" - source_type = "volume" - boot_index = -1 - uuid = block_device.value.id # actually openstack_blockstorage_volume_v3 id + source_type = "volume" + boot_index = -1 + uuid = block_device.value.id # actually openstack_blockstorage_volume_v3 id } } dynamic "network" { - for_each = {for net in var.cluster_networks: net.network => net} + for_each = { for net in var.cluster_networks : net.network => net } content { - port = openstack_networking_port_v2.control[network.key].id + port = openstack_networking_port_v2.control[network.key].id access_network = network.key == var.cluster_networks[0].network } } metadata = { environment_root = var.environment_root - access_ip = openstack_networking_port_v2.control[var.cluster_networks[0].network].all_fixed_ips[0] - gateway_ip = var.gateway_ip + access_ip = openstack_networking_port_v2.control[var.cluster_networks[0].network].all_fixed_ips[0] + gateway_ip = var.gateway_ip } user_data = <<-EOF diff --git a/environments/skeleton/{{cookiecutter.environment}}/tofu/data.tf b/environments/skeleton/{{cookiecutter.environment}}/tofu/data.tf index 443c52282..f90f2f0dc 100644 --- a/environments/skeleton/{{cookiecutter.environment}}/tofu/data.tf +++ b/environments/skeleton/{{cookiecutter.environment}}/tofu/data.tf @@ -1,5 +1,6 @@ +# tflint-ignore: terraform_required_providers data "external" "baremetal_nodes" { # returns an empty map if cannot list baremetal nodes program = ["${path.module}/baremetal-node-list.py"] - query = {} + query = {} } diff --git a/environments/skeleton/{{cookiecutter.environment}}/tofu/inventory.tf b/environments/skeleton/{{cookiecutter.environment}}/tofu/inventory.tf index 81ac46d6c..01489743d 100644 --- a/environments/skeleton/{{cookiecutter.environment}}/tofu/inventory.tf +++ b/environments/skeleton/{{cookiecutter.environment}}/tofu/inventory.tf @@ -1,14 +1,15 @@ +# tflint-ignore: terraform_required_providers resource "local_file" "hosts" { - content = templatefile("${path.module}/inventory.tpl", - { - "cluster_name": var.cluster_name, - "cluster_domain_suffix": var.cluster_domain_suffix, - "control": openstack_compute_instance_v2.control - "login_groups": module.login - "compute_groups": module.compute - "state_dir": var.state_dir - "cluster_home_volume": var.home_volume_provisioning != "none" - }, - ) + content = templatefile("${path.module}/inventory.tpl", + { + "cluster_name" : var.cluster_name, + "cluster_domain_suffix" : var.cluster_domain_suffix, + "control" : openstack_compute_instance_v2.control + "login_groups" : module.login + "compute_groups" : module.compute + "state_dir" : var.state_dir + "cluster_home_volume" : var.home_volume_provisioning != "none" + }, + ) filename = "../inventory/hosts.yml" } diff --git a/environments/skeleton/{{cookiecutter.environment}}/tofu/login.tf b/environments/skeleton/{{cookiecutter.environment}}/tofu/login.tf index 301822ef0..6940f7bfc 100644 --- a/environments/skeleton/{{cookiecutter.environment}}/tofu/login.tf +++ b/environments/skeleton/{{cookiecutter.environment}}/tofu/login.tf @@ -4,44 +4,44 @@ module "login" { for_each = var.login # must be set for group: - nodes = each.value.nodes + nodes = each.value.nodes flavor = each.value.flavor # always taken from top-level value: - cluster_name = var.cluster_name + cluster_name = var.cluster_name cluster_domain_suffix = var.cluster_domain_suffix - key_pair = var.key_pair - environment_root = var.environment_root - + key_pair = var.key_pair + environment_root = var.environment_root + # can be set for group, defaults to top-level value: - image_id = lookup(each.value, "image_id", var.cluster_image_id) - vnic_types = lookup(each.value, "vnic_types", var.vnic_types) + image_id = lookup(each.value, "image_id", var.cluster_image_id) + vnic_types = lookup(each.value, "vnic_types", var.vnic_types) volume_backed_instances = lookup(each.value, "volume_backed_instances", var.volume_backed_instances) - root_volume_size = lookup(each.value, "root_volume_size", var.root_volume_size) - root_volume_type = lookup(each.value, "root_volume_type", var.root_volume_type) - gateway_ip = lookup(each.value, "gateway_ip", var.gateway_ip) - nodename_template = lookup(each.value, "nodename_template", var.cluster_nodename_template) - + root_volume_size = lookup(each.value, "root_volume_size", var.root_volume_size) + root_volume_type = lookup(each.value, "root_volume_type", var.root_volume_type) + gateway_ip = lookup(each.value, "gateway_ip", var.gateway_ip) + nodename_template = lookup(each.value, "nodename_template", var.cluster_nodename_template) + # optionally set for group networks = concat(var.cluster_networks, lookup(each.value, "extra_networks", [])) # here null means "use module var default" - extra_volumes = lookup(each.value, "extra_volumes", null) - fip_addresses = lookup(each.value, "fip_addresses", null) - fip_network = lookup(each.value, "fip_network", null) + extra_volumes = lookup(each.value, "extra_volumes", null) + fip_addresses = lookup(each.value, "fip_addresses", null) + fip_network = lookup(each.value, "fip_network", null) match_ironic_node = lookup(each.value, "match_ironic_node", null) availability_zone = lookup(each.value, "availability_zone", null) - ip_addresses = lookup(each.value, "ip_addresses", null) + ip_addresses = lookup(each.value, "ip_addresses", null) # can't be set for login - compute_init_enable = [] + compute_init_enable = [] ignore_image_changes = false # computed # not using openstack_compute_instance_v2.control.access_ip_v4 to avoid # updates to node metadata on deletion/recreation of the control node: - control_address = openstack_networking_port_v2.control[var.cluster_networks[0].network].all_fixed_ips[0] - security_group_ids = [for o in data.openstack_networking_secgroup_v2.login: o.id] - baremetal_nodes = data.external.baremetal_nodes.result + control_address = openstack_networking_port_v2.control[var.cluster_networks[0].network].all_fixed_ips[0] + security_group_ids = [for o in data.openstack_networking_secgroup_v2.login : o.id] + baremetal_nodes = data.external.baremetal_nodes.result # input dict validation: group_name = each.key diff --git a/environments/skeleton/{{cookiecutter.environment}}/tofu/main.tf b/environments/skeleton/{{cookiecutter.environment}}/tofu/main.tf index dc639f7ec..e88ac1a50 100644 --- a/environments/skeleton/{{cookiecutter.environment}}/tofu/main.tf +++ b/environments/skeleton/{{cookiecutter.environment}}/tofu/main.tf @@ -2,7 +2,7 @@ terraform { required_version = ">= 1.7" # templatestring() function required_providers { openstack = { - source = "terraform-provider-openstack/openstack" + source = "terraform-provider-openstack/openstack" version = "~>3.0.0" } } diff --git a/environments/skeleton/{{cookiecutter.environment}}/tofu/network.tf b/environments/skeleton/{{cookiecutter.environment}}/tofu/network.tf index 0a86b8fb7..43c2e5da4 100644 --- a/environments/skeleton/{{cookiecutter.environment}}/tofu/network.tf +++ b/environments/skeleton/{{cookiecutter.environment}}/tofu/network.tf @@ -1,14 +1,14 @@ data "openstack_networking_network_v2" "cluster_net" { - for_each = {for net in var.cluster_networks: net.network => net} + for_each = { for net in var.cluster_networks : net.network => net } name = each.value.network } data "openstack_networking_subnet_v2" "cluster_subnet" { - for_each = {for net in var.cluster_networks: net.network => net} + for_each = { for net in var.cluster_networks : net.network => net } name = each.value.subnet } @@ -22,13 +22,13 @@ data "openstack_identity_auth_scope_v3" "scope" { data "openstack_networking_secgroup_v2" "login" { for_each = toset(var.login_security_groups) - name = each.key + name = each.key tenant_id = data.openstack_identity_auth_scope_v3.scope.project_id } data "openstack_networking_secgroup_v2" "nonlogin" { for_each = toset(var.nonlogin_security_groups) - name = each.key + name = each.key tenant_id = data.openstack_identity_auth_scope_v3.scope.project_id } diff --git a/environments/skeleton/{{cookiecutter.environment}}/tofu/node_group/main.tf b/environments/skeleton/{{cookiecutter.environment}}/tofu/node_group/main.tf index f29828499..03fbec461 100644 --- a/environments/skeleton/{{cookiecutter.environment}}/tofu/node_group/main.tf +++ b/environments/skeleton/{{cookiecutter.environment}}/tofu/node_group/main.tf @@ -2,7 +2,7 @@ terraform { required_version = ">= 0.14" required_providers { openstack = { - source = "terraform-provider-openstack/openstack" + source = "terraform-provider-openstack/openstack" version = "~>3.0.0" } } diff --git a/environments/skeleton/{{cookiecutter.environment}}/tofu/node_group/network.tf b/environments/skeleton/{{cookiecutter.environment}}/tofu/node_group/network.tf index f5763b97b..5a66d322f 100644 --- a/environments/skeleton/{{cookiecutter.environment}}/tofu/node_group/network.tf +++ b/environments/skeleton/{{cookiecutter.environment}}/tofu/node_group/network.tf @@ -1,14 +1,14 @@ data "openstack_networking_network_v2" "network" { - for_each = {for net in var.networks: net.network => net} + for_each = { for net in var.networks : net.network => net } name = each.value.network } data "openstack_networking_subnet_v2" "subnet" { - for_each = {for net in var.networks: net.network => net} + for_each = { for net in var.networks : net.network => net } name = each.value.subnet } diff --git a/environments/skeleton/{{cookiecutter.environment}}/tofu/node_group/nodes.tf b/environments/skeleton/{{cookiecutter.environment}}/tofu/node_group/nodes.tf index 5e8449381..743cca347 100644 --- a/environments/skeleton/{{cookiecutter.environment}}/tofu/node_group/nodes.tf +++ b/environments/skeleton/{{cookiecutter.environment}}/tofu/node_group/nodes.tf @@ -1,5 +1,5 @@ locals { - all_compute_volumes = {for v in setproduct(var.nodes, keys(var.extra_volumes)): "${v[0]}-${v[1]}" => {"node" = v[0], "volume" = v[1]}} + all_compute_volumes = { for v in setproduct(var.nodes, keys(var.extra_volumes)) : "${v[0]}-${v[1]}" => { "node" = v[0], "volume" = v[1] } } # e.g. with # var.nodes = ["compute-0", "compute-1"] # var.extra_volumes = { @@ -12,16 +12,16 @@ locals { # Workaround for lifecycle meta-argument only taking static values compute_instances = var.ignore_image_changes ? openstack_compute_instance_v2.compute_fixed_image : openstack_compute_instance_v2.compute - + # Define nodenames here to avoid repetition nodenames = { - for n in var.nodes: n => templatestring( + for n in var.nodes : n => templatestring( var.nodename_template, { - node = n, - cluster_name = var.cluster_name, + node = n, + cluster_name = var.cluster_name, cluster_domain_suffix = var.cluster_domain_suffix, - environment_name = basename(var.environment_root) + environment_name = basename(var.environment_root) } ) } @@ -29,39 +29,39 @@ locals { resource "openstack_blockstorage_volume_v3" "compute" { - for_each = local.all_compute_volumes + for_each = local.all_compute_volumes - name = "${var.cluster_name}-${each.key}" - description = "Compute node ${each.value.node} volume ${each.value.volume}" - size = var.extra_volumes[each.value.volume].size + name = "${var.cluster_name}-${each.key}" + description = "Compute node ${each.value.node} volume ${each.value.volume}" + size = var.extra_volumes[each.value.volume].size } resource "openstack_compute_volume_attach_v2" "compute" { for_each = local.all_compute_volumes - instance_id = local.compute_instances["${each.value.node}"].id - volume_id = openstack_blockstorage_volume_v3.compute["${each.key}"].id + instance_id = local.compute_instances[each.value.node].id + volume_id = openstack_blockstorage_volume_v3.compute[each.key].id } resource "openstack_networking_port_v2" "compute" { - for_each = {for item in setproduct(var.nodes, var.networks): + for_each = { for item in setproduct(var.nodes, var.networks) : "${item[0]}-${item[1].network}" => { - node_idx = index(var.nodes, item[0]) - net = item[1] - } + node_idx = index(var.nodes, item[0]) + net = item[1] + } } - name = "${var.cluster_name}-${each.key}" - network_id = data.openstack_networking_network_v2.network[each.value.net.network].id + name = "${var.cluster_name}-${each.key}" + network_id = data.openstack_networking_network_v2.network[each.value.net.network].id admin_state_up = "true" fixed_ip { - subnet_id = data.openstack_networking_subnet_v2.subnet[each.value.net.network].id + subnet_id = data.openstack_networking_subnet_v2.subnet[each.value.net.network].id ip_address = try(var.ip_addresses[each.value.net.network][each.value.node_idx], null) } - + no_security_groups = lookup(each.value.net, "no_security_groups", false) security_group_ids = lookup(each.value.net, "no_security_groups", false) ? [] : var.security_group_ids @@ -74,40 +74,40 @@ resource "openstack_compute_instance_v2" "compute_fixed_image" { for_each = var.ignore_image_changes ? toset(var.nodes) : [] - name = split(".", local.nodenames[each.key])[0] - image_id = var.image_id + name = split(".", local.nodenames[each.key])[0] + image_id = var.image_id flavor_name = var.flavor - key_pair = var.key_pair + key_pair = var.key_pair dynamic "block_device" { - for_each = var.volume_backed_instances ? [1]: [] + for_each = var.volume_backed_instances ? [1] : [] content { - uuid = var.image_id - source_type = "image" - destination_type = "volume" - volume_size = var.root_volume_size - volume_type = var.root_volume_type - boot_index = 0 + uuid = var.image_id + source_type = "image" + destination_type = "volume" + volume_size = var.root_volume_size + volume_type = var.root_volume_type + boot_index = 0 delete_on_termination = true } } dynamic "network" { - for_each = {for net in var.networks: net.network => net} + for_each = { for net in var.networks : net.network => net } content { - port = openstack_networking_port_v2.compute["${each.key}-${network.key}"].id + port = openstack_networking_port_v2.compute["${each.key}-${network.key}"].id access_network = network.key == var.networks[0].network } } metadata = merge( { - environment_root = var.environment_root - control_address = var.control_address - access_ip = openstack_networking_port_v2.compute["${each.key}-${var.networks[0].network}"].all_fixed_ips[0] - gateway_ip = var.gateway_ip + environment_root = var.environment_root + control_address = var.control_address + access_ip = openstack_networking_port_v2.compute["${each.key}-${var.networks[0].network}"].all_fixed_ips[0] + gateway_ip = var.gateway_ip }, - {for e in var.compute_init_enable: e => true} + { for e in var.compute_init_enable : e => true } ) user_data = <<-EOF @@ -128,41 +128,41 @@ resource "openstack_compute_instance_v2" "compute_fixed_image" { resource "openstack_compute_instance_v2" "compute" { for_each = var.ignore_image_changes ? [] : toset(var.nodes) - - name = split(".", local.nodenames[each.key])[0] - image_id = var.image_id + + name = split(".", local.nodenames[each.key])[0] + image_id = var.image_id flavor_name = var.flavor - key_pair = var.key_pair + key_pair = var.key_pair dynamic "block_device" { - for_each = var.volume_backed_instances ? [1]: [] + for_each = var.volume_backed_instances ? [1] : [] content { - uuid = var.image_id - source_type = "image" - destination_type = "volume" - volume_size = var.root_volume_size - volume_type = var.root_volume_type - boot_index = 0 + uuid = var.image_id + source_type = "image" + destination_type = "volume" + volume_size = var.root_volume_size + volume_type = var.root_volume_type + boot_index = 0 delete_on_termination = true } } - + dynamic "network" { - for_each = {for net in var.networks: net.network => net} + for_each = { for net in var.networks : net.network => net } content { - port = openstack_networking_port_v2.compute["${each.key}-${network.key}"].id + port = openstack_networking_port_v2.compute["${each.key}-${network.key}"].id access_network = network.key == var.networks[0].network } } metadata = merge( { - environment_root = var.environment_root - control_address = var.control_address - access_ip = openstack_networking_port_v2.compute["${each.key}-${var.networks[0].network}"].all_fixed_ips[0] - gateway_ip = var.gateway_ip + environment_root = var.environment_root + control_address = var.control_address + access_ip = openstack_networking_port_v2.compute["${each.key}-${var.networks[0].network}"].all_fixed_ips[0] + gateway_ip = var.gateway_ip }, - {for e in var.compute_init_enable: e => true} + { for e in var.compute_init_enable : e => true } ) user_data = <<-EOF @@ -175,7 +175,7 @@ resource "openstack_compute_instance_v2" "compute" { } resource "openstack_networking_floatingip_associate_v2" "fip" { - for_each = {for idx in range(length(var.fip_addresses)): var.nodes[idx] => var.fip_addresses[idx]} # zip, fip_addresses can be shorter + for_each = { for idx in range(length(var.fip_addresses)) : var.nodes[idx] => var.fip_addresses[idx] } # zip, fip_addresses can be shorter floating_ip = each.value port_id = openstack_networking_port_v2.compute["${each.key}-${length(var.networks) == 1 ? var.networks[0].network : var.fip_network}"].id @@ -183,7 +183,7 @@ resource "openstack_networking_floatingip_associate_v2" "fip" { } output "compute_instances" { - value = local.compute_instances + value = local.compute_instances } output "image_id" { diff --git a/environments/skeleton/{{cookiecutter.environment}}/tofu/node_group/variables.tf b/environments/skeleton/{{cookiecutter.environment}}/tofu/node_group/variables.tf index 72c3f004d..8baf5dbfb 100644 --- a/environments/skeleton/{{cookiecutter.environment}}/tofu/node_group/variables.tf +++ b/environments/skeleton/{{cookiecutter.environment}}/tofu/node_group/variables.tf @@ -1,192 +1,193 @@ variable "nodes" { - type = list(string) - description = "list of node names for partition" + type = list(string) + description = "list of node names for partition" } variable "flavor" { - type = string - description = "Name of flavor for partition" + type = string + description = "Name of flavor for partition" } variable "cluster_name" { - type = string + type = string } variable "cluster_domain_suffix" { - type = string - default = "invalid" + type = string + default = "invalid" } variable "key_pair" { - type = string - description = "Name of an existing keypair in OpenStack" + type = string + description = "Name of an existing keypair in OpenStack" } variable "image_id" { - type = string - description = "ID of image for the partition" + type = string + description = "ID of image for the partition" } variable "environment_root" { - type = string - description = "Path to environment root, automatically set by activate script" + type = string + description = "Path to environment root, automatically set by activate script" } variable "vnic_types" { - type = map(string) - default = {} + type = map(string) + default = {} } variable "volume_backed_instances" { - description = "Whether to use volumes for root disks" - type = bool - default = false + description = "Whether to use volumes for root disks" + type = bool + default = false } variable "root_volume_size" { - description = "Size of volume for root volumes if using volume backed instances, in Gb" - type = number - default = 40 + description = "Size of volume for root volumes if using volume backed instances, in Gb" + type = number + default = 40 } variable "root_volume_type" { - type = string - default = null + type = string + default = null } variable "extra_volumes" { - description = <<-EOF + description = <<-EOF Mapping defining additional volumes to create and attach. Keys are unique volume name. Values are a mapping with: size: Size of volume in GB **NB**: The order in /dev is not guaranteed to match the mapping EOF - type = map( - object({ - size = number - }) - ) - default = {} - nullable = false + type = map( + object({ + size = number + }) + ) + default = {} + nullable = false } variable "security_group_ids" { - type = list + type = list(any) } variable "control_address" { - description = "Name/address of control node" - type = string + description = "Name/address of control node" + type = string } variable "compute_init_enable" { - type = list(string) - description = "Groups to activate for ansible-init compute rebuilds" - default = [] - nullable = false + type = list(string) + description = "Groups to activate for ansible-init compute rebuilds" + default = [] + nullable = false } variable "ignore_image_changes" { - type = bool - description = "Whether to ignore changes to the image_id parameter" - default = false - nullable = false + type = bool + description = "Whether to ignore changes to the image_id parameter" + default = false + nullable = false } variable "networks" { - type = list(map(string)) + type = list(map(string)) } variable "fip_addresses" { - type = list(string) - description = <<-EOT + type = list(string) + description = <<-EOT List of addresses of floating IPs to associate with nodes, in same order as nodes parameter. The floating IPs must already be allocated to the project. EOT - default = [] - nullable = false + default = [] + nullable = false } variable "fip_network" { - type = string - description = <<-EOT + type = string + description = <<-EOT Name of network containing ports to attach FIPs to. Only required if multiple networks are defined. EOT - default = "" - nullable = false + default = "" + nullable = false } variable "ip_addresses" { - type = map(list(string)) - description = <<-EOT + type = map(list(string)) + description = <<-EOT Mapping of list of fixed IP addresses for nodes, keyed by network name, in same order as nodes parameter. For any networks not specified here the cloud will select addresses. NB: Changing IP addresses after deployment may hit terraform provider bugs. EOT - default = {} - nullable = false - validation { - condition = length(setsubtract(keys(var.ip_addresses), var.networks[*].network)) == 0 - error_message = "Keys in ip_addresses for nodegroup \"${var.group_name}\" must match network names in var.cluster_networks" - } - validation { - condition = alltrue([for v in values(var.ip_addresses): length(v) == length(var.nodes)]) - error_message = "Values in ip_addresses for nodegroup \"${var.group_name}\" must be a list of the same length as var.nodes" - } + default = {} + nullable = false + validation { + condition = length(setsubtract(keys(var.ip_addresses), var.networks[*].network)) == 0 + error_message = "Keys in ip_addresses for nodegroup \"${var.group_name}\" must match network names in var.cluster_networks" + } + validation { + condition = alltrue([for v in values(var.ip_addresses) : length(v) == length(var.nodes)]) + error_message = "Values in ip_addresses for nodegroup \"${var.group_name}\" must be a list of the same length as var.nodes" + } } variable "match_ironic_node" { - type = bool - description = "Whether to launch instances on the Ironic node of the same name as each cluster node" - default = false - nullable = false + type = bool + description = "Whether to launch instances on the Ironic node of the same name as each cluster node" + default = false + nullable = false } variable "availability_zone" { - type = string - description = "Name of availability zone - ignored unless match_ironic_node is true" - default = "nova" - nullable = false + type = string + description = "Name of availability zone - ignored unless match_ironic_node is true" + default = "nova" + nullable = false } variable "baremetal_nodes" { - type = map(string) - default = {} + type = map(string) + default = {} } variable "gateway_ip" { - type = string - default = "" + type = string + default = "" } variable "nodename_template" { - type = string - default = "" + type = string + default = "" } variable "group_name" { - type = string + type = string } +# tflint-ignore: terraform_unused_declarations variable "group_keys" { - type = list - validation { - condition = length(setsubtract(var.group_keys, var.allowed_keys)) == 0 - error_message = <<-EOT + type = list(any) + validation { + condition = length(setsubtract(var.group_keys, var.allowed_keys)) == 0 + error_message = <<-EOT Node group '${var.group_name}' contains invalid key(s) ${ - join(", ", setsubtract(var.group_keys, var.allowed_keys))}. + join(", ", setsubtract(var.group_keys, var.allowed_keys))}. Valid keys are ${join(", ", var.allowed_keys)}. EOT - } +} } variable "allowed_keys" { - type = list - # don't provide a default here as allowed keys may depend on module use + type = list(any) + # don't provide a default here as allowed keys may depend on module use } diff --git a/environments/skeleton/{{cookiecutter.environment}}/tofu/read-inventory-secrets.py b/environments/skeleton/{{cookiecutter.environment}}/tofu/read-inventory-secrets.py index e3de2f492..3728f5604 100755 --- a/environments/skeleton/{{cookiecutter.environment}}/tofu/read-inventory-secrets.py +++ b/environments/skeleton/{{cookiecutter.environment}}/tofu/read-inventory-secrets.py @@ -1,36 +1,42 @@ -#!/usr/bin/env python -""" opentofu external data program to load inventory string variables from - a (possibly vault-encrypted) secrets file. +#!/usr/bin/env python # pylint: disable=invalid-name +"""opentofu external data program to load inventory string variables from +a (possibly vault-encrypted) secrets file. - Example usage: +Example usage: - data "external" "example" { - program = [this_file] + data "external" "example" { + program = [this_file] - query = { - path = "${path.module}/../inventory/group_vars/all/secrets.yml" - } + query = { + path = "${path.module}/../inventory/group_vars/all/secrets.yml" } + } - The external data resource's result attribute then contains a mapping of - variable names to values. +The external data resource's result attribute then contains a mapping of +variable names to values. - NB: Only keys/values where values are strings are returned, in line with - the external program protocol. +NB: Only keys/values where values are strings are returned, in line with +the external program protocol. - NB: This approach is better than e.g. templating inventory vars as the - inventory doesn't need to be valid, which is helpful when opentofu will - template out hosts/groups. +NB: This approach is better than e.g. templating inventory vars as the +inventory doesn't need to be valid, which is helpful when opentofu will +template out hosts/groups. """ -import sys, json, subprocess, yaml -input = sys.stdin.read() -secrets_path = json.loads(input)['path'] +import json +import subprocess +import sys -with open(secrets_path) as f: +import yaml # pylint: disable=import-error + +input = sys.stdin.read() # pylint: disable=redefined-builtin +secrets_path = json.loads(input)["path"] + +with open(secrets_path) as f: # pylint: disable=unspecified-encoding header = f.readline() - if header.startswith('$ANSIBLE_VAULT'): - cmd = ['ansible-vault', 'view', secrets_path] + if header.startswith("$ANSIBLE_VAULT"): + cmd = ["ansible-vault", "view", secrets_path] + # pylint: disable-next=subprocess-run-check ansible = subprocess.run(cmd, capture_output=True, text=True) contents = ansible.stdout else: diff --git a/environments/skeleton/{{cookiecutter.environment}}/tofu/variables.tf b/environments/skeleton/{{cookiecutter.environment}}/tofu/variables.tf index c17db6584..f6500c859 100644 --- a/environments/skeleton/{{cookiecutter.environment}}/tofu/variables.tf +++ b/environments/skeleton/{{cookiecutter.environment}}/tofu/variables.tf @@ -1,17 +1,17 @@ variable "cluster_name" { - type = string - description = "Name of cluster, used as part of domain name" + type = string + description = "Name of cluster, used as part of domain name" } variable "cluster_domain_suffix" { - type = string - description = "Domain suffix for cluster" - default = "internal" + type = string + description = "Domain suffix for cluster" + default = "internal" } variable "cluster_networks" { - type = list(map(string)) - description = <<-EOT + type = list(map(string)) + description = <<-EOT List of mappings defining networks. Mapping key/values: network: Required. Name of existing network subnet: Required. Name of existing subnet @@ -20,34 +20,34 @@ variable "cluster_networks" { } variable "key_pair" { - type = string - description = "Name of an existing keypair in OpenStack" + type = string + description = "Name of an existing keypair in OpenStack" } variable "control_ip_addresses" { - type = map(string) - description = <<-EOT + type = map(string) + description = <<-EOT Mapping of fixed IP addresses for control node, keyed by network name. For any networks not specified here the cloud will select an address. NB: Changing IP addresses after deployment may hit terraform provider bugs. EOT - default = {} - validation { - # check all keys are network names in cluster_networks - condition = length(setsubtract(keys(var.control_ip_addresses), var.cluster_networks[*].network)) == 0 - error_message = "Keys in var.control_ip_addresses must match network names in var.cluster_networks" - } + default = {} + validation { + # check all keys are network names in cluster_networks + condition = length(setsubtract(keys(var.control_ip_addresses), var.cluster_networks[*].network)) == 0 + error_message = "Keys in var.control_ip_addresses must match network names in var.cluster_networks" + } } variable "control_node_flavor" { - type = string - description = "Flavor name for control node" + type = string + description = "Flavor name for control node" } variable "login" { - default = {} - description = <<-EOF + default = {} + description = <<-EOF Mapping defining homogenous groups of login nodes. Multiple groups may be useful for e.g. separating nodes for ssh and Open Ondemand usage, or to define login nodes with different capabilities such as high-memory. @@ -84,17 +84,17 @@ variable "login" { nodename_template: Overrides variable cluster_nodename_template EOF - type = any + type = any } variable "cluster_image_id" { - type = string - description = "ID of default image for the cluster" + type = string + description = "ID of default image for the cluster" } variable "compute" { - default = {} - description = <<-EOF + default = {} + description = <<-EOF Mapping defining homogenous groups of compute nodes. Groups are used in Slurm partition definitions. @@ -127,36 +127,36 @@ variable "compute" { nodename_template: Overrides variable cluster_nodename_template EOF - type = any # can't do any better; TF type constraints can't cope with heterogeneous inner mappings + type = any # can't do any better; TF type constraints can't cope with heterogeneous inner mappings } variable "environment_root" { - type = string - description = "Path to environment root, automatically set by activate script" + type = string + description = "Path to environment root, automatically set by activate script" } variable "state_dir" { - type = string - description = "Path to state directory on control node" - default = "/var/lib/state" + type = string + description = "Path to state directory on control node" + default = "/var/lib/state" } variable "state_volume_size" { - type = number - description = "Size of state volume on control node, in GB" - default = 150 # GB + type = number + description = "Size of state volume on control node, in GB" + default = 150 # GB } variable "state_volume_type" { - type = string - description = "Type of state volume, if not default type" - default = null + type = string + description = "Type of state volume, if not default type" + default = null } variable "state_volume_provisioning" { - type = string - default = "manage" - description = <<-EOT + type = string + default = "manage" + description = <<-EOT How to manage the state volume. Valid values are: "manage": (Default) OpenTofu will create a volume "$cluster_name-state" and delete it when the cluster is destroyed. A volume @@ -167,36 +167,36 @@ variable "state_volume_provisioning" { intact if the cluster is destroyed. Use for production environments. EOT - validation { - condition = contains(["manage", "attach"], var.state_volume_provisioning) - error_message = <<-EOT + validation { + condition = contains(["manage", "attach"], var.state_volume_provisioning) + error_message = <<-EOT home_volume_provisioning must be "manage" or "attach" EOT - } + } } variable "home_volume_size" { - type = number - description = "Size of state volume on control node, in GB." - default = 100 - validation { - condition = var.home_volume_provisioning == "manage" ? var.home_volume_size > 0 : true - error_message = <<-EOT + type = number + description = "Size of state volume on control node, in GB." + default = 100 + validation { + condition = var.home_volume_provisioning == "manage" ? var.home_volume_size > 0 : true + error_message = <<-EOT home_volume_size must be > 0 when var.home_volume_provisioning == "manage" EOT - } + } } variable "home_volume_type" { - type = string - default = null - description = "Type of home volume, if not default type" + type = string + default = null + description = "Type of home volume, if not default type" } variable "home_volume_provisioning" { - type = string - default = "manage" - description = <<-EOT + type = string + default = "manage" + description = <<-EOT How to manage the home volume. Valid values are: "manage": (Default) OpenTofu will create a volume "$cluster_name-home" and delete it when the cluster is destroyed. A volume @@ -209,67 +209,67 @@ variable "home_volume_provisioning" { "none": No home volume is used. Use if /home is provided by a parallel filesystem, e.g. manila. EOT - validation { - condition = contains(["manage", "attach", "none"], var.home_volume_provisioning) - error_message = <<-EOT + validation { + condition = contains(["manage", "attach", "none"], var.home_volume_provisioning) + error_message = <<-EOT home_volume_provisioning must be one of "manage", "attach" or "none" EOT - } + } } variable "vnic_types" { - type = map(string) - description = <<-EOT + type = map(string) + description = <<-EOT Default VNIC types, keyed by network name. See https://registry.terraform.io/providers/terraform-provider-openstack/openstack/latest/docs/resources/networking_port_v2#vnic_type If not given this defaults to the "normal" type. EOT - default = {} + default = {} } variable "login_security_groups" { - type = list(string) - description = "Name of preexisting security groups to apply to login nodes" - default = [ - "default", # allow all in-cluster services - "SSH", # access via ssh - "HTTPS", # access OpenOndemand - ] + type = list(string) + description = "Name of preexisting security groups to apply to login nodes" + default = [ + "default", # allow all in-cluster services + "SSH", # access via ssh + "HTTPS", # access OpenOndemand + ] } variable "nonlogin_security_groups" { - type = list(string) - description = "Name of preexisting security groups to apply to non-login nodes" - default = [ - "default", # allow all in-cluster services - ] + type = list(string) + description = "Name of preexisting security groups to apply to non-login nodes" + default = [ + "default", # allow all in-cluster services + ] } variable "volume_backed_instances" { - description = "Whether to use volumes for root disks" - type = bool - default = false + description = "Whether to use volumes for root disks" + type = bool + default = false } variable "root_volume_size" { - description = "Size of volume for root volumes if using volume backed instances, in Gb" - type = number - default = 40 + description = "Size of volume for root volumes if using volume backed instances, in Gb" + type = number + default = 40 } variable "root_volume_type" { - description = "Type of root volume, if using volume backed instances. If unset, the target cloud default volume type is used." - type = string - default = null + description = "Type of root volume, if using volume backed instances. If unset, the target cloud default volume type is used." + type = string + default = null } variable "gateway_ip" { - description = "Address to add default route via" - type = string - default = "" + description = "Address to add default route via" + type = string + default = "" } variable "cluster_nodename_template" { - description = <<-EOT + description = <<-EOT Template for node fully-qualified names. The following interpolations can be used: $${cluster_name}: From var.cluster_name @@ -279,6 +279,6 @@ variable "cluster_nodename_template" { node $${environment_name}: The last element of the current environment's path EOT - type = string - default = "$${cluster_name}-$${node}.$${cluster_name}.$${cluster_domain_suffix}" + type = string + default = "$${cluster_name}-$${node}.$${cluster_name}.$${cluster_domain_suffix}" } diff --git a/environments/skeleton/{{cookiecutter.environment}}/tofu/volumes.tf b/environments/skeleton/{{cookiecutter.environment}}/tofu/volumes.tf index 18a6a0969..46b63eb58 100644 --- a/environments/skeleton/{{cookiecutter.environment}}/tofu/volumes.tf +++ b/environments/skeleton/{{cookiecutter.environment}}/tofu/volumes.tf @@ -1,59 +1,59 @@ resource "openstack_blockstorage_volume_v3" "state" { - # NB: Changes to this resource's "address" i.e. (label or for_each key) - # may lose state data for existing clusters using this volume + # NB: Changes to this resource's "address" i.e. (label or for_each key) + # may lose state data for existing clusters using this volume - count = var.state_volume_provisioning == "manage" ? 1 : 0 + count = var.state_volume_provisioning == "manage" ? 1 : 0 - name = "${var.cluster_name}-state" # last word used to label filesystem - description = "State for control node" - size = var.state_volume_size - volume_type = var.state_volume_type + name = "${var.cluster_name}-state" # last word used to label filesystem + description = "State for control node" + size = var.state_volume_size + volume_type = var.state_volume_type } data "openstack_blockstorage_volume_v3" "state" { -/* We use a data resource whether or not TF is managing the volume, so the + /* We use a data resource whether or not TF is managing the volume, so the logic is all in one place. But that means this needs a dependency on the actual resource to avoid a race. Because there may be no volume, this has to use for_each. */ - for_each = toset( - (var.state_volume_provisioning == "manage") ? - [for v in openstack_blockstorage_volume_v3.state: v.name] : - ["${var.cluster_name}-state"] - ) + for_each = toset( + (var.state_volume_provisioning == "manage") ? + [for v in openstack_blockstorage_volume_v3.state : v.name] : + ["${var.cluster_name}-state"] + ) - name = each.key + name = each.key } resource "openstack_blockstorage_volume_v3" "home" { - # NB: Changes to this resource's "address" i.e. (label or for_each key) - # may lose user data for existing clusters using this volume + # NB: Changes to this resource's "address" i.e. (label or for_each key) + # may lose user data for existing clusters using this volume - count = var.home_volume_provisioning == "manage" ? 1 : 0 + count = var.home_volume_provisioning == "manage" ? 1 : 0 - name = "${var.cluster_name}-home" # last word used to label filesystem - description = "Home for control node" - size = var.home_volume_size - volume_type = var.home_volume_type + name = "${var.cluster_name}-home" # last word used to label filesystem + description = "Home for control node" + size = var.home_volume_size + volume_type = var.home_volume_type } data "openstack_blockstorage_volume_v3" "home" { -/* Comments as for the state volume. */ + /* Comments as for the state volume. */ - for_each = toset( - (var.home_volume_provisioning == "manage") ? - [for v in openstack_blockstorage_volume_v3.home: v.name] : - (var.home_volume_provisioning == "attach") ? - ["${var.cluster_name}-home"] : - [] - ) + for_each = toset( + (var.home_volume_provisioning == "manage") ? + [for v in openstack_blockstorage_volume_v3.home : v.name] : + (var.home_volume_provisioning == "attach") ? + ["${var.cluster_name}-home"] : + [] + ) - name = each.key + name = each.key } diff --git a/packer/openhpc_extravars.yml b/packer/openhpc_extravars.yml index 66f668649..8bfcd5989 100644 --- a/packer/openhpc_extravars.yml +++ b/packer/openhpc_extravars.yml @@ -1 +1,2 @@ -workaround_ansible_issue_61497: yes # extravars files can't be empty +--- +workaround_ansible_issue_61497: true # extravars files can't be empty diff --git a/requirements.yml b/requirements.yml index 729280df5..d70c593af 100644 --- a/requirements.yml +++ b/requirements.yml @@ -55,4 +55,3 @@ collections: version: 0.0.15 - name: stackhpc.pulp version: 0.5.5 -... diff --git a/super-linter.env b/super-linter.env index df2f160b5..5362c92f0 100644 --- a/super-linter.env +++ b/super-linter.env @@ -12,3 +12,16 @@ VALIDATE_ANSIBLE=false # Don't validate YAML prettier because yamllint is sufficient VALIDATE_YAML_PRETTIER=false + +# Getting false positives with terrascan that seemingly can't be masked +VALIDATE_TERRAFORM_TERRASCAN=false + +# Doesn't seem possible to exclude files with terragrunt +VALIDATE_TERRAGRUNT=false + +# TODO: address the following. +# Temporarily disable these linters, +# there are select issues remaining with each that can be addressed individually +VALIDATE_GITHUB_ACTIONS=false +VALIDATE_SHELL_SHFMT=false +VALIDATE_YAML=false From 1c298e9128270b8fe401372ac8041d45b960d12e Mon Sep 17 00:00:00 2001 From: Max Norton Date: Fri, 18 Jul 2025 09:08:13 +0100 Subject: [PATCH 03/37] Update GH workflow so linting always runs befor any other jobs --- .github/workflows/extra.yml | 17 +----- .github/workflows/main.yml | 96 +++++++++++++++++++++++++++++++++ .github/workflows/stackhpc.yml | 24 +-------- .github/workflows/trivyscan.yml | 6 +-- 4 files changed, 99 insertions(+), 44 deletions(-) create mode 100644 .github/workflows/main.yml diff --git a/.github/workflows/extra.yml b/.github/workflows/extra.yml index a076a6980..25defa95e 100644 --- a/.github/workflows/extra.yml +++ b/.github/workflows/extra.yml @@ -1,22 +1,7 @@ +--- name: Test extra build on: workflow_dispatch: - push: - branches: - - main - paths: - - 'environments/.stackhpc/tofu/cluster_image.auto.tfvars.json' - - 'ansible/roles/doca/**' - - 'ansible/roles/cuda/**' - - 'ansible/roles/lustre/**' - - '.github/workflows/extra.yml' - pull_request: - paths: - - 'environments/.stackhpc/tofu/cluster_image.auto.tfvars.json' - - 'ansible/roles/doca/**' - - 'ansible/roles/cuda/**' - - 'ansible/roles/lustre/**' - - '.github/workflows/extra.yml' permissions: contents: read diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml new file mode 100644 index 000000000..45e76c468 --- /dev/null +++ b/.github/workflows/main.yml @@ -0,0 +1,96 @@ +--- +name: Test on push and pull request + +permissions: + actions: write + contents: read + packages: write + # To report GitHub Actions status checks + statuses: write + id-token: write + +on: + push: + branches: + - main + pull_request: + +concurrency: + group: ${{ github.workflow }}-${{ github.ref }}-${{ matrix.build.image_name }} # to branch/PR + OS + cancel-in-progress: true + +jobs: + lint: + name: Lint + uses: ./.github/workflows/lint.yml + + files_changed: + name: Determine files changed + needs: lint + # continue-on-error: true # Uncomment once integration is finished + runs-on: ubuntu-latest + # Map a step output to a job output + outputs: + should_skip: ${{ steps.skip_check.outputs.should_skip }} + paths_result: ${{ steps.skip_check.outputs.paths_result }} + steps: + - id: skip_check + # For security we use the commit of fkirc/skip-duplicate-actions@v5 + uses: fkirc/skip-duplicate-actions@f75f66ce1886f00957d99748a42c724f4330bdcf + with: + cancel_others: true + paths_filter: | + extra: + paths: + - 'environments/.stackhpc/tofu/cluster_image.auto.tfvars.json' + - 'ansible/roles/doca/**' + - 'ansible/roles/cuda/**' + - 'ansible/roles/lustre/**' + - '.github/workflows/extra.yml' + stackhpc: + paths: + - '**' + - '!dev/**' + - 'dev/setup-env.sh' + - '!docs/**' + - '!README.md' + - '!.gitignore' + - '!.github/workflows/' + - '.github/workflows/stackhpc' + trivvyscan: + paths: + - 'environments/.stackhpc/tofu/cluster_image.auto.tfvars.json' + + extra: + name: Test extra build + needs: files_changed + if: needs.files_changed.outputs.should_skip != 'true' && + !fromJSON(needs.files_changed.outputs.paths_result).extra.should_skip + #uses: ./.github/workflows/extra.yml + steps: # TEST + - run: echo "Running: Test extra build..." && sleep 30 # TEST + runs-on: ubuntu-latest # TEST + #secrets: inherit + + stackhpc: + name: Test deployment and reimage on OpenStack + needs: files_changed + if: needs.files_changed.outputs.should_skip != 'true' && + !fromJSON(needs.files_changed.outputs.paths_result).stackhpc.should_skip + #uses: ./.github/workflows/stackhpc.yml + steps: # TEST + - run: echo "Running: Test deployment and reimage on OpenStack..." && sleep 30 # TEST + runs-on: ubuntu-latest # TEST + #secrets: inherit + + trivvyscan: + name: Trivy scan image for vulnerabilities + needs: files_changed + if: github.event_name == 'pull_request' && + needs.files_changed.outputs.should_skip != 'true' && + !fromJSON(needs.files_changed.outputs.paths_result).trivvyscan.should_skip + #uses: ./.github/workflows/trivvyscan.yml + steps: # TEST + - run: echo "Running: Trivy scan image for vulnerabilities..." && sleep 30 # TEST + runs-on: ubuntu-latest # TEST + #secrets: inherit diff --git a/.github/workflows/stackhpc.yml b/.github/workflows/stackhpc.yml index ef81ae4fe..45c9f43d1 100644 --- a/.github/workflows/stackhpc.yml +++ b/.github/workflows/stackhpc.yml @@ -1,29 +1,7 @@ - +--- name: Test deployment and reimage on OpenStack on: workflow_dispatch: - push: - branches: - - main - paths: - - '**' - - '!dev/**' - - 'dev/setup-env.sh' - - '!docs/**' - - '!README.md' - - '!.gitignore' - - '!.github/workflows/' - - '.github/workflows/stackhpc' - pull_request: - paths: - - '**' - - '!dev/**' - - 'dev/setup-env.sh' - - '!docs/**' - - '!README.md' - - '!.gitignore' - - '!.github/workflows/' - - '.github/workflows/stackhpc' permissions: contents: read diff --git a/.github/workflows/trivyscan.yml b/.github/workflows/trivyscan.yml index 5980a0b17..7a279f391 100644 --- a/.github/workflows/trivyscan.yml +++ b/.github/workflows/trivyscan.yml @@ -1,11 +1,7 @@ +--- name: Trivy scan image for vulnerabilities on: workflow_dispatch: - pull_request: - branches: - - main - paths: - - 'environments/.stackhpc/tofu/cluster_image.auto.tfvars.json' permissions: contents: read From 572bdd08ab4098c8f8e469d5050d231da2be83fd Mon Sep 17 00:00:00 2001 From: Max Norton Date: Fri, 18 Jul 2025 09:15:46 +0100 Subject: [PATCH 04/37] Update GH workflow so linting always runs befor any other jobs --- .github/workflows/main.yml | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 45e76c468..d84969797 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -68,7 +68,10 @@ jobs: !fromJSON(needs.files_changed.outputs.paths_result).extra.should_skip #uses: ./.github/workflows/extra.yml steps: # TEST - - run: echo "Running: Test extra build..." && sleep 30 # TEST + - name: Test extra build... + uses: jakejarvis/wait-action@master + with: + time: '120s' runs-on: ubuntu-latest # TEST #secrets: inherit @@ -79,7 +82,10 @@ jobs: !fromJSON(needs.files_changed.outputs.paths_result).stackhpc.should_skip #uses: ./.github/workflows/stackhpc.yml steps: # TEST - - run: echo "Running: Test deployment and reimage on OpenStack..." && sleep 30 # TEST + - name: Test deployment and reimage on OpenStack... + uses: jakejarvis/wait-action@master + with: + time: '120s' runs-on: ubuntu-latest # TEST #secrets: inherit @@ -91,6 +97,9 @@ jobs: !fromJSON(needs.files_changed.outputs.paths_result).trivvyscan.should_skip #uses: ./.github/workflows/trivvyscan.yml steps: # TEST - - run: echo "Running: Trivy scan image for vulnerabilities..." && sleep 30 # TEST + - name: Trivy scan image for vulnerabilities... + uses: jakejarvis/wait-action@master + with: + time: '120s' runs-on: ubuntu-latest # TEST #secrets: inherit From f6477c717a3abe3db13d9c16a0a83603b42fa084 Mon Sep 17 00:00:00 2001 From: Max Norton Date: Tue, 29 Jul 2025 15:02:00 +0100 Subject: [PATCH 05/37] Fix linting issues on the merge of origin/main --- ansible/extras.yml | 10 +- ansible/fatimage.yml | 10 +- ansible/final.yml | 12 +- ansible/monitoring.yml | 2 +- ansible/roles/compute_init/README.md | 4 +- .../roles/compute_init/files/compute-init.yml | 3 +- ansible/roles/compute_init/tasks/export.yml | 2 +- ansible/roles/cuda/tasks/facts.yml | 2 +- .../roles/dnf_repos/tasks/disable_repos.yml | 2 +- ansible/roles/doca/defaults/main.yml | 3 +- ansible/roles/eessi/tasks/configure.yml | 5 +- ansible/roles/eessi/tasks/main.yml | 4 +- ansible/roles/fail2ban/tasks/configure.yml | 9 +- ansible/roles/fail2ban/tasks/install.yml | 4 +- ansible/roles/filebeat/defaults/main.yml | 2 +- ansible/roles/nhc/README.md | 12 +- ansible/roles/nhc/tasks/export.yml | 1 + ansible/roles/nhc/tasks/main.yml | 2 +- ansible/roles/ofed/defaults/main.yml | 1 + ansible/roles/podman/tasks/main.yml | 4 +- ansible/roles/rebuild/tasks/configure.yml | 4 +- ansible/roles/rebuild/tasks/install.yml | 2 +- .../roles/slurm_exporter/tasks/configure.yml | 2 +- .../roles/slurm_exporter/tasks/install.yml | 2 +- ansible/roles/slurm_recompile/README.md | 16 +- ansible/roles/slurm_recompile/tasks/main.yml | 16 +- ansible/roles/slurm_stats/tasks/configure.yml | 8 +- ansible/roles/slurm_stats/tasks/install.yml | 2 +- ansible/site.yml | 1 - ansible/slurm.yml | 4 +- ansible/validate.yml | 10 +- docs/experimental/isolated-clusters.md | 140 +++++++++--------- docs/mig.md | 55 ++++--- .../tofu/cluster_image.auto.tfvars.json | 8 +- environments/.stackhpc/tofu/main.tf | 6 +- .../common/inventory/group_vars/all/proxy.yml | 2 +- .../tofu/additional.tf | 40 ++--- .../tofu/control.tf | 2 +- .../tofu/inventory.tf | 2 +- .../tofu/login.tf | 2 +- .../tofu/node_group/nodes.tf | 2 +- .../tofu/node_group/variables.tf | 4 +- .../tofu/variables.tf | 5 +- 43 files changed, 220 insertions(+), 209 deletions(-) diff --git a/ansible/extras.yml b/ansible/extras.yml index b6f9f575f..02b0d402f 100644 --- a/ansible/extras.yml +++ b/ansible/extras.yml @@ -52,16 +52,16 @@ - name: Setup vGPU hosts: vgpu - become: yes - gather_facts: yes + become: true + gather_facts: true tags: vgpu tasks: - - include_role: + - ansible.builtin.include_role: name: stackhpc.linux.vgpu tasks_from: "{{ 'configure.yml' if appliances_mode == 'configure' else 'install.yml' }}" handlers: - - name: reboot - fail: + - name: reboot # noqa: name[casing] + ansible.builtin.fail: msg: Reboot handler for stackhpc.linux.vgpu role fired unexpectedly. This was supposed to be unreachable. - name: Persist hostkeys across rebuilds diff --git a/ansible/fatimage.yml b/ansible/fatimage.yml index 75e5a4497..0d16fab5c 100644 --- a/ansible/fatimage.yml +++ b/ansible/fatimage.yml @@ -179,7 +179,7 @@ name: ondemand_exporter when: "'openondemand' in group_names" - - name: slurm exporter + - name: Slurm exporter ansible.builtin.include_role: name: slurm_exporter tasks_from: install.yml @@ -258,18 +258,18 @@ - ansible.builtin.import_role: name: cloudalchemy.grafana tasks_from: install.yml - - import_role: + - ansible.builtin.import_role: name: cloudalchemy.grafana tasks_from: plugins.yml - - include_role: # done in same play so it can use handlers from cloudalchemy.grafana + - ansible.builtin.include_role: # done in same play so it can use handlers from cloudalchemy.grafana name: grafana-dashboards - name: Add support for NVIDIA GPU auto detection to Slurm hosts: cuda - become: yes + become: true tasks: - name: Recompile slurm - import_role: + ansible.builtin.import_role: name: slurm_recompile vars: slurm_recompile_with_nvml: "{{ groups.cuda | length > 0 }}" diff --git a/ansible/final.yml b/ansible/final.yml index 3e715dfa0..d9842046c 100644 --- a/ansible/final.yml +++ b/ansible/final.yml @@ -1,5 +1,5 @@ - hosts: dnf_repos - become: yes + become: true tags: dnf_repos tasks: - name: Disable pulp repos @@ -12,19 +12,19 @@ hosts: compute_init:!builder # NB: done last so other roles can prepare configuration etc tags: compute_init - become: yes + become: true tasks: - - include_role: + - ansible.builtin.include_role: name: compute_init tasks_from: export.yml - hosts: proxy gather_facts: false tags: proxy - become: yes + become: true tasks: - - include_role: + - ansible.builtin.include_role: name: proxy vars: proxy_state: absent - when: proxy_remove | default(false) | bool == true + when: proxy_remove | default(false) | bool diff --git a/ansible/monitoring.yml b/ansible/monitoring.yml index 2f6dd46de..c8225a0ba 100644 --- a/ansible/monitoring.yml +++ b/ansible/monitoring.yml @@ -71,7 +71,7 @@ - name: Skip prometheus install if prometheus binaries exist and prometheus_version not defined # i.e. if prometheus_version isn't defined we don't care, so use what's already there ansible.builtin.set_fact: - prometheus_skip_install: "{{ false if prometheus_version is defined else true }}" + prometheus_skip_install: "{{ false if prometheus_version is defined else true }}" when: "(prometheus_binaries.results | map(attribute='stat') | map(attribute='exists')) + [prometheus_skip_install is not defined]" - ansible.builtin.import_role: name: cloudalchemy.prometheus diff --git a/ansible/roles/compute_init/README.md b/ansible/roles/compute_init/README.md index 21cccaaeb..cc8b2dec7 100644 --- a/ansible/roles/compute_init/README.md +++ b/ansible/roles/compute_init/README.md @@ -54,7 +54,7 @@ it also requires an image build with the role name added to the | bootstrap.yml | dnf_repos | None at present [2] | - | | bootstrap.yml | cacerts | Supported [3] | - | | bootstrap.yml | squid | Not relevant for compute nodes | n/a | -| bootstrap.yml | tuned | Fully supported | No | +| bootstrap.yml | tuned | Fully supported | No | | bootstrap.yml | freeipa_server | Not relevant for compute nodes | n/a | | bootstrap.yml | cockpit | None required - use image build | No | | bootstrap.yml | firewalld | Not relevant for compute nodes | n/a | @@ -91,7 +91,7 @@ it also requires an image build with the role name added to the | portal.yml | (openondemand vnc desktop) | None required - use image build | No | | portal.yml | (openondemand jupyter server) | None required - use image build | No | | monitoring.yml | node_exporter | None required - use image build | No | -| monitoring.yml | (other monitoring) | Not relevant for compute nodes | - | +| monitoring.yml | (other monitoring) | Not relevant for compute nodes | - | | disable-repos.yml | dnf_repos | None at present [2] | - | | hooks/post.yml | ? | None at present | - | diff --git a/ansible/roles/compute_init/files/compute-init.yml b/ansible/roles/compute_init/files/compute-init.yml index eec6cb753..a2c554c99 100644 --- a/ansible/roles/compute_init/files/compute-init.yml +++ b/ansible/roles/compute_init/files/compute-init.yml @@ -297,7 +297,6 @@ - name: Ensure CVMFS config is setup # noqa: no-changed-when ansible.builtin.command: cmd: "cvmfs_config setup" - when: enable_eessi - name: Configure VGPUs ansible.builtin.include_role: @@ -366,7 +365,7 @@ tasks_from: boot.yml when: enable_nhc - - name: Ensure node is resumed + - name: Ensure node is resumed # noqa: no-changed-when # TODO: consider if this is always safe for all job states? ansible.builtin.command: scontrol update state=resume nodename={{ ansible_hostname }} register: _scontrol_update diff --git a/ansible/roles/compute_init/tasks/export.yml b/ansible/roles/compute_init/tasks/export.yml index 2282fd48a..5b31bd685 100644 --- a/ansible/roles/compute_init/tasks/export.yml +++ b/ansible/roles/compute_init/tasks/export.yml @@ -101,7 +101,7 @@ when: "'sshd' in group_names" - name: Export generated NHC config - import_role: + ansible.builtin.import_role: name: nhc tasks_from: export.yml when: "'nhc' in group_names" diff --git a/ansible/roles/cuda/tasks/facts.yml b/ansible/roles/cuda/tasks/facts.yml index 0d60457de..787f02692 100644 --- a/ansible/roles/cuda/tasks/facts.yml +++ b/ansible/roles/cuda/tasks/facts.yml @@ -1,4 +1,4 @@ --- - name: Set cuda_facts_version_short - set_fact: + ansible.builtin.set_fact: cuda_facts_version_short: "{{ cuda_version_short }}" diff --git a/ansible/roles/dnf_repos/tasks/disable_repos.yml b/ansible/roles/dnf_repos/tasks/disable_repos.yml index 9f8abe6d9..9300c42a5 100644 --- a/ansible/roles/dnf_repos/tasks/disable_repos.yml +++ b/ansible/roles/dnf_repos/tasks/disable_repos.yml @@ -28,5 +28,5 @@ path: "{{ item.path }}" regexp: '^enabled\ ?=\ ?1' replace: 'enabled=0' - backup: yes + backup: true loop: "{{ _dnf_repo_files.files }}" diff --git a/ansible/roles/doca/defaults/main.yml b/ansible/roles/doca/defaults/main.yml index 59fc0e657..7f28ef8b4 100644 --- a/ansible/roles/doca/defaults/main.yml +++ b/ansible/roles/doca/defaults/main.yml @@ -1,4 +1,5 @@ --- -doca_version:"2.9.3" # 2.9 is LTS, last to support ConnectX-4, 3 years for bug fixes and CVE updates + +doca_version: "2.9.3" # 2.9 is LTS, last to support ConnectX-4, 3 years for bug fixes and CVE updates doca_profile: doca-ofed doca_repo_url: "https://linux.mellanox.com/public/repo/doca/{{ doca_version }}/rhel{{ ansible_distribution_version }}/{{ ansible_architecture }}/" diff --git a/ansible/roles/eessi/tasks/configure.yml b/ansible/roles/eessi/tasks/configure.yml index b3083761c..2c765d20c 100644 --- a/ansible/roles/eessi/tasks/configure.yml +++ b/ansible/roles/eessi/tasks/configure.yml @@ -7,10 +7,11 @@ option: "{{ item.key }}" value: "{{ item.value }}" no_extra_spaces: true + mode: "0644" loop: "{{ cvmfs_config | dict2items }}" # NOTE: Not clear how to make this idempotent -- name: Ensure CVMFS config is setup - command: +- name: Ensure CVMFS config is setup # noqa: no-changed-when + ansible.builtin.command: cmd: "cvmfs_config setup" diff --git a/ansible/roles/eessi/tasks/main.yml b/ansible/roles/eessi/tasks/main.yml index 79d326ceb..e5e078714 100644 --- a/ansible/roles/eessi/tasks/main.yml +++ b/ansible/roles/eessi/tasks/main.yml @@ -1,4 +1,4 @@ --- -- include_tasks: install.yml -- include_tasks: configure.yml +- ansible.builtin.include_tasks: install.yml +- ansible.builtin.include_tasks: configure.yml diff --git a/ansible/roles/fail2ban/tasks/configure.yml b/ansible/roles/fail2ban/tasks/configure.yml index e4951f726..6bde88a8b 100644 --- a/ansible/roles/fail2ban/tasks/configure.yml +++ b/ansible/roles/fail2ban/tasks/configure.yml @@ -1,15 +1,16 @@ --- - name: Create config - template: + ansible.builtin.template: dest: /etc/fail2ban/jail.local src: jail.local.j2 + mode: "0644" notify: Restart fail2ban -- name: flush handlers - meta: flush_handlers +- name: Flush handlers + ansible.builtin.meta: flush_handlers - name: Ensure fail2ban running even if no config change - service: + ansible.builtin.service: name: fail2ban state: started enabled: true diff --git a/ansible/roles/fail2ban/tasks/install.yml b/ansible/roles/fail2ban/tasks/install.yml index 65f3bfef2..e745a4f41 100644 --- a/ansible/roles/fail2ban/tasks/install.yml +++ b/ansible/roles/fail2ban/tasks/install.yml @@ -1,10 +1,10 @@ --- - name: Install EPEL repo - package: + ansible.builtin.package: name: epel-release - name: Install fail2ban packages - package: + ansible.builtin.package: name: - fail2ban-server - fail2ban-firewalld diff --git a/ansible/roles/filebeat/defaults/main.yml b/ansible/roles/filebeat/defaults/main.yml index bdd02a2b7..1701427f0 100644 --- a/ansible/roles/filebeat/defaults/main.yml +++ b/ansible/roles/filebeat/defaults/main.yml @@ -1,6 +1,6 @@ --- -#filebeat_config_path: undefined # REQUIRED. Path to filebeat.yml configuration file template +# filebeat_config_path: undefined # REQUIRED. Path to filebeat.yml configuration file template filebeat_debug: false # Note all the below can only be set/changed using the install.yml task file: diff --git a/ansible/roles/nhc/README.md b/ansible/roles/nhc/README.md index 8831e0eee..a826932aa 100644 --- a/ansible/roles/nhc/README.md +++ b/ansible/roles/nhc/README.md @@ -22,6 +22,7 @@ compute ``` When the `anisble/site.yml` playbook is run this will automatically: + 1. Add NHC-related configuration to the `slurm.conf` Slurm configuration file. The default configuration is defined in `openhpc_config_nhc` (see [environments/common/inventory/group_vars/all/openhpc.yml](../../../environments/common/inventory/group_vars/all/openhpc.yml)). @@ -33,10 +34,11 @@ When the `anisble/site.yml` playbook is run this will automatically: 2. Template out node health check rules using Ansible facts for each compute node. Currently these check: - - Filesystem mounts - - Ethernet interfaces - See `/etc/nhc/nhc.conf` on a compute node for the full configuration. + - Filesystem mounts + - Ethernet interfaces + + See `/etc/nhc/nhc.conf` on a compute node for the full configuration. If a node healthcheck run fails, Slurm will mark the node `DOWN`. With the default [alerting configuration](../../../docs/alerting.md) this will trigger @@ -52,15 +54,17 @@ an alert. ## Structure This role contains 3x task files, which run at different times: + - `main.yml`: Runs from `site.yml` -> `slurm.yml`. Templates health check configuration to nodes. - `export.yml`: Runs from `site.yml` -> `final.yml` via role `compute_init` tasks `export.yml`. Templates health check configuration to the cluster NFS - share for compute-init. + share for compute-init. - `boot.yml`: Runs on boot via `compute_init/files/compute-init.yml`. Copies the node's generated health check configuration from the cluster share to local disk. Note that the `stackhpc.openhpc` role: + - Installs the required package - Configures slurm.conf parameterss diff --git a/ansible/roles/nhc/tasks/export.yml b/ansible/roles/nhc/tasks/export.yml index afa440ffb..d6b1120e4 100644 --- a/ansible/roles/nhc/tasks/export.yml +++ b/ansible/roles/nhc/tasks/export.yml @@ -3,4 +3,5 @@ ansible.builtin.template: src: "{{ nhc_config_template }}" dest: "/exports/cluster/hostconfig/{{ inventory_hostname }}/nhc.conf" + mode: "0644" delegate_to: "{{ groups['control'] | first }}" diff --git a/ansible/roles/nhc/tasks/main.yml b/ansible/roles/nhc/tasks/main.yml index 5f6034f7d..a5071136a 100644 --- a/ansible/roles/nhc/tasks/main.yml +++ b/ansible/roles/nhc/tasks/main.yml @@ -1,4 +1,4 @@ - +--- - name: Ensure NHC configuration directory exists # When running site.yml after login/control upgrade, nhc group might be # enabled in repo, but as the compute nodes have not yet been upgraded they diff --git a/ansible/roles/ofed/defaults/main.yml b/ansible/roles/ofed/defaults/main.yml index a0cb2162b..422ccc1f2 100644 --- a/ansible/roles/ofed/defaults/main.yml +++ b/ansible/roles/ofed/defaults/main.yml @@ -1,5 +1,6 @@ --- ofed_version: "24.10-3.2.5.0" # LTS +# yamllint disable-line rule:line-length ofed_download_url: https://content.mellanox.com/ofed/MLNX_OFED-{{ ofed_version }}/MLNX_OFED_LINUX-{{ ofed_version }}-{{ ofed_distro }}{{ ofed_distro_version }}-{{ ofed_arch }}.tgz ofed_distro: rhel # NB: not expected to work on other distros due to installation differences ofed_distro_version: "{{ ansible_distribution_version }}" # e.g. '8.9' diff --git a/ansible/roles/podman/tasks/main.yml b/ansible/roles/podman/tasks/main.yml index 2b65e84b4..2538c7f52 100644 --- a/ansible/roles/podman/tasks/main.yml +++ b/ansible/roles/podman/tasks/main.yml @@ -1,2 +1,2 @@ -- import_tasks: install.yml -- import_tasks: configure.yml +- ansible.builtin.import_tasks: install.yml +- ansible.builtin.import_tasks: configure.yml diff --git a/ansible/roles/rebuild/tasks/configure.yml b/ansible/roles/rebuild/tasks/configure.yml index 78a3b7b55..801e2eac8 100644 --- a/ansible/roles/rebuild/tasks/configure.yml +++ b/ansible/roles/rebuild/tasks/configure.yml @@ -1,7 +1,7 @@ --- - name: Create /etc/openstack - file: + ansible.builtin.file: path: /etc/openstack state: directory owner: slurm @@ -9,7 +9,7 @@ mode: u=rX,g=rwX - name: Copy out clouds.yaml - copy: + ansible.builtin.copy: src: "{{ rebuild_clouds_path }}" dest: /etc/openstack/clouds.yaml owner: slurm diff --git a/ansible/roles/rebuild/tasks/install.yml b/ansible/roles/rebuild/tasks/install.yml index 1152426e6..1c1b63a14 100644 --- a/ansible/roles/rebuild/tasks/install.yml +++ b/ansible/roles/rebuild/tasks/install.yml @@ -1,3 +1,3 @@ - name: Setup slurm tools - include_role: + ansible.builtin.include_role: name: slurm_tools diff --git a/ansible/roles/slurm_exporter/tasks/configure.yml b/ansible/roles/slurm_exporter/tasks/configure.yml index e511be02b..d8f2aae52 100644 --- a/ansible/roles/slurm_exporter/tasks/configure.yml +++ b/ansible/roles/slurm_exporter/tasks/configure.yml @@ -1,5 +1,5 @@ - name: Ensure slurm exporter state - systemd: + ansible.builtin.systemd: name: prometheus-slurm-exporter state: "{{ slurm_exporter_state }}" enabled: true diff --git a/ansible/roles/slurm_exporter/tasks/install.yml b/ansible/roles/slurm_exporter/tasks/install.yml index 75ac499de..48196dd5c 100644 --- a/ansible/roles/slurm_exporter/tasks/install.yml +++ b/ansible/roles/slurm_exporter/tasks/install.yml @@ -7,4 +7,4 @@ disable_gpg_check: true notify: Restart slurm exporter -- meta: flush_handlers +- ansible.builtin.meta: flush_handlers diff --git a/ansible/roles/slurm_recompile/README.md b/ansible/roles/slurm_recompile/README.md index e42572aea..27b162cd9 100644 --- a/ansible/roles/slurm_recompile/README.md +++ b/ansible/roles/slurm_recompile/README.md @@ -1,28 +1,22 @@ # slurm_recompile -================= Recompiles slurm from source RPMs and installs the packages that were built. -Requirements ------------- +## Requirements -Role Variables --------------- +## Role Variables See `defaults/main.yml`. -Dependencies ------------- +## Dependencies -Example Playbook ----------------- +## Example Playbook - hosts: compute tasks: - import_role: name: slurm_recompile -License -------- +## License Apache-2.0 diff --git a/ansible/roles/slurm_recompile/tasks/main.yml b/ansible/roles/slurm_recompile/tasks/main.yml index 4720a6aef..22961d636 100644 --- a/ansible/roles/slurm_recompile/tasks/main.yml +++ b/ansible/roles/slurm_recompile/tasks/main.yml @@ -1,6 +1,6 @@ --- - name: Get facts about CUDA installation - import_role: + ansible.builtin.import_role: name: cuda tasks_from: facts.yml @@ -9,15 +9,16 @@ manager: auto - name: Set fact containing slurm package facts - set_fact: + ansible.builtin.set_fact: slurm_package: "{{ ansible_facts.packages['slurm-slurmd-ohpc'].0 }}" - name: Install build packages ansible.builtin.dnf: name: "{{ slurm_recompile_build_packages }}" -- name: Recompile and install slurm packages - shell: | +- name: Recompile and install slurm packages # noqa: no-changed-when + # yamllint disable rule:line-length + ansible.builtin.shell: | #!/bin/bash source /etc/profile set -eux @@ -27,17 +28,18 @@ dnf builddep -y slurm.spec rpmbuild -bb{% if slurm_recompile_with_nvml | bool %} -D "_with_nvml --with-nvml=/usr/local/cuda-{{ cuda_facts_version_short }}/targets/x86_64-linux/"{% endif %} slurm.spec dnf reinstall -y /root/rpmbuild/RPMS/x86_64/*.rpm + # yamllint enable rule:line-length become: true - name: Workaround missing symlink # Workaround path issue: https://groups.google.com/g/slurm-users/c/cvGb4JnK8BY - command: ln -s /lib64/libnvidia-ml.so.1 /lib64/libnvidia-ml.so + ansible.builtin.command: ln -s /lib64/libnvidia-ml.so.1 /lib64/libnvidia-ml.so args: creates: /lib64/libnvidia-ml.so when: slurm_recompile_with_nvml | bool -- name: Cleanup Dependencies - shell: | +- name: Cleanup Dependencies # noqa: no-changed-when + ansible.builtin.shell: | #!/bin/bash set -eux set -o pipefail diff --git a/ansible/roles/slurm_stats/tasks/configure.yml b/ansible/roles/slurm_stats/tasks/configure.yml index 6bd87b276..e83c33f29 100644 --- a/ansible/roles/slurm_stats/tasks/configure.yml +++ b/ansible/roles/slurm_stats/tasks/configure.yml @@ -1,13 +1,14 @@ --- - name: Create a directory to house the log files - file: + ansible.builtin.file: state: directory path: /var/log/slurm-stats + mode: "0755" become: true - name: Create cron job - cron: + ansible.builtin.cron: name: Generate slurm stats minute: "*/5" user: root @@ -17,7 +18,7 @@ become: true - name: Setup log rotate - copy: + ansible.builtin.copy: content: | # WARNING: This file is managed by ansible, do not modify. /var/log/slurm-stats/finished_jobs.json { @@ -27,4 +28,5 @@ delaycompress } dest: /etc/logrotate.d/slurm-stats + mode: "0644" become: true diff --git a/ansible/roles/slurm_stats/tasks/install.yml b/ansible/roles/slurm_stats/tasks/install.yml index 748272eb6..981bf84ef 100644 --- a/ansible/roles/slurm_stats/tasks/install.yml +++ b/ansible/roles/slurm_stats/tasks/install.yml @@ -1,5 +1,5 @@ --- - name: Setup slurm tools - include_role: + ansible.builtin.include_role: name: slurm_tools diff --git a/ansible/site.yml b/ansible/site.yml index 54c8fdb7e..79b71e10a 100644 --- a/ansible/site.yml +++ b/ansible/site.yml @@ -36,4 +36,3 @@ when: hook_path | exists - ansible.builtin.import_playbook: final.yml - diff --git a/ansible/slurm.yml b/ansible/slurm.yml index 106919264..99efb464d 100644 --- a/ansible/slurm.yml +++ b/ansible/slurm.yml @@ -67,8 +67,8 @@ - name: Setup Node Health Checks # Has to be done here as it requires openhpc repos etc for installation hosts: nhc:!builder - become: yes + become: true tags: nhc tasks: - - include_role: + - ansible.builtin.include_role: name: nhc diff --git a/ansible/validate.yml b/ansible/validate.yml index b3c0b2387..5e838f53d 100644 --- a/ansible/validate.yml +++ b/ansible/validate.yml @@ -25,7 +25,7 @@ # the actual installed version. # So this compares requirements.yml against a .last version produced by a # successful dev/setup-env.sh run. - - assert: + - ansible.builtin.assert: that: "{{ _requirements_current == _requirements_installed }}" fail_msg: | Ansible Galaxy installs are out of date: @@ -33,7 +33,7 @@ {% for req in _requirements_installed | difference(_requirements_current) %} {{ req }} {% endfor %} - + Run dev/setup-env.sh to fix this. vars: # note difference filter requires lists, so need to rearrange yaml from files. @@ -135,10 +135,10 @@ - name: Validate vGPU configuration hosts: vgpu - become: yes - gather_facts: yes + become: true + gather_facts: true tags: vgpu tasks: - - include_role: + - ansible.builtin.include_role: name: stackhpc.linux.vgpu tasks_from: validate.yml diff --git a/docs/experimental/isolated-clusters.md b/docs/experimental/isolated-clusters.md index 304b625d2..9745746f0 100644 --- a/docs/experimental/isolated-clusters.md +++ b/docs/experimental/isolated-clusters.md @@ -15,9 +15,9 @@ network is shown in the table below. Note that: 1. The `hpl` test from the `ansible/adhoc/hpctests.yml` playbook is not functional and must be skipped using: - ```shell - ansible-playbook ansible/adhoc/hpctests.yml --skip-tags hpl-solo - ``` + ```shell + ansible-playbook ansible/adhoc/hpctests.yml --skip-tags hpl-solo + ``` 2. Using [EESSI](https://www.eessi.io/docs/) necessarily requires outbound network access for the CernVM File System. However this can be provided @@ -28,59 +28,61 @@ network is shown in the table below. Note that: ## Support by feature for isolated networks See above for definition of "Default" features. In the "Isolated?" column: + - "Y": Feature works without outbound internet access. - "N": Known not to work. - "?": Not investigated at present. -| Inventory group/role | Default? | Isolated? | -| ----------------------| -------- | --------- | -| alertmanager | Y | Y | -| ansible_init | Y | Y | -| basic_users | Y | Y | -| block_devices | Y | No (depreciated) | -| cacerts | - | Y | -| chrony | - | Y | -| compute_init | - | Y | -| cuda | - | ? | -| eessi | Y | Y - see above | -| etc_hosts | Y | Y | -| extra_packages | - | No | -| fail2ban | Y | Y | -| filebeat | Y | Y | -| firewalld | Y | Y | -| freeipa_client | - | Y - image build required | -| gateway | n/a | n/a - build only | -| grafana | Y | Y | -| hpctests | Y | Y - except hpl-solo, see above | -| k3s_agent | - | ? | -| k3s_server | - | ? | -| k9s | - | ? | -| lustre | - | ? | -| manila | Y | Y | -| mysql | Y | Y | -| nfs | Y | Y | -| nhc | Y | Y | -| node_exporter | Y | Y | -| openhpc | Y | Y | -| openondemand | Y | Y | -| openondemand_desktop | Y | Y | -| openondemand_jupyter | Y | Y | -| opensearch | Y | Y | -| podman | Y | Y | -| persist_hostkeys | Y | Y | -| prometheus | Y | Y | -| proxy | - | Y | -| resolv_conf | - | ? | -| slurm_exporter | Y | Y | -| slurm_stats | Y | Y | -| squid | - | ? | -| sshd | - | ? | -| sssd | - | ? | -| systemd | Y | Y | -| tuned | - | Y | -| update | - | No | +| Inventory group/role | Default? | Isolated? | +| -------------------- | -------- | ------------------------------ | +| alertmanager | Y | Y | +| ansible_init | Y | Y | +| basic_users | Y | Y | +| block_devices | Y | No (depreciated) | +| cacerts | - | Y | +| chrony | - | Y | +| compute_init | - | Y | +| cuda | - | ? | +| eessi | Y | Y - see above | +| etc_hosts | Y | Y | +| extra_packages | - | No | +| fail2ban | Y | Y | +| filebeat | Y | Y | +| firewalld | Y | Y | +| freeipa_client | - | Y - image build required | +| gateway | n/a | n/a - build only | +| grafana | Y | Y | +| hpctests | Y | Y - except hpl-solo, see above | +| k3s_agent | - | ? | +| k3s_server | - | ? | +| k9s | - | ? | +| lustre | - | ? | +| manila | Y | Y | +| MySQL | Y | Y | +| nfs | Y | Y | +| nhc | Y | Y | +| node_exporter | Y | Y | +| openhpc | Y | Y | +| openondemand | Y | Y | +| openondemand_desktop | Y | Y | +| openondemand_jupyter | Y | Y | +| opensearch | Y | Y | +| podman | Y | Y | +| persist_hostkeys | Y | Y | +| prometheus | Y | Y | +| proxy | - | Y | +| resolv_conf | - | ? | +| slurm_exporter | Y | Y | +| slurm_stats | Y | Y | +| squid | - | ? | +| sshd | - | ? | +| sssd | - | ? | +| systemd | Y | Y | +| tuned | - | Y | +| update | - | No | ## Image build + A site image build may be required, either for features using packages not present in StackHPC images (e.g `freeipa_client`) or to [add additional packages](../operations.md#adding-additional-packages). Clearly in this case the build VM does require outbound internet access. For an @@ -90,7 +92,7 @@ proxy is available the image build can be configured to use that, e.g.: ```yaml # environments/$ENV/builder.pkrvars.hcl: -... +--- inventory_groups = 'proxy,freeipa_client' ``` @@ -104,7 +106,7 @@ proxy_http_address: squid.mysite.org ```yaml # environments/$ENV/group_vars/builder/vault_overrrides.yml: # NB: vault-encrypt this file -vault_proxy_basic_password: 'super-secret-password' +vault_proxy_basic_password: "super-secret-password" ``` See [ansible/roles/proxy/README.md](../../ansible/roles/proxy/README.md) and @@ -125,28 +127,32 @@ default security groups are less restrictive than these. Assuming nodes and the deploy host have a security group `isolated` applied then the following rules are required: - # allow outbound DNS - ALLOW IPv4 53/tcp to 0.0.0.0/0 - ALLOW IPv4 53/udp to 0.0.0.0/0 - - # allow everything within the cluster: - ALLOW IPv4 from isolated - ALLOW IPv4 to isolated - - # allow hosts to reach metadata server (e.g. for cloud-init keys): - ALLOW IPv4 80/tcp to 169.254.169.254/32 +```text +# allow outbound DNS +ALLOW IPv4 53/tcp to 0.0.0.0/0 +ALLOW IPv4 53/udp to 0.0.0.0/0 + +# allow everything within the cluster: +ALLOW IPv4 from isolated +ALLOW IPv4 to isolated - # optionally: allow hosts to reach squid proxy for EESSI: - ALLOW IPv4 3128/tcp to +# allow hosts to reach metadata server (e.g. for cloud-init keys): +ALLOW IPv4 80/tcp to 169.254.169.254/32 + +# optionally: allow hosts to reach squid proxy for EESSI: +ALLOW IPv4 3128/tcp to +``` Note that name resolution happens on the hosts, not on the proxy, hence DNS is required for nodes even with a proxy. -For nodes running OpenOndemand, inbound ssh and https are also required +For nodes running OpenOndemand, inbound SSH and https are also required (e.g. in a security group called `isolated-ssh-https`): - ALLOW IPv4 443/tcp from 0.0.0.0/0 - ALLOW IPv4 22/tcp from 0.0.0.0/0 +```text +ALLOW IPv4 443/tcp from 0.0.0.0/0 +ALLOW IPv4 22/tcp from 0.0.0.0/0 +``` If non-default security groups are required, then the OpenTofu variables `login_security_groups` and `nonlogin_security_groups` can be used to set diff --git a/docs/mig.md b/docs/mig.md index 0d52f968c..b8eeae8ad 100644 --- a/docs/mig.md +++ b/docs/mig.md @@ -10,9 +10,9 @@ This page details how to configure Multi Instance GPU (MIG) in Slurm. ## Inventory -Add relevant hosts to the ``vgpu`` group, for example in `environments/$ENV/inventory/groups`: +Add relevant hosts to the `vgpu` group, for example in `environments/$ENV/inventory/groups`: -``` +```yaml [vgpu:children] cuda ``` @@ -23,24 +23,24 @@ Use variables from the [stackhpc.linux.vgpu](https://github.com/stackhpc/ansible For example in: `environments//inventory/group_vars/all/vgpu`: -``` +```yaml --- vgpu_definitions: - - pci_address: "0000:17:00.0" - mig_devices: - "1g.10gb": 4 - "4g.40gb": 1 - - pci_address: "0000:81:00.0" - mig_devices: - "1g.10gb": 4 - "4g.40gb": 1 + - pci_address: "0000:17:00.0" + mig_devices: + "1g.10gb": 4 + "4g.40gb": 1 + - pci_address: "0000:81:00.0" + mig_devices: + "1g.10gb": 4 + "4g.40gb": 1 ``` -The appliance will use the driver installed via the ``cuda`` role. +The appliance will use the driver installed via the `cuda` role. -Use ``lspci`` to determine the PCI addresses e.g: +Use `lspci` to determine the PCI addresses e.g: -``` +```text [root@io-io-gpu-02 ~]# lspci -nn | grep -i nvidia 06:00.0 3D controller [0302]: NVIDIA Corporation GH100 [H100 SXM5 80GB] [10de:2330] (rev a1) 0c:00.0 3D controller [0302]: NVIDIA Corporation GH100 [H100 SXM5 80GB] [10de:2330] (rev a1) @@ -51,7 +51,7 @@ Use ``lspci`` to determine the PCI addresses e.g: The supported profiles can be discovered by consulting the [NVIDIA documentation](https://docs.nvidia.com/datacenter/tesla/mig-user-guide/index.html#supported-mig-profiles) or interactively by running the following on one of the compute nodes with GPU resources: -``` +```text [rocky@io-io-gpu-05 ~]$ sudo nvidia-smi -i 0 -mig 1 Enabled MIG Mode for GPU 00000000:06:00.0 All done. @@ -150,7 +150,7 @@ All done. ## compute_init configuration for slurm triggered rebuild (optional) You only need to configure this if you are using the slurm triggered rebuild -feature. Use the ``vgpu`` metadata option to enable creation of mig devices on +feature. Use the `vgpu` metadata option to enable creation of mig devices on rebuild. ## GRES configuration @@ -160,19 +160,19 @@ do this you need to determine the names of the GPU types as detected by slurm. F deploy slurm with the default nodegroup definitions to get a working cluster. Make a temporary copy of slurm.conf: -``` +```text cp /var/spool/slurm/conf-cache/slurm.conf /tmp/ ``` Then create a `/tmp/gres.conf` which enables autodetection: -``` +```text AutoDetect=nvml ``` You will then be able to run: `sudo slurmd -f /tmp/slurm.conf -G` on a compute node where GPU resources exist. An example is shown below: -``` +```text [rocky@io-io-gpu-02 ~]$ sudo slurmd -f /tmp/slurm.conf -G slurmd-io-io-gpu-02: Gres Name=gpu Type=nvidia_h100_80gb_hbm3 Count=1 Index=0 ID=7696487 File=/dev/nvidia0 Links=(null) Flags=HAS_FILE,HAS_TYPE,ENV_NVML,ENV_RSMI,ENV_ONEAPI ,ENV_OPENCL,ENV_DEFAULT @@ -201,24 +201,23 @@ NOTE: If you have configured a Gres= line in slurm.conf already. You may have to GRES resources can then be configured manually. An example is shown below (`environments//inventory/group_vars/all/openhpc.yml`): -``` +```yaml openhpc_partitions: - name: cpu - name: gpu openhpc_nodegroups: - - name: cpu - - name: gpu - gres_autodetect: nvml - gres: - - conf: "gpu:nvidia_h100_80gb_hbm3:2" - - conf: "gpu:nvidia_h100_80gb_hbm3_4g.40gb:2" - - conf: "gpu:nvidia_h100_80gb_hbm3_1g.10gb:6" + - name: cpu + - name: gpu + gres_autodetect: nvml + gres: + - conf: "gpu:nvidia_h100_80gb_hbm3:2" + - conf: "gpu:nvidia_h100_80gb_hbm3_4g.40gb:2" + - conf: "gpu:nvidia_h100_80gb_hbm3_1g.10gb:6" openhpc_config: GresTypes: - gpu - ``` Making sure the types (the identifier after `gpu:`) match those collected with `slurmd -G`. Substrings diff --git a/environments/.stackhpc/tofu/cluster_image.auto.tfvars.json b/environments/.stackhpc/tofu/cluster_image.auto.tfvars.json index 03b092564..cfb50e987 100644 --- a/environments/.stackhpc/tofu/cluster_image.auto.tfvars.json +++ b/environments/.stackhpc/tofu/cluster_image.auto.tfvars.json @@ -1,6 +1,6 @@ { - "cluster_image": { - "RL8": "openhpc-RL8-250708-1502-1494192e", - "RL9": "openhpc-RL9-250708-1547-1494192e" - } + "cluster_image": { + "RL8": "openhpc-RL8-250708-1502-1494192e", + "RL9": "openhpc-RL9-250708-1547-1494192e" + } } diff --git a/environments/.stackhpc/tofu/main.tf b/environments/.stackhpc/tofu/main.tf index 81d5f27eb..8175b67bf 100644 --- a/environments/.stackhpc/tofu/main.tf +++ b/environments/.stackhpc/tofu/main.tf @@ -83,9 +83,9 @@ module "cluster" { } compute = { standard = { # NB: can't call this default! - nodes = ["compute-0", "compute-1"] - flavor = var.other_node_flavor - compute_init_enable = ["compute", "chrony", "etc_hosts", "nfs", "basic_users", "eessi", "tuned", "cacerts", "nhc"] + nodes = ["compute-0", "compute-1"] + flavor = var.other_node_flavor + compute_init_enable = ["compute", "chrony", "etc_hosts", "nfs", "basic_users", "eessi", "tuned", "cacerts", "nhc"] ignore_image_changes = true } # Normally-empty partition for testing: diff --git a/environments/common/inventory/group_vars/all/proxy.yml b/environments/common/inventory/group_vars/all/proxy.yml index d4a2fce6d..4f0b58c9d 100644 --- a/environments/common/inventory/group_vars/all/proxy.yml +++ b/environments/common/inventory/group_vars/all/proxy.yml @@ -28,5 +28,5 @@ proxy_http_proxy: >- {%- endif -%} {{ proxy_http_address }}:{{ proxy_http_port }} {% else %} - + {% endif %} diff --git a/environments/skeleton/{{cookiecutter.environment}}/tofu/additional.tf b/environments/skeleton/{{cookiecutter.environment}}/tofu/additional.tf index 1f7c48ce5..83845ca98 100644 --- a/environments/skeleton/{{cookiecutter.environment}}/tofu/additional.tf +++ b/environments/skeleton/{{cookiecutter.environment}}/tofu/additional.tf @@ -4,37 +4,37 @@ module "additional" { for_each = var.additional_nodegroups # must be set for group: - nodes = each.value.nodes + nodes = each.value.nodes flavor = each.value.flavor # always taken from top-level value: - cluster_name = var.cluster_name + cluster_name = var.cluster_name cluster_domain_suffix = var.cluster_domain_suffix - key_pair = var.key_pair - environment_root = var.environment_root - + key_pair = var.key_pair + environment_root = var.environment_root + # can be set for group, defaults to top-level value: - image_id = lookup(each.value, "image_id", var.cluster_image_id) - vnic_types = lookup(each.value, "vnic_types", var.vnic_types) + image_id = lookup(each.value, "image_id", var.cluster_image_id) + vnic_types = lookup(each.value, "vnic_types", var.vnic_types) volume_backed_instances = lookup(each.value, "volume_backed_instances", var.volume_backed_instances) - root_volume_size = lookup(each.value, "root_volume_size", var.root_volume_size) - root_volume_type = lookup(each.value, "root_volume_type", var.root_volume_type) - gateway_ip = lookup(each.value, "gateway_ip", var.gateway_ip) - nodename_template = lookup(each.value, "nodename_template", var.cluster_nodename_template) - + root_volume_size = lookup(each.value, "root_volume_size", var.root_volume_size) + root_volume_type = lookup(each.value, "root_volume_type", var.root_volume_type) + gateway_ip = lookup(each.value, "gateway_ip", var.gateway_ip) + nodename_template = lookup(each.value, "nodename_template", var.cluster_nodename_template) + # optionally set for group: networks = concat(var.cluster_networks, lookup(each.value, "extra_networks", [])) # here null means "use module var default" - extra_volumes = lookup(each.value, "extra_volumes", null) - fip_addresses = lookup(each.value, "fip_addresses", null) - fip_network = lookup(each.value, "fip_network", null) - match_ironic_node = lookup(each.value, "match_ironic_node", null) - availability_zone = lookup(each.value, "availability_zone", null) - ip_addresses = lookup(each.value, "ip_addresses", null) - security_group_ids = lookup(each.value, "security_group_ids", [for o in data.openstack_networking_secgroup_v2.nonlogin: o.id]) + extra_volumes = lookup(each.value, "extra_volumes", null) + fip_addresses = lookup(each.value, "fip_addresses", null) + fip_network = lookup(each.value, "fip_network", null) + match_ironic_node = lookup(each.value, "match_ironic_node", null) + availability_zone = lookup(each.value, "availability_zone", null) + ip_addresses = lookup(each.value, "ip_addresses", null) + security_group_ids = lookup(each.value, "security_group_ids", [for o in data.openstack_networking_secgroup_v2.nonlogin : o.id]) # can't be set for additional nodes - compute_init_enable = [] + compute_init_enable = [] ignore_image_changes = false # computed diff --git a/environments/skeleton/{{cookiecutter.environment}}/tofu/control.tf b/environments/skeleton/{{cookiecutter.environment}}/tofu/control.tf index b96307fa1..8b4349497 100644 --- a/environments/skeleton/{{cookiecutter.environment}}/tofu/control.tf +++ b/environments/skeleton/{{cookiecutter.environment}}/tofu/control.tf @@ -37,7 +37,7 @@ resource "openstack_networking_port_v2" "control" { } resource "openstack_compute_instance_v2" "control" { - + name = split(".", local.control_fqdn)[0] image_id = var.cluster_image_id flavor_name = var.control_node_flavor diff --git a/environments/skeleton/{{cookiecutter.environment}}/tofu/inventory.tf b/environments/skeleton/{{cookiecutter.environment}}/tofu/inventory.tf index 6502ee870..fa7108b59 100644 --- a/environments/skeleton/{{cookiecutter.environment}}/tofu/inventory.tf +++ b/environments/skeleton/{{cookiecutter.environment}}/tofu/inventory.tf @@ -1,6 +1,6 @@ # tflint-ignore: terraform_required_providers resource "local_file" "hosts" { - content = templatefile("${path.module}/inventory.tpl", + content = templatefile("${path.module}/inventory.tpl", { "cluster_name" : var.cluster_name, "cluster_domain_suffix" : var.cluster_domain_suffix diff --git a/environments/skeleton/{{cookiecutter.environment}}/tofu/login.tf b/environments/skeleton/{{cookiecutter.environment}}/tofu/login.tf index 7bac7ba2f..33fc4666c 100644 --- a/environments/skeleton/{{cookiecutter.environment}}/tofu/login.tf +++ b/environments/skeleton/{{cookiecutter.environment}}/tofu/login.tf @@ -21,7 +21,7 @@ module "login" { root_volume_type = lookup(each.value, "root_volume_type", var.root_volume_type) gateway_ip = lookup(each.value, "gateway_ip", var.gateway_ip) nodename_template = lookup(each.value, "nodename_template", var.cluster_nodename_template) - + # optionally set for group: networks = concat(var.cluster_networks, lookup(each.value, "extra_networks", [])) # here null means "use module var default" diff --git a/environments/skeleton/{{cookiecutter.environment}}/tofu/node_group/nodes.tf b/environments/skeleton/{{cookiecutter.environment}}/tofu/node_group/nodes.tf index eed15d470..d50ef51aa 100644 --- a/environments/skeleton/{{cookiecutter.environment}}/tofu/node_group/nodes.tf +++ b/environments/skeleton/{{cookiecutter.environment}}/tofu/node_group/nodes.tf @@ -128,7 +128,7 @@ resource "openstack_compute_instance_v2" "compute_fixed_image" { resource "openstack_compute_instance_v2" "compute" { for_each = var.ignore_image_changes ? [] : toset(var.nodes) - + name = split(".", local.fqdns[each.key])[0] image_id = var.image_id flavor_name = var.flavor diff --git a/environments/skeleton/{{cookiecutter.environment}}/tofu/node_group/variables.tf b/environments/skeleton/{{cookiecutter.environment}}/tofu/node_group/variables.tf index 079be1bdb..f030ca986 100644 --- a/environments/skeleton/{{cookiecutter.environment}}/tofu/node_group/variables.tf +++ b/environments/skeleton/{{cookiecutter.environment}}/tofu/node_group/variables.tf @@ -72,8 +72,8 @@ variable "extra_volumes" { } variable "security_group_ids" { - type = list(string) - nullable = false + type = list(string) + nullable = false } variable "control_address" { diff --git a/environments/skeleton/{{cookiecutter.environment}}/tofu/variables.tf b/environments/skeleton/{{cookiecutter.environment}}/tofu/variables.tf index b0f1e0e0c..91842a133 100644 --- a/environments/skeleton/{{cookiecutter.environment}}/tofu/variables.tf +++ b/environments/skeleton/{{cookiecutter.environment}}/tofu/variables.tf @@ -135,9 +135,10 @@ variable "compute" { type = any # can't do any better; TF type constraints can't cope with heterogeneous inner mappings } +# tflint-ignore: terraform_typed_variables variable "additional_nodegroups" { - default = {} - description = <<-EOF + default = {} + description = <<-EOF Mapping defining homogenous groups of nodes for arbitrary purposes. These nodes are not in the compute or login inventory groups so they will not run slurmd. From dedef28ace26e18cea972657198a3ad595745c9b Mon Sep 17 00:00:00 2001 From: Max Norton Date: Tue, 29 Jul 2025 16:54:35 +0100 Subject: [PATCH 06/37] Fix linting issues on the merge of origin/main --- ansible/roles/topology/README.md | 34 +++++++++--------- ansible/roles/topology/defaults/main.yml | 1 - ansible/roles/topology/library/map_hosts.py | 36 ++++++++++--------- ansible/roles/topology/tasks/main.yml | 2 +- ansible/slurm.yml | 4 +-- .../tofu/node_group/variables.tf | 6 ++-- 6 files changed, 43 insertions(+), 40 deletions(-) diff --git a/ansible/roles/topology/README.md b/ansible/roles/topology/README.md index 057134490..6bdeaae32 100644 --- a/ansible/roles/topology/README.md +++ b/ansible/roles/topology/README.md @@ -1,5 +1,4 @@ -topology -======== +# topology Templates out /etc/slurm/topology.conf file based on an OpenStack project for use by Slurm's [topology/tree plugin.](https://slurm.schedmd.com/topology.html) Models @@ -12,22 +11,23 @@ reconfigure an already running cluster after a `ansible/site.yml` run. You will to run the `ansible/adhoc/restart-slurm.yml` playbook for changes to topology.conf to be recognised. -Role Variables --------------- +## Role Variables - `topology_nodes:`: Required list of strs. List of inventory hostnames of nodes to include in topology tree. Must be set to include all compute nodes in Slurm cluster. Default `[]`. - `topology_conf_template`: Optional str. Path to Jinja2 template of topology.conf file. Default `templates/topology.conf.j2` -- `topology_above_rack_topology`: Optionally multiline str. Used to define topology above racks/AZs if - you wish to partition racks further under different logical switches. New switches above should be - defined as [SwitchName lines](https://slurm.schedmd.com/topology.html#hierarchical) referencing - rack Availability Zones under that switch in their `Switches fields`. These switches must themselves - be under a top level switch. e.g - ``` - topology_above_rack_topology: | - SwitchName=rack-group-1 Switches=rack-az-1,rack-az-2 - SwitchName=rack-group-2 Switches=rack-az-3,rack-az-4 - SwitchName=top-level Switches=rack-group-1,rack-group-2 - ``` - Defaults to an empty string, which causes all AZs to be put under a - single top level switch. \ No newline at end of file +- `topology_above_rack_topology`: Optionally multiline str. Used to define topology above racks/AZs if + you wish to partition racks further under different logical switches. New switches above should be + defined as [SwitchName lines](https://slurm.schedmd.com/topology.html#hierarchical) referencing + rack Availability Zones under that switch in their `Switches fields`. These switches must themselves + be under a top level switch. e.g + + ```yaml + topology_above_rack_topology: | + SwitchName=rack-group-1 Switches=rack-az-1,rack-az-2 + SwitchName=rack-group-2 Switches=rack-az-3,rack-az-4 + SwitchName=top-level Switches=rack-group-1,rack-group-2 + ``` + + Defaults to an empty string, which causes all AZs to be put under a + single top level switch. diff --git a/ansible/roles/topology/defaults/main.yml b/ansible/roles/topology/defaults/main.yml index 6b6224302..87801e8aa 100644 --- a/ansible/roles/topology/defaults/main.yml +++ b/ansible/roles/topology/defaults/main.yml @@ -5,4 +5,3 @@ topology_nodes: [] topology_conf_template: templates/topology.conf.j2 topology_above_rack_topology: "" - diff --git a/ansible/roles/topology/library/map_hosts.py b/ansible/roles/topology/library/map_hosts.py index 196113261..5c685ef25 100644 --- a/ansible/roles/topology/library/map_hosts.py +++ b/ansible/roles/topology/library/map_hosts.py @@ -1,10 +1,10 @@ -#!/usr/bin/python +#!/usr/bin/python # pylint: disable=missing-module-docstring # Copyright: (c) 2025, StackHPC # Apache 2 License -from ansible.module_utils.basic import AnsibleModule -import openstack +import openstack # pylint: disable=import-error +from ansible.module_utils.basic import AnsibleModule # pylint: disable=import-error DOCUMENTATION = """ --- @@ -47,35 +47,39 @@ - mycluster-compute-1 """ + def min_prefix(uuids, start=4): - """ Take a list of uuids and return the smallest length >= start which keeps them unique """ + """Take a list of uuids and return the smallest length >= start which keeps them unique""" for length in range(start, len(uuids[0])): prefixes = set(uuid[:length] for uuid in uuids) if len(prefixes) == len(uuids): return length + # Fallback to returning the full length + return len(uuids[0]) + -def run_module(): - module_args = dict( - compute_vms=dict(type='list', elements='str', required=True) - ) +def run_module(): # pylint: disable=missing-function-docstring + module_args = {"compute_vms": {"type": "list", "elements": "str", "required": True}} module = AnsibleModule(argument_spec=module_args, supports_check_mode=True) conn = openstack.connection.from_config() - servers = [s for s in conn.compute.servers() if s["name"] in module.params["compute_vms"]] + servers = [ + s for s in conn.compute.servers() if s["name"] in module.params["compute_vms"] + ] topo = {} all_host_ids = [] for s in servers: - az = s['availability_zone'] - host_id = s['host_id'] - if host_id != '': # empty string if e.g. server is shelved + az = s["availability_zone"] + host_id = s["host_id"] + if host_id != "": # empty string if e.g. server is shelved all_host_ids.append(host_id) if az not in topo: topo[az] = {} if host_id not in topo[az]: topo[az][host_id] = [] - topo[az][host_id].append(s['name']) + topo[az][host_id].append(s["name"]) uuid_len = min_prefix(list(set(all_host_ids))) @@ -83,14 +87,14 @@ def run_module(): topo[az] = dict((k[:uuid_len], v) for (k, v) in topo[az].items()) result = { - "changed": False, + "changed": False, "topology": topo, } - + module.exit_json(**result) -def main(): +def main(): # pylint: disable=missing-function-docstring run_module() diff --git a/ansible/roles/topology/tasks/main.yml b/ansible/roles/topology/tasks/main.yml index 8debddeab..3872a0cbf 100644 --- a/ansible/roles/topology/tasks/main.yml +++ b/ansible/roles/topology/tasks/main.yml @@ -13,4 +13,4 @@ dest: /etc/slurm/topology.conf owner: root group: root - mode: 0644 + mode: "0644" diff --git a/ansible/slurm.yml b/ansible/slurm.yml index e09a3e495..d6d306e90 100644 --- a/ansible/slurm.yml +++ b/ansible/slurm.yml @@ -60,14 +60,14 @@ tags: - openhpc tasks: - - include_role: + - ansible.builtin.include_role: name: topology # Gated on topology group having compute nodes but role also # needs to run on control and login nodes when: - appliances_mode == 'configure' - groups['topology'] | length > 0 - - include_role: + - ansible.builtin.include_role: name: stackhpc.openhpc tasks_from: "{{ 'runtime.yml' if appliances_mode == 'configure' else 'main.yml' }}" diff --git a/environments/skeleton/{{cookiecutter.environment}}/tofu/node_group/variables.tf b/environments/skeleton/{{cookiecutter.environment}}/tofu/node_group/variables.tf index 74112fcfc..370636074 100644 --- a/environments/skeleton/{{cookiecutter.environment}}/tofu/node_group/variables.tf +++ b/environments/skeleton/{{cookiecutter.environment}}/tofu/node_group/variables.tf @@ -149,9 +149,9 @@ variable "match_ironic_node" { } variable "availability_zone" { - type = string - description = "Name of availability zone. If undefined, defaults to 'nova' if match_ironic_node is true, deferred to OpenStack otherwise" - default = null + type = string + description = "Name of availability zone. If undefined, defaults to 'nova' if match_ironic_node is true, deferred to OpenStack otherwise" + default = null } variable "baremetal_nodes" { From db6fca451c24dd90f5c596a70ce8ac4afdb28046 Mon Sep 17 00:00:00 2001 From: Max Norton Date: Tue, 29 Jul 2025 17:00:58 +0100 Subject: [PATCH 07/37] Use the head ref for workflow concurrency --- .github/workflows/main.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index b86a9b2d7..2c8890f71 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -16,7 +16,7 @@ on: pull_request: concurrency: - group: ${{ github.workflow }}-${{ github.ref }}-${{ matrix.build.image_name }} # to branch/PR + OS + group: ${{ github.workflow }}-${{ github.head_ref }} cancel-in-progress: true jobs: From 86215abaf24184d48219c4fb5664a3f76bff67d5 Mon Sep 17 00:00:00 2001 From: Max Norton Date: Wed, 30 Jul 2025 09:09:13 +0100 Subject: [PATCH 08/37] Output the path filter result of the workflow --- .github/workflows/main.yml | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 2c8890f71..424995849 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -34,7 +34,8 @@ jobs: should_skip: ${{ steps.skip_check.outputs.should_skip }} paths_result: ${{ steps.skip_check.outputs.paths_result }} steps: - - id: skip_check + - name: Build paths result + id: skip_check # For security we use the commit of fkirc/skip-duplicate-actions@v5 uses: fkirc/skip-duplicate-actions@f75f66ce1886f00957d99748a42c724f4330bdcf with: @@ -68,6 +69,10 @@ jobs: trivvyscan: paths: - 'environments/.stackhpc/tofu/cluster_image.auto.tfvars.json' + - name: Echo paths result + run: > + echo '${{ steps.skip_check.outputs.paths_result }}' + | jq -r '.' extra: name: Test extra build From 87bf78b2ff1d8189bd39da8727447b321d48be12 Mon Sep 17 00:00:00 2001 From: Max Norton Date: Wed, 30 Jul 2025 09:33:25 +0100 Subject: [PATCH 09/37] Tweak github action used to detect changed paths on push/pull request --- .github/workflows/main.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 424995849..4edbb300b 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -40,6 +40,8 @@ jobs: uses: fkirc/skip-duplicate-actions@f75f66ce1886f00957d99748a42c724f4330bdcf with: cancel_others: true + skip_after_successful_duplicate: false + do_not_skip: [ "pull_request", "push", "workflow_dispatch" ] paths_filter: | extra_push: paths: From 20d0874609a490305757e3405f5df155d8b9a92f Mon Sep 17 00:00:00 2001 From: Max Norton Date: Wed, 30 Jul 2025 09:46:59 +0100 Subject: [PATCH 10/37] Tweak github action used to detect changed paths on push/pull request --- .github/workflows/main.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 4edbb300b..6bbe3f217 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -41,7 +41,7 @@ jobs: with: cancel_others: true skip_after_successful_duplicate: false - do_not_skip: [ "pull_request", "push", "workflow_dispatch" ] + do_not_skip: '[ "pull_request", "push", "workflow_dispatch" ]' paths_filter: | extra_push: paths: From dbebb2ad402d7d834a210c04b757ad863e32c6af Mon Sep 17 00:00:00 2001 From: Max Norton Date: Wed, 30 Jul 2025 10:02:55 +0100 Subject: [PATCH 11/37] Tweak github action used to detect changed paths on push/pull request --- .github/workflows/main.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 6bbe3f217..6b3dc0b4b 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -41,7 +41,7 @@ jobs: with: cancel_others: true skip_after_successful_duplicate: false - do_not_skip: '[ "pull_request", "push", "workflow_dispatch" ]' + #do_not_skip: '[ "pull_request", "push", "workflow_dispatch" ]' paths_filter: | extra_push: paths: From 0457142a1f30b129b169c57a0b0ce7c5aa3542da Mon Sep 17 00:00:00 2001 From: Max Norton Date: Wed, 30 Jul 2025 11:50:30 +0100 Subject: [PATCH 12/37] Tweak github action used to detect changed paths on push/pull request --- .github/workflows/main.yml | 97 +++++++++++++++++++++----------------- 1 file changed, 53 insertions(+), 44 deletions(-) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 6b3dc0b4b..69fcfa30e 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -27,62 +27,66 @@ jobs: files_changed: name: Determine files changed needs: lint - # continue-on-error: true # Uncomment once integration is finished runs-on: ubuntu-latest - # Map a step output to a job output + # Map a step output to a job output, this allows other jobs to be gated on the filter results outputs: - should_skip: ${{ steps.skip_check.outputs.should_skip }} - paths_result: ${{ steps.skip_check.outputs.paths_result }} + extra_push: ${{ steps.filter.outputs.extra_push }} + extra_pull_request: ${{ steps.filter.outputs.extra_pull_request }} + stackhpc: ${{ steps.filter.outputs.stackhpc }} + trivvyscan: ${{ steps.filter.outputs.trivvyscan }} steps: - - name: Build paths result - id: skip_check - # For security we use the commit of fkirc/skip-duplicate-actions@v5 - uses: fkirc/skip-duplicate-actions@f75f66ce1886f00957d99748a42c724f4330bdcf + - name: Checkout + uses: actions/checkout@v4 + + - name: Paths Filter + # For safety using commit of dorny/paths-filter@v3 + uses: dorny/paths-filter@de90cc6fb38fc0963ad72b210f1f284cd68cea36 + id: filter with: - cancel_others: true - skip_after_successful_duplicate: false - #do_not_skip: '[ "pull_request", "push", "workflow_dispatch" ]' - paths_filter: | + list-files: 'json' + filters: | extra_push: - paths: - - 'environments/.stackhpc/tofu/cluster_image.auto.tfvars.json' - - 'ansible/roles/doca/**' - - 'ansible/roles/cuda/**' - - 'ansible/roles/slurm_recompile/**' # runs on cuda group - - 'ansible/roles/lustre/**' - - '.github/workflows/extra.yml' + - 'environments/.stackhpc/tofu/cluster_image.auto.tfvars.json' + - 'ansible/roles/doca/**' + - 'ansible/roles/cuda/**' + - 'ansible/roles/slurm_recompile/**' # runs on cuda group + - 'ansible/roles/lustre/**' + - '.github/workflows/extra.yml' extra_pull_request: - paths: - - 'environments/.stackhpc/tofu/cluster_image.auto.tfvars.json' - - 'ansible/roles/doca/**' - - 'ansible/roles/cuda/**' - - 'ansible/roles/lustre/**' - - '.github/workflows/extra.yml' + - 'environments/.stackhpc/tofu/cluster_image.auto.tfvars.json' + - 'ansible/roles/doca/**' + - 'ansible/roles/cuda/**' + - 'ansible/roles/lustre/**' + - '.github/workflows/extra.yml' stackhpc: - paths: - - '**' - - '!dev/**' - - 'dev/setup-env.sh' - - '!docs/**' - - '!README.md' - - '!.gitignore' - - '!.github/workflows/' - - '.github/workflows/stackhpc' + - '**' + - '!dev/**' + - 'dev/setup-env.sh' + - '!docs/**' + - '!README.md' + - '!.gitignore' + - '!.github/workflows/' + - '.github/workflows/stackhpc' trivvyscan: - paths: - - 'environments/.stackhpc/tofu/cluster_image.auto.tfvars.json' - - name: Echo paths result + - 'environments/.stackhpc/tofu/cluster_image.auto.tfvars.json' + + - name: Paths Filter Result run: > - echo '${{ steps.skip_check.outputs.paths_result }}' - | jq -r '.' + echo 'extra_push: ${{ steps.filter.outputs.extra_push }}' | jq -r '.' + echo 'extra_pull_request: ${{ steps.filter.outputs.extra_pull_request }}' | jq -r '.' + echo 'stackhpc: ${{ steps.filter.outputs.stackhpc }}' | jq -r '.' + echo 'trivvyscan: ${{ steps.filter.outputs.trivvyscan }}' | jq -r '.' extra: name: Test extra build needs: files_changed + #if: | + # needs.files_changed.outputs.should_skip != 'true' && + # (( github.event_name != 'pull_request' && !fromJSON(needs.files_changed.outputs.paths_result).extra_push.should_skip ) || + # ( github.event_name == 'pull_request' && !fromJSON(needs.files_changed.outputs.paths_result).extra_pull_request.should_skip )) if: | - needs.files_changed.outputs.should_skip != 'true' && - (( github.event_name == 'pull_request' && !fromJSON(needs.files_changed.outputs.paths_result).extra_push.should_skip ) || - ( github.event_name == 'pull_request' && !fromJSON(needs.files_changed.outputs.paths_result).extra_pull_request.should_skip )) + github.event_name != 'pull_request' && needs.files_changed.outputs.extra_push == 'true' || + github.event_name == 'pull_request' && needs.files_changed.outputs.extra_pull_request == 'true' #uses: ./.github/workflows/extra.yml steps: # TEST - name: Test extra build... @@ -95,8 +99,10 @@ jobs: stackhpc: name: Test deployment and reimage on OpenStack needs: files_changed + #if: | + # needs.files_changed.outputs.should_skip != 'true' && !fromJSON(needs.files_changed.outputs.paths_result).stackhpc.should_skip if: | - needs.files_changed.outputs.should_skip != 'true' && !fromJSON(needs.files_changed.outputs.paths_result).stackhpc.should_skip + needs.files_changed.outputs.stackhpc == 'true' #uses: ./.github/workflows/stackhpc.yml steps: # TEST - name: Test deployment and reimage on OpenStack... @@ -109,9 +115,12 @@ jobs: trivvyscan: name: Trivy scan image for vulnerabilities needs: files_changed + #if: | + # github.event_name == 'pull_request' && + # needs.files_changed.outputs.should_skip != 'true' && !fromJSON(needs.files_changed.outputs.paths_result).trivvyscan.should_skip if: | github.event_name == 'pull_request' && - needs.files_changed.outputs.should_skip != 'true' && !fromJSON(needs.files_changed.outputs.paths_result).trivvyscan.should_skip + needs.files_changed.outputs.trivvyscan == 'true' #uses: ./.github/workflows/trivvyscan.yml steps: # TEST - name: Trivy scan image for vulnerabilities... From 5ce0b00fb4ef085c0c6f7f579a18455b0adb18ec Mon Sep 17 00:00:00 2001 From: Max Norton Date: Wed, 30 Jul 2025 12:06:36 +0100 Subject: [PATCH 13/37] Tweak github action used to detect changed paths on push/pull request --- .github/workflows/main.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 69fcfa30e..d74724003 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -72,9 +72,9 @@ jobs: - name: Paths Filter Result run: > - echo 'extra_push: ${{ steps.filter.outputs.extra_push }}' | jq -r '.' - echo 'extra_pull_request: ${{ steps.filter.outputs.extra_pull_request }}' | jq -r '.' - echo 'stackhpc: ${{ steps.filter.outputs.stackhpc }}' | jq -r '.' + echo 'extra_push: ${{ steps.filter.outputs.extra_push }}' | jq -r '.'; + echo 'extra_pull_request: ${{ steps.filter.outputs.extra_pull_request }}' | jq -r '.'; + echo 'stackhpc: ${{ steps.filter.outputs.stackhpc }}' | jq -r '.'; echo 'trivvyscan: ${{ steps.filter.outputs.trivvyscan }}' | jq -r '.' extra: From 2b53a02f1d7f88ed4ba3231dbe941e945d341283 Mon Sep 17 00:00:00 2001 From: Max Norton Date: Wed, 30 Jul 2025 12:13:43 +0100 Subject: [PATCH 14/37] Tweak github action used to detect changed paths on push/pull request --- .github/workflows/main.yml | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index d74724003..6a941fef0 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -72,10 +72,11 @@ jobs: - name: Paths Filter Result run: > - echo 'extra_push: ${{ steps.filter.outputs.extra_push }}' | jq -r '.'; - echo 'extra_pull_request: ${{ steps.filter.outputs.extra_pull_request }}' | jq -r '.'; - echo 'stackhpc: ${{ steps.filter.outputs.stackhpc }}' | jq -r '.'; - echo 'trivvyscan: ${{ steps.filter.outputs.trivvyscan }}' | jq -r '.' + echo 'extra_push: ${{ steps.filter.outputs.extra_push }}' + #echo 'extra_push: ${{ steps.filter.outputs.extra_push }}' | jq -r '.'; + #echo 'extra_pull_request: ${{ steps.filter.outputs.extra_pull_request }}' | jq -r '.'; + #echo 'stackhpc: ${{ steps.filter.outputs.stackhpc }}' | jq -r '.'; + #echo 'trivvyscan: ${{ steps.filter.outputs.trivvyscan }}' | jq -r '.' extra: name: Test extra build From df0bb62953a12f9503cee261307e8e55c4cc588c Mon Sep 17 00:00:00 2001 From: Max Norton Date: Wed, 30 Jul 2025 12:38:33 +0100 Subject: [PATCH 15/37] Tweak github action used to detect changed paths on push/pull request --- .github/workflows/main.yml | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 6a941fef0..20b9d71d3 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -72,11 +72,11 @@ jobs: - name: Paths Filter Result run: > - echo 'extra_push: ${{ steps.filter.outputs.extra_push }}' - #echo 'extra_push: ${{ steps.filter.outputs.extra_push }}' | jq -r '.'; - #echo 'extra_pull_request: ${{ steps.filter.outputs.extra_pull_request }}' | jq -r '.'; - #echo 'stackhpc: ${{ steps.filter.outputs.stackhpc }}' | jq -r '.'; - #echo 'trivvyscan: ${{ steps.filter.outputs.trivvyscan }}' | jq -r '.' + echo 'extra_push_files: ${{ steps.filter.outputs.extra_push_files }}'; + echo 'extra_push_files: ${{ steps.filter.outputs.extra_push_files }}' | jq -r '.'; + #echo 'extra_pull_request_files: ${{ steps.filter.outputs.extra_pull_request_files }}' | jq -r '.'; + #echo 'stackhpc_files: ${{ steps.filter.outputs.stackhpc_files }}' | jq -r '.'; + #echo 'trivvyscan_files: ${{ steps.filter.outputs.trivvyscan_files }}' | jq -r '.' extra: name: Test extra build From 55d26e248d978548b97dcc5368c61182a8031efe Mon Sep 17 00:00:00 2001 From: Max Norton Date: Wed, 30 Jul 2025 12:51:25 +0100 Subject: [PATCH 16/37] Tweak github action used to detect changed paths on push/pull request --- .github/workflows/main.yml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 20b9d71d3..e9eea7267 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -73,10 +73,10 @@ jobs: - name: Paths Filter Result run: > echo 'extra_push_files: ${{ steps.filter.outputs.extra_push_files }}'; - echo 'extra_push_files: ${{ steps.filter.outputs.extra_push_files }}' | jq -r '.'; - #echo 'extra_pull_request_files: ${{ steps.filter.outputs.extra_pull_request_files }}' | jq -r '.'; - #echo 'stackhpc_files: ${{ steps.filter.outputs.stackhpc_files }}' | jq -r '.'; - #echo 'trivvyscan_files: ${{ steps.filter.outputs.trivvyscan_files }}' | jq -r '.' + echo '"extra_push_files" : { ${{ steps.filter.outputs.extra_push_files }} }' | jq -r '.'; + echo '"extra_pull_request_files" : { ${{ steps.filter.outputs.extra_pull_request_files }} }' | jq -r '.'; + echo '"stackhpc_files" : { ${{ steps.filter.outputs.stackhpc_files }} }' | jq -r '.'; + echo '"trivvyscan_files" : { ${{ steps.filter.outputs.trivvyscan_files }} }' | jq -r '.' extra: name: Test extra build From 59e2c38efc06ba9bebb8606629e918d952bd25d1 Mon Sep 17 00:00:00 2001 From: Max Norton Date: Wed, 30 Jul 2025 13:00:42 +0100 Subject: [PATCH 17/37] Tweak github action used to detect changed paths on push/pull request --- .github/workflows/main.yml | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index e9eea7267..f5680ede7 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -72,11 +72,10 @@ jobs: - name: Paths Filter Result run: > - echo 'extra_push_files: ${{ steps.filter.outputs.extra_push_files }}'; - echo '"extra_push_files" : { ${{ steps.filter.outputs.extra_push_files }} }' | jq -r '.'; - echo '"extra_pull_request_files" : { ${{ steps.filter.outputs.extra_pull_request_files }} }' | jq -r '.'; - echo '"stackhpc_files" : { ${{ steps.filter.outputs.stackhpc_files }} }' | jq -r '.'; - echo '"trivvyscan_files" : { ${{ steps.filter.outputs.trivvyscan_files }} }' | jq -r '.' + echo '{ "extra_push_files": ${{ steps.filter.outputs.extra_push_files }} }' | jq -r '.'; + echo '{ "extra_pull_request_files": ${{ steps.filter.outputs.extra_pull_request_files }} }' | jq -r '.'; + echo '{ "stackhpc_files": ${{ steps.filter.outputs.stackhpc_files }} }' | jq -r '.'; + echo '{ "trivvyscan_files": ${{ steps.filter.outputs.trivvyscan_files }} }' | jq -r '.' extra: name: Test extra build From b965ecb2fba921d5dbdb7dd99dc42c58d93e6858 Mon Sep 17 00:00:00 2001 From: Max Norton Date: Wed, 30 Jul 2025 14:32:09 +0100 Subject: [PATCH 18/37] Tweak github action used to detect changed paths on push/pull request --- .github/workflows/main.yml | 67 ++++++++++++++++++++++---------------- 1 file changed, 39 insertions(+), 28 deletions(-) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index f5680ede7..16c8c2643 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -30,63 +30,78 @@ jobs: runs-on: ubuntu-latest # Map a step output to a job output, this allows other jobs to be gated on the filter results outputs: - extra_push: ${{ steps.filter.outputs.extra_push }} - extra_pull_request: ${{ steps.filter.outputs.extra_pull_request }} - stackhpc: ${{ steps.filter.outputs.stackhpc }} - trivvyscan: ${{ steps.filter.outputs.trivvyscan }} + extra_on_push: ${{ steps.filter_some.outputs.extra_on_push }} + extra_on_pull_request: ${{ steps.filter_some.outputs.extra_on_pull_request }} + stackhpc_some: ${{ steps.filter_some.outputs.stackhpc }} + stackhpc_every: ${{ steps.filter_every.outputs.stackhpc }} + trivvyscan: ${{ steps.filter_some.outputs.trivvyscan }} steps: - name: Checkout uses: actions/checkout@v4 - - name: Paths Filter + - name: Paths matching on any filter # For safety using commit of dorny/paths-filter@v3 uses: dorny/paths-filter@de90cc6fb38fc0963ad72b210f1f284cd68cea36 - id: filter + id: filter_some with: + predicate-quantifier: 'some' list-files: 'json' filters: | - extra_push: + extra_on_push: - 'environments/.stackhpc/tofu/cluster_image.auto.tfvars.json' - 'ansible/roles/doca/**' - 'ansible/roles/cuda/**' - 'ansible/roles/slurm_recompile/**' # runs on cuda group - 'ansible/roles/lustre/**' - '.github/workflows/extra.yml' - extra_pull_request: + extra_on_pull_request: - 'environments/.stackhpc/tofu/cluster_image.auto.tfvars.json' - 'ansible/roles/doca/**' - 'ansible/roles/cuda/**' - 'ansible/roles/lustre/**' - '.github/workflows/extra.yml' + stackhpc: + #- '**' + #- '!dev/**' + - 'dev/setup-env.sh' + #- '!docs/**' + #- '!README.md' + #- '!.gitignore' + #- '!.github/workflows/' + - '.github/workflows/stackhpc' + trivvyscan: + - 'environments/.stackhpc/tofu/cluster_image.auto.tfvars.json' + + - name: Paths matching on every filter + # For safety using commit of dorny/paths-filter@v3 + uses: dorny/paths-filter@de90cc6fb38fc0963ad72b210f1f284cd68cea36 + id: filter_every + with: + predicate-quantifier: 'every' + list-files: 'json' + filters: | stackhpc: - '**' - '!dev/**' - - 'dev/setup-env.sh' - '!docs/**' - '!README.md' - '!.gitignore' - '!.github/workflows/' - - '.github/workflows/stackhpc' - trivvyscan: - - 'environments/.stackhpc/tofu/cluster_image.auto.tfvars.json' - name: Paths Filter Result run: > - echo '{ "extra_push_files": ${{ steps.filter.outputs.extra_push_files }} }' | jq -r '.'; - echo '{ "extra_pull_request_files": ${{ steps.filter.outputs.extra_pull_request_files }} }' | jq -r '.'; - echo '{ "stackhpc_files": ${{ steps.filter.outputs.stackhpc_files }} }' | jq -r '.'; - echo '{ "trivvyscan_files": ${{ steps.filter.outputs.trivvyscan_files }} }' | jq -r '.' + echo '{ "extra_on_push_files": ${{ steps.filter_some.outputs.extra_on_push_files }} }' | jq -r '.'; + echo '{ "extra_on_pull_request_files": ${{ steps.filter_some.outputs.extra_on_pull_request_files }} }' | jq -r '.'; + echo '{ "stackhpc_some_files": ${{ steps.filter_some.outputs.stackhpc_some_files }} }' | jq -r '.'; + echo '{ "stackhpc_every_files": ${{ steps.filter_every.outputs.stackhpc_every_files }} }' | jq -r '.'; + echo '{ "trivvyscan_files": ${{ steps.filter_some.outputs.trivvyscan_files }} }' | jq -r '.' extra: name: Test extra build needs: files_changed - #if: | - # needs.files_changed.outputs.should_skip != 'true' && - # (( github.event_name != 'pull_request' && !fromJSON(needs.files_changed.outputs.paths_result).extra_push.should_skip ) || - # ( github.event_name == 'pull_request' && !fromJSON(needs.files_changed.outputs.paths_result).extra_pull_request.should_skip )) if: | - github.event_name != 'pull_request' && needs.files_changed.outputs.extra_push == 'true' || - github.event_name == 'pull_request' && needs.files_changed.outputs.extra_pull_request == 'true' + github.event_name != 'pull_request' && needs.files_changed.outputs.extra_on_push == 'true' || + github.event_name == 'pull_request' && needs.files_changed.outputs.extra_on_pull_request == 'true' #uses: ./.github/workflows/extra.yml steps: # TEST - name: Test extra build... @@ -99,10 +114,9 @@ jobs: stackhpc: name: Test deployment and reimage on OpenStack needs: files_changed - #if: | - # needs.files_changed.outputs.should_skip != 'true' && !fromJSON(needs.files_changed.outputs.paths_result).stackhpc.should_skip if: | - needs.files_changed.outputs.stackhpc == 'true' + needs.files_changed.outputs.stackhpc_some == 'true' || + needs.files_changed.outputs.stackhpc_every == 'true' #uses: ./.github/workflows/stackhpc.yml steps: # TEST - name: Test deployment and reimage on OpenStack... @@ -115,9 +129,6 @@ jobs: trivvyscan: name: Trivy scan image for vulnerabilities needs: files_changed - #if: | - # github.event_name == 'pull_request' && - # needs.files_changed.outputs.should_skip != 'true' && !fromJSON(needs.files_changed.outputs.paths_result).trivvyscan.should_skip if: | github.event_name == 'pull_request' && needs.files_changed.outputs.trivvyscan == 'true' From 5b02436a21ebc5f53b41b641eadfb655eb3b2385 Mon Sep 17 00:00:00 2001 From: Max Norton Date: Wed, 30 Jul 2025 15:19:54 +0100 Subject: [PATCH 19/37] Tweak github action used to detect changed paths on push/pull request --- .github/workflows/main.yml | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 16c8c2643..db6aa02d9 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -32,6 +32,7 @@ jobs: outputs: extra_on_push: ${{ steps.filter_some.outputs.extra_on_push }} extra_on_pull_request: ${{ steps.filter_some.outputs.extra_on_pull_request }} + stackhpc: ${{ toJson(fromJson(steps.filter_some.outputs.stackhpc) || fromJson(steps.filter_every.outputs.stackhpc)) }} stackhpc_some: ${{ steps.filter_some.outputs.stackhpc }} stackhpc_every: ${{ steps.filter_every.outputs.stackhpc }} trivvyscan: ${{ steps.filter_some.outputs.trivvyscan }} @@ -88,14 +89,26 @@ jobs: - '!.gitignore' - '!.github/workflows/' - - name: Paths Filter Result + - name: Paths matching run: > echo '{ "extra_on_push_files": ${{ steps.filter_some.outputs.extra_on_push_files }} }' | jq -r '.'; echo '{ "extra_on_pull_request_files": ${{ steps.filter_some.outputs.extra_on_pull_request_files }} }' | jq -r '.'; - echo '{ "stackhpc_some_files": ${{ steps.filter_some.outputs.stackhpc_some_files }} }' | jq -r '.'; - echo '{ "stackhpc_every_files": ${{ steps.filter_every.outputs.stackhpc_every_files }} }' | jq -r '.'; + echo '{ "stackhpc_some_files": ${{ steps.filter_some.outputs.stackhpc_files }} }' | jq -r '.'; + echo '{ "stackhpc_every_files": ${{ steps.filter_every.outputs.stackhpc_files }} }' | jq -r '.'; echo '{ "trivvyscan_files": ${{ steps.filter_some.outputs.trivvyscan_files }} }' | jq -r '.' + debug: + name: Debug paths matching + needs: files_changed + run: > + echo 'extra_on_push: ${{ needs.files_changed.outputs.extra_on_push }}'; + echo 'extra_on_pull_request: ${{ needs.files_changed.outputs.extra_on_pull_request }}'; + echo 'stackhpc: ${{ needs.files_changed.outputs.stackhpc }}'; + echo 'stackhpc_some: ${{ needs.files_changed.outputs.stackhpc_some }}'; + echo 'stackhpc_every: ${{ needs.files_changed.outputs.stackhpc_every }}'; + echo 'trivvyscan: ${{ needs.files_changed.outputs.trivvyscan }}' + runs-on: ubuntu-latest # TEST + extra: name: Test extra build needs: files_changed From 2f698a360512e086aba25efc994a32f0fcfdf5e2 Mon Sep 17 00:00:00 2001 From: Max Norton Date: Wed, 30 Jul 2025 15:28:21 +0100 Subject: [PATCH 20/37] Tweak github action used to detect changed paths on push/pull request --- .github/workflows/main.yml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index db6aa02d9..13ac74052 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -84,8 +84,7 @@ jobs: stackhpc: - '**' - '!dev/**' - - '!docs/**' - - '!README.md' + - '!**.md' - '!.gitignore' - '!.github/workflows/' From e17401d6b512f362e9ab8db385261824de94871e Mon Sep 17 00:00:00 2001 From: Max Norton Date: Wed, 30 Jul 2025 15:36:15 +0100 Subject: [PATCH 21/37] Tweak github action used to detect changed paths on push/pull request --- .github/workflows/main.yml | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 13ac74052..2c22f411c 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -99,13 +99,14 @@ jobs: debug: name: Debug paths matching needs: files_changed - run: > - echo 'extra_on_push: ${{ needs.files_changed.outputs.extra_on_push }}'; - echo 'extra_on_pull_request: ${{ needs.files_changed.outputs.extra_on_pull_request }}'; - echo 'stackhpc: ${{ needs.files_changed.outputs.stackhpc }}'; - echo 'stackhpc_some: ${{ needs.files_changed.outputs.stackhpc_some }}'; - echo 'stackhpc_every: ${{ needs.files_changed.outputs.stackhpc_every }}'; - echo 'trivvyscan: ${{ needs.files_changed.outputs.trivvyscan }}' + steps: + run: > + echo 'extra_on_push: ${{ needs.files_changed.outputs.extra_on_push }}'; + echo 'extra_on_pull_request: ${{ needs.files_changed.outputs.extra_on_pull_request }}'; + echo 'stackhpc: ${{ needs.files_changed.outputs.stackhpc }}'; + echo 'stackhpc_some: ${{ needs.files_changed.outputs.stackhpc_some }}'; + echo 'stackhpc_every: ${{ needs.files_changed.outputs.stackhpc_every }}'; + echo 'trivvyscan: ${{ needs.files_changed.outputs.trivvyscan }}' runs-on: ubuntu-latest # TEST extra: From 642888a1ab0f74861f9f945ae63ed1656b6bf828 Mon Sep 17 00:00:00 2001 From: Max Norton Date: Wed, 30 Jul 2025 15:37:21 +0100 Subject: [PATCH 22/37] Tweak github action used to detect changed paths on push/pull request --- .github/workflows/main.yml | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 2c22f411c..d2a2bcf78 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -100,13 +100,14 @@ jobs: name: Debug paths matching needs: files_changed steps: - run: > - echo 'extra_on_push: ${{ needs.files_changed.outputs.extra_on_push }}'; - echo 'extra_on_pull_request: ${{ needs.files_changed.outputs.extra_on_pull_request }}'; - echo 'stackhpc: ${{ needs.files_changed.outputs.stackhpc }}'; - echo 'stackhpc_some: ${{ needs.files_changed.outputs.stackhpc_some }}'; - echo 'stackhpc_every: ${{ needs.files_changed.outputs.stackhpc_every }}'; - echo 'trivvyscan: ${{ needs.files_changed.outputs.trivvyscan }}' + - name: Debug paths matching + run: > + echo 'extra_on_push: ${{ needs.files_changed.outputs.extra_on_push }}'; + echo 'extra_on_pull_request: ${{ needs.files_changed.outputs.extra_on_pull_request }}'; + echo 'stackhpc: ${{ needs.files_changed.outputs.stackhpc }}'; + echo 'stackhpc_some: ${{ needs.files_changed.outputs.stackhpc_some }}'; + echo 'stackhpc_every: ${{ needs.files_changed.outputs.stackhpc_every }}'; + echo 'trivvyscan: ${{ needs.files_changed.outputs.trivvyscan }}' runs-on: ubuntu-latest # TEST extra: From c35c930652cba5481c0740783bdd0fe201623f83 Mon Sep 17 00:00:00 2001 From: Max Norton Date: Wed, 30 Jul 2025 15:50:05 +0100 Subject: [PATCH 23/37] Tweak github action used to detect changed paths on push/pull request --- .github/workflows/main.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index d2a2bcf78..52d7feb51 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -64,7 +64,7 @@ jobs: stackhpc: #- '**' #- '!dev/**' - - 'dev/setup-env.sh' + - 'dev/setup-env.sh___TMP_WILL_NOT_MATCH' #- '!docs/**' #- '!README.md' #- '!.gitignore' @@ -84,7 +84,7 @@ jobs: stackhpc: - '**' - '!dev/**' - - '!**.md' + - '!**/*.md' - '!.gitignore' - '!.github/workflows/' From 1e165bb3718a046b520d85e93d1c90aa28287301 Mon Sep 17 00:00:00 2001 From: Max Norton Date: Wed, 30 Jul 2025 15:59:09 +0100 Subject: [PATCH 24/37] Tweak github action used to detect changed paths on push/pull request --- .github/workflows/main.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 52d7feb51..2e1892371 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -82,7 +82,7 @@ jobs: list-files: 'json' filters: | stackhpc: - - '**' + - '**___TMP_WILL_NOT_MATCH' - '!dev/**' - '!**/*.md' - '!.gitignore' From c046991d2b196491237d218bf04ef2189c1c9f1c Mon Sep 17 00:00:00 2001 From: Max Norton Date: Wed, 30 Jul 2025 16:43:55 +0100 Subject: [PATCH 25/37] Tweak github action used to detect changed paths on push/pull request --- .github/workflows/main.yml | 54 ++++++++++++++++---------------------- 1 file changed, 23 insertions(+), 31 deletions(-) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 2e1892371..d982c81fb 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -33,18 +33,18 @@ jobs: extra_on_push: ${{ steps.filter_some.outputs.extra_on_push }} extra_on_pull_request: ${{ steps.filter_some.outputs.extra_on_pull_request }} stackhpc: ${{ toJson(fromJson(steps.filter_some.outputs.stackhpc) || fromJson(steps.filter_every.outputs.stackhpc)) }} - stackhpc_some: ${{ steps.filter_some.outputs.stackhpc }} - stackhpc_every: ${{ steps.filter_every.outputs.stackhpc }} trivvyscan: ${{ steps.filter_some.outputs.trivvyscan }} steps: - name: Checkout uses: actions/checkout@v4 - - name: Paths matching on any filter - # For safety using commit of dorny/paths-filter@v3 + - name: Paths matching on any filter rule + # For safety use the commit of dorny/paths-filter@v3 uses: dorny/paths-filter@de90cc6fb38fc0963ad72b210f1f284cd68cea36 id: filter_some with: + # Filter changed files, 'some' means the file is matched if any one of the filter rules match. + # Processing is different from 'paths' and 'paths_exclude', see note below. predicate-quantifier: 'some' list-files: 'json' filters: | @@ -62,33 +62,40 @@ jobs: - 'ansible/roles/lustre/**' - '.github/workflows/extra.yml' stackhpc: - #- '**' - #- '!dev/**' - - 'dev/setup-env.sh___TMP_WILL_NOT_MATCH' - #- '!docs/**' - #- '!README.md' - #- '!.gitignore' - #- '!.github/workflows/' + - 'dev/setup-env.sh' - '.github/workflows/stackhpc' trivvyscan: - 'environments/.stackhpc/tofu/cluster_image.auto.tfvars.json' - - name: Paths matching on every filter - # For safety using commit of dorny/paths-filter@v3 + - name: Paths matching on every filter rule + # For safety use the commit of dorny/paths-filter@v3 uses: dorny/paths-filter@de90cc6fb38fc0963ad72b210f1f284cd68cea36 id: filter_every with: + # Filter changed files, 'every' means the file is matched only if matches all filter rules. + # Processing is different from 'paths' and 'paths_exclude', see note below. predicate-quantifier: 'every' list-files: 'json' filters: | stackhpc: - - '**___TMP_WILL_NOT_MATCH' + - '**' - '!dev/**' - '!**/*.md' - '!.gitignore' - '!.github/workflows/' - - name: Paths matching + - name: Paths matched output + # NOTE: This is a debug step, it prints the paths that matched the filters + # It's useful because dorny/paths-filter doesn't work like the conventional 'paths' and 'paths_exclude' + # We can't do the following: + # paths: + # - '**' + # - '!dev/**' + # - 'dev/setup-env.sh' + # + # Which would include all files whilst removing all "dev/" files except "dev/setup-env.sh". + # Instead the 'some' stackhpc filter ensures we include "dev/setup-env.sh" - if changed, + # the 'every' stackhpc filter ensures we don't include any other "dev/**" files. run: > echo '{ "extra_on_push_files": ${{ steps.filter_some.outputs.extra_on_push_files }} }' | jq -r '.'; echo '{ "extra_on_pull_request_files": ${{ steps.filter_some.outputs.extra_on_pull_request_files }} }' | jq -r '.'; @@ -96,20 +103,6 @@ jobs: echo '{ "stackhpc_every_files": ${{ steps.filter_every.outputs.stackhpc_files }} }' | jq -r '.'; echo '{ "trivvyscan_files": ${{ steps.filter_some.outputs.trivvyscan_files }} }' | jq -r '.' - debug: - name: Debug paths matching - needs: files_changed - steps: - - name: Debug paths matching - run: > - echo 'extra_on_push: ${{ needs.files_changed.outputs.extra_on_push }}'; - echo 'extra_on_pull_request: ${{ needs.files_changed.outputs.extra_on_pull_request }}'; - echo 'stackhpc: ${{ needs.files_changed.outputs.stackhpc }}'; - echo 'stackhpc_some: ${{ needs.files_changed.outputs.stackhpc_some }}'; - echo 'stackhpc_every: ${{ needs.files_changed.outputs.stackhpc_every }}'; - echo 'trivvyscan: ${{ needs.files_changed.outputs.trivvyscan }}' - runs-on: ubuntu-latest # TEST - extra: name: Test extra build needs: files_changed @@ -129,8 +122,7 @@ jobs: name: Test deployment and reimage on OpenStack needs: files_changed if: | - needs.files_changed.outputs.stackhpc_some == 'true' || - needs.files_changed.outputs.stackhpc_every == 'true' + needs.files_changed.outputs.stackhpc == 'true' #uses: ./.github/workflows/stackhpc.yml steps: # TEST - name: Test deployment and reimage on OpenStack... From b29f846c931e40af3953ca24e1c82503ccd18518 Mon Sep 17 00:00:00 2001 From: Max Norton Date: Wed, 30 Jul 2025 16:54:59 +0100 Subject: [PATCH 26/37] Tweak github action used to detect changed paths on push/pull request --- .github/workflows/main.yml | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index d982c81fb..484c6e68e 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -63,7 +63,7 @@ jobs: - '.github/workflows/extra.yml' stackhpc: - 'dev/setup-env.sh' - - '.github/workflows/stackhpc' + - '.github/workflows/stackhpc.yml' trivvyscan: - 'environments/.stackhpc/tofu/cluster_image.auto.tfvars.json' @@ -110,13 +110,14 @@ jobs: github.event_name != 'pull_request' && needs.files_changed.outputs.extra_on_push == 'true' || github.event_name == 'pull_request' && needs.files_changed.outputs.extra_on_pull_request == 'true' #uses: ./.github/workflows/extra.yml - steps: # TEST + #secrets: inherit + # TEST - remove from here and uncomment the above two lines + steps: - name: Test extra build... uses: jakejarvis/wait-action@master with: time: '120s' - runs-on: ubuntu-latest # TEST - #secrets: inherit + runs-on: ubuntu-latest stackhpc: name: Test deployment and reimage on OpenStack @@ -124,13 +125,14 @@ jobs: if: | needs.files_changed.outputs.stackhpc == 'true' #uses: ./.github/workflows/stackhpc.yml - steps: # TEST + #secrets: inherit + # TEST - remove from here and uncomment the above two lines + steps: - name: Test deployment and reimage on OpenStack... uses: jakejarvis/wait-action@master with: time: '120s' - runs-on: ubuntu-latest # TEST - #secrets: inherit + runs-on: ubuntu-latest trivvyscan: name: Trivy scan image for vulnerabilities @@ -139,10 +141,11 @@ jobs: github.event_name == 'pull_request' && needs.files_changed.outputs.trivvyscan == 'true' #uses: ./.github/workflows/trivvyscan.yml - steps: # TEST + #secrets: inherit + # TEST - remove from here and uncomment the above two lines + steps: - name: Trivy scan image for vulnerabilities... uses: jakejarvis/wait-action@master with: time: '120s' - runs-on: ubuntu-latest # TEST - #secrets: inherit + runs-on: ubuntu-latest \ No newline at end of file From 4531d421c553e3f4fa3e92917f10e199cc13f69b Mon Sep 17 00:00:00 2001 From: Max Norton Date: Wed, 30 Jul 2025 17:04:57 +0100 Subject: [PATCH 27/37] Tweak github action used to detect changed paths on push/pull request --- .github/workflows/main.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 484c6e68e..dc1d4d518 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -82,7 +82,7 @@ jobs: - '!dev/**' - '!**/*.md' - '!.gitignore' - - '!.github/workflows/' + - '!.github/workflows/**' - name: Paths matched output # NOTE: This is a debug step, it prints the paths that matched the filters From fc6e6e739da3401abb6f638cf6a14cff66aa6e04 Mon Sep 17 00:00:00 2001 From: Max Norton Date: Thu, 31 Jul 2025 10:09:47 +0100 Subject: [PATCH 28/37] Tweak github action used to detect changed paths on push/pull request --- .github/workflows/main.yml | 84 +++++++++++++++++++++----------------- 1 file changed, 46 insertions(+), 38 deletions(-) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index dc1d4d518..b82a89755 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -30,21 +30,55 @@ jobs: runs-on: ubuntu-latest # Map a step output to a job output, this allows other jobs to be gated on the filter results outputs: - extra_on_push: ${{ steps.filter_some.outputs.extra_on_push }} - extra_on_pull_request: ${{ steps.filter_some.outputs.extra_on_pull_request }} - stackhpc: ${{ toJson(fromJson(steps.filter_some.outputs.stackhpc) || fromJson(steps.filter_every.outputs.stackhpc)) }} - trivvyscan: ${{ steps.filter_some.outputs.trivvyscan }} + extra_on_push: ${{ steps.filter_on_some.outputs.extra_on_push }} + extra_on_pull_request: ${{ steps.filter_on_some.outputs.extra_on_pull_request }} + # The 'stackhpc' output will be 'true' if either of the two stackhpc filters below matched + stackhpc: ${{ toJson(fromJson(steps.filter_on_every.outputs.stackhpc) || fromJson(steps.filter_on_some.outputs.stackhpc)) }} + trivvyscan: ${{ steps.filter_on_some.outputs.trivvyscan }} steps: - name: Checkout uses: actions/checkout@v4 + # NOTE: We're detecting the changed files within a job so that we can gate execution of other jobs. + # We use dorny/paths-filter which doesn't work like the conventional 'paths' and 'paths_exclude', + # we can't do the following: + # paths: + # - '**' + # - '!dev/**' + # - 'dev/setup-env.sh' + # + # Which would include all files whilst removing all "dev/" files except "dev/setup-env.sh". + # We have to use two filters: + # * first filter includes all changed files and removes "dev/" files + # * second filter explicitly adds 'dev/setup-env.sh' + # We use the logical OR of the filters outputs to gate jobs. + + - name: Paths matching on every filter rule + # For safety use the commit of dorny/paths-filter@v3 + uses: dorny/paths-filter@de90cc6fb38fc0963ad72b210f1f284cd68cea36 + id: filter_on_every + with: + # Filter changed files, 'every' means the file is matched only if it matches all filter rules. + # NOTE: currently seeing: Warning: Unexpected input(s) 'predicate-quantifier', valid inputs are.. + # this can be ignored, filtering works as expected. + predicate-quantifier: 'every' + list-files: 'json' + filters: | + stackhpc: + - '**' + - '!dev/**' + - '!**/*.md' + - '!.gitignore' + - '!.github/workflows/**' + - name: Paths matching on any filter rule # For safety use the commit of dorny/paths-filter@v3 uses: dorny/paths-filter@de90cc6fb38fc0963ad72b210f1f284cd68cea36 - id: filter_some + id: filter_on_some with: # Filter changed files, 'some' means the file is matched if any one of the filter rules match. - # Processing is different from 'paths' and 'paths_exclude', see note below. + # NOTE: currently seeing: Warning: Unexpected input(s) 'predicate-quantifier', valid inputs are.. + # this can be ignored, filtering works as expected. predicate-quantifier: 'some' list-files: 'json' filters: | @@ -67,41 +101,15 @@ jobs: trivvyscan: - 'environments/.stackhpc/tofu/cluster_image.auto.tfvars.json' - - name: Paths matching on every filter rule - # For safety use the commit of dorny/paths-filter@v3 - uses: dorny/paths-filter@de90cc6fb38fc0963ad72b210f1f284cd68cea36 - id: filter_every - with: - # Filter changed files, 'every' means the file is matched only if matches all filter rules. - # Processing is different from 'paths' and 'paths_exclude', see note below. - predicate-quantifier: 'every' - list-files: 'json' - filters: | - stackhpc: - - '**' - - '!dev/**' - - '!**/*.md' - - '!.gitignore' - - '!.github/workflows/**' - - name: Paths matched output - # NOTE: This is a debug step, it prints the paths that matched the filters + # NOTE: This is a debug step, it shows what files were matched by the filters. # It's useful because dorny/paths-filter doesn't work like the conventional 'paths' and 'paths_exclude' - # We can't do the following: - # paths: - # - '**' - # - '!dev/**' - # - 'dev/setup-env.sh' - # - # Which would include all files whilst removing all "dev/" files except "dev/setup-env.sh". - # Instead the 'some' stackhpc filter ensures we include "dev/setup-env.sh" - if changed, - # the 'every' stackhpc filter ensures we don't include any other "dev/**" files. run: > - echo '{ "extra_on_push_files": ${{ steps.filter_some.outputs.extra_on_push_files }} }' | jq -r '.'; - echo '{ "extra_on_pull_request_files": ${{ steps.filter_some.outputs.extra_on_pull_request_files }} }' | jq -r '.'; - echo '{ "stackhpc_some_files": ${{ steps.filter_some.outputs.stackhpc_files }} }' | jq -r '.'; - echo '{ "stackhpc_every_files": ${{ steps.filter_every.outputs.stackhpc_files }} }' | jq -r '.'; - echo '{ "trivvyscan_files": ${{ steps.filter_some.outputs.trivvyscan_files }} }' | jq -r '.' + echo '{ "extra_on_push_files": ${{ steps.filter_on_some.outputs.extra_on_push_files }} }' | jq -r '.'; + echo '{ "extra_on_pull_request_files": ${{ steps.filter_on_some.outputs.extra_on_pull_request_files }} }' | jq -r '.'; + echo '{ "stackhpc_every_files": ${{ steps.filter_on_every.outputs.stackhpc_files }} }' | jq -r '.'; + echo '{ "stackhpc_some_files": ${{ steps.filter_on_some.outputs.stackhpc_files }} }' | jq -r '.'; + echo '{ "trivvyscan_files": ${{ steps.filter_on_some.outputs.trivvyscan_files }} }' | jq -r '.' extra: name: Test extra build From 7ed88660b95765d003f485ff6382d9f47a4c6bfa Mon Sep 17 00:00:00 2001 From: Max Norton Date: Fri, 1 Aug 2025 23:51:21 +0100 Subject: [PATCH 29/37] Troubleshooting: ansible.builtin.user --- ansible/bootstrap.yml | 4 ++-- environments/.caas/hooks/pre.yml | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/ansible/bootstrap.yml b/ansible/bootstrap.yml index ebe7d424a..09b6df20f 100644 --- a/ansible/bootstrap.yml +++ b/ansible/bootstrap.yml @@ -88,7 +88,7 @@ when: - item.enable | default(true) | bool - "'group' in item" - become_method: "ansible.builtin.sudo" + become_method: ansible.builtin.sudo # Need to change working directory otherwise we try to switch back to non-existent directory. become_flags: "-i" - name: Add system users @@ -96,7 +96,7 @@ name: "{{ item.user }}" loop: "{{ appliances_local_users }}" when: item.enable | default(true) | bool - become_method: "ansible.builtin.sudo" + become_method: ansible.builtin.sudo # Need to change working directory otherwise we try to switch back to non-existent directory. become_flags: "-i" - name: Reset ssh connection to allow user changes to affect ansible_user diff --git a/environments/.caas/hooks/pre.yml b/environments/.caas/hooks/pre.yml index 052e6e561..755ed6a09 100644 --- a/environments/.caas/hooks/pre.yml +++ b/environments/.caas/hooks/pre.yml @@ -41,9 +41,8 @@ gather_facts: false tasks: - name: Set up Ansible user - ansible.builtin.user: - name: "{{ (appliances_local_users_default | selectattr('user.name', 'eq', appliances_local_users_ansible_user_name))[0]['user'] }}" - become_method: "ansible.builtin.sudo" + ansible.builtin.user: "{{ (appliances_local_users_default | selectattr('user.name', 'eq', appliances_local_users_ansible_user_name))[0]['user'] }}" + become_method: ansible.builtin.sudo # Need to change working directory otherwise we try to switch back to non-existent directory. become_flags: "-i" become: true @@ -53,6 +52,7 @@ tasks: - name: Reset persistent SSH connections ansible.builtin.meta: reset_connection + - hosts: localhost gather_facts: false become: false From bdb0bbfd64bfd3c089934ac38ffb48d28bfa71e7 Mon Sep 17 00:00:00 2001 From: Max Norton Date: Mon, 4 Aug 2025 09:06:03 +0100 Subject: [PATCH 30/37] Troubleshooting: debugging temporarily added --- ansible/validate.yml | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/ansible/validate.yml b/ansible/validate.yml index 5e838f53d..c0af6b7c8 100644 --- a/ansible/validate.yml +++ b/ansible/validate.yml @@ -34,6 +34,11 @@ {{ req }} {% endfor %} + _requirements_current: + .{{ _requirements_current }}. + _requirements_installed: + .{{ _requirements_installed }}. + Run dev/setup-env.sh to fix this. vars: # note difference filter requires lists, so need to rearrange yaml from files. From 7f8e61a42a5935742284ee72b3843b0a3e1fc0ca Mon Sep 17 00:00:00 2001 From: Max Norton Date: Mon, 4 Aug 2025 21:25:18 +0100 Subject: [PATCH 31/37] Shift pylint invalid-name linting behond python bang line --- dev/delete-cluster.py | 3 ++- .../{{cookiecutter.environment}}/tofu/baremetal-node-list.py | 3 ++- .../tofu/read-inventory-secrets.py | 3 ++- 3 files changed, 6 insertions(+), 3 deletions(-) diff --git a/dev/delete-cluster.py b/dev/delete-cluster.py index b8f24b13a..f329e7413 100755 --- a/dev/delete-cluster.py +++ b/dev/delete-cluster.py @@ -1,4 +1,5 @@ -#!/usr/bin/env python # pylint: disable=invalid-name +#!/usr/bin/env python +# pylint: disable=invalid-name """ Delete infrastructure for a cluster without using Terraform. Useful for CI clusters. diff --git a/environments/skeleton/{{cookiecutter.environment}}/tofu/baremetal-node-list.py b/environments/skeleton/{{cookiecutter.environment}}/tofu/baremetal-node-list.py index d07e59fb4..c1747ece4 100755 --- a/environments/skeleton/{{cookiecutter.environment}}/tofu/baremetal-node-list.py +++ b/environments/skeleton/{{cookiecutter.environment}}/tofu/baremetal-node-list.py @@ -1,4 +1,5 @@ -#!/usr/bin/env python # pylint: disable=invalid-name +#!/usr/bin/env python +# pylint: disable=invalid-name """opentofu external data program to list baremetal nodes Example usage: diff --git a/environments/skeleton/{{cookiecutter.environment}}/tofu/read-inventory-secrets.py b/environments/skeleton/{{cookiecutter.environment}}/tofu/read-inventory-secrets.py index 3728f5604..85ac0a973 100755 --- a/environments/skeleton/{{cookiecutter.environment}}/tofu/read-inventory-secrets.py +++ b/environments/skeleton/{{cookiecutter.environment}}/tofu/read-inventory-secrets.py @@ -1,4 +1,5 @@ -#!/usr/bin/env python # pylint: disable=invalid-name +#!/usr/bin/env python +# pylint: disable=invalid-name """opentofu external data program to load inventory string variables from a (possibly vault-encrypted) secrets file. From de0b35f8a77ba73887a5c4db1c6938ed9dde3bd7 Mon Sep 17 00:00:00 2001 From: Max Norton Date: Mon, 11 Aug 2025 11:37:03 +0100 Subject: [PATCH 32/37] Temporarily disable the ansible galaxy requirements validation --- ansible/validate.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/ansible/validate.yml b/ansible/validate.yml index c0af6b7c8..ce04dc6c9 100644 --- a/ansible/validate.yml +++ b/ansible/validate.yml @@ -46,6 +46,7 @@ _requirements_current: "{{ (lookup('file', _requirements_path) | from_yaml).values() | flatten }}" # below produced by dev/setup-env.sh - gives empty list if file is missing: _requirements_installed: "{{ ((lookup('file', _requirements_path + '.last', errors='ignore') or '{}') | from_yaml ).values() | flatten }}" + when: 0 > 1 - name: Ensure control node is in inventory hosts: all From 66f0e7eaa2d0da9f1eb447cc78a836c303f49349 Mon Sep 17 00:00:00 2001 From: Max Norton Date: Mon, 11 Aug 2025 13:51:54 +0100 Subject: [PATCH 33/37] Reverting changes made to ansible.builtin.user and ansible.builtin.group where the name parameter was added. Reverting to ansible.builtin.group: becasue args aren't an expected label: groupadd: '{'name': 'grafana', 'gid': 979}' is not a valid group name --- ansible/bootstrap.yml | 6 ++---- ansible/fatimage.yml | 3 +-- ansible/roles/alertmanager/tasks/install.yml | 3 +-- ansible/roles/basic_users/tasks/main.yml | 6 ++---- ansible/roles/podman/tasks/configure.yml | 3 +-- 5 files changed, 7 insertions(+), 14 deletions(-) diff --git a/ansible/bootstrap.yml b/ansible/bootstrap.yml index 09b6df20f..5783630e3 100644 --- a/ansible/bootstrap.yml +++ b/ansible/bootstrap.yml @@ -82,8 +82,7 @@ search_string: HOSTNAME=$(/usr/bin/hostnamectl --transient 2>/dev/null) || \ state: absent - name: Add system user groups - ansible.builtin.group: - name: "{{ item.group }}" + ansible.builtin.group: "{{ item.group }}" loop: "{{ appliances_local_users }}" when: - item.enable | default(true) | bool @@ -92,8 +91,7 @@ # Need to change working directory otherwise we try to switch back to non-existent directory. become_flags: "-i" - name: Add system users - ansible.builtin.user: - name: "{{ item.user }}" + ansible.builtin.user: "{{ item.user }}" loop: "{{ appliances_local_users }}" when: item.enable | default(true) | bool become_method: ansible.builtin.sudo diff --git a/ansible/fatimage.yml b/ansible/fatimage.yml index f4abc6044..c76623382 100644 --- a/ansible/fatimage.yml +++ b/ansible/fatimage.yml @@ -202,8 +202,7 @@ # can't run cloudalchemy.prometheus/tasks/install.yml as it triggers a unit start # so below is a partial extraction of this: - name: Create prometheus system group - ansible.builtin.group: - name: prometheus + ansible.builtin.group: prometheus system: true state: present diff --git a/ansible/roles/alertmanager/tasks/install.yml b/ansible/roles/alertmanager/tasks/install.yml index f1cb9cd4b..f88c4e76d 100644 --- a/ansible/roles/alertmanager/tasks/install.yml +++ b/ansible/roles/alertmanager/tasks/install.yml @@ -1,7 +1,6 @@ --- - name: Create alertmanager system user - ansible.builtin.user: - name: "{{ alertmanager_system_user }}" + ansible.builtin.user: "{{ alertmanager_system_user }}" system: true create_home: false diff --git a/ansible/roles/basic_users/tasks/main.yml b/ansible/roles/basic_users/tasks/main.yml index 4df4e78ef..1f01f69c1 100644 --- a/ansible/roles/basic_users/tasks/main.yml +++ b/ansible/roles/basic_users/tasks/main.yml @@ -18,13 +18,11 @@ - basic_users_override_sssd | bool - name: Create groups - ansible.builtin.group: - name: "{{ item }}" + ansible.builtin.group: "{{ item }}" loop: "{{ basic_users_groups }}" - name: Create users - ansible.builtin.user: - name: "{{ basic_users_userdefaults | combine(item) | filter_user_params() | combine(_disable_homedir) }}" + ansible.builtin.user: "{{ basic_users_userdefaults | combine(item) | filter_user_params() | combine(_disable_homedir) }}" loop: "{{ basic_users_users }}" loop_control: label: "{{ item.name }}" diff --git a/ansible/roles/podman/tasks/configure.yml b/ansible/roles/podman/tasks/configure.yml index d2a7804aa..62f42cda6 100644 --- a/ansible/roles/podman/tasks/configure.yml +++ b/ansible/roles/podman/tasks/configure.yml @@ -39,8 +39,7 @@ - name: Reset ssh connection to allow user changes to affect 'current login user' ansible.builtin.meta: reset_connection - name: Ensure podman users exist - ansible.builtin.user: - name: "{{ item }}" + ansible.builtin.user: "{{ item }}" with_items: "{{ podman_users }}" register: podman_user_info become: true From ef9147d3827b0b42b38f36e23e228d4058ccf545 Mon Sep 17 00:00:00 2001 From: Max Norton Date: Mon, 11 Aug 2025 14:13:22 +0100 Subject: [PATCH 34/37] Arguments are dicts not labels --- ansible/roles/mysql/tasks/configure.yml | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/ansible/roles/mysql/tasks/configure.yml b/ansible/roles/mysql/tasks/configure.yml index dca4bde08..d57723a43 100644 --- a/ansible/roles/mysql/tasks/configure.yml +++ b/ansible/roles/mysql/tasks/configure.yml @@ -30,11 +30,9 @@ delay: 2 - name: Ensure mysql databases created - community.mysql.mysql_db: - name: "{{ item }}" + community.mysql.mysql_db: "{{ item }}" loop: "{{ mysql_databases}}" - name: Ensure mysql users present - community.mysql.mysql_user: - name: "{{ item }}" + community.mysql.mysql_user: "{{ item }}" loop: "{{ mysql_users }}" From 6a698b2c5b6ed423e961ceaf0594fb957f6de6a1 Mon Sep 17 00:00:00 2001 From: Max Norton Date: Tue, 12 Aug 2025 17:04:35 +0100 Subject: [PATCH 35/37] Preserve file permissions on .ssh directory contents --- ansible/roles/persist_hostkeys/tasks/main.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ansible/roles/persist_hostkeys/tasks/main.yml b/ansible/roles/persist_hostkeys/tasks/main.yml index 014feadb7..139281ad0 100644 --- a/ansible/roles/persist_hostkeys/tasks/main.yml +++ b/ansible/roles/persist_hostkeys/tasks/main.yml @@ -41,7 +41,7 @@ ansible.builtin.copy: content: "{{ item.content | b64decode }}" dest: "/etc/ssh/{{ item.source | regex_search('[^/]+$') }}" - mode: "0644" + mode: "preserve" loop: "{{ _slurp_keys.results }}" - ansible.builtin.meta: reset_connection From 32326a921bbea194eb112e527bada05baccaee84 Mon Sep 17 00:00:00 2001 From: Max Norton Date: Tue, 12 Aug 2025 18:24:59 +0100 Subject: [PATCH 36/37] Wherever we use become_user set become: true, keeps the linter happy and maintains functionality --- ansible/roles/basic_users/tasks/main.yml | 12 ++++++++---- ansible/roles/compute_init/files/compute-init.yml | 6 ++++-- ansible/roles/filebeat/tasks/install.yml | 3 ++- ansible/roles/mysql/tasks/install.yml | 3 ++- ansible/roles/opensearch/tasks/install.yml | 3 ++- 5 files changed, 18 insertions(+), 9 deletions(-) diff --git a/ansible/roles/basic_users/tasks/main.yml b/ansible/roles/basic_users/tasks/main.yml index 1f01f69c1..b6a9a1641 100644 --- a/ansible/roles/basic_users/tasks/main.yml +++ b/ansible/roles/basic_users/tasks/main.yml @@ -79,7 +79,8 @@ owner: "{{ item.name }}" group: "{{ item.name }}" mode: u=rwX,go= - # become_user: "{{ item.name }}" # Commenting out as become_user does not imply become: true + become: true + become_user: "{{ item.name }}" loop: "{{ basic_users_users }}" loop_control: label: "{{ item.name }}" @@ -95,7 +96,8 @@ comment: "{{ item.ssh_key_comment | default(item.name) }}" vars: _ssh_key_type: "{{ item.ssh_key_type | default('ed25519') }}" - # become_user: "{{ item.name }}" # Commenting out as become_user does not imply become: true + become: true + become_user: "{{ item.name }}" loop: "{{ basic_users_users }}" loop_control: label: "{{ item.name }}" @@ -112,7 +114,8 @@ manage_dir: false key: "{{ item.public_key }}" path: ~/.ssh/authorized_keys - # become_user: "{{ item.item.name }}" # Commenting out as become_user does not imply become: true + become: true + become_user: "{{ item.item.name }}" loop: "{{ _cluster_ssh_keypair.results }}" loop_control: label: "{{ item.item.name }}" @@ -129,7 +132,8 @@ manage_dir: false key: "{{ item.public_key }}" path: ~/.ssh/authorized_keys - # become_user: "{{ item.name }}" # Commenting out as become_user does not imply become: true + become: true + become_user: "{{ item.name }}" loop: "{{ basic_users_users }}" loop_control: label: "{{ item.name }}" diff --git a/ansible/roles/compute_init/files/compute-init.yml b/ansible/roles/compute_init/files/compute-init.yml index a2c554c99..ebd71cd78 100644 --- a/ansible/roles/compute_init/files/compute-init.yml +++ b/ansible/roles/compute_init/files/compute-init.yml @@ -84,7 +84,8 @@ - ansible.builtin.meta: end_play - name: Check if hostvars exist - # become_user: slurm # Commenting out as become_user does not imply become: true + become: true + become_user: slurm ansible.builtin.stat: path: "/mnt/cluster/hostvars/{{ ansible_hostname }}/hostvars.yml" register: hostvars_stat @@ -99,7 +100,8 @@ - ansible.builtin.meta: end_play - name: Sync /mnt/cluster to /var/tmp - # become_user: slurm # Commenting out as become_user does not imply become: true + become: true + become_user: slurm ansible.posix.synchronize: src: "/mnt/cluster/" dest: "/var/tmp/cluster/" diff --git a/ansible/roles/filebeat/tasks/install.yml b/ansible/roles/filebeat/tasks/install.yml index eaf621cc5..74c3b09c2 100644 --- a/ansible/roles/filebeat/tasks/install.yml +++ b/ansible/roles/filebeat/tasks/install.yml @@ -11,7 +11,8 @@ containers.podman.podman_image: name: "docker.elastic.co/beats/filebeat-oss" tag: "{{ filebeat_version }}" - # become_user: "{{ filebeat_podman_user }}" # Commenting out as become_user does not imply become: true + become: true + become_user: "{{ filebeat_podman_user }}" - name: Reload filebeat unit file # noqa: no-changed-when ansible.builtin.command: systemctl daemon-reload # noqa: command-instead-of-module diff --git a/ansible/roles/mysql/tasks/install.yml b/ansible/roles/mysql/tasks/install.yml index a3c66d758..0a108d279 100644 --- a/ansible/roles/mysql/tasks/install.yml +++ b/ansible/roles/mysql/tasks/install.yml @@ -21,4 +21,5 @@ containers.podman.podman_image: name: docker.io/library/mysql tag: "{{ mysql_tag }}" - # become_user: "{{ mysql_podman_user }}" # Commenting out as become_user does not imply become: true + become: true + become_user: "{{ mysql_podman_user }}" diff --git a/ansible/roles/opensearch/tasks/install.yml b/ansible/roles/opensearch/tasks/install.yml index f9bc1f8d0..0ca5ebd4f 100644 --- a/ansible/roles/opensearch/tasks/install.yml +++ b/ansible/roles/opensearch/tasks/install.yml @@ -20,7 +20,8 @@ containers.podman.podman_image: name: docker.io/opensearchproject/opensearch tag: "{{ opensearch_version }}" - # become_user: "{{ opensearch_podman_user }}" # Commenting out as become_user does not imply become: true + become: true + become_user: "{{ opensearch_podman_user }}" - name: Reload opensearch unit file # noqa: no-changed-when ansible.builtin.command: systemctl daemon-reload # noqa: command-instead-of-module From 5a55e6cd6135c00b9195f573e87b3bb704eb5312 Mon Sep 17 00:00:00 2001 From: Max Norton Date: Thu, 14 Aug 2025 07:38:09 +0100 Subject: [PATCH 37/37] Fix linting on merge of origin/main --- .github/bin/get-s3-image.sh | 2 +- ansible/bootstrap.yml | 4 +- ansible/fatimage.yml | 7 +- ansible/portal.yml | 12 +- ansible/roles/alertmanager/README.md | 1 + ansible/roles/alertmanager/tasks/install.yml | 3 +- ansible/roles/basic_users/tasks/main.yml | 4 +- ansible/roles/dnf_repos/defaults/main.yml | 2 +- ansible/roles/hpctests/tasks/build-hpl.yml | 1 + ansible/roles/hpctests/tasks/source-hpl.yml | 5 +- ansible/roles/lustre/README.md | 3 +- ansible/roles/mysql/tasks/configure.yml | 4 +- .../openondemand/tasks/codeserver_compute.yml | 9 +- .../openondemand/tasks/rstudio_compute.yml | 7 +- ansible/roles/podman/tasks/configure.yml | 2 +- .../{{cookiecutter.environment}}/tofu/main.tf | 32 +++--- docs/alerting.md | 2 +- docs/experimental/isolated-clusters.md | 104 +++++++++--------- docs/openondemand.md | 10 +- docs/production.md | 45 ++++---- docs/upgrades.md | 13 ++- environments/.caas/hooks/pre.yml | 2 +- environments/README.md | 5 +- .../inventory/group_vars/all/openondemand.yml | 19 ++-- environments/site/tofu/additional.tf | 4 +- environments/site/tofu/compute.tf | 4 +- environments/site/tofu/login.tf | 8 +- environments/site/tofu/node_group/nodes.tf | 8 +- .../site/tofu/node_group/variables.tf | 18 +-- environments/site/tofu/variables.tf | 18 +-- 30 files changed, 188 insertions(+), 170 deletions(-) diff --git a/.github/bin/get-s3-image.sh b/.github/bin/get-s3-image.sh index 7c9c46d0d..dc0c81655 100755 --- a/.github/bin/get-s3-image.sh +++ b/.github/bin/get-s3-image.sh @@ -17,7 +17,7 @@ if [ -n "$image_exists" ]; then else echo "Image $image_name not found in OpenStack. Getting it from S3." - wget https://leafcloud.store/swift/v1/AUTH_f39848421b2747148400ad8eeae8d536/$bucket_name/$image_name --progress=dot:giga + wget "https://leafcloud.store/swift/v1/AUTH_f39848421b2747148400ad8eeae8d536/$bucket_name/$image_name" --progress=dot:giga echo "Uploading image $image_name to OpenStack..." openstack image create --file "$image_name" --disk-format qcow2 "$image_name" --progress diff --git a/ansible/bootstrap.yml b/ansible/bootstrap.yml index 5783630e3..b21e78e86 100644 --- a/ansible/bootstrap.yml +++ b/ansible/bootstrap.yml @@ -82,7 +82,7 @@ search_string: HOSTNAME=$(/usr/bin/hostnamectl --transient 2>/dev/null) || \ state: absent - name: Add system user groups - ansible.builtin.group: "{{ item.group }}" + ansible.builtin.group: "{{ item.group }}" # noqa: args[module] loop: "{{ appliances_local_users }}" when: - item.enable | default(true) | bool @@ -91,7 +91,7 @@ # Need to change working directory otherwise we try to switch back to non-existent directory. become_flags: "-i" - name: Add system users - ansible.builtin.user: "{{ item.user }}" + ansible.builtin.user: "{{ item.user }}" # noqa: args[module] loop: "{{ appliances_local_users }}" when: item.enable | default(true) | bool become_method: ansible.builtin.sudo diff --git a/ansible/fatimage.yml b/ansible/fatimage.yml index 2747a4a44..58960eb6d 100644 --- a/ansible/fatimage.yml +++ b/ansible/fatimage.yml @@ -192,7 +192,7 @@ when: "'alertmanager' in group_names" - name: Download HPL source - include_role: + ansible.builtin.include_role: name: hpctests tasks_from: source-hpl.yml @@ -207,7 +207,8 @@ # can't run cloudalchemy.prometheus/tasks/install.yml as it triggers a unit start # so below is a partial extraction of this: - name: Create prometheus system group - ansible.builtin.group: prometheus + ansible.builtin.group: + name: prometheus system: true state: present @@ -270,7 +271,7 @@ - name: Add support for NVIDIA GPU auto detection to Slurm hosts: slurm_recompile - become: yes + become: true tasks: - name: Recompile slurm ansible.builtin.import_role: diff --git a/ansible/portal.yml b/ansible/portal.yml index 8766b1a81..361a603f5 100644 --- a/ansible/portal.yml +++ b/ansible/portal.yml @@ -43,10 +43,10 @@ tags: - openondemand - openondemand_rstudio - become: yes - gather_facts: yes + become: true + gather_facts: true tasks: - - import_role: + - ansible.builtin.import_role: name: openondemand tasks_from: rstudio_compute.yml when: appliances_mode != 'configure' # is run during build @@ -55,10 +55,10 @@ tags: - openondemand - openondemand_codeserver - become: yes - gather_facts: yes + become: true + gather_facts: true tasks: - - import_role: + - ansible.builtin.import_role: name: openondemand tasks_from: codeserver_compute.yml when: appliances_mode != 'configure' # is run during build diff --git a/ansible/roles/alertmanager/README.md b/ansible/roles/alertmanager/README.md index be0ce7c4c..900e0e194 100644 --- a/ansible/roles/alertmanager/README.md +++ b/ansible/roles/alertmanager/README.md @@ -15,6 +15,7 @@ Alertmanager is enabled by default on the `control` node in the `site` environment's `inventory/groups` file. In general usage may only require: + - Enabling the Slack integration (see section below). - Possibly setting `alertmanager_web_external_url`. diff --git a/ansible/roles/alertmanager/tasks/install.yml b/ansible/roles/alertmanager/tasks/install.yml index f88c4e76d..f1cb9cd4b 100644 --- a/ansible/roles/alertmanager/tasks/install.yml +++ b/ansible/roles/alertmanager/tasks/install.yml @@ -1,6 +1,7 @@ --- - name: Create alertmanager system user - ansible.builtin.user: "{{ alertmanager_system_user }}" + ansible.builtin.user: + name: "{{ alertmanager_system_user }}" system: true create_home: false diff --git a/ansible/roles/basic_users/tasks/main.yml b/ansible/roles/basic_users/tasks/main.yml index b6a9a1641..cd01430e4 100644 --- a/ansible/roles/basic_users/tasks/main.yml +++ b/ansible/roles/basic_users/tasks/main.yml @@ -18,11 +18,11 @@ - basic_users_override_sssd | bool - name: Create groups - ansible.builtin.group: "{{ item }}" + ansible.builtin.group: "{{ item }}" # noqa: args[module] loop: "{{ basic_users_groups }}" - name: Create users - ansible.builtin.user: "{{ basic_users_userdefaults | combine(item) | filter_user_params() | combine(_disable_homedir) }}" + ansible.builtin.user: "{{ basic_users_userdefaults | combine(item) | filter_user_params() | combine(_disable_homedir) }}" # noqa: args[module] loop: "{{ basic_users_users }}" loop_control: label: "{{ item.name }}" diff --git a/ansible/roles/dnf_repos/defaults/main.yml b/ansible/roles/dnf_repos/defaults/main.yml index dca4cd4ab..edaf986c5 100644 --- a/ansible/roles/dnf_repos/defaults/main.yml +++ b/ansible/roles/dnf_repos/defaults/main.yml @@ -49,7 +49,7 @@ dnf_repos_openhpc_repolist: base_url: "{{ dnf_repos_pulp_content_url }}/{{ appliances_pulp_repos.openhpc_updates[ansible_distribution_major_version] | appliances_repo_to_subpath }}" dnf_repos_extra_repolist: [] -dnf_repos_repolist: "{{ dnf_repos_default_repolist + (dnf_repos_openhpc_repolist if (openhpc_install_type | default('ohpc')) == 'ohpc' else []) + dnf_repos_extra_repolist }}" +dnf_repos_repolist: "{{ dnf_repos_default_repolist + (dnf_repos_openhpc_repolist if (openhpc_install_type | default('ohpc')) == 'ohpc' else []) + dnf_repos_extra_repolist }}" # noqa: yaml[line-length] dnf_repos_epel_baseurl: "{{ dnf_repos_pulp_content_url }}/{{ appliances_pulp_repos.epel[ansible_distribution_major_version] | appliances_repo_to_subpath }}" dnf_repos_epel_description: "epel" diff --git a/ansible/roles/hpctests/tasks/build-hpl.yml b/ansible/roles/hpctests/tasks/build-hpl.yml index e8682fb0d..7339d9b7e 100644 --- a/ansible/roles/hpctests/tasks/build-hpl.yml +++ b/ansible/roles/hpctests/tasks/build-hpl.yml @@ -20,6 +20,7 @@ src: "{{ hpctests_hpl_srcdir }}/setup/Make.Linux_PII_CBLAS" dest: "{{ hpctests_hpl_srcdir }}/Make.{{ hpctests_hpl_arch }}" remote_src: true + mode: "0644" - name: Modify make file ansible.builtin.replace: diff --git a/ansible/roles/hpctests/tasks/source-hpl.yml b/ansible/roles/hpctests/tasks/source-hpl.yml index 43585d3e6..60832405c 100644 --- a/ansible/roles/hpctests/tasks/source-hpl.yml +++ b/ansible/roles/hpctests/tasks/source-hpl.yml @@ -1,7 +1,7 @@ --- - name: Make directory - file: + ansible.builtin.file: path: "/opt/hpl" state: directory owner: root @@ -9,7 +9,8 @@ mode: '0755' - name: Download HPL tarball - get_url: + # checkov:skip=CKV2_ANSIBLE_2: "Ensure that HTTPS url is used with get_url" + ansible.builtin.get_url: url: "http://www.netlib.org/benchmark/hpl/hpl-{{ hpctests_hpl_version }}.tar.gz" dest: "/opt/hpl/hpl-{{ hpctests_hpl_version }}.tar.gz" owner: root diff --git a/ansible/roles/lustre/README.md b/ansible/roles/lustre/README.md index 289a341ef..56e6b3a88 100644 --- a/ansible/roles/lustre/README.md +++ b/ansible/roles/lustre/README.md @@ -21,8 +21,9 @@ The following variables control configuration of Lustre clients. - `lustre_mount_options`. Optional default mount options. Default values are systemd defaults from [Lustre client docs](http://wiki.lustre.org/Mounting_a_Lustre_File_System_on_Client_Nodes). The following variables control the package build and and install: + - `lustre_version`: Optional str. Version of lustre to build, default `2.15.7` -- `lustre_repo`: Optional str. URL for Lustre repo. Default is `git://git.whamcloud.com/fs/lustre-release`.git. +- `lustre_repo`: Optional str. URL for Lustre repository. Default is `git://git.whamcloud.com/fs/lustre-release`.git. - `lustre_build_packages`: Optional list. Prerequisite packages required to build Lustre. See `defaults/main.yml`. - `lustre_build_dir`: Optional str. Path to build lustre at, default `/tmp/lustre-release`. - `lustre_configure_opts`: Optional list. Options to `./configure` command. Default builds client rpms supporting Mellanox OFED, without support for GSS keys. diff --git a/ansible/roles/mysql/tasks/configure.yml b/ansible/roles/mysql/tasks/configure.yml index d57723a43..7bf9cb3b5 100644 --- a/ansible/roles/mysql/tasks/configure.yml +++ b/ansible/roles/mysql/tasks/configure.yml @@ -30,9 +30,9 @@ delay: 2 - name: Ensure mysql databases created - community.mysql.mysql_db: "{{ item }}" + community.mysql.mysql_db: "{{ item }}" # noqa: args[module] loop: "{{ mysql_databases}}" - name: Ensure mysql users present - community.mysql.mysql_user: "{{ item }}" + community.mysql.mysql_user: "{{ item }}" # noqa: args[module] loop: "{{ mysql_users }}" diff --git a/ansible/roles/openondemand/tasks/codeserver_compute.yml b/ansible/roles/openondemand/tasks/codeserver_compute.yml index 7b39bf75d..6f178c5b9 100644 --- a/ansible/roles/openondemand/tasks/codeserver_compute.yml +++ b/ansible/roles/openondemand/tasks/codeserver_compute.yml @@ -1,24 +1,25 @@ - name: Download Code Server RPM ansible.builtin.get_url: - url: "https://github.com/coder/code-server/releases/download/v{{ openondemand_code_server_version }}/code-server-{{ openondemand_code_server_version }}-amd64.rpm" + url: "https://github.com/coder/code-server/releases/download/v{{ openondemand_code_server_version }}/code-server-{{ openondemand_code_server_version }}-amd64.rpm" # noqa: yaml[line-length] dest: /tmp/code-server.rpm mode: '0644' - name: Install Code Server + # checkov:skip=CKV2_ANSIBLE_4: "Ensure that packages with untrusted or missing GPG signatures are not used by dnf" ansible.builtin.dnf: name: /tmp/code-server.rpm state: present - disable_gpg_check: yes + disable_gpg_check: true - name: Create module directory for Code Server ansible.builtin.file: path: /opt/ohpc/pub/modulefiles/code-server state: directory mode: '0755' - recurse: yes + recurse: true - name: Create modulefile for Code Server - copy: + ansible.builtin.copy: dest: "/opt/ohpc/pub/modulefiles/code-server/{{ openondemand_code_server_version }}" mode: "0644" content: | diff --git a/ansible/roles/openondemand/tasks/rstudio_compute.yml b/ansible/roles/openondemand/tasks/rstudio_compute.yml index 99dd83a9b..8cb3c911d 100644 --- a/ansible/roles/openondemand/tasks/rstudio_compute.yml +++ b/ansible/roles/openondemand/tasks/rstudio_compute.yml @@ -9,22 +9,23 @@ - name: Download RStudio Server RPM ansible.builtin.get_url: - url: "https://download2.rstudio.org/server/rhel{{ ansible_distribution_major_version }}/x86_64/rstudio-server-rhel-{{ openondemand_rstudio_version }}-x86_64.rpm" + url: "https://download2.rstudio.org/server/rhel{{ ansible_distribution_major_version }}/x86_64/rstudio-server-rhel-{{ openondemand_rstudio_version }}-x86_64.rpm" # noqa: yaml[line-length] dest: /tmp/rstudio-server.rpm mode: '0644' - name: Install RStudio Server + # checkov:skip=CKV2_ANSIBLE_4: "Ensure that packages with untrusted or missing GPG signatures are not used by dnf" ansible.builtin.dnf: name: /tmp/rstudio-server.rpm state: present - disable_gpg_check: yes + disable_gpg_check: true - name: Create module directory for RStudio Server ansible.builtin.file: path: /opt/ohpc/pub/modulefiles/rstudio-server state: directory mode: '0755' - recurse: yes + recurse: true - name: Write modulefile for RStudio Server ansible.builtin.copy: diff --git a/ansible/roles/podman/tasks/configure.yml b/ansible/roles/podman/tasks/configure.yml index 62f42cda6..962712f74 100644 --- a/ansible/roles/podman/tasks/configure.yml +++ b/ansible/roles/podman/tasks/configure.yml @@ -39,7 +39,7 @@ - name: Reset ssh connection to allow user changes to affect 'current login user' ansible.builtin.meta: reset_connection - name: Ensure podman users exist - ansible.builtin.user: "{{ item }}" + ansible.builtin.user: "{{ item }}" # noqa: args[module] with_items: "{{ podman_users }}" register: podman_user_info become: true diff --git a/cookiecutter/{{cookiecutter.environment}}/tofu/main.tf b/cookiecutter/{{cookiecutter.environment}}/tofu/main.tf index 9aa447595..abbcf9459 100644 --- a/cookiecutter/{{cookiecutter.environment}}/tofu/main.tf +++ b/cookiecutter/{{cookiecutter.environment}}/tofu/main.tf @@ -1,21 +1,23 @@ +# tflint-ignore: terraform_required_version + variable "environment_root" { - type = string - description = "Path to environment root, automatically set by activate script" + type = string + description = "Path to environment root, automatically set by activate script" } module "cluster" { - source = "../../site/tofu/" - environment_root = var.environment_root + source = "../../site/tofu/" + environment_root = var.environment_root - # Environment specific variables - # Note that some of the variables below may need to be moved to the site environment - # defaults e.g cluster_networks should be in site if your staging and prod - # environments use the same networks - cluster_name = - cluster_image_id = - control_node_flavor = - cluster_networks = - key_pair = - login = - compute = + # Environment specific variables + # Note that some of the variables below may need to be moved to the site environment + # defaults e.g cluster_networks should be in site if your staging and prod + # environments use the same networks + cluster_name = null + cluster_image_id = null + control_node_flavor = null + cluster_networks = null + key_pair = null + login = null + compute = null } diff --git a/docs/alerting.md b/docs/alerting.md index ea2740068..38bfb051a 100644 --- a/docs/alerting.md +++ b/docs/alerting.md @@ -21,7 +21,7 @@ must be configured to generate notifications. ## Enabling alertmanager 1. Ensure both the `prometheus` and `alertmanager` servers are deployed on the -control node - these are deployed by default in the site environment's groups: + control node - these are deployed by default in the site environment's groups: ```ini # environments/site/groups: diff --git a/docs/experimental/isolated-clusters.md b/docs/experimental/isolated-clusters.md index dfdc453cd..5cf5a7b38 100644 --- a/docs/experimental/isolated-clusters.md +++ b/docs/experimental/isolated-clusters.md @@ -11,11 +11,11 @@ all "default" features, i.e. roles/groups which are enabled either in the The full list of features and whether they are functional on such an "isolated" network is shown in the table below. Note that: -- Using [EESSI](https://www.eessi.io/docs/) necessarily requires outbound - network access for the CernVM File System. However this can be provided - via an authenticated proxy. While the proxy configuration on the cluster node - is readable by all users, this proxy could be limited via acls to only provide - access to EESSI's CVMFS Stratum 1 servers. +- Using [EESSI](https://www.eessi.io/docs/) necessarily requires outbound + network access for the CernVM File System. However this can be provided + via an authenticated proxy. While the proxy configuration on the cluster node + is readable by all users, this proxy could be limited via acls to only provide + access to EESSI's CVMFS Stratum 1 servers. ## Support by feature for isolated networks @@ -25,53 +25,53 @@ See above for definition of "Default" features. In the "Isolated?" column: - "N": Known not to work. - "?": Not investigated at present. -| Inventory group/role | Default? | Isolated? | -| ----------------------| -------- | ------------------------ | -| alertmanager | Y | Y | -| ansible_init | Y | Y | -| basic_users | Y | Y | -| block_devices | Y | No (depreciated) | -| cacerts | - | Y | -| chrony | - | Y | -| compute_init | - | Y | -| cuda | - | ? | -| eessi | Y | Y - see above | -| etc_hosts | Y | Y | -| extra_packages | - | No | -| fail2ban | Y | Y | -| filebeat | Y | Y | -| firewalld | Y | Y | -| freeipa_client | - | Y - image build required | -| gateway | n/a | n/a - build only | -| grafana | Y | Y | -| hpctests | Y | Y | -| k3s_agent | - | ? | -| k3s_server | - | ? | -| k9s | - | ? | -| lustre | - | ? | -| manila | Y | Y | -| MySQL | Y | Y | -| nfs | Y | Y | -| nhc | Y | Y | -| node_exporter | Y | Y | -| openhpc | Y | Y | -| openondemand | Y | Y | -| openondemand_desktop | Y | Y | -| openondemand_jupyter | Y | Y | -| opensearch | Y | Y | -| podman | Y | Y | -| persist_hostkeys | Y | Y | -| prometheus | Y | Y | -| proxy | - | Y | -| resolv_conf | - | ? | -| slurm_exporter | Y | Y | -| slurm_stats | Y | Y | -| squid | - | ? | -| sshd | - | ? | -| sssd | - | ? | -| systemd | Y | Y | -| tuned | - | Y | -| update | - | No | +| Inventory group/role | Default? | Isolated? | +| -------------------- | -------- | ------------------------ | +| alertmanager | Y | Y | +| ansible_init | Y | Y | +| basic_users | Y | Y | +| block_devices | Y | No (depreciated) | +| cacerts | - | Y | +| chrony | - | Y | +| compute_init | - | Y | +| cuda | - | ? | +| eessi | Y | Y - see above | +| etc_hosts | Y | Y | +| extra_packages | - | No | +| fail2ban | Y | Y | +| filebeat | Y | Y | +| firewalld | Y | Y | +| freeipa_client | - | Y - image build required | +| gateway | n/a | n/a - build only | +| grafana | Y | Y | +| hpctests | Y | Y | +| k3s_agent | - | ? | +| k3s_server | - | ? | +| k9s | - | ? | +| lustre | - | ? | +| manila | Y | Y | +| MySQL | Y | Y | +| nfs | Y | Y | +| nhc | Y | Y | +| node_exporter | Y | Y | +| openhpc | Y | Y | +| openondemand | Y | Y | +| openondemand_desktop | Y | Y | +| openondemand_jupyter | Y | Y | +| opensearch | Y | Y | +| podman | Y | Y | +| persist_hostkeys | Y | Y | +| prometheus | Y | Y | +| proxy | - | Y | +| resolv_conf | - | ? | +| slurm_exporter | Y | Y | +| slurm_stats | Y | Y | +| squid | - | ? | +| sshd | - | ? | +| sssd | - | ? | +| systemd | Y | Y | +| tuned | - | Y | +| update | - | No | ## Image build diff --git a/docs/openondemand.md b/docs/openondemand.md index 51bb9c7ce..cd33cd54a 100644 --- a/docs/openondemand.md +++ b/docs/openondemand.md @@ -24,22 +24,24 @@ For examples of all of the above see the `smslabs-example` environment in this r To enable the Open OnDemand server, add single host to the `openondemand` inventory group. Generally, this should be a node in the `login` group, as Open OnDemand must be able to access Slurm commands. -To enable compute nodes for virtual desktops, Jupyter notebooks, RStudio, VSCode, or MATLAB (accessed through the Open OnDemand portal), add nodes/groups to the `openondemand_desktop`, `openondemand_jupyter`, `openondemand_rstudio`, `openondemand_codeserver`, and `openondemand_matlab` inventory groups respectively. These may be all or a subset of the `compute` group. +To enable compute nodes for virtual desktops, Jupyter notebooks, RStudio, Visual Studio Code, or MATLAB (accessed through the Open OnDemand portal), add nodes/groups to the `openondemand_desktop`, `openondemand_jupyter`, `openondemand_rstudio`, `openondemand_codeserver`, and `openondemand_matlab` inventory groups respectively. These may be all or a subset of the `compute` group. The above functionality is configured by running the `ansible/portal.yml` playbook. This is automatically run as part of `ansible/site.yml`. ## MATLAB -*NB* Due to licensing, the MATLAB batch connect app requires a MATLAB intallation to be present on the relevant compute nodes. The MATLAB app is therefore disabled by default, and must be enabled by setting `openondemand_matlab_partition` in e.g. `environments/site/inventory/group_vars/all/openondemand.yml` to the name of the partition where MATLAB is available. + +_NB_ Due to licensing, the MATLAB batch connect app requires a MATLAB intallation to be present on the relevant compute nodes. The MATLAB app is therefore disabled by default, and must be enabled by setting `openondemand_matlab_partition` in e.g. `environments/site/inventory/group_vars/all/openondemand.yml` to the name of the partition where MATLAB is available. An Lmod modulefile also needs to be available on compute nodes - this is not provided by the appliance. See e.g.`roles/openondemand/tasks/rstudio_compute.yml` for an example. The modulefile must be named `matlab/$MATLAB_VERSION`, where the version matches thes `openondemand_matlab_version` variable. This variable is set to empty in the role default so must be defined in `environments/site/inventory/group_vars/all/openondemand.yml`. -As MATLAB requires a remote desktop, the TurboVNC and Xfce Desktop packages and configuration from the "openondemand_desktop" app will be automatically applied to nodes where the MATLAB app is enabled. +As MATLAB requires a remote desktop, the TurboVNC and Xfce Desktop packages and configuration from the "openondemand_desktop" app will be automatically applied to nodes where the MATLAB app is enabled. -# Default configuration +## Default configuration See the [ansible/roles/openondemand/README.md](../ansible/roles/openondemand/README.md) for more details on the variables described below. The following variables have been given default values to allow Open OnDemand to work in a newly created environment without additional configuration, but generally should be overridden in `environments/site/inventory/group_vars/all/` with site-specific values: + - `openondemand_servername` - this must be defined for both `openondemand` and `grafana` hosts (when Grafana is enabled). The default is `ansible_host` (i.e. the IP address) of the first host in the `openondemand` group. For production diff --git a/docs/production.md b/docs/production.md index 2929b94ac..984465db7 100644 --- a/docs/production.md +++ b/docs/production.md @@ -8,8 +8,9 @@ production-ready deployments. requires instance deletion/recreation. - At least two environments should be created using cookiecutter, which will derive from the `site` base environment: - - `production`: production environment - - `staging`: staging environment + + - `production`: production environment + - `staging`: staging environment A `dev` environment should also be created if considered required, or this can be left until later. @@ -27,19 +28,19 @@ production-ready deployments. and referenced from the `site` and `production` environments, e.g.: ```yaml - # environments/production/hooks/pre.yml: - - name: Import parent hook - import_playbook: "{{ lookup('env', 'APPLIANCES_ENVIRONMENT_ROOT') }}/../site/hooks/pre.yml" +# environments/production/hooks/pre.yml: +- name: Import parent hook + import_playbook: "{{ lookup('env', 'APPLIANCES_ENVIRONMENT_ROOT') }}/../site/hooks/pre.yml" ``` - When setting OpenTofu configurations: - - - Environment-specific variables (`cluster_name`) should be hardcoded - as arguments into the cluster module block at `environments/$ENV/tofu/main.tf`. - - Environment-independent variables (e.g. maybe `cluster_net` if the - same is used for staging and production) should be set as *defaults* - in `environments/site/tofu/variables.tf`, and then don't need to - be passed in to the module. + + - Environment-specific variables (`cluster_name`) should be hardcoded + as arguments into the cluster module block at `environments/$ENV/tofu/main.tf`. + - Environment-independent variables (e.g. maybe `cluster_net` if the + same is used for staging and production) should be set as _defaults_ + in `environments/site/tofu/variables.tf`, and then don't need to + be passed in to the module. - Vault-encrypt secrets. Running the `generate-passwords.yml` playbook creates a secrets file at `environments/$ENV/inventory/group_vars/all/secrets.yml`. @@ -107,13 +108,14 @@ set the "attach" options and run `tofu apply` again - this should show there are no changes planned. - Consider whether Prometheus storage configuration is required. By default: + - A 200GB state volume is provisioned (but see above) - The common environment [sets](../environments/common/inventory/group_vars/all/prometheus.yml) a maximum retention of 100 GB and 31 days - These may or may not be appropriate depending on the number of nodes, the - scrape interval, and other uses of the state volume (primarily the `slurmctld` - state and the `slurmdbd` database). See [docs/monitoring-and-logging](./monitoring-and-logging.md) - for more options. + These may or may not be appropriate depending on the number of nodes, the + scrape interval, and other uses of the state volume (primarily the `slurmctld` + state and the `slurmdbd` database). See [docs/monitoring-and-logging](./monitoring-and-logging.md) + for more options. - Configure Open OnDemand - see [specific documentation](openondemand.md) which notes specific variables required. @@ -128,11 +130,12 @@ are no changes planned. the OpenTofu `login` definition. - Consider enabling topology aware scheduling. This is currently only supported if your cluster does not include any baremetal nodes. This can be enabled by: - 1. Creating Availability Zones in your OpenStack project for each physical rack - 2. Setting the `availability_zone` fields of compute groups in your OpenTofu configuration - 3. Adding the `compute` group as a child of `topology` in `environments/$ENV/inventory/groups` - 4. (Optional) If you are aware of the physical topology of switches above the rack-level, override `topology_above_rack_topology` in your groups vars - (see [topology docs](../ansible/roles/topology/README.md) for more detail) + + 1. Creating Availability Zones in your OpenStack project for each physical rack + 2. Setting the `availability_zone` fields of compute groups in your OpenTofu configuration + 3. Adding the `compute` group as a child of `topology` in `environments/$ENV/inventory/groups` + 4. (Optional) If you are aware of the physical topology of switches above the rack-level, override `topology_above_rack_topology` in your groups vars + (see [topology docs](../ansible/roles/topology/README.md) for more detail) - Consider whether mapping of baremetal nodes to ironic nodes is required. See [PR 485](https://github.com/stackhpc/ansible-slurm-appliance/pull/485). diff --git a/docs/upgrades.md b/docs/upgrades.md index 9330b643c..5cf4bbebd 100644 --- a/docs/upgrades.md +++ b/docs/upgrades.md @@ -48,11 +48,12 @@ It is possible this will introduce merge conflicts; fix these following the usua prompts. Generally merge conflicts should only exist where functionality which was added for your site (not in a hook) has subsequently been merged upstream. - Note that if upgrading from a release prior to v2.3, you will likely have merge conflicts - with existing site OpenTofu configurations in `environments/site/tofu`. Generally - - Changes to `default` values in `environments/site/tofu.variables.tf` should be rejected. - - All other changes to the OpenTofu configuration should be accepted, unless they overwrite - site-specific additional resources. +Note that if upgrading from a release prior to v2.3, you will likely have merge conflicts +with existing site OpenTofu configurations in `environments/site/tofu`. Generally + +- Changes to `default` values in `environments/site/tofu.variables.tf` should be rejected. +- All other changes to the OpenTofu configuration should be accepted, unless they overwrite + site-specific additional resources. 1. Push this branch and create a PR: @@ -67,7 +68,7 @@ git push necessary to use new functionality or where functionality has been upstreamed as above. Note that the upstream `environments/site/inventory/groups` file contains all possible groups which can be used to enable features. This will be updated when pulling changes - from the StackHPC repo, and any new groups should be enabled/disabled as required for + from the StackHPC repository, and any new groups should be enabled/disabled as required for your site. Make changes as necessary. diff --git a/environments/.caas/hooks/pre.yml b/environments/.caas/hooks/pre.yml index 755ed6a09..3b50899a0 100644 --- a/environments/.caas/hooks/pre.yml +++ b/environments/.caas/hooks/pre.yml @@ -40,7 +40,7 @@ - hosts: login gather_facts: false tasks: - - name: Set up Ansible user + - name: Set up Ansible user # noqa: args[module] ansible.builtin.user: "{{ (appliances_local_users_default | selectattr('user.name', 'eq', appliances_local_users_ansible_user_name))[0]['user'] }}" become_method: ansible.builtin.sudo # Need to change working directory otherwise we try to switch back to non-existent directory. diff --git a/environments/README.md b/environments/README.md index 0eb9d41cc..94a66e1e4 100644 --- a/environments/README.md +++ b/environments/README.md @@ -32,15 +32,14 @@ for usage instructions for that component. ### common Shared configuration for all environments. This is not -intended to be used as a standalone environment, hence the README does *not* detail -how to provision the infrastructure. This environment should not be edited, except as part of upstreaming new features or bug fixes. +intended to be used as a standalone environment, hence the readme does _not_ detail +how to provision the infrastructure. This environment should not be edited, except as part of upstreaming new features or bugfixes. ## site Provides the base configuration for all subsequent `cookiecutter` created environments, including OpenTofu configurations for infrastructure. In general, most local customisations should be made by adding to this environment. - ## Defining an environment To define an environment using cookiecutter: diff --git a/environments/common/inventory/group_vars/all/openondemand.yml b/environments/common/inventory/group_vars/all/openondemand.yml index 12e1551bf..af7554a7d 100644 --- a/environments/common/inventory/group_vars/all/openondemand.yml +++ b/environments/common/inventory/group_vars/all/openondemand.yml @@ -189,7 +189,7 @@ openondemand_apps_rstudio_default: description: Request a RStudio server cluster: slurm attributes: - bc_queue: + bc_queue: value: "{{ openondemand_rstudio_partition | default(none) }}" rstudio_module: label: RStudio module @@ -198,10 +198,11 @@ openondemand_apps_rstudio_default: help: Choose your RStudio module widget: select options: - - ["RStudio v{{ openondemand_rstudio_version }}", "rstudio-server/{{ openondemand_rstudio_version }}}"] + - "RStudio v{{ openondemand_rstudio_version }}" + - "rstudio-server/{{ openondemand_rstudio_version }}}" extra_modules_script: label: Extra modules script - help: If you'd like to load additional modules alongside RStudio-Server, put the 'module load ...' commands into a text file (one 'module load...' per line) and specify its path here + help: If you'd like to load additional modules alongside RStudio-Server, put the 'module load ...' commands into a text file (one 'module load...' per line) and specify its path here # noqa: yaml[line-length] widget: text_field required: false cores: @@ -218,7 +219,7 @@ openondemand_apps_rstudio_default: label: RAM in GB help: How much RAM to reserve for your session. NB Ensure this is within the maximum allowed by your chosen partition min: 4 - max: 700 + max: 700 step: 1 value: 4 cachable: true @@ -287,7 +288,8 @@ openondemand_apps_matlab_default: help: Choose your MATLAB module widget: select options: - - ["MATLAB v{{ openondemand_matlab_version }}", "matlab/{{ openondemand_matlab_version }}"] + - "MATLAB v{{ openondemand_matlab_version }}" + - "matlab/{{ openondemand_matlab_version }}" cores: label: Number of CPU cores help: How many CPU cores to reserve for your session. NB Ensure this is within the maximum allowed by your chosen partition. @@ -302,7 +304,7 @@ openondemand_apps_matlab_default: label: RAM in GB help: How much RAM to reserve for your session. NB Ensure this is within the maximum allowed by your chosen partition min: 4 - max: 700 + max: 700 step: 1 value: 4 cachable: true @@ -347,7 +349,8 @@ openondemand_apps_codeserver_default: help: Choose your Code Server module widget: select options: - - ["Code Server v{{ openondemand_code_server_version}}", "code-server/{{ openondemand_code_server_version }}"] + - "Code Server v{{ openondemand_code_server_version}}" + - "code-server/{{ openondemand_code_server_version }}" bc_queue: value: "{{ openondemand_codeserver_partition | default(none) }}" cores: @@ -364,7 +367,7 @@ openondemand_apps_codeserver_default: label: RAM in GB help: How much RAM to reserve for your session. NB Ensure this is within the maximum allowed by your chosen partition min: 4 - max: 700 + max: 700 step: 1 value: 4 cachable: true diff --git a/environments/site/tofu/additional.tf b/environments/site/tofu/additional.tf index db415890e..57a6a7414 100644 --- a/environments/site/tofu/additional.tf +++ b/environments/site/tofu/additional.tf @@ -13,7 +13,7 @@ module "additional" { key_pair = var.key_pair environment_root = var.environment_root config_drive = var.config_drive - + # can be set for group, defaults to top-level value: image_id = lookup(each.value, "image_id", var.cluster_image_id) vnic_types = lookup(each.value, "vnic_types", var.vnic_types) @@ -32,7 +32,7 @@ module "additional" { match_ironic_node = lookup(each.value, "match_ironic_node", null) availability_zone = lookup(each.value, "availability_zone", null) ip_addresses = lookup(each.value, "ip_addresses", null) - security_group_ids = lookup(each.value, "security_group_ids", [for o in data.openstack_networking_secgroup_v2.nonlogin: o.id]) + security_group_ids = lookup(each.value, "security_group_ids", [for o in data.openstack_networking_secgroup_v2.nonlogin : o.id]) additional_cloud_config = lookup(each.value, "additional_cloud_config", var.additional_cloud_config) additional_cloud_config_vars = lookup(each.value, "additional_cloud_config_vars", var.additional_cloud_config_vars) diff --git a/environments/site/tofu/compute.tf b/environments/site/tofu/compute.tf index 9ef05b88b..8519d6dca 100644 --- a/environments/site/tofu/compute.tf +++ b/environments/site/tofu/compute.tf @@ -13,7 +13,7 @@ module "compute" { key_pair = var.key_pair environment_root = var.environment_root config_drive = var.config_drive - + # can be set for group, defaults to top-level value: image_id = lookup(each.value, "image_id", var.cluster_image_id) vnic_types = lookup(each.value, "vnic_types", var.vnic_types) @@ -65,5 +65,5 @@ module "compute" { "additional_cloud_config", "additional_cloud_config_vars" ] - + } diff --git a/environments/site/tofu/login.tf b/environments/site/tofu/login.tf index 7f4c28629..d61582367 100644 --- a/environments/site/tofu/login.tf +++ b/environments/site/tofu/login.tf @@ -13,7 +13,7 @@ module "login" { key_pair = var.key_pair environment_root = var.environment_root config_drive = var.config_drive - + # can be set for group, defaults to top-level value: image_id = lookup(each.value, "image_id", var.cluster_image_id) vnic_types = lookup(each.value, "vnic_types", var.vnic_types) @@ -24,7 +24,7 @@ module "login" { nodename_template = lookup(each.value, "nodename_template", var.cluster_nodename_template) additional_cloud_config = lookup(each.value, "additional_cloud_config", var.additional_cloud_config) additional_cloud_config_vars = lookup(each.value, "additional_cloud_config_vars", var.additional_cloud_config_vars) - + # optionally set for group: networks = concat(var.cluster_networks, lookup(each.value, "extra_networks", [])) # here null means "use module var default" @@ -43,7 +43,7 @@ module "login" { # not using openstack_compute_instance_v2.control.access_ip_v4 to avoid # updates to node metadata on deletion/recreation of the control node: control_address = openstack_networking_port_v2.control[var.cluster_networks[0].network].all_fixed_ips[0] - security_group_ids = lookup(each.value, "security_group_ids", [for o in data.openstack_networking_secgroup_v2.login: o.id]) + security_group_ids = lookup(each.value, "security_group_ids", [for o in data.openstack_networking_secgroup_v2.login : o.id]) baremetal_nodes = data.external.baremetal_nodes.result # input dict validation: @@ -70,5 +70,5 @@ module "login" { "additional_cloud_config_vars", "security_group_ids" ] - + } diff --git a/environments/site/tofu/node_group/nodes.tf b/environments/site/tofu/node_group/nodes.tf index ba623b367..7e6ffd829 100644 --- a/environments/site/tofu/node_group/nodes.tf +++ b/environments/site/tofu/node_group/nodes.tf @@ -33,10 +33,10 @@ resource "openstack_blockstorage_volume_v3" "compute" { for_each = local.all_compute_volumes - name = "${var.cluster_name}-${each.key}" - description = "Compute node ${each.value.node} volume ${each.value.volume}" - size = var.extra_volumes[each.value.volume].size - volume_type = var.extra_volumes[each.value.volume].volume_type + name = "${var.cluster_name}-${each.key}" + description = "Compute node ${each.value.node} volume ${each.value.volume}" + size = var.extra_volumes[each.value.volume].size + volume_type = var.extra_volumes[each.value.volume].volume_type } resource "openstack_compute_volume_attach_v2" "compute" { diff --git a/environments/site/tofu/node_group/variables.tf b/environments/site/tofu/node_group/variables.tf index c043409be..2f4bdd2da 100644 --- a/environments/site/tofu/node_group/variables.tf +++ b/environments/site/tofu/node_group/variables.tf @@ -65,11 +65,11 @@ variable "extra_volumes" { EOF type = map( object({ - size = number + size = number volume_type = optional(string) }) ) - default = {} + default = {} nullable = false } @@ -195,17 +195,17 @@ variable "allowed_keys" { } variable "config_drive" { - type = bool + type = bool } variable "additional_cloud_config" { - type = string - default = "" - nullable = false + type = string + default = "" + nullable = false } variable "additional_cloud_config_vars" { - type = map(any) - default = {} - nullable = false + type = map(any) + default = {} + nullable = false } diff --git a/environments/site/tofu/variables.tf b/environments/site/tofu/variables.tf index 0847cd3b2..98d4eac54 100644 --- a/environments/site/tofu/variables.tf +++ b/environments/site/tofu/variables.tf @@ -316,28 +316,28 @@ variable "cluster_nodename_template" { } variable "config_drive" { - description = <<-EOT + description = <<-EOT Whether to enable Nova config drives on all nodes, which will attach a drive containing information usually provided through the metadata service. EOT - type = bool - default = null + type = bool + default = null } variable "additional_cloud_config" { - description = <<-EOT + description = <<-EOT Multiline string to be appended to the node's cloud-init cloud-config user-data. Must be in yaml format and not include the #cloud-config or any other user-data headers. See https://cloudinit.readthedocs.io/en/latest/explanation/format.html#cloud-config-data. Can be a templatestring parameterised by `additional_cloud_config_vars`. The `boot-cmd`, `fqdn` and `mounts` modules must not be specified. EOT - type = string - default = "" + type = string + default = "" } variable "additional_cloud_config_vars" { - description = "Map of values passed to the `additional_cloud_config` templatestring" - type = map(any) - default = {} + description = "Map of values passed to the `additional_cloud_config` templatestring" + type = map(any) + default = {} }