From 5932fa637645e4ec19f88aaf71ab9bfd61f5d979 Mon Sep 17 00:00:00 2001 From: Xaver Stiensmeier Date: Mon, 2 Feb 2026 13:38:46 +0100 Subject: [PATCH 1/6] fixed gres and included installation of NVIDIA drivers --- .../roles/bibigrid/tasks/001-apt.yaml | 23 +++++++++++++++++++ .../roles/bibigrid/templates/slurm/gres.j2 | 10 ++++---- 2 files changed, 28 insertions(+), 5 deletions(-) diff --git a/bibigrid/resources/playbook/roles/bibigrid/tasks/001-apt.yaml b/bibigrid/resources/playbook/roles/bibigrid/tasks/001-apt.yaml index dec9e5b8..12c5a3ca 100644 --- a/bibigrid/resources/playbook/roles/bibigrid/tasks/001-apt.yaml +++ b/bibigrid/resources/playbook/roles/bibigrid/tasks/001-apt.yaml @@ -66,5 +66,28 @@ repo: 'deb https://apt.bi.denbi.de/repos/apt/{{ ansible_distribution_release | lower }} {{ ansible_distribution_release | lower }} main' notify: Apt update +- name: Install Nvidia Drivers + block: + - name: Install ubuntu-drivers-common + apt: + name: ubuntu-drivers-common + state: present + update_cache: yes + + - name: Install recommended NVIDIA driver + command: ubuntu-drivers install + args: + creates: /usr/bin/nvidia-smi + register: ubuntu_drivers_result + + - name: Reboot after NVIDIA driver installation + reboot: + reboot_timeout: 900 + pre_reboot_delay: 5 + post_reboot_delay: 20 + test_command: nvidia-smi + when: ubuntu_drivers_result.changed + when: flavor.gres is defined + - name: Flush handlers meta: flush_handlers diff --git a/bibigrid/resources/playbook/roles/bibigrid/templates/slurm/gres.j2 b/bibigrid/resources/playbook/roles/bibigrid/templates/slurm/gres.j2 index 7168728b..e9d45ad0 100644 --- a/bibigrid/resources/playbook/roles/bibigrid/templates/slurm/gres.j2 +++ b/bibigrid/resources/playbook/roles/bibigrid/templates/slurm/gres.j2 @@ -1,8 +1,8 @@ # GRES CONFIG -{% set device_index = 0 %} +{% set ns = namespace(device_index=0) %} {% for gres in flavor.gres %} - {% for i in range(gres.count | int) %} -Name={{ gres.name }} Type={{ gres.type }} File=/dev/nvidia{{ device_index }} - {% set device_index = device_index + 1 %} - {% endfor %} +{% for i in range(gres.count | int) %} +Name={{ gres.name }} Type={{ gres.type }} File=/dev/nvidia{{ ns.device_index }} +{% set ns.device_index = ns.device_index + 1 %} +{% endfor %} {% endfor %} \ No newline at end of file From 6eeda65895a335a043bdf62fa7a65ade899fd4d3 Mon Sep 17 00:00:00 2001 From: Xaver Stiensmeier Date: Mon, 2 Feb 2026 17:33:46 +0100 Subject: [PATCH 2/6] Fixed gres, zabbix repository and logging Untested. Includes installation for nvidia --- bibigrid/core/actions/create.py | 9 ++++++--- bibigrid/core/actions/terminate.py | 5 +++-- bibigrid/core/utility/validate_configuration.py | 9 +++++---- bibigrid/resources/cloud_node_requirements.yaml | 4 +--- .../playbook/roles/bibigrid/handlers/main.yml | 7 +++++++ .../playbook/roles/bibigrid/tasks/001-apt.yaml | 15 ++++----------- tests/rest_tests/rest_tests.py | 3 ++- 7 files changed, 28 insertions(+), 24 deletions(-) diff --git a/bibigrid/core/actions/create.py b/bibigrid/core/actions/create.py index 856f1763..98c68c72 100644 --- a/bibigrid/core/actions/create.py +++ b/bibigrid/core/actions/create.py @@ -117,6 +117,7 @@ def generate_keypair(self): # upload keyfiles for provider in self.providers: provider.create_keypair(name=self.key_name, public_key=public_key) + self.log.debug("Keypair generated - METHOD") def delete_old_vars(self): """ @@ -338,13 +339,14 @@ def add_volume_device_info_to_instance(self, provider, server, instance): server_volume["id"] == volume["id"]), None) if not server_volume: raise RuntimeError( - f"Created server {server['name']} doesn't have attached volume {volume['name']}.") + f"Created server {server['name']} doesn't have attached volume {volume.get('name')} " + f"(volume_id:{volume.get('id')}).") device = server_volume.get("device") final_volumes.append({**volume, "device": device}) self.log.debug(f"Added Configuration: Instance {server['name']} has volume {volume['name']} " - f"as device {device} that is going to be mounted to " - f"{volume.get('mountPoint')}") + f"(volume_id:{volume.get('id')}) " + f"as device {device} (Mount Point {volume.get('mountPoint', 'Will Not Be Mounted')})") self.write_remote.append( ({"volumes": final_volumes}, os.path.join(a_rp.HOST_VARS_FOLDER_REMOTE, f"{server['name']}.yaml"), @@ -529,6 +531,7 @@ def create(self): # pylint: disable=too-many-branches,too-many-statements self.log.info("%s not found. Creating folder.", folder) os.mkdir(folder) self.generate_keypair() + self.log.debug("Keypair generated") self.delete_old_vars() self.prepare_configurations() self.create_defaults() diff --git a/bibigrid/core/actions/terminate.py b/bibigrid/core/actions/terminate.py index 95fc96fb..2dea741b 100644 --- a/bibigrid/core/actions/terminate.py +++ b/bibigrid/core/actions/terminate.py @@ -168,8 +168,9 @@ def delete_security_groups(provider, cluster_id, security_groups, log, timeout=5 while not tmp_success: try: tmp_success = provider.delete_security_group(security_group_name) - except ConflictException: - log.info(f"ConflictException on deletion attempt on {provider.cloud_specification['identifier']}.") + except ConflictException as exc: + log.info(f"ConflictException on deletion attempt on {provider.cloud_specification['identifier']}: " + f"{exc}") tmp_success = False if tmp_success: break diff --git a/bibigrid/core/utility/validate_configuration.py b/bibigrid/core/utility/validate_configuration.py index e1a7d798..a448bbac 100644 --- a/bibigrid/core/utility/validate_configuration.py +++ b/bibigrid/core/utility/validate_configuration.py @@ -370,15 +370,16 @@ def check_instance_type_image_combination(self, instance_type, instance_image, p def _check_volume(self, provider, volume, count): success = True if volume.get("exists"): - if volume.get("name"): - volume_object = provider.get_volume_by_id_or_name(volume["name"]) + volume_name_or_id = volume.get("id", volume.get("name")) + if volume_name_or_id: + volume_object = provider.get_volume_by_id_or_name(volume_name_or_id) if volume_object: self.log.debug( - f"Found volume {volume['name']} on cloud " + f"Found volume {volume_name_or_id} on cloud " f"{provider.cloud_specification['identifier']}.") else: self.log.warning( - f"Couldn't find volume {volume['name']} on cloud " + f"Couldn't find volume {volume_name_or_id} on cloud " f"{provider.cloud_specification['identifier']}. " "No size added to resource requirements dict." ) diff --git a/bibigrid/resources/cloud_node_requirements.yaml b/bibigrid/resources/cloud_node_requirements.yaml index ca918c4d..136bd867 100644 --- a/bibigrid/resources/cloud_node_requirements.yaml +++ b/bibigrid/resources/cloud_node_requirements.yaml @@ -1,6 +1,4 @@ os_distro: ubuntu: os_versions: - - "24.04" - - "22.04" - - "20.04" \ No newline at end of file + - "24.04" \ No newline at end of file diff --git a/bibigrid/resources/playbook/roles/bibigrid/handlers/main.yml b/bibigrid/resources/playbook/roles/bibigrid/handlers/main.yml index 0c6e6eae..e5d0d966 100644 --- a/bibigrid/resources/playbook/roles/bibigrid/handlers/main.yml +++ b/bibigrid/resources/playbook/roles/bibigrid/handlers/main.yml @@ -78,3 +78,10 @@ - name: Apt update apt: update_cache: "yes" + +- name: Nvidia drivers installed + reboot: + reboot_timeout: 900 + pre_reboot_delay: 5 + post_reboot_delay: 20 + test_command: nvidia-smi diff --git a/bibigrid/resources/playbook/roles/bibigrid/tasks/001-apt.yaml b/bibigrid/resources/playbook/roles/bibigrid/tasks/001-apt.yaml index 12c5a3ca..5fae97fc 100644 --- a/bibigrid/resources/playbook/roles/bibigrid/tasks/001-apt.yaml +++ b/bibigrid/resources/playbook/roles/bibigrid/tasks/001-apt.yaml @@ -47,6 +47,7 @@ state: "present" - name: Add zabbix repositories # noqa: line-length + when: enable_zabbix|default(false)|bool apt: # yamllint disable-line deb: "https://repo.zabbix.com/zabbix/7.0/ubuntu/pool/main/z/zabbix-release/zabbix-release_latest_7.0+{{ ansible_distribution | lower }}{{ ansible_distribution_version }}_all.deb" @@ -67,27 +68,19 @@ notify: Apt update - name: Install Nvidia Drivers + when: flavor.gres is defined block: - name: Install ubuntu-drivers-common apt: name: ubuntu-drivers-common state: present - update_cache: yes + update_cache: true - name: Install recommended NVIDIA driver command: ubuntu-drivers install args: creates: /usr/bin/nvidia-smi - register: ubuntu_drivers_result - - - name: Reboot after NVIDIA driver installation - reboot: - reboot_timeout: 900 - pre_reboot_delay: 5 - post_reboot_delay: 20 - test_command: nvidia-smi - when: ubuntu_drivers_result.changed - when: flavor.gres is defined + notify: Nvidia drivers installed - name: Flush handlers meta: flush_handlers diff --git a/tests/rest_tests/rest_tests.py b/tests/rest_tests/rest_tests.py index 3d217b37..1e47faaf 100644 --- a/tests/rest_tests/rest_tests.py +++ b/tests/rest_tests/rest_tests.py @@ -14,7 +14,7 @@ with open(os.path.join(ROOT_PATH, "resources/tests/rest_test.json"), 'r', encoding='utf-8') as file: configurations_json = json.load(file) -CLUSTER_ID = "123456789123456" +CLUSTER_ID = "bibigrid1234567" # "a2ternativetest" client = TestClient(app) # Read the cloud_node_requirements YAML file and load it into a dictionary @@ -97,6 +97,7 @@ def test_state(state, do_assert=False): while test_state("starting"): time.sleep(10) test_state("running", do_assert=True) + input("Waiting...") test_terminate_cluster() while test_state("running"): time.sleep(2) From f1dd1de24d56270f4e265e8cdbcd45838a1d1024 Mon Sep 17 00:00:00 2001 From: Xaver Stiensmeier Date: Tue, 3 Feb 2026 10:51:49 +0100 Subject: [PATCH 3/6] removed pausing line --- tests/rest_tests/rest_tests.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/rest_tests/rest_tests.py b/tests/rest_tests/rest_tests.py index 1e47faaf..8758c4e5 100644 --- a/tests/rest_tests/rest_tests.py +++ b/tests/rest_tests/rest_tests.py @@ -97,7 +97,6 @@ def test_state(state, do_assert=False): while test_state("starting"): time.sleep(10) test_state("running", do_assert=True) - input("Waiting...") test_terminate_cluster() while test_state("running"): time.sleep(2) From 28c0ee3d74a7d806c8c04015718b06bfe43f99e6 Mon Sep 17 00:00:00 2001 From: Xaver Stiensmeier Date: Tue, 3 Feb 2026 13:26:38 +0100 Subject: [PATCH 4/6] support new rest schema --- bibigrid/models/configuration.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/bibigrid/models/configuration.py b/bibigrid/models/configuration.py index fbc41151..e77661e7 100644 --- a/bibigrid/models/configuration.py +++ b/bibigrid/models/configuration.py @@ -201,6 +201,8 @@ class ConfigurationsModel(StrictModel): def split_master_and_other(cls, values): if isinstance(values, list): values = {"configurations": values} + if values.get("master"): + return values configs = values.get("configurations") if not configs: raise ValueError("Configurations list cannot be empty") From 896e9045b0c798683ca27dc55e87d000ed5465b9 Mon Sep 17 00:00:00 2001 From: Xaver Stiensmeier Date: Thu, 5 Feb 2026 14:07:48 +0100 Subject: [PATCH 5/6] updated gres and volume documentation --- documentation/markdown/bibigrid_feature_list.md | 5 +++-- documentation/markdown/features/configuration.md | 2 +- documentation/markdown/features/gres.md | 10 ++++++++++ 3 files changed, 14 insertions(+), 3 deletions(-) create mode 100644 documentation/markdown/features/gres.md diff --git a/documentation/markdown/bibigrid_feature_list.md b/documentation/markdown/bibigrid_feature_list.md index 3ec6b35d..7a9972dc 100644 --- a/documentation/markdown/bibigrid_feature_list.md +++ b/documentation/markdown/bibigrid_feature_list.md @@ -3,9 +3,9 @@ | Name | Purpose | |:----------------------------------------------------------------:|:---------------------------------------------------------------------------------------------------------------------------:| | [Version](features/version.md) | Returns BiBiGrid's version for opening issues and the like | -| [Terminate Cluster](features/terminate_cluster.md) | Terminates the cluster specified by cluster-id i.e. removes key, application credentials, servers and floating-ips. | +| [Terminate](features/terminate_cluster.md) | Terminates the cluster specified by cluster-id i.e. removes key, application credentials, servers and floating-ips. | | [Create](features/create.md) | Creates the cluster specified by the configuration. | -| [List Clusters](features/list_clusters.md) | Shows info of all clusters if no cluster-id is specified. Otherwise the cluster-id's cluster will be shown in great detail. | +| [List](features/list_clusters.md) | Shows info of all clusters if no cluster-id is specified. Otherwise the cluster-id's cluster will be shown in great detail. | | [Check](features/check.md) | Checks if given configuration is valid and necessary security measures are taken. | | [Web IDE](features/ide.md) | Connects to running IDE of cluster-id's cluster. Requires that given cluster was setup with an ide. | | [Update](features/update.md) | Updates the master's playbook and runs that playbook for the master. Requires that no job is running and no workers up. | @@ -16,5 +16,6 @@ | [BiBiGrid Cluster Commands](features/cluster_commands.md) | Short useful commands to get information on the cluster | | [Other Configurations](features/other_configurations.md) | Info about custom `ansible.cfg` and `slurm.conf` | | [BiBiGrid REST API](features/bibigrid_rest.md) | Info about custom how to run BiBiGrid REST. | +| [GPU Nodes](features/gres.md) | Info about gpu nodes. | ![](../images/actions.jpg) \ No newline at end of file diff --git a/documentation/markdown/features/configuration.md b/documentation/markdown/features/configuration.md index bc2b7e91..5e04831b 100644 --- a/documentation/markdown/features/configuration.md +++ b/documentation/markdown/features/configuration.md @@ -238,7 +238,7 @@ workerInstance: volumes: # optional - name: volumeName snapshot: snapshotName # optional; to create volume from - # one or none of these + # one or none of these three # permanent: False # semiPermanent: False # exists: False diff --git a/documentation/markdown/features/gres.md b/documentation/markdown/features/gres.md new file mode 100644 index 00000000..0c345550 --- /dev/null +++ b/documentation/markdown/features/gres.md @@ -0,0 +1,10 @@ +# GPU Nodes (GRES) + +Currently, BiBiGrid can only handle GPU nodes with NVIDIA graphic cards. +BiBiGrid will [install NVIDIA drivers](bibigrid/resources/playbook/roles/bibigrid/tasks/001-apt.yaml) +and set up the [gres.conf](https://slurm.schedmd.com/gres.conf.html) files and adjust +the [slurm.conf](https://slurm.schedmd.com/slurm.conf.html#OPT_Gres_1) accordingly. + +For this the gres information is stored in the group vars. + +You can check the installation by executing nvidia-smi or `sinfo -N -o "%N %G"` (which lists the gres). \ No newline at end of file From 213994ceda457b8a6ac2f5a345379a7fb8d008fc Mon Sep 17 00:00:00 2001 From: Xaver Stiensmeier Date: Thu, 19 Feb 2026 15:55:50 +0100 Subject: [PATCH 6/6] removed unnecessary log line removed a log line with little semantic information #705 --- bibigrid/core/actions/create.py | 1 - 1 file changed, 1 deletion(-) diff --git a/bibigrid/core/actions/create.py b/bibigrid/core/actions/create.py index 98c68c72..e644fb22 100644 --- a/bibigrid/core/actions/create.py +++ b/bibigrid/core/actions/create.py @@ -117,7 +117,6 @@ def generate_keypair(self): # upload keyfiles for provider in self.providers: provider.create_keypair(name=self.key_name, public_key=public_key) - self.log.debug("Keypair generated - METHOD") def delete_old_vars(self): """