diff --git a/bibigrid/core/actions/create.py b/bibigrid/core/actions/create.py index 856f1763..e644fb22 100644 --- a/bibigrid/core/actions/create.py +++ b/bibigrid/core/actions/create.py @@ -338,13 +338,14 @@ def add_volume_device_info_to_instance(self, provider, server, instance): server_volume["id"] == volume["id"]), None) if not server_volume: raise RuntimeError( - f"Created server {server['name']} doesn't have attached volume {volume['name']}.") + f"Created server {server['name']} doesn't have attached volume {volume.get('name')} " + f"(volume_id:{volume.get('id')}).") device = server_volume.get("device") final_volumes.append({**volume, "device": device}) self.log.debug(f"Added Configuration: Instance {server['name']} has volume {volume['name']} " - f"as device {device} that is going to be mounted to " - f"{volume.get('mountPoint')}") + f"(volume_id:{volume.get('id')}) " + f"as device {device} (Mount Point {volume.get('mountPoint', 'Will Not Be Mounted')})") self.write_remote.append( ({"volumes": final_volumes}, os.path.join(a_rp.HOST_VARS_FOLDER_REMOTE, f"{server['name']}.yaml"), @@ -529,6 +530,7 @@ def create(self): # pylint: disable=too-many-branches,too-many-statements self.log.info("%s not found. Creating folder.", folder) os.mkdir(folder) self.generate_keypair() + self.log.debug("Keypair generated") self.delete_old_vars() self.prepare_configurations() self.create_defaults() diff --git a/bibigrid/core/actions/terminate.py b/bibigrid/core/actions/terminate.py index 95fc96fb..2dea741b 100644 --- a/bibigrid/core/actions/terminate.py +++ b/bibigrid/core/actions/terminate.py @@ -168,8 +168,9 @@ def delete_security_groups(provider, cluster_id, security_groups, log, timeout=5 while not tmp_success: try: tmp_success = provider.delete_security_group(security_group_name) - except ConflictException: - log.info(f"ConflictException on deletion attempt on {provider.cloud_specification['identifier']}.") + except ConflictException as exc: + log.info(f"ConflictException on deletion attempt on {provider.cloud_specification['identifier']}: " + f"{exc}") tmp_success = False if tmp_success: break diff --git a/bibigrid/core/utility/validate_configuration.py b/bibigrid/core/utility/validate_configuration.py index e1a7d798..a448bbac 100644 --- a/bibigrid/core/utility/validate_configuration.py +++ b/bibigrid/core/utility/validate_configuration.py @@ -370,15 +370,16 @@ def check_instance_type_image_combination(self, instance_type, instance_image, p def _check_volume(self, provider, volume, count): success = True if volume.get("exists"): - if volume.get("name"): - volume_object = provider.get_volume_by_id_or_name(volume["name"]) + volume_name_or_id = volume.get("id", volume.get("name")) + if volume_name_or_id: + volume_object = provider.get_volume_by_id_or_name(volume_name_or_id) if volume_object: self.log.debug( - f"Found volume {volume['name']} on cloud " + f"Found volume {volume_name_or_id} on cloud " f"{provider.cloud_specification['identifier']}.") else: self.log.warning( - f"Couldn't find volume {volume['name']} on cloud " + f"Couldn't find volume {volume_name_or_id} on cloud " f"{provider.cloud_specification['identifier']}. " "No size added to resource requirements dict." ) diff --git a/bibigrid/models/configuration.py b/bibigrid/models/configuration.py index fbc41151..e77661e7 100644 --- a/bibigrid/models/configuration.py +++ b/bibigrid/models/configuration.py @@ -201,6 +201,8 @@ class ConfigurationsModel(StrictModel): def split_master_and_other(cls, values): if isinstance(values, list): values = {"configurations": values} + if values.get("master"): + return values configs = values.get("configurations") if not configs: raise ValueError("Configurations list cannot be empty") diff --git a/bibigrid/resources/cloud_node_requirements.yaml b/bibigrid/resources/cloud_node_requirements.yaml index ca918c4d..136bd867 100644 --- a/bibigrid/resources/cloud_node_requirements.yaml +++ b/bibigrid/resources/cloud_node_requirements.yaml @@ -1,6 +1,4 @@ os_distro: ubuntu: os_versions: - - "24.04" - - "22.04" - - "20.04" \ No newline at end of file + - "24.04" \ No newline at end of file diff --git a/bibigrid/resources/playbook/roles/bibigrid/handlers/main.yml b/bibigrid/resources/playbook/roles/bibigrid/handlers/main.yml index 0c6e6eae..e5d0d966 100644 --- a/bibigrid/resources/playbook/roles/bibigrid/handlers/main.yml +++ b/bibigrid/resources/playbook/roles/bibigrid/handlers/main.yml @@ -78,3 +78,10 @@ - name: Apt update apt: update_cache: "yes" + +- name: Nvidia drivers installed + reboot: + reboot_timeout: 900 + pre_reboot_delay: 5 + post_reboot_delay: 20 + test_command: nvidia-smi diff --git a/bibigrid/resources/playbook/roles/bibigrid/tasks/001-apt.yaml b/bibigrid/resources/playbook/roles/bibigrid/tasks/001-apt.yaml index dec9e5b8..5fae97fc 100644 --- a/bibigrid/resources/playbook/roles/bibigrid/tasks/001-apt.yaml +++ b/bibigrid/resources/playbook/roles/bibigrid/tasks/001-apt.yaml @@ -47,6 +47,7 @@ state: "present" - name: Add zabbix repositories # noqa: line-length + when: enable_zabbix|default(false)|bool apt: # yamllint disable-line deb: "https://repo.zabbix.com/zabbix/7.0/ubuntu/pool/main/z/zabbix-release/zabbix-release_latest_7.0+{{ ansible_distribution | lower }}{{ ansible_distribution_version }}_all.deb" @@ -66,5 +67,20 @@ repo: 'deb https://apt.bi.denbi.de/repos/apt/{{ ansible_distribution_release | lower }} {{ ansible_distribution_release | lower }} main' notify: Apt update +- name: Install Nvidia Drivers + when: flavor.gres is defined + block: + - name: Install ubuntu-drivers-common + apt: + name: ubuntu-drivers-common + state: present + update_cache: true + + - name: Install recommended NVIDIA driver + command: ubuntu-drivers install + args: + creates: /usr/bin/nvidia-smi + notify: Nvidia drivers installed + - name: Flush handlers meta: flush_handlers diff --git a/bibigrid/resources/playbook/roles/bibigrid/templates/slurm/gres.j2 b/bibigrid/resources/playbook/roles/bibigrid/templates/slurm/gres.j2 index 7168728b..e9d45ad0 100644 --- a/bibigrid/resources/playbook/roles/bibigrid/templates/slurm/gres.j2 +++ b/bibigrid/resources/playbook/roles/bibigrid/templates/slurm/gres.j2 @@ -1,8 +1,8 @@ # GRES CONFIG -{% set device_index = 0 %} +{% set ns = namespace(device_index=0) %} {% for gres in flavor.gres %} - {% for i in range(gres.count | int) %} -Name={{ gres.name }} Type={{ gres.type }} File=/dev/nvidia{{ device_index }} - {% set device_index = device_index + 1 %} - {% endfor %} +{% for i in range(gres.count | int) %} +Name={{ gres.name }} Type={{ gres.type }} File=/dev/nvidia{{ ns.device_index }} +{% set ns.device_index = ns.device_index + 1 %} +{% endfor %} {% endfor %} \ No newline at end of file diff --git a/documentation/markdown/bibigrid_feature_list.md b/documentation/markdown/bibigrid_feature_list.md index 3ec6b35d..7a9972dc 100644 --- a/documentation/markdown/bibigrid_feature_list.md +++ b/documentation/markdown/bibigrid_feature_list.md @@ -3,9 +3,9 @@ | Name | Purpose | |:----------------------------------------------------------------:|:---------------------------------------------------------------------------------------------------------------------------:| | [Version](features/version.md) | Returns BiBiGrid's version for opening issues and the like | -| [Terminate Cluster](features/terminate_cluster.md) | Terminates the cluster specified by cluster-id i.e. removes key, application credentials, servers and floating-ips. | +| [Terminate](features/terminate_cluster.md) | Terminates the cluster specified by cluster-id i.e. removes key, application credentials, servers and floating-ips. | | [Create](features/create.md) | Creates the cluster specified by the configuration. | -| [List Clusters](features/list_clusters.md) | Shows info of all clusters if no cluster-id is specified. Otherwise the cluster-id's cluster will be shown in great detail. | +| [List](features/list_clusters.md) | Shows info of all clusters if no cluster-id is specified. Otherwise the cluster-id's cluster will be shown in great detail. | | [Check](features/check.md) | Checks if given configuration is valid and necessary security measures are taken. | | [Web IDE](features/ide.md) | Connects to running IDE of cluster-id's cluster. Requires that given cluster was setup with an ide. | | [Update](features/update.md) | Updates the master's playbook and runs that playbook for the master. Requires that no job is running and no workers up. | @@ -16,5 +16,6 @@ | [BiBiGrid Cluster Commands](features/cluster_commands.md) | Short useful commands to get information on the cluster | | [Other Configurations](features/other_configurations.md) | Info about custom `ansible.cfg` and `slurm.conf` | | [BiBiGrid REST API](features/bibigrid_rest.md) | Info about custom how to run BiBiGrid REST. | +| [GPU Nodes](features/gres.md) | Info about gpu nodes. | ![](../images/actions.jpg) \ No newline at end of file diff --git a/documentation/markdown/features/configuration.md b/documentation/markdown/features/configuration.md index bc2b7e91..5e04831b 100644 --- a/documentation/markdown/features/configuration.md +++ b/documentation/markdown/features/configuration.md @@ -238,7 +238,7 @@ workerInstance: volumes: # optional - name: volumeName snapshot: snapshotName # optional; to create volume from - # one or none of these + # one or none of these three # permanent: False # semiPermanent: False # exists: False diff --git a/documentation/markdown/features/gres.md b/documentation/markdown/features/gres.md new file mode 100644 index 00000000..0c345550 --- /dev/null +++ b/documentation/markdown/features/gres.md @@ -0,0 +1,10 @@ +# GPU Nodes (GRES) + +Currently, BiBiGrid can only handle GPU nodes with NVIDIA graphic cards. +BiBiGrid will [install NVIDIA drivers](bibigrid/resources/playbook/roles/bibigrid/tasks/001-apt.yaml) +and set up the [gres.conf](https://slurm.schedmd.com/gres.conf.html) files and adjust +the [slurm.conf](https://slurm.schedmd.com/slurm.conf.html#OPT_Gres_1) accordingly. + +For this the gres information is stored in the group vars. + +You can check the installation by executing nvidia-smi or `sinfo -N -o "%N %G"` (which lists the gres). \ No newline at end of file diff --git a/tests/rest_tests/rest_tests.py b/tests/rest_tests/rest_tests.py index 3d217b37..8758c4e5 100644 --- a/tests/rest_tests/rest_tests.py +++ b/tests/rest_tests/rest_tests.py @@ -14,7 +14,7 @@ with open(os.path.join(ROOT_PATH, "resources/tests/rest_test.json"), 'r', encoding='utf-8') as file: configurations_json = json.load(file) -CLUSTER_ID = "123456789123456" +CLUSTER_ID = "bibigrid1234567" # "a2ternativetest" client = TestClient(app) # Read the cloud_node_requirements YAML file and load it into a dictionary