Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 5 additions & 3 deletions bibigrid/core/actions/create.py
Original file line number Diff line number Diff line change
Expand Up @@ -338,13 +338,14 @@ def add_volume_device_info_to_instance(self, provider, server, instance):
server_volume["id"] == volume["id"]), None)
if not server_volume:
raise RuntimeError(
f"Created server {server['name']} doesn't have attached volume {volume['name']}.")
f"Created server {server['name']} doesn't have attached volume {volume.get('name')} "
f"(volume_id:{volume.get('id')}).")
device = server_volume.get("device")
final_volumes.append({**volume, "device": device})

self.log.debug(f"Added Configuration: Instance {server['name']} has volume {volume['name']} "
f"as device {device} that is going to be mounted to "
f"{volume.get('mountPoint')}")
f"(volume_id:{volume.get('id')}) "
f"as device {device} (Mount Point {volume.get('mountPoint', 'Will Not Be Mounted')})")

self.write_remote.append(
({"volumes": final_volumes}, os.path.join(a_rp.HOST_VARS_FOLDER_REMOTE, f"{server['name']}.yaml"),
Expand Down Expand Up @@ -529,6 +530,7 @@ def create(self): # pylint: disable=too-many-branches,too-many-statements
self.log.info("%s not found. Creating folder.", folder)
os.mkdir(folder)
self.generate_keypair()
self.log.debug("Keypair generated")
self.delete_old_vars()
self.prepare_configurations()
self.create_defaults()
Expand Down
5 changes: 3 additions & 2 deletions bibigrid/core/actions/terminate.py
Original file line number Diff line number Diff line change
Expand Up @@ -168,8 +168,9 @@ def delete_security_groups(provider, cluster_id, security_groups, log, timeout=5
while not tmp_success:
try:
tmp_success = provider.delete_security_group(security_group_name)
except ConflictException:
log.info(f"ConflictException on deletion attempt on {provider.cloud_specification['identifier']}.")
except ConflictException as exc:
log.info(f"ConflictException on deletion attempt on {provider.cloud_specification['identifier']}: "
f"{exc}")
tmp_success = False
if tmp_success:
break
Expand Down
9 changes: 5 additions & 4 deletions bibigrid/core/utility/validate_configuration.py
Original file line number Diff line number Diff line change
Expand Up @@ -370,15 +370,16 @@ def check_instance_type_image_combination(self, instance_type, instance_image, p
def _check_volume(self, provider, volume, count):
success = True
if volume.get("exists"):
if volume.get("name"):
volume_object = provider.get_volume_by_id_or_name(volume["name"])
volume_name_or_id = volume.get("id", volume.get("name"))
if volume_name_or_id:
volume_object = provider.get_volume_by_id_or_name(volume_name_or_id)
if volume_object:
self.log.debug(
f"Found volume {volume['name']} on cloud "
f"Found volume {volume_name_or_id} on cloud "
f"{provider.cloud_specification['identifier']}.")
else:
self.log.warning(
f"Couldn't find volume {volume['name']} on cloud "
f"Couldn't find volume {volume_name_or_id} on cloud "
f"{provider.cloud_specification['identifier']}. "
"No size added to resource requirements dict."
)
Expand Down
2 changes: 2 additions & 0 deletions bibigrid/models/configuration.py
Original file line number Diff line number Diff line change
Expand Up @@ -201,6 +201,8 @@ class ConfigurationsModel(StrictModel):
def split_master_and_other(cls, values):
if isinstance(values, list):
values = {"configurations": values}
if values.get("master"):
return values
configs = values.get("configurations")
if not configs:
raise ValueError("Configurations list cannot be empty")
Expand Down
4 changes: 1 addition & 3 deletions bibigrid/resources/cloud_node_requirements.yaml
Original file line number Diff line number Diff line change
@@ -1,6 +1,4 @@
os_distro:
ubuntu:
os_versions:
- "24.04"
- "22.04"
- "20.04"
- "24.04"
7 changes: 7 additions & 0 deletions bibigrid/resources/playbook/roles/bibigrid/handlers/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -78,3 +78,10 @@
- name: Apt update
apt:
update_cache: "yes"

- name: Nvidia drivers installed
reboot:
reboot_timeout: 900
pre_reboot_delay: 5
post_reboot_delay: 20
test_command: nvidia-smi
16 changes: 16 additions & 0 deletions bibigrid/resources/playbook/roles/bibigrid/tasks/001-apt.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@
state: "present"

- name: Add zabbix repositories # noqa: line-length
when: enable_zabbix|default(false)|bool
apt:
# yamllint disable-line
deb: "https://repo.zabbix.com/zabbix/7.0/ubuntu/pool/main/z/zabbix-release/zabbix-release_latest_7.0+{{ ansible_distribution | lower }}{{ ansible_distribution_version }}_all.deb"
Expand All @@ -66,5 +67,20 @@
repo: 'deb https://apt.bi.denbi.de/repos/apt/{{ ansible_distribution_release | lower }} {{ ansible_distribution_release | lower }} main'
notify: Apt update

- name: Install Nvidia Drivers
when: flavor.gres is defined
block:
- name: Install ubuntu-drivers-common
apt:
name: ubuntu-drivers-common
state: present
update_cache: true

- name: Install recommended NVIDIA driver
command: ubuntu-drivers install
args:
creates: /usr/bin/nvidia-smi
notify: Nvidia drivers installed

- name: Flush handlers
meta: flush_handlers
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
# GRES CONFIG
{% set device_index = 0 %}
{% set ns = namespace(device_index=0) %}
{% for gres in flavor.gres %}
{% for i in range(gres.count | int) %}
Name={{ gres.name }} Type={{ gres.type }} File=/dev/nvidia{{ device_index }}
{% set device_index = device_index + 1 %}
{% endfor %}
{% for i in range(gres.count | int) %}
Name={{ gres.name }} Type={{ gres.type }} File=/dev/nvidia{{ ns.device_index }}
{% set ns.device_index = ns.device_index + 1 %}
{% endfor %}
{% endfor %}
5 changes: 3 additions & 2 deletions documentation/markdown/bibigrid_feature_list.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,9 @@
| Name | Purpose |
|:----------------------------------------------------------------:|:---------------------------------------------------------------------------------------------------------------------------:|
| [Version](features/version.md) | Returns BiBiGrid's version for opening issues and the like |
| [Terminate Cluster](features/terminate_cluster.md) | Terminates the cluster specified by cluster-id i.e. removes key, application credentials, servers and floating-ips. |
| [Terminate](features/terminate_cluster.md) | Terminates the cluster specified by cluster-id i.e. removes key, application credentials, servers and floating-ips. |
| [Create](features/create.md) | Creates the cluster specified by the configuration. |
| [List Clusters](features/list_clusters.md) | Shows info of all clusters if no cluster-id is specified. Otherwise the cluster-id's cluster will be shown in great detail. |
| [List](features/list_clusters.md) | Shows info of all clusters if no cluster-id is specified. Otherwise the cluster-id's cluster will be shown in great detail. |
| [Check](features/check.md) | Checks if given configuration is valid and necessary security measures are taken. |
| [Web IDE](features/ide.md) | Connects to running IDE of cluster-id's cluster. Requires that given cluster was setup with an ide. |
| [Update](features/update.md) | Updates the master's playbook and runs that playbook for the master. Requires that no job is running and no workers up. |
Expand All @@ -16,5 +16,6 @@
| [BiBiGrid Cluster Commands](features/cluster_commands.md) | Short useful commands to get information on the cluster |
| [Other Configurations](features/other_configurations.md) | Info about custom `ansible.cfg` and `slurm.conf` |
| [BiBiGrid REST API](features/bibigrid_rest.md) | Info about custom how to run BiBiGrid REST. |
| [GPU Nodes](features/gres.md) | Info about gpu nodes. |

![](../images/actions.jpg)
2 changes: 1 addition & 1 deletion documentation/markdown/features/configuration.md
Original file line number Diff line number Diff line change
Expand Up @@ -238,7 +238,7 @@ workerInstance:
volumes: # optional
- name: volumeName
snapshot: snapshotName # optional; to create volume from
# one or none of these
# one or none of these three
# permanent: False
# semiPermanent: False
# exists: False
Expand Down
10 changes: 10 additions & 0 deletions documentation/markdown/features/gres.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
# GPU Nodes (GRES)

Currently, BiBiGrid can only handle GPU nodes with NVIDIA graphic cards.
BiBiGrid will [install NVIDIA drivers](bibigrid/resources/playbook/roles/bibigrid/tasks/001-apt.yaml)
and set up the [gres.conf](https://slurm.schedmd.com/gres.conf.html) files and adjust
the [slurm.conf](https://slurm.schedmd.com/slurm.conf.html#OPT_Gres_1) accordingly.

For this the gres information is stored in the group vars.

You can check the installation by executing nvidia-smi or `sinfo -N -o "%N %G"` (which lists the gres).
2 changes: 1 addition & 1 deletion tests/rest_tests/rest_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
with open(os.path.join(ROOT_PATH, "resources/tests/rest_test.json"), 'r', encoding='utf-8') as file:
configurations_json = json.load(file)

CLUSTER_ID = "123456789123456"
CLUSTER_ID = "bibigrid1234567" # "a2ternativetest"

client = TestClient(app)
# Read the cloud_node_requirements YAML file and load it into a dictionary
Expand Down