Skip to content

Commit 0900d20

Browse files
committed
Merge pull request #60 from mgermain/helios_support
Added support for helios gpu cluster.
2 parents 37d07fe + 5ccc4bb commit 0900d20

File tree

4 files changed

+93
-10
lines changed

4 files changed

+93
-10
lines changed

smartdispatch/config/helios.json

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
{
2+
"maint": {
3+
"ram": 128,
4+
"modules": ["cuda/6.0.37"],
5+
"cores": 20,
6+
"max_walltime": "12:00:00",
7+
"gpus": 8,
8+
"nodes": 15
9+
}
10+
}

smartdispatch/job_generator.py

Lines changed: 31 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,8 @@ def job_generator_factory(queue, commands, command_params={}, cluster_name=None)
1111
return GuilliminJobGenerator(queue, commands, command_params)
1212
elif cluster_name == "mammouth":
1313
return MammouthJobGenerator(queue, commands, command_params)
14+
elif cluster_name == "helios":
15+
return HeliosJobGenerator(queue, commands, command_params)
1416

1517
return JobGenerator(queue, commands, command_params)
1618

@@ -80,11 +82,23 @@ def write_pbs_files(self, pbs_dir="./"):
8082

8183
return pbs_filenames
8284

85+
def generate_pbs_with_account_name_from_env(self, environment_variable_name):
86+
pbs_list = JobGenerator.generate_pbs(self)
87+
88+
if environment_variable_name not in os.environ:
89+
raise ValueError("Undefined environment variable: ${}. Please, provide your account name!".format(environment_variable_name))
90+
91+
account_name = os.path.basename(os.path.realpath(os.getenv(environment_variable_name)))
92+
for pbs in pbs_list:
93+
pbs.add_options(A=account_name)
94+
95+
return pbs_list
96+
8397

8498
class MammouthJobGenerator(JobGenerator):
8599

86-
def generate_pbs(self, *args, **kwargs):
87-
pbs_list = JobGenerator.generate_pbs(self, *args, **kwargs)
100+
def generate_pbs(self):
101+
pbs_list = JobGenerator.generate_pbs(self)
88102

89103
if self.queue.name.endswith("@mp2"):
90104
for pbs in pbs_list:
@@ -95,14 +109,23 @@ def generate_pbs(self, *args, **kwargs):
95109

96110
class GuilliminJobGenerator(JobGenerator):
97111

98-
def generate_pbs(self, *args, **kwargs):
99-
pbs_list = JobGenerator.generate_pbs(self, *args, **kwargs)
112+
def generate_pbs(self):
113+
return self.generate_pbs_with_account_name_from_env('HOME_GROUP')
114+
100115

101-
if 'HOME_GROUP' not in os.environ:
102-
raise ValueError("Undefined environment variable: $HOME_GROUP. Please, provide your account name if on Guillimin!")
116+
# https://wiki.calculquebec.ca/w/Ex%C3%A9cuter_une_t%C3%A2che#tab=tab6
117+
class HeliosJobGenerator(JobGenerator):
118+
119+
def generate_pbs(self):
120+
pbs_list = self.generate_pbs_with_account_name_from_env('RAP')
103121

104-
account_name = os.path.split(os.getenv('HOME_GROUP', ''))[-1]
105122
for pbs in pbs_list:
106-
pbs.add_options(A=account_name)
123+
# Remove forbidden ppn option. Default is 5 cores per 2 gpu.
124+
pbs.resources['nodes'] = re.sub(":ppn=[0-9]+", "", pbs.resources['nodes'])
125+
126+
# Nb of GPUs has to be a multiple of 2
127+
nb_gpus = int(re.findall("gpus=([0-9]+)", pbs.resources['nodes'])[0])
128+
if nb_gpus % 2 != 0:
129+
pbs.resources['nodes'] = re.sub("gpus=[0-9]+", "gpus={0}".format(nb_gpus+1), pbs.resources['nodes'])
107130

108131
return pbs_list

smartdispatch/tests/test_job_generator.py

Lines changed: 50 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
1-
from nose.tools import assert_true, assert_equal, assert_raises
1+
from nose.tools import assert_true, assert_false, assert_equal, assert_raises
22

33
import os
44
from smartdispatch.queue import Queue
5-
from smartdispatch.job_generator import JobGenerator, GuilliminJobGenerator, MammouthJobGenerator
5+
from smartdispatch.job_generator import JobGenerator, GuilliminJobGenerator, MammouthJobGenerator, HeliosJobGenerator
66
from smartdispatch.job_generator import job_generator_factory
77
import unittest
88
import tempfile
@@ -112,6 +112,7 @@ def test_generate_pbs(self):
112112

113113

114114
class TestMammouthQueue(unittest.TestCase):
115+
115116
def setUp(self):
116117
self.commands = ["echo 1", "echo 2", "echo 3", "echo 4"]
117118
self.queue = Queue("qtest@mp2", "mammouth")
@@ -122,6 +123,48 @@ def test_generate_pbs(self):
122123
assert_true("ppn=1" in job_generator.generate_pbs()[0].__str__())
123124

124125

126+
class TestHeliosQueue(unittest.TestCase):
127+
128+
def setUp(self):
129+
self.commands = ["echo 1", "echo 2", "echo 3", "echo 4"]
130+
self.queue = Queue("maint", "helios")
131+
132+
self.env_val = 'RAP'
133+
134+
self.bak_env_home_group = os.environ.get(self.env_val)
135+
if self.bak_env_home_group is not None:
136+
del os.environ[self.env_val]
137+
os.environ[self.env_val] = "/rap/group/"
138+
139+
self.job_generator = HeliosJobGenerator(self.queue, self.commands)
140+
141+
def tearDown(self):
142+
if self.bak_env_home_group is not None:
143+
os.environ[self.env_val] = self.bak_env_home_group
144+
145+
def test_generate_pbs_invalid_group(self):
146+
del os.environ[self.env_val]
147+
148+
assert_raises(ValueError, self.job_generator.generate_pbs)
149+
150+
def test_generate_pbs_valid_group(self):
151+
pbs = self.job_generator.generate_pbs()[0]
152+
153+
assert_equal(pbs.options['-A'], "group")
154+
155+
def test_generate_pbs_ppn_is_absent(self):
156+
assert_false("ppn=" in self.job_generator.generate_pbs()[0].__str__())
157+
158+
def test_generate_pbs_even_nb_commands(self):
159+
assert_true("gpus=4" in self.job_generator.generate_pbs()[0].__str__())
160+
161+
def test_generate_pbs_odd_nb_commands(self):
162+
commands = ["echo 1", "echo 2", "echo 3", "echo 4", "echo 5"]
163+
job_generator = HeliosJobGenerator(self.queue, commands)
164+
165+
assert_true("gpus=6" in job_generator.generate_pbs()[0].__str__())
166+
167+
125168
def test_job_generator_factory():
126169
queue = {"queue_name": "qtest"}
127170
commands = []
@@ -131,6 +174,11 @@ def test_job_generator_factory():
131174
job_generator = job_generator_factory(queue, commands, cluster_name="mammouth")
132175
assert_true(isinstance(job_generator, MammouthJobGenerator))
133176

177+
job_generator = job_generator_factory(queue, commands, cluster_name="helios")
178+
assert_true(isinstance(job_generator, HeliosJobGenerator))
179+
134180
job_generator = job_generator_factory(queue, commands, cluster_name=None)
135181
assert_true(isinstance(job_generator, JobGenerator))
136182
assert_true(not isinstance(job_generator, GuilliminJobGenerator))
183+
assert_true(not isinstance(job_generator, MammouthJobGenerator))
184+
assert_true(not isinstance(job_generator, HeliosJobGenerator))

smartdispatch/utils.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -74,4 +74,6 @@ def detect_cluster():
7474
cluster_name = "mammouth"
7575
elif server_name.split('.')[-1] == 'guil':
7676
cluster_name = "guillimin"
77+
elif server_name.split('.')[-1] == 'helios':
78+
cluster_name = "helios"
7779
return cluster_name

0 commit comments

Comments
 (0)