From 2245f6d1893b22f259e90045e135337a1db8e45d Mon Sep 17 00:00:00 2001 From: luccab user Date: Thu, 2 Oct 2025 19:07:00 +0000 Subject: [PATCH 1/3] adding num gpus and partition to output --- clusterscope/cli.py | 2 ++ clusterscope/cluster_info.py | 24 +++++++++++++++++++++--- 2 files changed, 23 insertions(+), 3 deletions(-) diff --git a/clusterscope/cli.py b/clusterscope/cli.py index 1fba727..96d1cc4 100644 --- a/clusterscope/cli.py +++ b/clusterscope/cli.py @@ -207,6 +207,7 @@ def slurm(num_gpus: int, num_tasks_per_node: int, output_format: str, partition: unified_info = UnifiedInfo(partition=partition) job_requirements = unified_info.get_task_resource_requirements( + partition=partition, num_gpus=num_gpus, num_tasks_per_node=num_tasks_per_node, ) @@ -242,6 +243,7 @@ def array(num_gpus_per_task: int, output_format: str, partition: str): """Generate job requirements for an array job.""" unified_info = UnifiedInfo(partition=partition) job_requirements = unified_info.get_array_job_requirements( + partition=partition, num_gpus_per_task=num_gpus_per_task, ) diff --git a/clusterscope/cluster_info.py b/clusterscope/cluster_info.py index 346930e..1b03e3e 100644 --- a/clusterscope/cluster_info.py +++ b/clusterscope/cluster_info.py @@ -25,6 +25,8 @@ class ResourceShape(NamedTuple): cpu_cores: int memory: str tasks_per_node: int + gpus_per_node: int + slurm_partition: str def to_json(self) -> str: """Convert ResourceShape to JSON format. @@ -39,6 +41,8 @@ def to_json(self) -> str: "memory": self.memory, "tasks_per_node": self.tasks_per_node, "mem_gb": mem_gb, + "gpus_per_node": self.gpus_per_node, + "slurm_partition": self.slurm_partition, } return json.dumps(data, indent=2) @@ -53,6 +57,8 @@ def to_sbatch(self) -> str: f"#SBATCH --cpus-per-task={self.cpu_cores}", f"#SBATCH --mem={self.memory}", f"#SBATCH --ntasks-per-node={self.tasks_per_node}", + f"#SBATCH --gres=gpu:{self.gpus_per_node}", + f"#SBATCH --partition={self.slurm_partition}", ] return "\n".join(lines) @@ -67,6 +73,8 @@ def to_srun(self) -> str: f"--cpus-per-task={self.cpu_cores}", f"--mem={self.memory}", f"--ntasks-per-node={self.tasks_per_node}", + f"--gres=gpu:{self.gpus_per_node}", + f"--partition={self.slurm_partition}", ] return " ".join(cmd_parts) @@ -79,9 +87,11 @@ def to_submitit(self) -> str: mem_gb = parse_memory_to_gb(self.memory) params = { + "slurm_partition": self.slurm_partition, "cpus_per_task": self.cpu_cores, "mem_gb": mem_gb, "tasks_per_node": self.tasks_per_node, + "gpus_per_node": self.gpus_per_node, } return json.dumps(params, indent=2) @@ -222,7 +232,7 @@ def get_total_gpus_per_node(self) -> int: return max(total_gpus, 1) # Ensure at least 1 to avoid division by zero def get_task_resource_requirements( - self, num_gpus: int, num_tasks_per_node: int = 1 + self, partition: str, num_gpus: int, num_tasks_per_node: int = 1 ) -> ResourceShape: """Calculate resource requirements for better GPU packing based on node's GPU configuration. @@ -285,12 +295,16 @@ def get_task_resource_requirements( sbatch_memory = f"{required_ram_gb:.0f}G" return ResourceShape( + slurm_partition=partition, cpu_cores=sbatch_cpu_cores, memory=sbatch_memory, tasks_per_node=num_tasks_per_node, + gpus_per_node=num_gpus, ) - def get_array_job_requirements(self, num_gpus_per_task: int) -> ResourceShape: + def get_array_job_requirements( + self, partition: str, num_gpus_per_task: int + ) -> ResourceShape: """Calculate resource requirements for array jobs with optimal GPU packing. For array jobs, each array element gets its own resource allocation. @@ -349,7 +363,11 @@ def get_array_job_requirements(self, num_gpus_per_task: int) -> ResourceShape: # Array jobs always have 1 task per array element return ResourceShape( - cpu_cores=sbatch_cpu_cores, memory=sbatch_memory, tasks_per_node=1 + slurm_partition=partition, + cpu_cores=sbatch_cpu_cores, + memory=sbatch_memory, + tasks_per_node=1, + gpus_per_node=num_gpus_per_task, ) From bb352b98411a2e1cfb5bb73c335336a2bf6286a8 Mon Sep 17 00:00:00 2001 From: luccab user Date: Thu, 2 Oct 2025 19:24:07 +0000 Subject: [PATCH 2/3] tests --- tests/test_cluster_info.py | 96 ++++++++++++++------ tests/test_resource_shape.py | 169 +++++++++++++++++++---------------- 2 files changed, 163 insertions(+), 102 deletions(-) diff --git a/tests/test_cluster_info.py b/tests/test_cluster_info.py index da2b26e..58edfed 100644 --- a/tests/test_cluster_info.py +++ b/tests/test_cluster_info.py @@ -717,7 +717,9 @@ def test_get_task_resource_requirements_single_gpu_8gpu_node( mock_cpus.return_value = 192 mock_mem.return_value = 1843200 # 1.8TB in MB - result = self.unified_info.get_task_resource_requirements(num_gpus=1) + result = self.unified_info.get_task_resource_requirements( + partition="test_partition", num_gpus=1 + ) self.assertEqual(result.cpu_cores, 24) # 192/8 = 24 self.assertEqual(result.memory, "225G") # 1843200/8/1024 = 225GB @@ -734,7 +736,9 @@ def test_get_task_resource_requirements_four_gpus_8gpu_node( mock_cpus.return_value = 192 mock_mem.return_value = 1843200 - result = self.unified_info.get_task_resource_requirements(num_gpus=4) + result = self.unified_info.get_task_resource_requirements( + partition="test_partition", num_gpus=4 + ) self.assertEqual(result.cpu_cores, 96) # 192/8*4 = 96 self.assertEqual(result.memory, "900G") # 1843200/8*4/1024 = 900GB @@ -751,7 +755,9 @@ def test_get_task_resource_requirements_full_node_8gpu( mock_cpus.return_value = 192 mock_mem.return_value = 1843200 - result = self.unified_info.get_task_resource_requirements(num_gpus=8) + result = self.unified_info.get_task_resource_requirements( + partition="test_partition", num_gpus=8 + ) self.assertEqual(result.cpu_cores, 192) # All CPUs self.assertEqual(result.memory, "1800G") # All memory: 1843200/1024 = 1800GB @@ -769,12 +775,16 @@ def test_get_task_resource_requirements_4gpu_node_configuration( mock_mem.return_value = 524288 # 512GB in MB # Test 1 GPU on 4-GPU node - result = self.unified_info.get_task_resource_requirements(num_gpus=1) + result = self.unified_info.get_task_resource_requirements( + partition="test_partition", num_gpus=1 + ) self.assertEqual(result.cpu_cores, 16) # 64/4 = 16 self.assertEqual(result.memory, "128G") # 524288/4/1024 = 128GB # Test full 4-GPU node - result = self.unified_info.get_task_resource_requirements(num_gpus=4) + result = self.unified_info.get_task_resource_requirements( + partition="test_partition", num_gpus=4 + ) self.assertEqual(result.cpu_cores, 64) # All CPUs self.assertEqual(result.memory, "512G") # All memory @@ -790,7 +800,7 @@ def test_getResRequirements_with_multiple_tasks_per_node( mock_mem.return_value = 1843200 result = self.unified_info.get_task_resource_requirements( - num_gpus=4, num_tasks_per_node=2 + partition="test_partition", num_gpus=4, num_tasks_per_node=2 ) self.assertEqual(result.cpu_cores, 48) # (192/8*4)/2 = 48 per task @@ -808,7 +818,9 @@ def test_get_task_resource_requirements_memory_terabyte_format( mock_cpus.return_value = 192 mock_mem.return_value = 8388608 # 8TB in MB - result = self.unified_info.get_task_resource_requirements(num_gpus=8) + result = self.unified_info.get_task_resource_requirements( + partition="test_partition", num_gpus=8 + ) self.assertEqual(result.memory, "8192G") # 8388608/1024 = 8192GB @@ -823,7 +835,9 @@ def test_get_task_resource_requirements_cpu_rounding_up( mock_cpus.return_value = 191 # Odd number that doesn't divide evenly mock_mem.return_value = 1843200 - result = self.unified_info.get_task_resource_requirements(num_gpus=1) + result = self.unified_info.get_task_resource_requirements( + partition="test_partition", num_gpus=1 + ) # 191/8 = 23.875, should round up to 24 self.assertEqual(result.cpu_cores, 24) @@ -835,12 +849,16 @@ def test_getResRequirements_invalid_num_gpus(self, mock_total_gpus): # Test zero GPUs with self.assertRaises(ValueError) as context: - self.unified_info.get_task_resource_requirements(num_gpus=0) + self.unified_info.get_task_resource_requirements( + partition="test_partition", num_gpus=0 + ) self.assertIn("num_gpus must be between 1 and 8", str(context.exception)) # Test more than max GPUs with self.assertRaises(ValueError) as context: - self.unified_info.get_task_resource_requirements(num_gpus=9) + self.unified_info.get_task_resource_requirements( + partition="test_partition", num_gpus=9 + ) self.assertIn("num_gpus must be between 1 and 8", str(context.exception)) @patch.object(UnifiedInfo, "get_total_gpus_per_node") @@ -850,7 +868,7 @@ def test_getResRequirements_invalid_num_tasks_per_node(self, mock_total_gpus): with self.assertRaises(ValueError) as context: self.unified_info.get_task_resource_requirements( - num_gpus=1, num_tasks_per_node=0 + partition="test_partition", num_gpus=1, num_tasks_per_node=0 ) self.assertIn("num_tasks_per_node must be at least 1", str(context.exception)) @@ -865,7 +883,9 @@ def test_getArrayJobRequirements_single_gpu( mock_cpus.return_value = 192 mock_mem.return_value = 1843200 - result = self.unified_info.get_array_job_requirements(num_gpus_per_task=1) + result = self.unified_info.get_array_job_requirements( + partition="test_partition", num_gpus_per_task=1 + ) self.assertEqual(result.cpu_cores, 24) # 192/8 = 24 self.assertEqual(result.memory, "225G") # 1843200/8/1024 = 225GB @@ -882,7 +902,9 @@ def test_getArrayJobRequirements_multiple_gpus( mock_cpus.return_value = 192 mock_mem.return_value = 1843200 - result = self.unified_info.get_array_job_requirements(num_gpus_per_task=4) + result = self.unified_info.get_array_job_requirements( + partition="test_partition", num_gpus_per_task=4 + ) self.assertEqual(result.cpu_cores, 96) # 192/8*4 = 96 self.assertEqual(result.memory, "900G") # 1843200/8*4/1024 = 900GB @@ -899,7 +921,9 @@ def test_getArrayJobRequirements_full_node( mock_cpus.return_value = 192 mock_mem.return_value = 1843200 - result = self.unified_info.get_array_job_requirements(num_gpus_per_task=8) + result = self.unified_info.get_array_job_requirements( + partition="test_partition", num_gpus_per_task=8 + ) self.assertEqual(result.cpu_cores, 192) # All CPUs self.assertEqual( @@ -918,7 +942,9 @@ def test_getArrayJobRequirements_4gpu_node( mock_cpus.return_value = 64 mock_mem.return_value = 524288 - result = self.unified_info.get_array_job_requirements(num_gpus_per_task=2) + result = self.unified_info.get_array_job_requirements( + partition="test_partition", num_gpus_per_task=2 + ) self.assertEqual(result.cpu_cores, 32) # 64/4*2 = 32 self.assertEqual(result.memory, "256G") # 524288/4*2/1024 = 256GB @@ -931,14 +957,18 @@ def test_getArrayJobRequirements_invalid_num_gpus_per_task(self, mock_total_gpus # Test zero GPUs with self.assertRaises(ValueError) as context: - self.unified_info.get_array_job_requirements(num_gpus_per_task=0) + self.unified_info.get_array_job_requirements( + partition="test_partition", num_gpus_per_task=0 + ) self.assertIn( "num_gpus_per_task must be between 1 and 8", str(context.exception) ) # Test more than max GPUs with self.assertRaises(ValueError) as context: - self.unified_info.get_array_job_requirements(num_gpus_per_task=9) + self.unified_info.get_array_job_requirements( + partition="test_partition", num_gpus_per_task=9 + ) self.assertIn( "num_gpus_per_task must be between 1 and 8", str(context.exception) ) @@ -988,36 +1018,48 @@ def test_dynamic_gpu_boundary_validation( mock_mem.return_value = 524288 # Valid requests for 4-GPU node - result = self.unified_info.get_task_resource_requirements(num_gpus=1) + result = self.unified_info.get_task_resource_requirements( + partition="test_partition", num_gpus=1 + ) self.assertIsInstance(result, ResourceShape) - result = self.unified_info.get_task_resource_requirements(num_gpus=4) + result = self.unified_info.get_task_resource_requirements( + partition="test_partition", num_gpus=4 + ) self.assertIsInstance(result, ResourceShape) # Invalid request (more than available) with self.assertRaises(ValueError) as context: - self.unified_info.get_task_resource_requirements(num_gpus=5) + self.unified_info.get_task_resource_requirements( + partition="test_partition", num_gpus=5 + ) self.assertIn("num_gpus must be between 1 and 4", str(context.exception)) # Test with 16-GPU node mock_total_gpus.return_value = 16 # Should now accept up to 16 GPUs - result = self.unified_info.get_task_resource_requirements(num_gpus=16) + result = self.unified_info.get_task_resource_requirements( + partition="test_partition", num_gpus=16 + ) self.assertIsInstance(result, ResourceShape) # But not 17 with self.assertRaises(ValueError) as context: - self.unified_info.get_task_resource_requirements(num_gpus=17) + self.unified_info.get_task_resource_requirements( + partition="test_partition", num_gpus=17 + ) self.assertIn("num_gpus must be between 1 and 16", str(context.exception)) def test_resource_shape_namedtuple(self): """Test ResourceShape NamedTuple structure.""" - resource = ResourceShape(cpu_cores=24, memory="225G", tasks_per_node=1) - - self.assertEqual(resource.cpu_cores, 24) - self.assertEqual(resource.memory, "225G") - self.assertEqual(resource.tasks_per_node, 1) + resource = ResourceShape( + slurm_partition="test", + gpus_per_node=4, + cpu_cores=24, + memory="225G", + tasks_per_node=1, + ) # Test that it's immutable (characteristic of NamedTuple) with self.assertRaises(AttributeError): diff --git a/tests/test_resource_shape.py b/tests/test_resource_shape.py index 5402ddb..8b4eaf4 100644 --- a/tests/test_resource_shape.py +++ b/tests/test_resource_shape.py @@ -9,17 +9,29 @@ from clusterscope.cluster_info import ResourceShape from clusterscope.parser import parse_memory_to_gb +TEST_CONFIGS = [ + # (partition, gpus_per_node, cpu_cores, memory, tasks_per_node, expected_mem_gb) + ("test_partition", 4, 24, "225G", 1, 225), + ("test_partition", 4, 64, "1T", 2, 1024), + ("test_partition", 4, 8, "32G", 1, 32), + ("test_partition", 8, 128, "4T", 1, 4096), + ("test_partition", 8, 1, "1G", 1, 1), + ("test_partition", 8, 256, "16T", 4, 16384), +] + class TestResourceShape(unittest.TestCase): """Test cases for ResourceShape class and its to_X methods.""" def test_resource_shape_creation(self): """Test ResourceShape creation and basic properties.""" - resource = ResourceShape(cpu_cores=24, memory="225G", tasks_per_node=1) - - self.assertEqual(resource.cpu_cores, 24) - self.assertEqual(resource.memory, "225G") - self.assertEqual(resource.tasks_per_node, 1) + resource = ResourceShape( + slurm_partition="test_partition", + gpus_per_node=1, + cpu_cores=24, + memory="225G", + tasks_per_node=1, + ) # Test immutability (NamedTuple characteristic) with self.assertRaises(AttributeError): @@ -44,35 +56,39 @@ def test_memory_parsing(self): for memory_str, expected_gb in test_cases: with self.subTest(memory=memory_str, expected=expected_gb): resource = ResourceShape( - cpu_cores=8, memory=memory_str, tasks_per_node=1 + slurm_partition="test_partition", + gpus_per_node=1, + cpu_cores=8, + memory=memory_str, + tasks_per_node=1, ) self.assertEqual(parse_memory_to_gb(resource.memory), expected_gb) def test_to_json(self): """Test to_json format method with various configurations.""" - test_configs = [ - # (cpu_cores, memory, tasks_per_node, expected_mem_gb) - (24, "225G", 1, 225), - (64, "1T", 2, 1024), - (8, "32G", 1, 32), - (128, "4T", 1, 4096), - (1, "1G", 1, 1), # Minimum values - (256, "16T", 4, 16384), # Large values - ] - for cpu_cores, memory, tasks_per_node, expected_mem_gb in test_configs: - with self.subTest(config=f"{cpu_cores}cpu_{memory}_{tasks_per_node}tasks"): + for ( + partition, + gpus_per_node, + cpu_cores, + memory, + tasks_per_node, + expected_mem_gb, + ) in TEST_CONFIGS: + with self.subTest( + config=f"{gpus_per_node}gpu_{cpu_cores}cpu_{memory}_{tasks_per_node}tasks" + ): resource = ResourceShape( - cpu_cores=cpu_cores, memory=memory, tasks_per_node=tasks_per_node + slurm_partition=partition, + gpus_per_node=gpus_per_node, + cpu_cores=cpu_cores, + memory=memory, + tasks_per_node=tasks_per_node, ) - result = json.loads(resource.to_json()) - # Verify all required keys are present - required_keys = {"cpu_cores", "memory", "tasks_per_node", "mem_gb"} - self.assertEqual(set(result.keys()), required_keys) - - # Verify values + self.assertEqual(result["slurm_partition"], partition) + self.assertEqual(result["gpus_per_node"], gpus_per_node) self.assertEqual(result["cpu_cores"], cpu_cores) self.assertEqual(result["memory"], memory) self.assertEqual(result["tasks_per_node"], tasks_per_node) @@ -80,90 +96,93 @@ def test_to_json(self): def test_to_sbatch(self): """Test to_sbatch format method with various configurations.""" - test_configs = [ - (24, "225G", 1), - (64, "1T", 2), - (8, "32G", 1), - (128, "4T", 1), - (1, "1G", 1), # Minimum values - (256, "16T", 4), # Large values - ] - for cpu_cores, memory, tasks_per_node in test_configs: - with self.subTest(config=f"{cpu_cores}cpu_{memory}_{tasks_per_node}tasks"): + for ( + partition, + gpus_per_node, + cpu_cores, + memory, + tasks_per_node, + _, + ) in TEST_CONFIGS: + with self.subTest( + config=f"{gpus_per_node}gpu_{cpu_cores}cpu_{memory}_{tasks_per_node}tasks" + ): resource = ResourceShape( - cpu_cores=cpu_cores, memory=memory, tasks_per_node=tasks_per_node + slurm_partition=partition, + gpus_per_node=gpus_per_node, + cpu_cores=cpu_cores, + memory=memory, + tasks_per_node=tasks_per_node, ) - result = resource.to_sbatch() lines = result.split("\n") - # Verify shebang self.assertEqual(lines[0], "#!/bin/bash") - - # Verify SBATCH directives are present sbatch_lines = [line for line in lines if line.startswith("#SBATCH")] - self.assertEqual(len(sbatch_lines), 3) # cpus, mem, ntasks - - # Verify specific directives self.assertIn(f"#SBATCH --cpus-per-task={cpu_cores}", result) self.assertIn(f"#SBATCH --mem={memory}", result) self.assertIn(f"#SBATCH --ntasks-per-node={tasks_per_node}", result) + self.assertIn(f"#SBATCH --partition={partition}", result) + self.assertIn(f"#SBATCH --gres=gpu:{gpus_per_node}", result) def test_to_srun(self): """Test to_srun format method with various configurations.""" - test_configs = [ - (24, "225G", 1), - (64, "1T", 2), - (8, "32G", 1), - (128, "4T", 1), - (1, "1G", 1), # Minimum values - (256, "16T", 4), # Large values - ] - for cpu_cores, memory, tasks_per_node in test_configs: - with self.subTest(config=f"{cpu_cores}cpu_{memory}_{tasks_per_node}tasks"): + for ( + partition, + gpus_per_node, + cpu_cores, + memory, + tasks_per_node, + _, + ) in TEST_CONFIGS: + with self.subTest( + config=f"{gpus_per_node}gpu_{cpu_cores}cpu_{memory}_{tasks_per_node}tasks" + ): resource = ResourceShape( - cpu_cores=cpu_cores, memory=memory, tasks_per_node=tasks_per_node + slurm_partition=partition, + gpus_per_node=gpus_per_node, + cpu_cores=cpu_cores, + memory=memory, + tasks_per_node=tasks_per_node, ) result = resource.to_srun() - expected_command = f"srun --cpus-per-task={cpu_cores} --mem={memory} --ntasks-per-node={tasks_per_node}" - - self.assertEqual(result, expected_command) - # Verify command structure parts = result.split() self.assertEqual(parts[0], "srun") self.assertIn(f"--cpus-per-task={cpu_cores}", result) self.assertIn(f"--mem={memory}", result) self.assertIn(f"--ntasks-per-node={tasks_per_node}", result) + self.assertIn(f"--partition={partition}", result) + self.assertIn(f"--gres=gpu:{gpus_per_node}", result) def test_to_submitit(self): """Test to_submitit format method with various configurations.""" - test_configs = [ - # (cpu_cores, memory, tasks_per_node, expected_mem_gb) - (24, "225G", 1, 225), - (64, "1T", 2, 1024), - (8, "32G", 1, 32), - (128, "4T", 1, 4096), - (1, "1G", 1, 1), # Minimum values - (256, "16T", 4, 16384), # Large values - ] - for cpu_cores, memory, tasks_per_node, expected_mem_gb in test_configs: - with self.subTest(config=f"{cpu_cores}cpu_{memory}_{tasks_per_node}tasks"): + for ( + partition, + gpus_per_node, + cpu_cores, + memory, + tasks_per_node, + expected_mem_gb, + ) in TEST_CONFIGS: + with self.subTest( + config=f"{gpus_per_node}gpu_{cpu_cores}cpu_{memory}_{tasks_per_node}tasks" + ): resource = ResourceShape( - cpu_cores=cpu_cores, memory=memory, tasks_per_node=tasks_per_node + slurm_partition=partition, + gpus_per_node=gpus_per_node, + cpu_cores=cpu_cores, + memory=memory, + tasks_per_node=tasks_per_node, ) - result = json.loads(resource.to_submitit()) - # Verify all required keys are present - required_keys = {"cpus_per_task", "mem_gb", "tasks_per_node"} - self.assertEqual(set(result.keys()), required_keys) - - # Verify values + self.assertEqual(result["slurm_partition"], partition) + self.assertEqual(result["gpus_per_node"], gpus_per_node) self.assertEqual(result["cpus_per_task"], cpu_cores) self.assertEqual(result["mem_gb"], expected_mem_gb) self.assertEqual(result["tasks_per_node"], tasks_per_node) From e23117ff1dd8cb98c66b0489387efb0479c2b414 Mon Sep 17 00:00:00 2001 From: luccab user Date: Thu, 2 Oct 2025 21:29:31 +0000 Subject: [PATCH 3/3] adding optional args --- clusterscope/cli.py | 31 ++++++++++++++++++++++++- clusterscope/cluster_info.py | 44 ++++++++++++++++++++++++------------ 2 files changed, 60 insertions(+), 15 deletions(-) diff --git a/clusterscope/cli.py b/clusterscope/cli.py index 96d1cc4..4bbcb43 100644 --- a/clusterscope/cli.py +++ b/clusterscope/cli.py @@ -195,7 +195,33 @@ def task(): default="json", help="Format to output the job requirements in", ) -def slurm(num_gpus: int, num_tasks_per_node: int, output_format: str, partition: str): +@click.option( + "--account", + type=str, + default=None, + help="SLURM account to charge resources to (optional)", +) +@click.option( + "--qos", + type=str, + default=None, + help="Quality of Service (QoS) specification for the job (optional)", +) +@click.option( + "--time", + type=str, + default=None, + help="Time limit for the job (format: HH:MM:SS or days-HH:MM:SS, optional)", +) +def slurm( + num_gpus: int, + num_tasks_per_node: int, + output_format: str, + partition: str, + account: str, + qos: str, + time: str, +): """Generate job requirements for a task of a Slurm job.""" partitions = get_partition_info() partition_names = [p.name for p in partitions] @@ -210,6 +236,9 @@ def slurm(num_gpus: int, num_tasks_per_node: int, output_format: str, partition: partition=partition, num_gpus=num_gpus, num_tasks_per_node=num_tasks_per_node, + account=account, + qos=qos, + time=time, ) # Route to the correct format method based on CLI option diff --git a/clusterscope/cluster_info.py b/clusterscope/cluster_info.py index 1b03e3e..51ee28b 100644 --- a/clusterscope/cluster_info.py +++ b/clusterscope/cluster_info.py @@ -27,6 +27,9 @@ class ResourceShape(NamedTuple): tasks_per_node: int gpus_per_node: int slurm_partition: str + account: Optional[str] = None + qos: Optional[str] = None + time: Optional[str] = None def to_json(self) -> str: """Convert ResourceShape to JSON format. @@ -34,16 +37,8 @@ def to_json(self) -> str: Returns: str: JSON representation of the resource requirements """ - mem_gb = parse_memory_to_gb(self.memory) - - data = { - "cpu_cores": self.cpu_cores, - "memory": self.memory, - "tasks_per_node": self.tasks_per_node, - "mem_gb": mem_gb, - "gpus_per_node": self.gpus_per_node, - "slurm_partition": self.slurm_partition, - } + data = {k: v for k, v in self._asdict().items() if v is not None} + data["mem_gb"] = parse_memory_to_gb(data["memory"]) return json.dumps(data, indent=2) def to_sbatch(self) -> str: @@ -60,6 +55,10 @@ def to_sbatch(self) -> str: f"#SBATCH --gres=gpu:{self.gpus_per_node}", f"#SBATCH --partition={self.slurm_partition}", ] + for attr_name in ["account", "qos", "time"]: + value = getattr(self, attr_name) + if value is not None: + lines.append(f"#SBATCH --{attr_name}={value}") return "\n".join(lines) def to_srun(self) -> str: @@ -76,6 +75,10 @@ def to_srun(self) -> str: f"--gres=gpu:{self.gpus_per_node}", f"--partition={self.slurm_partition}", ] + for attr_name in ["account", "qos", "time"]: + value = getattr(self, attr_name) + if value is not None: + cmd_parts.append(f"--{attr_name}={value}") return " ".join(cmd_parts) def to_submitit(self) -> str: @@ -87,12 +90,20 @@ def to_submitit(self) -> str: mem_gb = parse_memory_to_gb(self.memory) params = { - "slurm_partition": self.slurm_partition, "cpus_per_task": self.cpu_cores, "mem_gb": mem_gb, - "tasks_per_node": self.tasks_per_node, - "gpus_per_node": self.gpus_per_node, } + for attr_name in [ + "slurm_partition", + "tasks_per_node", + "gpus_per_node", + "account", + "qos", + "time", + ]: + value = getattr(self, attr_name) + if value is not None: + params[attr_name] = value return json.dumps(params, indent=2) @@ -232,7 +243,11 @@ def get_total_gpus_per_node(self) -> int: return max(total_gpus, 1) # Ensure at least 1 to avoid division by zero def get_task_resource_requirements( - self, partition: str, num_gpus: int, num_tasks_per_node: int = 1 + self, + partition: str, + num_gpus: int, + num_tasks_per_node: int = 1, + **kwargs, ) -> ResourceShape: """Calculate resource requirements for better GPU packing based on node's GPU configuration. @@ -300,6 +315,7 @@ def get_task_resource_requirements( memory=sbatch_memory, tasks_per_node=num_tasks_per_node, gpus_per_node=num_gpus, + **kwargs, ) def get_array_job_requirements(