From dfd0e874c41496896b52973db46209d8e05634f1 Mon Sep 17 00:00:00 2001 From: Matt Smith Date: Tue, 3 Mar 2026 10:22:36 -0500 Subject: [PATCH 1/4] -j flag to y --- components/scheduler/src/sge.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/components/scheduler/src/sge.rs b/components/scheduler/src/sge.rs index bec9069..502c847 100644 --- a/components/scheduler/src/sge.rs +++ b/components/scheduler/src/sge.rs @@ -8,7 +8,7 @@ use tera::Tera; const DEFAULT_TEMPLATE: &str = r#"#!/bin/bash #$ -N {{job_name}} #$ -V -#$ -j oe +#$ -j y #$ -o {{log_path}} {% if parallel -%}#$ -pe orte {{num_mpi_cpus}}{% endif %} From 861e258b4f7fe31d0f0c33b71d7fd6b878e712d6 Mon Sep 17 00:00:00 2001 From: Matt Smith Date: Wed, 4 Mar 2026 11:38:04 -0500 Subject: [PATCH 2/4] nonmem version sort and qsub node spin up warning catch --- components/config/src/nonmem.rs | 58 ++++++++++++++++++++++++++++++++- components/scheduler/src/lib.rs | 39 +++++++++++++++++----- 2 files changed, 88 insertions(+), 9 deletions(-) diff --git a/components/config/src/nonmem.rs b/components/config/src/nonmem.rs index b41b8ef..6a6e1a0 100644 --- a/components/config/src/nonmem.rs +++ b/components/config/src/nonmem.rs @@ -1,3 +1,4 @@ +use std::cmp::Ordering; use std::collections::HashMap; use std::path::{Path, PathBuf}; use std::process::Command; @@ -10,6 +11,36 @@ use which::which; const KNOWN_NONMEM_FOLDERS: [&str; 2] = ["/opt/nonmem", "/opt/NONMEM"]; +fn parse_nonmem_version_name(name: &str) -> Option<(u32, Vec<&str>)> { + let rest = name.strip_prefix("nm")?; + let digit_count = rest.chars().take_while(|c| c.is_ascii_digit()).count(); + if digit_count == 0 { + return None; + } + + let version_number = rest[..digit_count].parse::().ok()?; + let suffix = rest[digit_count..].trim_start_matches('_'); + let suffix_parts = if suffix.is_empty() { + Vec::new() + } else { + suffix.split('_').collect() + }; + + Some((version_number, suffix_parts)) +} + +fn cmp_nonmem_version_names(a: &str, b: &str) -> Ordering { + match (parse_nonmem_version_name(a), parse_nonmem_version_name(b)) { + (Some((a_num, a_suffix)), Some((b_num, b_suffix))) => b_num + .cmp(&a_num) + .then_with(|| a_suffix.cmp(&b_suffix)) + .then_with(|| a.cmp(b)), + (Some(_), None) => Ordering::Less, + (None, Some(_)) => Ordering::Greater, + (None, None) => a.cmp(b), + } +} + fn resolve_path_from_config_dir(path: Option<&PathBuf>, config_dir: &Path) -> Option { path.map(|p| { if p.is_relative() { @@ -432,7 +463,7 @@ impl NonmemConfig { } config.versions = find_nonmem_versions()?; let mut versions = config.versions.keys().collect::>(); - versions.sort(); + versions.sort_by(|a, b| cmp_nonmem_version_names(a, b)); config.default_version = versions[0].clone(); Ok(config) @@ -646,4 +677,29 @@ mod tests { let nonmem = config.nonmem.expect("Should have nonmem config"); assert_eq!(nonmem.files_to_copy().len(), 0); } + + #[test] + fn test_nonmem_versions_sorted_latest_first_with_suffix_specificity() { + let mut versions = [ + "nm74gf_nmfe", + "nm75", + "nm73gf", + "nm74gf", + "nm73gf_nmfe", + "nm76", + ]; + versions.sort_by(|a, b| cmp_nonmem_version_names(a, b)); + + assert_eq!( + versions, + [ + "nm76", + "nm75", + "nm74gf", + "nm74gf_nmfe", + "nm73gf", + "nm73gf_nmfe" + ] + ); + } } diff --git a/components/scheduler/src/lib.rs b/components/scheduler/src/lib.rs index 3349ad0..535832f 100644 --- a/components/scheduler/src/lib.rs +++ b/components/scheduler/src/lib.rs @@ -271,10 +271,14 @@ impl SchedulerType { || format!("failed to execute {cmd_name} command for model {m:?}",), )?; if !output.status.success() { - bail!( - "{cmd_name} failed: {}", - String::from_utf8_lossy(&output.stderr) - ); + let stderr = String::from_utf8_lossy(&output.stderr); + let sge_queuing_warning = stderr.contains("Unable to run job") + && stderr.contains("not allowed to run in any queue") + && stderr.contains("has been submitted"); + if !sge_queuing_warning { + bail!("{cmd_name} failed: {stderr}"); + } + log::warn!("{cmd_name} reported a warning but the job was submitted: {stderr}"); } let stdout = String::from_utf8_lossy(&output.stdout); @@ -284,10 +288,29 @@ impl SchedulerType { num.parse() .map_err(|e| anyhow!("Failed to parse job ID '{stdout}': {e}"))? } - SchedulerType::Sge(_) => stdout - .trim() - .parse() - .map_err(|e| anyhow!("Failed to parse job ID '{stdout}': {e}"))?, + SchedulerType::Sge(_) => { + if !stdout.trim().is_empty() { + stdout + .trim() + .parse() + .map_err(|e| anyhow!("Failed to parse job ID '{stdout}': {e}"))? + } else { + let stderr = String::from_utf8_lossy(&output.stderr); + let job_id_str = stderr + .lines() + .find_map(|line| { + line.trim() + .strip_prefix("Your job ") + .and_then(|rest| rest.split_whitespace().next()) + }) + .ok_or_else(|| { + anyhow!("Failed to find job ID in SGE output: {stderr}") + })?; + job_id_str + .parse() + .map_err(|e| anyhow!("Failed to parse job ID '{job_id_str}': {e}"))? + } + } }; out.push((m, job_id)); From 1864e4b0568bff92d0012fc8f5867dc694fbe9c7 Mon Sep 17 00:00:00 2001 From: Matt Smith Date: Wed, 4 Mar 2026 13:52:58 -0500 Subject: [PATCH 3/4] bump version number --- Cargo.lock | 2 +- Cargo.toml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 365553b..882fd59 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -787,7 +787,7 @@ dependencies = [ [[package]] name = "pharos" -version = "0.3.0" +version = "0.3.1" dependencies = [ "anyhow", "clap", diff --git a/Cargo.toml b/Cargo.toml index 61d34fe..3c143b3 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "pharos" -version = "0.3.0" +version = "0.3.1" edition = "2024" [[bin]] From f26d578f3cbdb06e21624e075e756e1dafdea7c3 Mon Sep 17 00:00:00 2001 From: Matt Smith Date: Wed, 4 Mar 2026 16:19:46 -0500 Subject: [PATCH 4/4] added comments for qsub failure. Updated comparison to remove suffix sorting. --- components/config/src/nonmem.rs | 28 ++++++++++++---------------- components/scheduler/src/lib.rs | 18 ++++++++++++++++-- 2 files changed, 28 insertions(+), 18 deletions(-) diff --git a/components/config/src/nonmem.rs b/components/config/src/nonmem.rs index 6a6e1a0..0c4fb8f 100644 --- a/components/config/src/nonmem.rs +++ b/components/config/src/nonmem.rs @@ -11,30 +11,22 @@ use which::which; const KNOWN_NONMEM_FOLDERS: [&str; 2] = ["/opt/nonmem", "/opt/NONMEM"]; -fn parse_nonmem_version_name(name: &str) -> Option<(u32, Vec<&str>)> { +fn parse_nonmem_version_name(name: &str) -> Option { let rest = name.strip_prefix("nm")?; let digit_count = rest.chars().take_while(|c| c.is_ascii_digit()).count(); if digit_count == 0 { return None; } - let version_number = rest[..digit_count].parse::().ok()?; - let suffix = rest[digit_count..].trim_start_matches('_'); - let suffix_parts = if suffix.is_empty() { - Vec::new() - } else { - suffix.split('_').collect() - }; - - Some((version_number, suffix_parts)) + rest[..digit_count].parse::().ok() } fn cmp_nonmem_version_names(a: &str, b: &str) -> Ordering { - match (parse_nonmem_version_name(a), parse_nonmem_version_name(b)) { - (Some((a_num, a_suffix)), Some((b_num, b_suffix))) => b_num - .cmp(&a_num) - .then_with(|| a_suffix.cmp(&b_suffix)) - .then_with(|| a.cmp(b)), + let a_num = parse_nonmem_version_name(a); + let b_num = parse_nonmem_version_name(b); + + match (a_num, b_num) { + (Some(a_num), Some(b_num)) => b_num.cmp(&a_num).then_with(|| a.cmp(b)), (Some(_), None) => Ordering::Less, (None, Some(_)) => Ordering::Greater, (None, None) => a.cmp(b), @@ -679,10 +671,12 @@ mod tests { } #[test] - fn test_nonmem_versions_sorted_latest_first_with_suffix_specificity() { + fn test_nonmem_versions_sorted_latest_first_with_lexical_fallback() { let mut versions = [ "nm74gf_nmfe", "nm75", + "nm74", + "nm73", "nm73gf", "nm74gf", "nm73gf_nmfe", @@ -695,8 +689,10 @@ mod tests { [ "nm76", "nm75", + "nm74", "nm74gf", "nm74gf_nmfe", + "nm73", "nm73gf", "nm73gf_nmfe" ] diff --git a/components/scheduler/src/lib.rs b/components/scheduler/src/lib.rs index 535832f..97db9ad 100644 --- a/components/scheduler/src/lib.rs +++ b/components/scheduler/src/lib.rs @@ -270,10 +270,19 @@ impl SchedulerType { .with_context( || format!("failed to execute {cmd_name} command for model {m:?}",), )?; + + // If SGE does not have a compute node ready during job submission + // qsub fails and gives this error: + // + // Unable to run job: warning: job is not allowed to run in any queue + // Your job ("") has been submitted + // Exiting. + // + // Checking stderr for this message to prevent bail! on non-zero exit code from qsub if !output.status.success() { let stderr = String::from_utf8_lossy(&output.stderr); - let sge_queuing_warning = stderr.contains("Unable to run job") - && stderr.contains("not allowed to run in any queue") + let sge_queuing_warning = stderr.contains("Unable to run job: warning") + && stderr.contains("job is not allowed to run in any queue") && stderr.contains("has been submitted"); if !sge_queuing_warning { bail!("{cmd_name} failed: {stderr}"); @@ -289,12 +298,17 @@ impl SchedulerType { .map_err(|e| anyhow!("Failed to parse job ID '{stdout}': {e}"))? } SchedulerType::Sge(_) => { + // If qsub failed due to no compute nodes being available, + // the job ID is not printed in stdout but rather in the + // error message given to stderr (see error message template above). if !stdout.trim().is_empty() { stdout .trim() .parse() .map_err(|e| anyhow!("Failed to parse job ID '{stdout}': {e}"))? } else { + // Need to isolate job ID from + // Your job ("") has been submitted let stderr = String::from_utf8_lossy(&output.stderr); let job_id_str = stderr .lines()