From 3ab408ff029de78aae36a51bb51062e65d7ca6e3 Mon Sep 17 00:00:00 2001 From: "Navid C. Constantinou" Date: Thu, 24 Jul 2025 09:31:53 +1000 Subject: [PATCH 1/6] add some debug statements --- test/test_distributed_hydrostatic_model.jl | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/test/test_distributed_hydrostatic_model.jl b/test/test_distributed_hydrostatic_model.jl index 933bd8be002..c338073c5dc 100644 --- a/test/test_distributed_hydrostatic_model.jl +++ b/test/test_distributed_hydrostatic_model.jl @@ -148,11 +148,19 @@ for arch in archs @test all(isapprox(cp, cs; atol, rtol)) @test all(isapprox(ηp, ηs; atol, rtol)) end + if arch.local_rank == 0 + @info " done distributed solid body rotation" + @info " Testing CATKE with $(ranks(arch)) ranks on $(typeof(grid).name.wrapper)" + end # CATKE works only with synchronized communication at the moment arch = synchronized(arch) closure = CATKEVerticalDiffusivity() + if arch.local_rank == 0 + @info " Testing CATKE with $(ranks(arch)) ranks" + end + # "s" for "serial" computation, "p" for parallel ms = rotation_with_shear_test(global_underlying_grid, closure) mp = rotation_with_shear_test(underlying_grid, closure) @@ -188,4 +196,3 @@ for arch in archs end end end - From 632935678249024f8bf2c172da58d8ce362360f9 Mon Sep 17 00:00:00 2001 From: "Navid C. Constantinou" Date: Thu, 24 Jul 2025 15:39:02 +1000 Subject: [PATCH 2/6] Update test_distributed_hydrostatic_model.jl --- test/test_distributed_hydrostatic_model.jl | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/test/test_distributed_hydrostatic_model.jl b/test/test_distributed_hydrostatic_model.jl index c338073c5dc..ddb52d3527d 100644 --- a/test/test_distributed_hydrostatic_model.jl +++ b/test/test_distributed_hydrostatic_model.jl @@ -147,10 +147,10 @@ for arch in archs @test all(isapprox(wp, ws; atol, rtol)) @test all(isapprox(cp, cs; atol, rtol)) @test all(isapprox(ηp, ηs; atol, rtol)) - end - if arch.local_rank == 0 - @info " done distributed solid body rotation" - @info " Testing CATKE with $(ranks(arch)) ranks on $(typeof(grid).name.wrapper)" + + if arch.local_rank == 0 + @info " done distributed solid body rotation" + end end # CATKE works only with synchronized communication at the moment From d41b057c03c480f8c3b75512837f46bf41e19178 Mon Sep 17 00:00:00 2001 From: Simone Silvestri Date: Thu, 24 Jul 2025 15:31:45 +0200 Subject: [PATCH 3/6] impose a larger timeout --- .buildkite/distributed/pipeline.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.buildkite/distributed/pipeline.yml b/.buildkite/distributed/pipeline.yml index 3af0ef8c9a3..da9b054a5c6 100644 --- a/.buildkite/distributed/pipeline.yml +++ b/.buildkite/distributed/pipeline.yml @@ -95,6 +95,7 @@ steps: env: TEST_GROUP: "distributed_hydrostatic_model" TEST_ARCHITECTURE: "CPU" + timeout_in_minutes: 1440 commands: - "srun julia -O0 --color=yes --project -e 'using Pkg; Pkg.test()'" agents: @@ -110,6 +111,7 @@ steps: env: TEST_GROUP: "distributed_hydrostatic_model" TEST_ARCHITECTURE: "GPU" + timeout_in_minutes: 1440 commands: - "srun julia -O0 --color=yes --project -e 'using Pkg; Pkg.test()'" agents: From 32b48a8bcdc828bf6265a3c901ee8198f180de0f Mon Sep 17 00:00:00 2001 From: Simone Silvestri Date: Fri, 25 Jul 2025 09:19:27 +0200 Subject: [PATCH 4/6] Update pipeline.yml --- .buildkite/distributed/pipeline.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.buildkite/distributed/pipeline.yml b/.buildkite/distributed/pipeline.yml index da9b054a5c6..0cc1bcf626f 100644 --- a/.buildkite/distributed/pipeline.yml +++ b/.buildkite/distributed/pipeline.yml @@ -95,9 +95,9 @@ steps: env: TEST_GROUP: "distributed_hydrostatic_model" TEST_ARCHITECTURE: "CPU" - timeout_in_minutes: 1440 commands: - "srun julia -O0 --color=yes --project -e 'using Pkg; Pkg.test()'" + timeout_in_minutes: 1440 agents: slurm_mem: 50G slurm_ntasks: 4 @@ -111,9 +111,9 @@ steps: env: TEST_GROUP: "distributed_hydrostatic_model" TEST_ARCHITECTURE: "GPU" - timeout_in_minutes: 1440 commands: - "srun julia -O0 --color=yes --project -e 'using Pkg; Pkg.test()'" + timeout_in_minutes: 1440 agents: slurm_mem: 80G # Apparently the GPU tests require more memory slurm_ntasks: 4 From 461b5f48a87a07e8bd248dd80db71acccaece648 Mon Sep 17 00:00:00 2001 From: "Navid C. Constantinou" Date: Sat, 26 Jul 2025 07:14:48 +1000 Subject: [PATCH 5/6] Update test_distributed_hydrostatic_model.jl --- test/test_distributed_hydrostatic_model.jl | 4 ---- 1 file changed, 4 deletions(-) diff --git a/test/test_distributed_hydrostatic_model.jl b/test/test_distributed_hydrostatic_model.jl index ddb52d3527d..bf9d4a43662 100644 --- a/test/test_distributed_hydrostatic_model.jl +++ b/test/test_distributed_hydrostatic_model.jl @@ -147,10 +147,6 @@ for arch in archs @test all(isapprox(wp, ws; atol, rtol)) @test all(isapprox(cp, cs; atol, rtol)) @test all(isapprox(ηp, ηs; atol, rtol)) - - if arch.local_rank == 0 - @info " done distributed solid body rotation" - end end # CATKE works only with synchronized communication at the moment From 9201794f9b8c6624eef000980c54aeaafa3dbf7d Mon Sep 17 00:00:00 2001 From: "Navid C. Constantinou" Date: Sun, 27 Jul 2025 10:31:14 +1000 Subject: [PATCH 6/6] Update pipeline.yml --- .buildkite/distributed/pipeline.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.buildkite/distributed/pipeline.yml b/.buildkite/distributed/pipeline.yml index 0cc1bcf626f..cfb326827ca 100644 --- a/.buildkite/distributed/pipeline.yml +++ b/.buildkite/distributed/pipeline.yml @@ -115,7 +115,7 @@ steps: - "srun julia -O0 --color=yes --project -e 'using Pkg; Pkg.test()'" timeout_in_minutes: 1440 agents: - slurm_mem: 80G # Apparently the GPU tests require more memory + slurm_mem: 100G # Apparently the GPU tests require more memory slurm_ntasks: 4 slurm_gpus_per_task: 1 retry: