From 962bd2126a24c1f64d885d0329146fb76753bcc2 Mon Sep 17 00:00:00 2001 From: Eliot Quon Date: Thu, 30 Apr 2026 12:16:31 -0600 Subject: [PATCH 1/4] Remove dead code in ERA5 module MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Drop the unused `ERA5_single_level_accumulated_variables` Set in `ERA5_single_levels.jl` — it was defined with a comment about CDS step types but never wired up to anything (no references in src/, ext/, test/, or examples/). - Drop the unused `Day` from the `using Dates: ...` import in `ERA5.jl` (only `DateTime`, `Month`, `Hour` are referenced). - Drop the redundant `metadata_path` from the `import` block — already brought in via `using NumericalEarth.DataWrangling: metadata_path`, and the ERA5 module never extends it (the `import` form is for extending only). - Delete the 1-arg `metadata_prefix(::ERA5Metadata)` / `(::ERA5PressureMetadata)` overloads. They duplicate the logic of the 4-arg form `metadata_prefix(dataset, name, date, region)` (which is what `metadata_filename` actually calls). Update the metadata_prefix unit tests to call the 4-arg form directly. Co-Authored-By: Claude Opus 4.7 (1M context) --- src/DataWrangling/ERA5/ERA5.jl | 3 +- .../ERA5/ERA5_pressure_levels.jl | 29 ------------ src/DataWrangling/ERA5/ERA5_single_levels.jl | 44 ------------------- test/test_cds_downloading.jl | 17 +++---- 4 files changed, 8 insertions(+), 85 deletions(-) diff --git a/src/DataWrangling/ERA5/ERA5.jl b/src/DataWrangling/ERA5/ERA5.jl index 610655d52..580fdcc0d 100644 --- a/src/DataWrangling/ERA5/ERA5.jl +++ b/src/DataWrangling/ERA5/ERA5.jl @@ -16,7 +16,7 @@ using Oceananigans.Fields: Center, set! using Oceananigans: Field, fill_halo_regions!, CPU using NumericalEarth.DataWrangling: Metadata, Metadatum, metadata_path, native_grid, InverseGravity, download_dataset using Dates -using Dates: DateTime, Day, Month, Hour +using Dates: DateTime, Month, Hour import NumericalEarth.DataWrangling: all_dates, @@ -31,7 +31,6 @@ import NumericalEarth.DataWrangling: inpainted_metadata_path, available_variables, retrieve_data, - metadata_path, is_three_dimensional, reversed_vertical_axis, reversed_latitude_axis, diff --git a/src/DataWrangling/ERA5/ERA5_pressure_levels.jl b/src/DataWrangling/ERA5/ERA5_pressure_levels.jl index 3be6eadb1..a92e6a593 100644 --- a/src/DataWrangling/ERA5/ERA5_pressure_levels.jl +++ b/src/DataWrangling/ERA5/ERA5_pressure_levels.jl @@ -126,35 +126,6 @@ function retrieve_data(metadata::ERA5PressureMetadatum) return reverse(data, dims=2) # Latitude is stored from 90°N → 90°S end -##### -##### Metadata filename construction -##### - -function metadata_prefix(md::ERA5PressureMetadata) - var = ERA5PL_dataset_variable_names[md.name] - dataset = dataset_name(md.dataset) - start_date = start_date_str(md.dates) - end_date = end_date_str(md.dates) - bbox = md.region - - if !isnothing(bbox) - w, e = bbox_strs(bbox.longitude) - s, n = bbox_strs(bbox.latitude) - suffix = string(w, e, s, n) - else - suffix = "" - end - - if start_date == end_date - prefix = string(var, "_", dataset, "_", start_date, suffix) - else - prefix = string(var, "_", dataset, "_", start_date, "_", end_date, suffix) - end - prefix = colon2dash(prefix) - prefix = underscore_spaces(prefix) - return prefix -end - ##### ##### Pressure-level vertical coordinate ##### diff --git a/src/DataWrangling/ERA5/ERA5_single_levels.jl b/src/DataWrangling/ERA5/ERA5_single_levels.jl index 5c00626da..04694894f 100644 --- a/src/DataWrangling/ERA5/ERA5_single_levels.jl +++ b/src/DataWrangling/ERA5/ERA5_single_levels.jl @@ -10,22 +10,6 @@ const ERA5_wave_variables = Set([ :significant_wave_height, :mean_wave_period, :mean_wave_direction, ]) -# Mean rate / accumulated variables (CDS "step type" = accum). -# All other single-level variables are instantaneous. -# See ECMWF ERA5 documentation, Tables 3 and 4: -# https://confluence.ecmwf.int/display/CKB/ERA5%3A+data+documentation#ERA5:datadocumentation-Meanrates/fluxesandaccumulations -const ERA5_single_level_accumulated_variables = Set([ - :total_precipitation, - :mean_surface_sensible_heat_flux, - :mean_surface_latent_heat_flux, - :mean_surface_momentum_flux_x, - :mean_surface_momentum_flux_y, - :downwelling_shortwave_radiation, - :downwelling_longwave_radiation, - :evaporation, - :mean_evaporation_rate, -]) - ##### ##### ERA5 single-level data availability ##### @@ -152,31 +136,3 @@ function retrieve_data(metadata::ERA5Metadatum) return reshape(data_2d, size(data_2d, 1), size(data_2d, 2), 1) end -##### -##### Metadata filename construction -##### - -function metadata_prefix(md::ERA5Metadata) - var = ERA5_dataset_variable_names[md.name] - dataset = dataset_name(md.dataset) - start_date = start_date_str(md.dates) - end_date = end_date_str(md.dates) - bbox = md.region - - if !isnothing(bbox) - w, e = bbox_strs(bbox.longitude) - s, n = bbox_strs(bbox.latitude) - suffix = string(w, e, s, n) - else - suffix = "" - end - - if start_date == end_date - prefix = string(var, "_", dataset, "_", start_date, suffix) - else - prefix = string(var, "_", dataset, "_", start_date, "_", end_date, suffix) - end - prefix = colon2dash(prefix) - prefix = underscore_spaces(prefix) - return prefix -end diff --git a/test/test_cds_downloading.jl b/test/test_cds_downloading.jl index e1bd366f1..7ff816347 100644 --- a/test/test_cds_downloading.jl +++ b/test/test_cds_downloading.jl @@ -173,10 +173,10 @@ start_date = DateTime(2005, 2, 16, 12) @testset "ERA5 single-level metadata_prefix" begin ds = ERA5HourlySingleLevel() + mp = NumericalEarth.DataWrangling.ERA5.metadata_prefix - # Single-date metadatum, with region: prefix should not duplicate the date - md_single = Metadatum(:temperature; dataset=ds, region, date=start_date) - prefix_single = NumericalEarth.DataWrangling.ERA5.metadata_prefix(md_single) + # Single-date with region: prefix should not duplicate the date + prefix_single = mp(ds, :temperature, start_date, region) @test occursin("2m_temperature", prefix_single) @test occursin("ERA5HourlySingleLevel", prefix_single) @test occursin("2005-02-16", prefix_single) @@ -189,17 +189,14 @@ start_date = DateTime(2005, 2, 16, 12) @test !occursin(":", prefix_single) # colons replaced by dashes @test !occursin(" ", prefix_single) # spaces replaced by underscores - # Single-date metadatum, no region: suffix should be empty - md_no_region = Metadatum(:temperature; dataset=ds, date=start_date) - prefix_no_region = NumericalEarth.DataWrangling.ERA5.metadata_prefix(md_no_region) + # Single-date, no region: suffix should be empty + prefix_no_region = mp(ds, :temperature, start_date, nothing) @test !occursin("0.0", prefix_no_region) @test !occursin("nothing", prefix_no_region) - # Multi-date metadata: prefix should include both start and end dates + # Multi-date: prefix should include both start and end dates end_date = start_date + Hour(2) - md_multi = Metadata(:temperature; dataset=ds, region, - dates=start_date:Hour(1):end_date) - prefix_multi = NumericalEarth.DataWrangling.ERA5.metadata_prefix(md_multi) + prefix_multi = mp(ds, :temperature, start_date:Hour(1):end_date, region) @test occursin("2005-02-16T12", prefix_multi) @test occursin("2005-02-16T14", prefix_multi) end From f66246c455d519b81bbdb91bb2bdc23af6f0cff5 Mon Sep 17 00:00:00 2001 From: Eliot Quon Date: Fri, 1 May 2026 14:18:25 -0600 Subject: [PATCH 2/4] Standardize download_dataset return types Make download_dataset consistently return paths instead of nothing: String for single-file Metadatum, Vector{String} for multi-file Metadata. ERA5 helpers now return their already-computed path lists rather than re-deriving them via throwaway Metadatum construction. Closes #201. Co-Authored-By: Claude Opus 4.7 (1M context) --- ext/NumericalEarthCDSAPIExt.jl | 44 ++++++++++--------- ext/NumericalEarthWOAExt.jl | 2 +- src/DataWrangling/ECCO/ECCO.jl | 2 +- src/DataWrangling/EN4/EN4.jl | 2 +- src/DataWrangling/JRA55/JRA55_metadata.jl | 2 +- .../OSPapa/OSPapa_flux_observations.jl | 4 +- 6 files changed, 30 insertions(+), 26 deletions(-) diff --git a/ext/NumericalEarthCDSAPIExt.jl b/ext/NumericalEarthCDSAPIExt.jl index 9b979a491..66f49be34 100644 --- a/ext/NumericalEarthCDSAPIExt.jl +++ b/ext/NumericalEarthCDSAPIExt.jl @@ -139,12 +139,15 @@ function download_dataset(metadata::ERA5Metadata; skip_existing=true, cleanup=tr dates = metadata.dates isa AbstractVector ? metadata.dates : [metadata.dates] grouped = _group_by_calendar_day(dates) + paths = String[] for day in sort(collect(keys(grouped))) - download_era5_day(metadata.name, metadata.dataset, grouped[day]; - region = metadata.region, - dir = metadata.dir, - skip_existing, cleanup) + append!(paths, download_era5_day(metadata.name, metadata.dataset, grouped[day]; + region = metadata.region, + dir = metadata.dir, + skip_existing, cleanup)) end + + return paths end """ @@ -160,16 +163,15 @@ function _group_by_calendar_day(datetimes) end function download_era5_day(name, dataset, day_dates; - region, dir, skip_existing, cleanup) + region, dir, skip_existing, cleanup) - MDatum = NumericalEarth.DataWrangling.Metadatum - meta_path = NumericalEarth.DataWrangling.metadata_path + meta_filename = NumericalEarth.DataWrangling.metadata_filename - all_pairs = [(dt, meta_path(MDatum(name; dataset, date=dt, region, dir))) + all_pairs = [(dt, joinpath(dir, meta_filename(dataset, name, dt, region))) for dt in day_dates] pending = skip_existing ? filter(((_, p),) -> !isfile(p), all_pairs) : all_pairs - isempty(pending) && return nothing + isempty(pending) && return [p for (_, p) in all_pairs] sorted_dts = sort(unique([dt for (dt, _) in pending])) hours_str = [lpad(string(Dates.hour(dt)), 2, '0') * ":00" for dt in sorted_dts] @@ -210,7 +212,7 @@ function download_era5_day(name, dataset, day_dates; cleanup && rm(tmp_path; force=true) end - return nothing + return [p for (_, p) in all_pairs] end ##### @@ -223,10 +225,11 @@ end Download multiple ERA5 pressure-level variables for each date in `metadata`. """ function download_dataset(names::Vector{Symbol}, metadata::ERA5PressureMetadata; kwargs...) + paths = String[] for metadatum in metadata - download_dataset(names, metadatum; kwargs...) + append!(paths, download_dataset(names, metadatum; kwargs...)) end - return nothing + return paths end """ @@ -327,11 +330,13 @@ function download_dataset(names::Vector{Symbol}, grouped = _group_by_calendar_day(datetimes) + paths = String[] for day in sort(collect(keys(grouped))) - download_era5_multivar_day(names, dataset, grouped[day]; region, dir, skip_existing, cleanup) + append!(paths, download_era5_multivar_day(names, dataset, grouped[day]; + region, dir, skip_existing, cleanup)) end - return nothing + return paths end function download_dataset(name::Symbol, @@ -345,16 +350,15 @@ function download_dataset(name::Symbol, end function download_era5_multivar_day(names, dataset, day_dates; - region, dir, skip_existing, cleanup) + region, dir, skip_existing, cleanup) - MDatum = NumericalEarth.DataWrangling.Metadatum - meta_path = NumericalEarth.DataWrangling.metadata_path + meta_filename = NumericalEarth.DataWrangling.metadata_filename - all_triples = [(name, dt, meta_path(MDatum(name; dataset, date=dt, region, dir))) + all_triples = [(name, dt, joinpath(dir, meta_filename(dataset, name, dt, region))) for name in names for dt in day_dates] pending = skip_existing ? filter(((_, _, p),) -> !isfile(p), all_triples) : all_triples - isempty(pending) && return nothing + isempty(pending) && return [p for (_, _, p) in all_triples] cds_vars = unique([cds_varnames(dataset)[name] for (name, _, _) in pending]) sorted_dts = sort(unique([dt for (_, dt, _) in pending])) @@ -396,7 +400,7 @@ function download_era5_multivar_day(names, dataset, day_dates; cleanup && rm(tmp_path; force=true) end - return nothing + return [p for (_, _, p) in all_triples] end ##### diff --git a/ext/NumericalEarthWOAExt.jl b/ext/NumericalEarthWOAExt.jl index 13d7a0b6c..d65e6a93b 100644 --- a/ext/NumericalEarthWOAExt.jl +++ b/ext/NumericalEarthWOAExt.jl @@ -54,7 +54,7 @@ function download_dataset(metadata::Metadata{<:WOAClimatology}; skip_existing=tr cp(source, linkpath) end - return nothing + return metadata_path(metadata) end end # module diff --git a/src/DataWrangling/ECCO/ECCO.jl b/src/DataWrangling/ECCO/ECCO.jl index 576e81014..da65d30e1 100644 --- a/src/DataWrangling/ECCO/ECCO.jl +++ b/src/DataWrangling/ECCO/ECCO.jl @@ -338,7 +338,7 @@ function download_dataset(metadata::ECCOMetadata) end end - return nothing + return metadata_path(metadata) end function inpainted_metadata_filename(metadata::ECCOMetadatum) diff --git a/src/DataWrangling/EN4/EN4.jl b/src/DataWrangling/EN4/EN4.jl index 851401439..9b2b2f041 100644 --- a/src/DataWrangling/EN4/EN4.jl +++ b/src/DataWrangling/EN4/EN4.jl @@ -229,7 +229,7 @@ function download_dataset(metadata::Metadata{<:EN4Monthly}) end end - return nothing + return metadata_path(metadata) end end # Module diff --git a/src/DataWrangling/JRA55/JRA55_metadata.jl b/src/DataWrangling/JRA55/JRA55_metadata.jl index 53f1fa58c..27f7e4fad 100644 --- a/src/DataWrangling/JRA55/JRA55_metadata.jl +++ b/src/DataWrangling/JRA55/JRA55_metadata.jl @@ -201,5 +201,5 @@ function download_dataset(metadata::JRA55Metadata) end end - return nothing + return metadata_path(metadata) end diff --git a/src/DataWrangling/OSPapa/OSPapa_flux_observations.jl b/src/DataWrangling/OSPapa/OSPapa_flux_observations.jl index 75cad9f9d..ad9d11ccc 100644 --- a/src/DataWrangling/OSPapa/OSPapa_flux_observations.jl +++ b/src/DataWrangling/OSPapa/OSPapa_flux_observations.jl @@ -96,7 +96,7 @@ build_filename(::OSPapaFluxHourly, name, dates::AbstractArray, region) = function download_dataset(md::OSPapaFluxMetadata) uniform_path = joinpath(md.dir, metadata_filename(md)) - isfile(uniform_path) && return nothing + isfile(uniform_path) && return uniform_path if !(md.dates isa AbstractArray) error("OSPapaFluxHourly uniform cache $(uniform_path) is missing; " * @@ -107,7 +107,7 @@ function download_dataset(md::OSPapaFluxMetadata) end_date = last(md.dates) raw_path = download_ospapa_flux(; start_date, end_date, dir=md.dir) _write_uniform_flux_file(raw_path, uniform_path, start_date, end_date) - return nothing + return uniform_path end function _write_uniform_flux_file(raw_path, uniform_path, start_date, end_date) From 57ff2f7b736ac17c50a5d755f22f6a52f55a915c Mon Sep 17 00:00:00 2001 From: Eliot Quon Date: Fri, 1 May 2026 14:24:28 -0600 Subject: [PATCH 3/4] Update CDS skip-existing tests for new helper return types download_era5_day and download_era5_multivar_day now return path vectors instead of nothing on the skip-existing short-circuit; update the assertions accordingly. Co-Authored-By: Claude Opus 4.7 (1M context) --- test/test_cds_downloading.jl | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/test/test_cds_downloading.jl b/test/test_cds_downloading.jl index 7ff816347..9670c60c5 100644 --- a/test/test_cds_downloading.jl +++ b/test/test_cds_downloading.jl @@ -576,25 +576,25 @@ end @testset "single-variable multi-date (download_era5_day)" begin # All hours of date1, date2 already on disk ds_sl = ERA5HourlySingleLevel() - for dt in (date1, date2) - touch_expected(:temperature, ds_sl, dt) - end + expected = [touch_expected(:temperature, ds_sl, dt) for dt in (date1, date2)] - # Returns nothing without raising — the early-return guard fires - @test CDSExt.download_era5_day(:temperature, ds_sl, [date1, date2]; - region, dir=tmp, - skip_existing=true, cleanup=true) === nothing + # Returns the existing paths without raising — the early-return guard fires + result = CDSExt.download_era5_day(:temperature, ds_sl, [date1, date2]; + region, dir=tmp, + skip_existing=true, cleanup=true) + @test result isa Vector{String} + @test Set(result) == Set(expected) end @testset "multi-variable multi-date (download_era5_multivar_day)" begin ds_sl = ERA5HourlySingleLevel() - for name in names, dt in (date1, date2) - touch_expected(name, ds_sl, dt) - end + expected = [touch_expected(name, ds_sl, dt) for name in names for dt in (date1, date2)] - @test CDSExt.download_era5_multivar_day(names, ds_sl, [date1, date2]; - region, dir=tmp, - skip_existing=true, cleanup=true) === nothing + result = CDSExt.download_era5_multivar_day(names, ds_sl, [date1, date2]; + region, dir=tmp, + skip_existing=true, cleanup=true) + @test result isa Vector{String} + @test Set(result) == Set(expected) end end end From 17e3565928c6bfd9784f410a046073764c904ac0 Mon Sep 17 00:00:00 2001 From: Eliot Quon Date: Fri, 1 May 2026 15:31:29 -0600 Subject: [PATCH 4/4] Test multi-day path collection in ERA5 download_dataset wrappers The existing skip-existing tests only exercised single-calendar-day helper calls. Add tests that span two calendar days for the three parent dispatches (ERA5Metadata, ERA5PressureMetadata + names, and names + dataset + datetimes) so a regression that drops paths from one day group would fail loudly. Co-Authored-By: Claude Opus 4.7 (1M context) --- test/test_cds_downloading.jl | 35 +++++++++++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) diff --git a/test/test_cds_downloading.jl b/test/test_cds_downloading.jl index 9670c60c5..98d52c324 100644 --- a/test/test_cds_downloading.jl +++ b/test/test_cds_downloading.jl @@ -596,6 +596,41 @@ end @test result isa Vector{String} @test Set(result) == Set(expected) end + + # Dates spanning two calendar days — exercises the parents' + # path-collection across multiple `_group_by_calendar_day` groups. + # Catches regressions that drop or overwrite paths from one group. + date_day1 = DateTime(2005, 2, 16, 12) + date_day2 = DateTime(2005, 2, 17, 6) + + @testset "ERA5Metadata parent (multi-day)" begin + ds_sl = ERA5HourlySingleLevel() + expected = [touch_expected(:temperature, ds_sl, dt) for dt in (date_day1, date_day2)] + meta = Metadata(:temperature; dataset=ds_sl, dates=[date_day1, date_day2], region, dir=tmp) + + result = download_dataset(meta; skip_existing=true) + @test result isa Vector{String} + @test Set(result) == Set(expected) + end + + @testset "ERA5PressureMetadata parent (multi-day, multi-name)" begin + expected = [touch_expected(name, ds_pl, dt) for name in names for dt in (date_day1, date_day2)] + meta = Metadata(:temperature; dataset=ds_pl, dates=[date_day1, date_day2], region, dir=tmp) + + result = download_dataset(names, meta; skip_existing=true) + @test result isa Vector{String} + @test Set(result) == Set(expected) + end + + @testset "names + dataset + datetimes convenience overload (multi-day)" begin + ds_sl = ERA5HourlySingleLevel() + expected = [touch_expected(name, ds_sl, dt) for name in names for dt in (date_day1, date_day2)] + + result = download_dataset(names, ds_sl, [date_day1, date_day2]; + region, dir=tmp, skip_existing=true, cleanup=true) + @test result isa Vector{String} + @test Set(result) == Set(expected) + end end end