From 4078d8c5c0064a1efe5dc461b4986022fe8af256 Mon Sep 17 00:00:00 2001 From: Eliot Quon Date: Tue, 3 Mar 2026 11:08:15 -0700 Subject: [PATCH 01/53] Implement ERA5PressureDataset, ERA5HourlyPressureLevels, ERA5MonthlyPressureLevels MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Updated exports to include ERA5HourlyPressureLevels, ERA5MonthlyPressureLevels, ERA5_all_pressure_levels, pressure_field - Added using Statistics, Oceananigans.Fields.CenterField/interior, Oceananigans.BoundaryConditions.fill_halo_regions!, native_grid, InverseGravity to imports - Extended import block to include is_three_dimensional, reversed_vertical_axis, conversion_units - Added ERA5_all_pressure_levels constant (37 standard hPa levels) - Added ERA5PressureDataset, ERA5HourlyPressureLevels, ERA5MonthlyPressureLevels with keyword constructors - Added ERA5PressureMetadata{D} and ERA5PressureMetadatum type aliases - Added Base.size, all_dates, is_three_dimensional, reversed_vertical_axis dispatches - Added ERA5PL_dataset_variable_names and ERA5PL_netcdf_variable_names dicts (15 variables each) - Added available_variables, dataset_variable_name, netcdf_variable_name, conversion_units dispatches - Added retrieve_data(::ERA5PressureMetadatum) — reads 4D NetCDF, reverses vertical axis - Added metadata_prefix(::ERA5PressureMetadata) — uses ERA5PL_dataset_variable_names for filename construction - Added _std_atm_geopotential_height, _std_atm_z_interfaces, z_interfaces(::ERA5PressureMetadata) - Added pressure_field and mean_geopotential_heights --- src/DataWrangling/ERA5/ERA5.jl | 216 ++++++++++++++++++++++++++++++++- 1 file changed, 213 insertions(+), 3 deletions(-) diff --git a/src/DataWrangling/ERA5/ERA5.jl b/src/DataWrangling/ERA5/ERA5.jl index b69d68d2f..112b76297 100644 --- a/src/DataWrangling/ERA5/ERA5.jl +++ b/src/DataWrangling/ERA5/ERA5.jl @@ -1,13 +1,16 @@ module ERA5 export ERA5Hourly, ERA5Monthly +export ERA5HourlyPressureLevels, ERA5MonthlyPressureLevels, ERA5_all_pressure_levels, pressure_field using NCDatasets using Printf using Scratch +using Statistics -using Oceananigans.Fields: Center -using NumericalEarth.DataWrangling: Metadata, Metadatum, metadata_path +using Oceananigans.Fields: Center, CenterField, interior +using Oceananigans.BoundaryConditions: fill_halo_regions! +using NumericalEarth.DataWrangling: Metadata, Metadatum, metadata_path, native_grid, InverseGravity using Dates using Dates: DateTime, Day, Month, Hour @@ -24,7 +27,10 @@ import NumericalEarth.DataWrangling: inpainted_metadata_path, available_variables, retrieve_data, - metadata_path + metadata_path, + is_three_dimensional, + reversed_vertical_axis, + conversion_units import Base: eltype @@ -48,6 +54,37 @@ struct ERA5Monthly <: ERA5Dataset end dataset_name(::ERA5Hourly) = "ERA5Hourly" dataset_name(::ERA5Monthly) = "ERA5Monthly" +##### +##### ERA5 pressure-level datasets +##### + +const ERA5_all_pressure_levels = [1, 2, 3, 5, 7, 10, 20, 30, 50, 70, 100, 125, 150, + 175, 200, 225, 250, 300, 350, 400, 450, 500, 550, 600, 650, 700, 750, 775, 800, + 825, 850, 875, 900, 925, 950, 975, 1000] + +abstract type ERA5PressureDataset <: ERA5Dataset end + +struct ERA5HourlyPressureLevels <: ERA5PressureDataset + levels :: Vector{Int} +end +ERA5HourlyPressureLevels(; levels=ERA5_all_pressure_levels) = ERA5HourlyPressureLevels(levels) + +struct ERA5MonthlyPressureLevels <: ERA5PressureDataset + levels :: Vector{Int} +end +ERA5MonthlyPressureLevels(; levels=ERA5_all_pressure_levels) = ERA5MonthlyPressureLevels(levels) + +dataset_name(::ERA5HourlyPressureLevels) = "ERA5HourlyPressureLevels" +dataset_name(::ERA5MonthlyPressureLevels) = "ERA5MonthlyPressureLevels" + +Base.size(ds::ERA5PressureDataset, variable) = (1440, 721, length(ds.levels)) + +all_dates(::ERA5HourlyPressureLevels, var) = range(DateTime("1940-01-01"), stop=DateTime("2024-12-31"), step=Hour(1)) +all_dates(::ERA5MonthlyPressureLevels, var) = range(DateTime("1940-01-01"), stop=DateTime("2024-12-01"), step=Month(1)) + +const ERA5PressureMetadata{D} = Metadata{<:ERA5PressureDataset, D} +const ERA5PressureMetadatum = Metadatum{<:ERA5PressureDataset} + # Wave variables are on a 0.5° grid (720×361), atmospheric variables on 0.25° (1440×721) const ERA5_wave_variables = Set([ :eastward_stokes_drift, :northward_stokes_drift, @@ -72,6 +109,12 @@ const ERA5Metadatum = Metadatum{<:ERA5Dataset} # ERA5 is a spatially 2D dataset (atmospheric surface variables) is_three_dimensional(::ERA5Metadata) = false +# ERA5 pressure-level data is 3D +is_three_dimensional(::ERA5PressureMetadata) = true + +# ERA5 stores pressure levels top-to-bottom; reverse to match Oceananigans bottom-to-top +reversed_vertical_axis(::ERA5PressureDataset) = true + # Variable name mappings from NumericalEarth names to ERA5/CDS API variable names ERA5_dataset_variable_names = Dict( :temperature => "2m_temperature", @@ -126,6 +169,53 @@ ERA5_netcdf_variable_names = Dict( netcdf_variable_name(metadata::ERA5Metadata) = ERA5_netcdf_variable_names[metadata.name] +##### +##### ERA5 pressure-level variable name mappings +##### + +ERA5PL_dataset_variable_names = Dict( + :temperature => "temperature", + :eastward_velocity => "u_component_of_wind", + :northward_velocity => "v_component_of_wind", + :vertical_velocity => "vertical_velocity", + :geopotential => "geopotential", + :geopotential_height => "geopotential", + :specific_humidity => "specific_humidity", + :relative_humidity => "relative_humidity", + :vorticity => "vorticity", + :divergence => "divergence", + :potential_vorticity => "potential_vorticity", + :ozone_mass_mixing_ratio => "ozone_mass_mixing_ratio", + :fraction_of_cloud_cover => "fraction_of_cloud_cover", + :specific_cloud_liquid_water_content => "specific_cloud_liquid_water_content", + :specific_cloud_ice_water_content => "specific_cloud_ice_water_content", +) + +ERA5PL_netcdf_variable_names = Dict( + :temperature => "t", + :eastward_velocity => "u", + :northward_velocity => "v", + :vertical_velocity => "w", + :geopotential => "z", + :geopotential_height => "z", + :specific_humidity => "q", + :relative_humidity => "r", + :vorticity => "vo", + :divergence => "d", + :potential_vorticity => "pv", + :ozone_mass_mixing_ratio => "o3", + :fraction_of_cloud_cover => "cc", + :specific_cloud_liquid_water_content => "clwc", + :specific_cloud_ice_water_content => "ciwc", +) + +available_variables(::ERA5PressureDataset) = ERA5PL_dataset_variable_names +dataset_variable_name(md::ERA5PressureMetadata) = ERA5PL_dataset_variable_names[md.name] +netcdf_variable_name(md::ERA5PressureMetadata) = ERA5PL_netcdf_variable_names[md.name] + +conversion_units(md::ERA5PressureMetadatum) = + md.name == :geopotential_height ? InverseGravity() : nothing + """ retrieve_data(metadata::ERA5Metadatum) @@ -158,6 +248,22 @@ function retrieve_data(metadata::ERA5Metadatum) return reshape(data_2d, size(data_2d, 1), size(data_2d, 2), 1) end +""" + retrieve_data(metadata::ERA5PressureMetadatum) + +Retrieve ERA5 pressure-level data from a NetCDF file. +Returns a 3D array (lon, lat, level) with levels ordered bottom-to-top +(highest pressure at k=1, lowest pressure at k=Nz). +""" +function retrieve_data(metadata::ERA5PressureMetadatum) + path = metadata_path(metadata) + name = netcdf_variable_name(metadata) + ds = NCDatasets.Dataset(path) + data = ds[name][:, :, :, 1] # (lon, lat, pressure_level, time=1) + close(ds) + return reverse(data, dims=3) # flip from ERA5 top-to-bottom to Oceananigans bottom-to-top +end + ##### ##### Metadata filename construction ##### @@ -186,6 +292,27 @@ function bbox_strs(c) return first, second end +function metadata_prefix(metadata::ERA5PressureMetadata) + var = ERA5PL_dataset_variable_names[metadata.name] + dataset = dataset_name(metadata.dataset) + start_date = start_date_str(metadata.dates) + end_date = end_date_str(metadata.dates) + bbox = metadata.bounding_box + + if !isnothing(bbox) + w, e = bbox_strs(bbox.longitude) + s, n = bbox_strs(bbox.latitude) + suffix = string(w, e, s, n) + else + suffix = "" + end + + prefix = string(var, "_", dataset, "_", start_date, "_", end_date, suffix) + prefix = colon2dash(prefix) + prefix = underscore_spaces(prefix) + return prefix +end + function metadata_prefix(metadata::ERA5Metadata) var = ERA5_dataset_variable_names[metadata.name] dataset = dataset_name(metadata.dataset) @@ -240,5 +367,88 @@ z_interfaces(::ERA5Metadata) = (0, 1) # ERA5 data is stored as Float32 eltype(::ERA5Metadata) = Float32 +##### +##### Pressure-level vertical coordinate +##### + +# Standard atmosphere height (m) for a given pressure (hPa) +_std_atm_geopotential_height(p_hPa) = (287.05 * 288.15 / 9.80665) * log(1013.25 / p_hPa) + +# Build z-interfaces (Nz+1 values) from pressure levels. +# Levels may be in any order; output is sorted so k=1 is highest pressure (lowest altitude). +function _std_atm_z_interfaces(levels) + sorted_levels = sort(levels, rev=true) # highest pressure first → k=1 is bottom + heights = _std_atm_geopotential_height.(Float64.(sorted_levels)) + Nz = length(heights) + + interfaces = Vector{Float64}(undef, Nz + 1) + + if Nz == 1 + interfaces[1] = max(0.0, heights[1] - 500.0) + interfaces[2] = heights[1] + 500.0 + return interfaces + end + + interfaces[1] = max(0.0, heights[1] - (heights[2] - heights[1]) / 2) + for k in 2:Nz + interfaces[k] = (heights[k-1] + heights[k]) / 2 + end + interfaces[Nz+1] = heights[Nz] + (heights[Nz] - heights[Nz-1]) / 2 + + return interfaces +end + +z_interfaces(metadata::ERA5PressureMetadata) = _std_atm_z_interfaces(metadata.dataset.levels) + +##### +##### pressure_field — synthetic pressure coordinate field +##### + +""" + pressure_field(metadata::ERA5PressureMetadatum, arch=CPU(); halo=(3,3,3)) + +Return a `CenterField` on the native grid of `metadata` filled with the pressure +value (hPa) at each vertical level. Levels are ordered bottom-to-top (k=1 is the +highest pressure level). +""" +function pressure_field(metadata::ERA5PressureMetadatum, arch=CPU(); halo=(3,3,3)) + grid = native_grid(metadata, arch; halo) + field = CenterField(grid) + reversed_levels = sort(metadata.dataset.levels, rev=true) # highest pressure → k=1 + for (k, p) in enumerate(reversed_levels) + interior(field)[:, :, k] .= Float32(p) + end + fill_halo_regions!(field) + return field +end + +##### +##### mean_geopotential_heights — data-derived static z-coordinate +##### + +""" + mean_geopotential_heights(metadata::ERA5PressureMetadata; arch=CPU()) + +Compute spatially and temporally averaged geopotential heights (m) for each +pressure level in `metadata`. This provides more accurate z-coordinates than +the standard-atmosphere fallback used by `z_interfaces`. + +Downloads the `:geopotential` field for every date in `metadata`, divides by g, +averages over the horizontal domain and all dates, and returns one representative +height per pressure level in bottom-to-top order (k=1 is highest pressure). +""" +function mean_geopotential_heights(metadata::ERA5PressureMetadata; arch=CPU()) + geo_metadata = Metadata(:geopotential; dataset=metadata.dataset, + dates=metadata.dates, bounding_box=metadata.bounding_box, + dir=metadata.dir) + heights = zeros(length(metadata.dataset.levels)) + for geo_datum in geo_metadata + data = retrieve_data(geo_datum) ./ Float32(9.80665) # Φ → Z (m) + heights .+= dropdims(mean(data; dims=(1, 2)); dims=(1, 2)) + end + heights ./= length(geo_metadata) + return heights +end + end # module ERA5 From 54a2a0c4d74435116f936ab18de2876c5315e3f0 Mon Sep 17 00:00:00 2001 From: Eliot Quon Date: Tue, 3 Mar 2026 11:11:21 -0700 Subject: [PATCH 02/53] Add ERA5*PressureLevels exports --- src/DataWrangling/DataWrangling.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/DataWrangling/DataWrangling.jl b/src/DataWrangling/DataWrangling.jl index 6bbe3dccd..9db7553e0 100644 --- a/src/DataWrangling/DataWrangling.jl +++ b/src/DataWrangling/DataWrangling.jl @@ -8,7 +8,7 @@ export Metadata, Metadatum, ECCOMetadatum, EN4Metadatum, all_dates, first_date, export metadata_time_step, metadata_epoch export LinearlyTaperedPolarMask export DatasetRestoring -export ERA5Hourly, ERA5Monthly +export ERA5Hourly, ERA5Monthly, ERA5HourlyPressureLevels, ERA5MonthlyPressureLevels using Oceananigans using Downloads From 8fede163943b4764e19147a476ae2f5d2b8c6ee4 Mon Sep 17 00:00:00 2001 From: Eliot Quon Date: Tue, 3 Mar 2026 11:12:44 -0700 Subject: [PATCH 03/53] Add InverseGravity for geopotential height calc --- src/DataWrangling/metadata.jl | 1 + src/DataWrangling/metadata_field.jl | 1 + 2 files changed, 2 insertions(+) diff --git a/src/DataWrangling/metadata.jl b/src/DataWrangling/metadata.jl index b4de44791..09e90c2a5 100644 --- a/src/DataWrangling/metadata.jl +++ b/src/DataWrangling/metadata.jl @@ -297,6 +297,7 @@ struct NanomolePerKilogram end struct NanomolePerLiter end struct InverseSign end +struct InverseGravity end struct GramPerKilogramMinus35 end # Salinity anomaly struct MilliliterPerLiter end # Sometimes for disssolved_oxygen diff --git a/src/DataWrangling/metadata_field.jl b/src/DataWrangling/metadata_field.jl index 7107ca8be..846e6dcc0 100644 --- a/src/DataWrangling/metadata_field.jl +++ b/src/DataWrangling/metadata_field.jl @@ -248,6 +248,7 @@ end @inline convert_units(C::FT, ::Union{NanomolePerLiter, NanomolePerKilogram}) where FT = C * convert(FT, 1e-6) @inline convert_units(C::FT, ::MilliliterPerLiter) where FT = C / convert(FT, 22.3916) @inline convert_units(C::FT, ::GramPerKilogramMinus35) where FT = C + convert(FT, 35) +@inline convert_units(Φ::FT, ::InverseGravity) where FT = Φ / convert(FT, 9.80665) ##### From 13e4fab5f83da180004793af52ffc58d1b1ad600 Mon Sep 17 00:00:00 2001 From: Eliot Quon Date: Tue, 3 Mar 2026 11:16:04 -0700 Subject: [PATCH 04/53] Add ERA5 pressure-levels downloader --- ext/NumericalEarthCDSAPIExt.jl | 50 ++++++++++++++++++++++++++++++++++ 1 file changed, 50 insertions(+) diff --git a/ext/NumericalEarthCDSAPIExt.jl b/ext/NumericalEarthCDSAPIExt.jl index b450dabe9..52f54d301 100644 --- a/ext/NumericalEarthCDSAPIExt.jl +++ b/ext/NumericalEarthCDSAPIExt.jl @@ -8,6 +8,7 @@ using Oceananigans.DistributedComputations: @root using Dates using NumericalEarth.DataWrangling.ERA5: ERA5Metadata, ERA5Metadatum, ERA5_dataset_variable_names +using NumericalEarth.DataWrangling.ERA5: ERA5PressureMetadata, ERA5PressureMetadatum, ERA5PL_dataset_variable_names import NumericalEarth.DataWrangling: download_dataset @@ -90,6 +91,55 @@ function download_dataset(meta::ERA5Metadatum; skip_existing=true) return output_path end +##### +##### ERA5 pressure-level download +##### + +function download_dataset(metadata::ERA5PressureMetadata; kwargs...) + paths = Array{String}(undef, length(metadata)) + for (m, metadatum) in enumerate(metadata) + paths[m] = download_dataset(metadatum; kwargs...) + end + return paths +end + +function download_dataset(meta::ERA5PressureMetadatum; skip_existing=true) + output_directory = meta.dir + output_filename = NumericalEarth.DataWrangling.metadata_filename(meta) + output_path = joinpath(output_directory, output_filename) + + skip_existing && isfile(output_path) && return output_path + mkpath(output_directory) + + variable_name = ERA5PL_dataset_variable_names[meta.name] + date = meta.dates + year = string(Dates.year(date)) + month = lpad(string(Dates.month(date)), 2, '0') + day = lpad(string(Dates.day(date)), 2, '0') + hour = lpad(string(Dates.hour(date)), 2, '0') * ":00" + + request = Dict( + "product_type" => ["reanalysis"], + "variable" => [variable_name], + "pressure_level" => [string(p) for p in meta.dataset.levels], + "year" => [year], + "month" => [month], + "day" => [day], + "time" => [hour], + "data_format" => "netcdf", + "download_format" => "unarchived", + ) + + area = build_era5_area(meta.bounding_box) + isnothing(area) || (request["area"] = area) + + @root begin + CDSAPI.retrieve("reanalysis-era5-pressure-levels", request, output_path) + end + + return output_path +end + ##### ##### Area/bounding box utilities ##### From afdd2272af285545628595c6274265d2b1d04957 Mon Sep 17 00:00:00 2001 From: Eliot Quon Date: Tue, 3 Mar 2026 11:20:28 -0700 Subject: [PATCH 05/53] Add CDS tests, verified 95/95 pass --- test/test_cds_downloading.jl | 114 ++++++++++++++++++++++++++++++++++- 1 file changed, 113 insertions(+), 1 deletion(-) diff --git a/test/test_cds_downloading.jl b/test/test_cds_downloading.jl index 3e5d8fa9e..6fb4c62ca 100644 --- a/test/test_cds_downloading.jl +++ b/test/test_cds_downloading.jl @@ -5,6 +5,9 @@ using NCDatasets using NumericalEarth.DataWrangling.ERA5 using NumericalEarth.DataWrangling.ERA5: ERA5Hourly, ERA5Monthly, ERA5_dataset_variable_names +using NumericalEarth.DataWrangling.ERA5: ERA5HourlyPressureLevels, ERA5MonthlyPressureLevels, + ERA5_all_pressure_levels, ERA5PL_dataset_variable_names, + pressure_field using NumericalEarth.DataWrangling: metadata_path, download_dataset using Dates @@ -127,13 +130,53 @@ start_date = DateTime(2005, 2, 16, 12) @testset "ERA5 Monthly dataset" begin monthly_dataset = ERA5Monthly() @test monthly_dataset isa ERA5Monthly - + # Test that all_dates returns a valid range dates = NumericalEarth.DataWrangling.all_dates(monthly_dataset, :temperature) @test first(dates) == DateTime("1940-01-01") @test step(dates) == Month(1) end + @testset "ERA5HourlyPressureLevels construction and metadata" begin + # Default constructor uses all 37 standard levels + ds_full = ERA5HourlyPressureLevels() + @test ds_full isa ERA5HourlyPressureLevels + @test length(ds_full.levels) == 37 + @test Base.size(ds_full, :temperature) == (1440, 721, 37) + + # Subset constructor + ds_sub = ERA5HourlyPressureLevels(levels=[850, 500]) + @test Base.size(ds_sub, :temperature) == (1440, 721, 2) + + # Monthly variant + ds_monthly = ERA5MonthlyPressureLevels() + @test ds_monthly isa ERA5MonthlyPressureLevels + + # Metadatum size propagates Nz correctly + meta = Metadatum(:temperature; dataset=ds_sub, bounding_box=bounding_box, date=start_date) + Nx, Ny, Nz, Nt = size(meta) + @test Nz == 2 + @test NumericalEarth.DataWrangling.ERA5.is_three_dimensional(meta) == true + + # Variable name lookups + @test ERA5PL_dataset_variable_names[:temperature] == "temperature" + @test ERA5PL_dataset_variable_names[:geopotential_height] == "geopotential" + end + + @testset "ERA5 pressure-level z_interfaces (standard atmosphere)" begin + levels_2 = [850, 500] + z = NumericalEarth.DataWrangling.ERA5._std_atm_z_interfaces(levels_2) + @test length(z) == 3 # Nz+1 interfaces + @test issorted(z) # monotonically increasing with altitude + # 850 hPa ≈ 1457 m, 500 hPa ≈ 5575 m + @test z[1] < 1457.0 < z[2] < 5575.0 < z[3] + + # Single level + z1 = NumericalEarth.DataWrangling.ERA5._std_atm_z_interfaces([500]) + @test length(z1) == 2 + @test z1[1] < z1[2] + end + for arch in test_architectures A = typeof(arch) @@ -195,4 +238,73 @@ start_date = DateTime(2005, 2, 16, 12) isfile(inpainted_path) && rm(inpainted_path; force=true) end end + + @testset "ERA5 pressure-level download and Field on CPU" begin + arch = CPU() + ds_pl = ERA5HourlyPressureLevels(levels=[850, 500]) + + @testset "Download and 3D Field" begin + meta = Metadatum(:temperature; dataset=ds_pl, bounding_box, date=start_date) + filepath = metadata_path(meta) + isfile(filepath) && rm(filepath; force=true) + + download_dataset(meta) + @test isfile(filepath) + + # Verify the NetCDF has a pressure_level dimension and the right variable + ds_nc = NCDataset(filepath) + @test haskey(ds_nc, "t") + @test haskey(ds_nc, "pressure_level") || haskey(ds_nc, "level") + close(ds_nc) + + f = Field(meta, arch) + @test f isa Field + Nx, Ny, Nz = size(f) + @test Nz == 2 + + @allowscalar begin + @test !all(iszero, interior(f)) + # Temperature at these levels should be in a plausible range (K) + @test all(x -> 180 < x < 340, filter(!isnan, vec(interior(f)))) + end + + rm(filepath; force=true) + inpainted_path = NumericalEarth.DataWrangling.inpainted_metadata_path(meta) + isfile(inpainted_path) && rm(inpainted_path; force=true) + end + + @testset "Geopotential height conversion" begin + meta_z = Metadatum(:geopotential_height; dataset=ds_pl, bounding_box, date=start_date) + filepath = metadata_path(meta_z) + isfile(filepath) && rm(filepath; force=true) + + download_dataset(meta_z) + fz = Field(meta_z, arch) + + @allowscalar begin + max_z = maximum(filter(!isnan, vec(interior(fz)))) + # 500 hPa geopotential height ≈ 5500 m + @test 4000 < max_z < 7000 + end + + rm(filepath; force=true) + inpainted_path = NumericalEarth.DataWrangling.inpainted_metadata_path(meta_z) + isfile(inpainted_path) && rm(inpainted_path; force=true) + end + + @testset "pressure_field" begin + meta = Metadatum(:temperature; dataset=ds_pl, bounding_box, date=start_date) + pf = pressure_field(meta, arch) + @test pf isa Field + Nx, Ny, Nz = size(pf) + @test Nz == 2 + + @allowscalar begin + # k=1 should be 850 hPa (highest pressure, lowest altitude) + @test interior(pf)[1, 1, 1] ≈ 850.0f0 + # k=2 should be 500 hPa + @test interior(pf)[1, 1, 2] ≈ 500.0f0 + end + end + end end From c09f6daf8695e6f80d28b07306f0fcebcf29cf79 Mon Sep 17 00:00:00 2001 From: Eliot Quon Date: Tue, 3 Mar 2026 13:00:41 -0700 Subject: [PATCH 06/53] Fix imports --- src/DataWrangling/ERA5/ERA5.jl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/DataWrangling/ERA5/ERA5.jl b/src/DataWrangling/ERA5/ERA5.jl index 112b76297..e6c688fd3 100644 --- a/src/DataWrangling/ERA5/ERA5.jl +++ b/src/DataWrangling/ERA5/ERA5.jl @@ -8,8 +8,8 @@ using Printf using Scratch using Statistics -using Oceananigans.Fields: Center, CenterField, interior -using Oceananigans.BoundaryConditions: fill_halo_regions! +using Oceananigans.Fields: Center +using Oceananigans: CenterField, interior, fill_halo_regions!, CPU using NumericalEarth.DataWrangling: Metadata, Metadatum, metadata_path, native_grid, InverseGravity using Dates using Dates: DateTime, Day, Month, Hour From 9c4893705bcb9bf61e1fdc4f525a342c0373dfcd Mon Sep 17 00:00:00 2001 From: Eliot Quon Date: Tue, 3 Mar 2026 13:34:45 -0700 Subject: [PATCH 07/53] Add day, hour to date_str for filenaming --- src/DataWrangling/ERA5/ERA5.jl | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/DataWrangling/ERA5/ERA5.jl b/src/DataWrangling/ERA5/ERA5.jl index e6c688fd3..a7664625d 100644 --- a/src/DataWrangling/ERA5/ERA5.jl +++ b/src/DataWrangling/ERA5/ERA5.jl @@ -271,7 +271,9 @@ end function date_str(date::DateTime) y = Dates.year(date) m = lpad(Dates.month(date), 2, '0') - return "$(y)-$(m)" + d = lpad(Dates.day(date), 2, '0') + h = lpad(Dates.hour(date), 2, '0') + return "$(y)-$(m)-$(d)T$(h)" end start_date_str(date::DateTime) = date_str(date) From aec39c801d75d58ba86e881b3fc9ef5dc6302ea2 Mon Sep 17 00:00:00 2001 From: Eliot Quon Date: Tue, 3 Mar 2026 15:55:54 -0700 Subject: [PATCH 08/53] Allow simultaneous download of multiple variables --- ext/NumericalEarthCDSAPIExt.jl | 174 ++++++++++++++++++++++++++++++++- 1 file changed, 173 insertions(+), 1 deletion(-) diff --git a/ext/NumericalEarthCDSAPIExt.jl b/ext/NumericalEarthCDSAPIExt.jl index 52f54d301..025e8b0c8 100644 --- a/ext/NumericalEarthCDSAPIExt.jl +++ b/ext/NumericalEarthCDSAPIExt.jl @@ -2,16 +2,26 @@ module NumericalEarthCDSAPIExt using NumericalEarth using CDSAPI +using NCDatasets using Oceananigans using Oceananigans.DistributedComputations: @root using Dates using NumericalEarth.DataWrangling.ERA5: ERA5Metadata, ERA5Metadatum, ERA5_dataset_variable_names -using NumericalEarth.DataWrangling.ERA5: ERA5PressureMetadata, ERA5PressureMetadatum, ERA5PL_dataset_variable_names +using NumericalEarth.DataWrangling.ERA5: ERA5PressureDataset, + ERA5PressureMetadata, ERA5PressureMetadatum, + ERA5PL_dataset_variable_names, ERA5PL_netcdf_variable_names import NumericalEarth.DataWrangling: download_dataset +# Coordinate / dimension variables to propagate into each split file +const ERA5PL_COORD_VARS = Set(["longitude", "latitude", + "pressure_level", "level", + "time", "valid_time", + "expver", # experiment version: ==1 for final data, ==5 preliminary 5-day data + "number"]) # ensemble member + """ download_dataset(metadata::ERA5Metadata; kwargs...) @@ -140,6 +150,168 @@ function download_dataset(meta::ERA5PressureMetadatum; skip_existing=true) return output_path end +##### +##### Multi-variable ERA5 pressure-level download +##### + +""" + download_dataset(names::Vector{Symbol}, metadata::ERA5PressureMetadata; kwargs...) + +Download multiple ERA5 pressure-level variables for each date in `metadata`. +""" +function download_dataset(names::Vector{Symbol}, metadata::ERA5PressureMetadata; kwargs...) + for metadatum in metadata + download_dataset(names, metadatum; kwargs...) + end + return nothing +end + +""" + download_dataset(names::Vector{Symbol}, meta::ERA5PressureMetadatum; skip_existing=true) + +Download multiple ERA5 pressure-level variables for a single date in one CDS API request. + +The multi-variable NetCDF returned by CDS is split into individual per-variable files, each +stored at the path returned by `metadata_path(Metadatum(name; dataset=meta.dataset, ...))`. +This keeps subsequent calls to `Field` and `retrieve_data` unchanged. +""" +function download_dataset(names::Vector{Symbol}, meta::ERA5PressureMetadatum; skip_existing=true) + # Build (symbol → output path) pairs using the standard single-variable filename scheme + name_path_pairs = [(name, NumericalEarth.DataWrangling.metadata_path( + NumericalEarth.DataWrangling.Metadatum(name; + dataset = meta.dataset, + bounding_box = meta.bounding_box, + date = meta.dates, + dir = meta.dir))) + for name in names] + + # Determine which output files are missing + pending = if skip_existing + [(n, p) for (n, p) in name_path_pairs if !isfile(p)] + else + name_path_pairs + end + + isempty(pending) && return [path for (_, path) in name_path_pairs] + + # Unique CDS variable names (:geopotential and :geopotential_height share one CDS field) + cds_vars = unique([ERA5PL_dataset_variable_names[name] for (name, _) in pending]) + + date = meta.dates + year = string(Dates.year(date)) + month = lpad(string(Dates.month(date)), 2, '0') + day = lpad(string(Dates.day(date)), 2, '0') + hour = lpad(string(Dates.hour(date)), 2, '0') * ":00" + + request = Dict( + "product_type" => ["reanalysis"], + "variable" => cds_vars, + "pressure_level" => [string(p) for p in meta.dataset.levels], + "year" => [year], + "month" => [month], + "day" => [day], + "time" => [hour], + "data_format" => "netcdf", + "download_format" => "unarchived", + ) + + area = build_era5_area(meta.bounding_box) + isnothing(area) || (request["area"] = area) + + mkpath(meta.dir) + tmp_path = joinpath(meta.dir, "_tmp_multi_$(year)$(month)$(day)T$(hour[1:2]).nc") + + nc_name_path_pairs = [(ERA5PL_netcdf_variable_names[name], path) for (name, path) in pending] + + @root begin + CDSAPI.retrieve("reanalysis-era5-pressure-levels", request, tmp_path) + _split_era5pl_nc(tmp_path, nc_name_path_pairs) + rm(tmp_path; force=true) + end + + return [path for (_, path) in name_path_pairs] +end + +""" + download_dataset(names::Vector{Symbol}, dataset::ERA5PressureDataset; + date, bounding_box=nothing, dir=default_download_directory(dataset)) + +Download multiple ERA5 pressure-level variables without requiring a dummy `Metadatum`. + +# Example +```julia +ds = ERA5HourlyPressureLevels(levels=[850, 500]) +download_dataset([:temperature, :geopotential_height, :eastward_velocity], ds; + date=DateTime(2020, 6, 15, 0), bounding_box=bbox) +``` +""" +function download_dataset(names::Vector{Symbol}, dataset::ERA5PressureDataset; + date, + bounding_box = nothing, + dir = NumericalEarth.DataWrangling.default_download_directory(dataset)) + meta = NumericalEarth.DataWrangling.Metadatum(first(names); dataset, date, bounding_box, dir) + return download_dataset(names, meta) +end + +""" + _split_era5pl_nc(src_path, nc_name_path_pairs) + +Split a multi-variable ERA5 pressure-level NetCDF at `src_path` into individual +per-variable files. `nc_name_path_pairs` is a vector of `(nc_varname, dst_path)` tuples. + +Each output file contains all coordinate/dimension variables plus the one data variable. +Global attributes and dimension definitions are preserved. Raw data (including scale/offset +encoding) is copied verbatim so that downstream readers decode values identically. +""" +function _split_era5pl_nc(src_path, nc_name_path_pairs) + NCDatasets.Dataset(src_path, "r") do src + for (nc_varname, dst_path) in nc_name_path_pairs + NCDatasets.Dataset(dst_path, "c") do dst + # Define dimensions + unlimited = NCDatasets.unlimited(src) + for (dname, dlen) in src.dim + NCDatasets.defDim(dst, dname, dname in unlimited ? Inf : dlen) + end + + # Copy global attributes + for (k, v) in src.attrib + dst.attrib[k] = v + end + + # Copy coordinate variables and the one requested data variable + for (vname, var) in src + (vname in ERA5PL_COORD_VARS || vname == nc_varname) || continue + _ncvar_copy!(dst, var, vname) + end + end + end + end +end + +function _ncvar_copy!(dst, src_var, vname) + dims = NCDatasets.dimnames(src_var) + T = eltype(src_var.var) # raw storage type; eltype(src_var) gives CF-decoded DateTime + attribs = src_var.attrib + fill_val = haskey(attribs, "_FillValue") ? attribs["_FillValue"] : nothing + + dst_var = if isnothing(fill_val) + NCDatasets.defVar(dst, vname, T, dims) + else + NCDatasets.defVar(dst, vname, T, dims; fillvalue=fill_val) + end + + # Copy variable attributes (skip _FillValue — handled by defVar above) + for (k, v) in attribs + k == "_FillValue" && continue + dst_var.attrib[k] = v + end + + # Copy raw encoded data, bypassing scale/offset transforms + dst_var.var[:] = src_var.var[:] + + return nothing +end + ##### ##### Area/bounding box utilities ##### From 03efebcf613657f0474b77ffbaf693e595bfc253 Mon Sep 17 00:00:00 2001 From: Eliot Quon Date: Tue, 3 Mar 2026 16:25:43 -0700 Subject: [PATCH 09/53] Aggregate hourly downloads into API requests per calendar day --- ext/NumericalEarthCDSAPIExt.jl | 140 +++++++++++++++++++++++++++++++++ 1 file changed, 140 insertions(+) diff --git a/ext/NumericalEarthCDSAPIExt.jl b/ext/NumericalEarthCDSAPIExt.jl index 025e8b0c8..390191bc8 100644 --- a/ext/NumericalEarthCDSAPIExt.jl +++ b/ext/NumericalEarthCDSAPIExt.jl @@ -312,6 +312,146 @@ function _ncvar_copy!(dst, src_var, vname) return nothing end +""" + download_dataset(names::Vector{Symbol}, dataset::ERA5PressureDataset, dates::AbstractVector; + bounding_box=nothing, dir=default_download_directory(dataset), skip_existing=true) + +Download multiple ERA5 pressure-level variables for a vector of `dates`, issuing one CDS API +request per unique calendar day (which may contain multiple hours). + +# Example +```julia +ds = ERA5HourlyPressureLevels(levels=[850, 500]) +dates = DateTime(2020, 6, 15, 0) : Hour(6) : DateTime(2020, 6, 16, 18) +download_dataset([:temperature, :geopotential_height], ds, collect(dates); bounding_box=bbox) +``` +""" +function download_dataset(names::Vector{Symbol}, dataset::ERA5PressureDataset, + dates::AbstractVector; + bounding_box = nothing, + dir = NumericalEarth.DataWrangling.default_download_directory(dataset), + skip_existing = true) + + grouped = Dict(d => filter(dt -> Dates.Date(dt) == d, dates) + for d in unique(Dates.Date.(dates))) + + for day in sort(collect(keys(grouped))) + _download_era5pl_day(names, dataset, grouped[day]; bounding_box, dir, skip_existing) + end + + return nothing +end + +function _download_era5pl_day(names, dataset, day_dates; + bounding_box, dir, skip_existing) + + MDatum = NumericalEarth.DataWrangling.Metadatum + meta_path = NumericalEarth.DataWrangling.metadata_path + + # All (name, dt, output path) triples for this day + all_triples = [(name, dt, meta_path(MDatum(name; dataset, date=dt, bounding_box, dir))) + for name in names for dt in day_dates] + + pending = skip_existing ? filter(((_, _, p),) -> !isfile(p), all_triples) : all_triples + isempty(pending) && return nothing + + # Unique CDS variable names and hours needed + cds_vars = unique([ERA5PL_dataset_variable_names[name] for (name, _, _) in pending]) + sorted_dts = sort(unique([dt for (_, dt, _) in pending])) + hours_str = [lpad(string(Dates.hour(dt)), 2, '0') * ":00" for dt in sorted_dts] + dt_to_tidx = Dict(dt => i for (i, dt) in enumerate(sorted_dts)) + + dt = first(sorted_dts) + year = string(Dates.year(dt)) + month = lpad(string(Dates.month(dt)), 2, '0') + day = lpad(string(Dates.day(dt)), 2, '0') + + request = Dict( + "product_type" => ["reanalysis"], + "variable" => cds_vars, + "pressure_level" => [string(p) for p in dataset.levels], + "year" => [year], + "month" => [month], + "day" => [day], + "time" => hours_str, + "data_format" => "netcdf", + "download_format" => "unarchived", + ) + + area = build_era5_area(bounding_box) + isnothing(area) || (request["area"] = area) + + mkpath(dir) + tmp_path = joinpath(dir, "_tmp_multi_$(year)$(month)$(day).nc") + nc_triples = [(ERA5PL_netcdf_variable_names[name], dt_to_tidx[dt], path) + for (name, dt, path) in pending] + + @root begin + CDSAPI.retrieve("reanalysis-era5-pressure-levels", request, tmp_path) + _split_era5pl_nc_multistep(tmp_path, nc_triples) + rm(tmp_path; force=true) + end + + return nothing +end + +function _split_era5pl_nc_multistep(src_path, nc_varname_tidx_path_triples) + time_dimnames = Set(["time", "valid_time"]) + + NCDatasets.Dataset(src_path, "r") do src + unlimited = NCDatasets.unlimited(src) + + for (nc_varname, tidx, dst_path) in nc_varname_tidx_path_triples + NCDatasets.Dataset(dst_path, "c") do dst + # Collapse the time dimension to 1; keep all others + for (dname, dlen) in src.dim + out_len = dname in time_dimnames ? 1 : + dname in unlimited ? Inf : dlen + NCDatasets.defDim(dst, dname, out_len) + end + + for (k, v) in src.attrib + dst.attrib[k] = v + end + + for (vname, var) in src + (vname in ERA5PL_COORD_VARS || vname == nc_varname) || continue + _ncvar_copy_tslice!(dst, var, vname, tidx, time_dimnames) + end + end + end + end +end + +function _ncvar_copy_tslice!(dst, src_var, vname, tidx, time_dimnames) + dims = NCDatasets.dimnames(src_var) + T = eltype(src_var.var) + attribs = src_var.attrib + fill_val = haskey(attribs, "_FillValue") ? attribs["_FillValue"] : nothing + + dst_var = isnothing(fill_val) ? + NCDatasets.defVar(dst, vname, T, dims) : + NCDatasets.defVar(dst, vname, T, dims; fillvalue=fill_val) + + for (k, v) in attribs + k == "_FillValue" && continue + dst_var.attrib[k] = v + end + + # Slice time dimension; keep all others intact + has_time = any(d -> d in time_dimnames, dims) + if has_time + idx = ntuple(ndims(src_var.var)) do i + dims[i] in time_dimnames ? (tidx:tidx) : Colon() + end + dst_var.var[:] = src_var.var[idx...] + else + dst_var.var[:] = src_var.var[:] + end + + return nothing +end + ##### ##### Area/bounding box utilities ##### From f1de9e55c9174d303ac6fbb85c6ba11d7b61ec53 Mon Sep 17 00:00:00 2001 From: Eliot Quon Date: Tue, 3 Mar 2026 20:06:55 -0700 Subject: [PATCH 10/53] Cleanup file naming --- src/DataWrangling/ERA5/ERA5.jl | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/src/DataWrangling/ERA5/ERA5.jl b/src/DataWrangling/ERA5/ERA5.jl index a7664625d..13e6723da 100644 --- a/src/DataWrangling/ERA5/ERA5.jl +++ b/src/DataWrangling/ERA5/ERA5.jl @@ -309,7 +309,11 @@ function metadata_prefix(metadata::ERA5PressureMetadata) suffix = "" end - prefix = string(var, "_", dataset, "_", start_date, "_", end_date, suffix) + if start_date == end_date + prefix = string(var, "_", dataset, "_", start_date, suffix) + else + prefix = string(var, "_", dataset, "_", start_date, "_", end_date, suffix) + end prefix = colon2dash(prefix) prefix = underscore_spaces(prefix) return prefix @@ -330,7 +334,11 @@ function metadata_prefix(metadata::ERA5Metadata) suffix = "" end - prefix = string(var, "_", dataset, "_", start_date, "_", end_date, suffix) + if start_date == end_date + prefix = string(var, "_", dataset, "_", start_date, suffix) + else + prefix = string(var, "_", dataset, "_", start_date, "_", end_date, suffix) + end prefix = colon2dash(prefix) prefix = underscore_spaces(prefix) return prefix From 916614e9d55aab470e02fbd6aa27db4a2e78b403 Mon Sep 17 00:00:00 2001 From: Eliot Quon Date: Tue, 3 Mar 2026 23:22:19 -0700 Subject: [PATCH 11/53] Cleanup, clarify --- src/DataWrangling/ERA5/ERA5.jl | 34 ++++++++++++++++++++++++---------- 1 file changed, 24 insertions(+), 10 deletions(-) diff --git a/src/DataWrangling/ERA5/ERA5.jl b/src/DataWrangling/ERA5/ERA5.jl index 13e6723da..3c1b270fd 100644 --- a/src/DataWrangling/ERA5/ERA5.jl +++ b/src/DataWrangling/ERA5/ERA5.jl @@ -371,7 +371,7 @@ location(::ERA5Metadata) = (Center, Center, Center) longitude_interfaces(::ERA5Metadata) = (0, 360) latitude_interfaces(::ERA5Metadata) = (-90, 90) -# ERA5 is a 2D surface dataset, so z is a single level at the surface +# ERA5 single-levels (2-D) data product z_interfaces(::ERA5Metadata) = (0, 1) # ERA5 data is stored as Float32 @@ -381,25 +381,38 @@ eltype(::ERA5Metadata) = Float32 ##### Pressure-level vertical coordinate ##### -# Standard atmosphere height (m) for a given pressure (hPa) -_std_atm_geopotential_height(p_hPa) = (287.05 * 288.15 / 9.80665) * log(1013.25 / p_hPa) +const ERA5_gravitational_acceleration = 9.80665 + +# International Standard Atmosphere height (m) for a given pressure (hPa) +function standard_atmosphere_geopotential_height(p) + g = ERA5_gravitational_acceleration + T⁰ = 288.15 # K + p⁰ = 1013.25 # hPa + Rᵈ = 287.0528 # J/(kg-K) + + return (Rᵈ * T⁰ / g) * log(p⁰ / p) +end # Build z-interfaces (Nz+1 values) from pressure levels. # Levels may be in any order; output is sorted so k=1 is highest pressure (lowest altitude). -function _std_atm_z_interfaces(levels) +function standard_atmosphere_z_interfaces(levels) + @info """ + Calculating z-interfaces based on International Standard Atmosphere... + For greater accuracy, use `mean_geopotential_heights`! + """ sorted_levels = sort(levels, rev=true) # highest pressure first → k=1 is bottom - heights = _std_atm_geopotential_height.(Float64.(sorted_levels)) + heights = standard_atmosphere_geopotential_height.(Float64.(sorted_levels)) Nz = length(heights) interfaces = Vector{Float64}(undef, Nz + 1) if Nz == 1 - interfaces[1] = max(0.0, heights[1] - 500.0) - interfaces[2] = heights[1] + 500.0 + interfaces[1] = max(0, heights[1] - 500) + interfaces[2] = heights[1] + 500 return interfaces end - interfaces[1] = max(0.0, heights[1] - (heights[2] - heights[1]) / 2) + interfaces[1] = max(0, heights[1] - (heights[2] - heights[1]) / 2) for k in 2:Nz interfaces[k] = (heights[k-1] + heights[k]) / 2 end @@ -408,7 +421,8 @@ function _std_atm_z_interfaces(levels) return interfaces end -z_interfaces(metadata::ERA5PressureMetadata) = _std_atm_z_interfaces(metadata.dataset.levels) +# ERA5 pressure-levels (3-D) data product +z_interfaces(metadata::ERA5PressureMetadata) = standard_atmosphere_z_interfaces(metadata.dataset.levels) ##### ##### pressure_field — synthetic pressure coordinate field @@ -453,7 +467,7 @@ function mean_geopotential_heights(metadata::ERA5PressureMetadata; arch=CPU()) dir=metadata.dir) heights = zeros(length(metadata.dataset.levels)) for geo_datum in geo_metadata - data = retrieve_data(geo_datum) ./ Float32(9.80665) # Φ → Z (m) + data = retrieve_data(geo_datum) ./ Float32(ERA5_gravitational_acceleration) # Φ → Z (m) heights .+= dropdims(mean(data; dims=(1, 2)); dims=(1, 2)) end heights ./= length(geo_metadata) From 8f42deed11a1d6e983843a5f9cc0cca25798ca9e Mon Sep 17 00:00:00 2001 From: Eliot Quon Date: Thu, 5 Mar 2026 10:45:43 -0700 Subject: [PATCH 12/53] Allow one or more variables to be downloaded, at one or more datetimes, without explicitly defining metadata --- ext/NumericalEarthCDSAPIExt.jl | 51 ++++++++++++++++++++++++---------- 1 file changed, 37 insertions(+), 14 deletions(-) diff --git a/ext/NumericalEarthCDSAPIExt.jl b/ext/NumericalEarthCDSAPIExt.jl index 390191bc8..cb13a4af0 100644 --- a/ext/NumericalEarthCDSAPIExt.jl +++ b/ext/NumericalEarthCDSAPIExt.jl @@ -233,10 +233,14 @@ function download_dataset(names::Vector{Symbol}, meta::ERA5PressureMetadatum; sk end """ - download_dataset(names::Vector{Symbol}, dataset::ERA5PressureDataset; - date, bounding_box=nothing, dir=default_download_directory(dataset)) + download_dataset(names::Union{Symbol, Vector{Symbol}}, + dataset::ERA5PressureDataset, + datetime; + bounding_box=nothing, + dir=default_download_directory(dataset)) -Download multiple ERA5 pressure-level variables without requiring a dummy `Metadatum`. +Download one or more ERA5 pressure-level variables at a single datetime, without requiring a dummy +`Metadatum`. # Example ```julia @@ -245,14 +249,19 @@ download_dataset([:temperature, :geopotential_height, :eastward_velocity], ds; date=DateTime(2020, 6, 15, 0), bounding_box=bbox) ``` """ -function download_dataset(names::Vector{Symbol}, dataset::ERA5PressureDataset; - date, +function download_dataset(names::Vector{Symbol}, dataset::ERA5PressureDataset, datetime; bounding_box = nothing, dir = NumericalEarth.DataWrangling.default_download_directory(dataset)) - meta = NumericalEarth.DataWrangling.Metadatum(first(names); dataset, date, bounding_box, dir) + meta = NumericalEarth.DataWrangling.Metadatum(first(names); dataset, datetime, bounding_box, dir) return download_dataset(names, meta) end +function download_dataset(name::Symbol, dataset::ERA5PressureDataset, datetime; + bounding_box = nothing, + dir = NumericalEarth.DataWrangling.default_download_directory(dataset)) + return download_dataset([name], dataset, datetime; bounding_box, dir) +end + """ _split_era5pl_nc(src_path, nc_name_path_pairs) @@ -313,10 +322,14 @@ function _ncvar_copy!(dst, src_var, vname) end """ - download_dataset(names::Vector{Symbol}, dataset::ERA5PressureDataset, dates::AbstractVector; - bounding_box=nothing, dir=default_download_directory(dataset), skip_existing=true) - -Download multiple ERA5 pressure-level variables for a vector of `dates`, issuing one CDS API + download_dataset(names::Union{Symbol, Vector{Symbol}}, + dataset::ERA5PressureDataset, + datetimes::AbstractVector; + bounding_box=nothing, + dir=default_download_directory(dataset), + skip_existing=true) + +Download one or more ERA5 pressure-level variables for a vector of `datetimes`, issuing one CDS API request per unique calendar day (which may contain multiple hours). # Example @@ -326,14 +339,15 @@ dates = DateTime(2020, 6, 15, 0) : Hour(6) : DateTime(2020, 6, 16, 18) download_dataset([:temperature, :geopotential_height], ds, collect(dates); bounding_box=bbox) ``` """ -function download_dataset(names::Vector{Symbol}, dataset::ERA5PressureDataset, - dates::AbstractVector; +function download_dataset(names::Vector{Symbol}, + dataset::ERA5PressureDataset, + datetimes::AbstractVector; bounding_box = nothing, dir = NumericalEarth.DataWrangling.default_download_directory(dataset), skip_existing = true) - grouped = Dict(d => filter(dt -> Dates.Date(dt) == d, dates) - for d in unique(Dates.Date.(dates))) + grouped = Dict(d => filter(dt -> Dates.Date(dt) == d, datetimes) + for d in unique(Dates.Date.(datetimes))) for day in sort(collect(keys(grouped))) _download_era5pl_day(names, dataset, grouped[day]; bounding_box, dir, skip_existing) @@ -342,6 +356,15 @@ function download_dataset(names::Vector{Symbol}, dataset::ERA5PressureDataset, return nothing end +function download_dataset(name::Symbol, + dataset::ERA5PressureDataset, + datetimes::AbstractVector; + bounding_box = nothing, + dir = NumericalEarth.DataWrangling.default_download_directory(dataset), + skip_existing = true) + return download_dataset([name], dataset, datetimes; bounding_box, dir, skip_existing) +end + function _download_era5pl_day(names, dataset, day_dates; bounding_box, dir, skip_existing) From 3cac1f8a79c32101c6bea91d3254f87d5be0d67f Mon Sep 17 00:00:00 2001 From: Eliot Quon Date: Thu, 5 Mar 2026 15:39:51 -0700 Subject: [PATCH 13/53] Add cleanup flag for multi-datetime downloads Combined CDSAPI requests will return a single combined netcdf; set cleanup=false to keep the "_tmp_multi__*" files --- ext/NumericalEarthCDSAPIExt.jl | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/ext/NumericalEarthCDSAPIExt.jl b/ext/NumericalEarthCDSAPIExt.jl index cb13a4af0..afbdcd8ac 100644 --- a/ext/NumericalEarthCDSAPIExt.jl +++ b/ext/NumericalEarthCDSAPIExt.jl @@ -344,13 +344,14 @@ function download_dataset(names::Vector{Symbol}, datetimes::AbstractVector; bounding_box = nothing, dir = NumericalEarth.DataWrangling.default_download_directory(dataset), - skip_existing = true) + skip_existing = true, + cleanup = true) grouped = Dict(d => filter(dt -> Dates.Date(dt) == d, datetimes) for d in unique(Dates.Date.(datetimes))) for day in sort(collect(keys(grouped))) - _download_era5pl_day(names, dataset, grouped[day]; bounding_box, dir, skip_existing) + _download_era5pl_day(names, dataset, grouped[day]; bounding_box, dir, skip_existing, cleanup) end return nothing @@ -361,12 +362,13 @@ function download_dataset(name::Symbol, datetimes::AbstractVector; bounding_box = nothing, dir = NumericalEarth.DataWrangling.default_download_directory(dataset), - skip_existing = true) - return download_dataset([name], dataset, datetimes; bounding_box, dir, skip_existing) + skip_existing = true, + cleanup = true) + return download_dataset([name], dataset, datetimes; bounding_box, dir, skip_existing, cleanup) end function _download_era5pl_day(names, dataset, day_dates; - bounding_box, dir, skip_existing) + bounding_box, dir, skip_existing, cleanup) MDatum = NumericalEarth.DataWrangling.Metadatum meta_path = NumericalEarth.DataWrangling.metadata_path @@ -412,7 +414,7 @@ function _download_era5pl_day(names, dataset, day_dates; @root begin CDSAPI.retrieve("reanalysis-era5-pressure-levels", request, tmp_path) _split_era5pl_nc_multistep(tmp_path, nc_triples) - rm(tmp_path; force=true) + cleanup && rm(tmp_path; force=true) end return nothing From c56c6fce435598965ed58011258ad3c41a130fbb Mon Sep 17 00:00:00 2001 From: Eliot Quon Date: Mon, 9 Mar 2026 14:17:53 -0700 Subject: [PATCH 14/53] Fix silly Claude vertical-level flip; no inpainting needed for ERA5 --- src/DataWrangling/ERA5/ERA5.jl | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/src/DataWrangling/ERA5/ERA5.jl b/src/DataWrangling/ERA5/ERA5.jl index 3c1b270fd..d8e464de7 100644 --- a/src/DataWrangling/ERA5/ERA5.jl +++ b/src/DataWrangling/ERA5/ERA5.jl @@ -20,6 +20,7 @@ import NumericalEarth.DataWrangling: all_dates, dataset_variable_name, default_download_directory, + default_inpainting, longitude_interfaces, latitude_interfaces, z_interfaces, @@ -112,8 +113,8 @@ is_three_dimensional(::ERA5Metadata) = false # ERA5 pressure-level data is 3D is_three_dimensional(::ERA5PressureMetadata) = true -# ERA5 stores pressure levels top-to-bottom; reverse to match Oceananigans bottom-to-top -reversed_vertical_axis(::ERA5PressureDataset) = true +# ERA5 stores pressure levels bottom-to-top +reversed_vertical_axis(::ERA5PressureDataset) = false # Variable name mappings from NumericalEarth names to ERA5/CDS API variable names ERA5_dataset_variable_names = Dict( @@ -216,6 +217,8 @@ netcdf_variable_name(md::ERA5PressureMetadata) = ERA5PL_netcdf_variable_names[m conversion_units(md::ERA5PressureMetadatum) = md.name == :geopotential_height ? InverseGravity() : nothing +default_inpainting(md::ERA5PressureMetadatum) = nothing + """ retrieve_data(metadata::ERA5Metadatum) @@ -261,7 +264,7 @@ function retrieve_data(metadata::ERA5PressureMetadatum) ds = NCDatasets.Dataset(path) data = ds[name][:, :, :, 1] # (lon, lat, pressure_level, time=1) close(ds) - return reverse(data, dims=3) # flip from ERA5 top-to-bottom to Oceananigans bottom-to-top + return data end ##### From e552ccbc97c8cb1da59bbe7c261992dbaa2a0b87 Mon Sep 17 00:00:00 2001 From: Eliot Quon Date: Mon, 9 Mar 2026 15:17:03 -0700 Subject: [PATCH 15/53] Fix handling of ERA5 latitude grid Note: inpainting is now turned OFF for 2D data -- the reanalysis data should be complete and turning on inpainting would artificially fill in data that should be masked (e.g., ocean quantities over land) Tested for 2D (single levels) and 3D (pressure levels) --- src/DataWrangling/ERA5/ERA5.jl | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/DataWrangling/ERA5/ERA5.jl b/src/DataWrangling/ERA5/ERA5.jl index d8e464de7..1fa7cb3c2 100644 --- a/src/DataWrangling/ERA5/ERA5.jl +++ b/src/DataWrangling/ERA5/ERA5.jl @@ -217,6 +217,7 @@ netcdf_variable_name(md::ERA5PressureMetadata) = ERA5PL_netcdf_variable_names[m conversion_units(md::ERA5PressureMetadatum) = md.name == :geopotential_height ? InverseGravity() : nothing +default_inpainting(md::ERA5Metadatum) = nothing default_inpainting(md::ERA5PressureMetadatum) = nothing """ @@ -245,6 +246,9 @@ function retrieve_data(metadata::ERA5Metadatum) end close(ds) + + # Latitude is stored from 90°N → 90°S + data_2d = reverse(data_2d, dims=2) # Add singleton z-dimension for 3D field compatibility # Return as (Nx, Ny, 1) @@ -264,7 +268,7 @@ function retrieve_data(metadata::ERA5PressureMetadatum) ds = NCDatasets.Dataset(path) data = ds[name][:, :, :, 1] # (lon, lat, pressure_level, time=1) close(ds) - return data + return reverse(data, dims=2) # Latitude is stored from 90°N → 90°S end ##### From fce1404b85b895246dfd213b178c29c4a7ba0669 Mon Sep 17 00:00:00 2001 From: Eliot Quon Date: Mon, 9 Mar 2026 15:39:08 -0700 Subject: [PATCH 16/53] Rename original "ERA5*" metadata to "ERA5SingleLevels*" to disambiguate from ERA5PressureLevels* --- examples/ERA5_winds_and_stokes_drift.jl | 4 ++-- src/DataWrangling/DataWrangling.jl | 2 +- src/DataWrangling/ERA5/ERA5.jl | 17 ++++++++++------- test/test_cds_downloading.jl | 10 +++++----- 4 files changed, 18 insertions(+), 15 deletions(-) diff --git a/examples/ERA5_winds_and_stokes_drift.jl b/examples/ERA5_winds_and_stokes_drift.jl index b45be444c..f58ebdb87 100644 --- a/examples/ERA5_winds_and_stokes_drift.jl +++ b/examples/ERA5_winds_and_stokes_drift.jl @@ -16,7 +16,7 @@ using NumericalEarth using NumericalEarth.DataWrangling: Metadatum -using NumericalEarth.DataWrangling.ERA5: ERA5Hourly +using NumericalEarth.DataWrangling.ERA5: ERA5SingleLevelsHourly using CDSAPI using Oceananigans @@ -29,7 +29,7 @@ using Dates # while ocean wave variables (Stokes drift) live on a 0.5° grid (720×361). # We define metadata for each variable at a single date. -dataset = ERA5Hourly() +dataset = ERA5SingleLevelsHourly() date = DateTime(2020, 1, 15, 12) # January 15, 2020 at 12:00 UTC u_stokes_meta = Metadatum(:eastward_stokes_drift; dataset, date) diff --git a/src/DataWrangling/DataWrangling.jl b/src/DataWrangling/DataWrangling.jl index bd1ceb4cd..142eb47b2 100644 --- a/src/DataWrangling/DataWrangling.jl +++ b/src/DataWrangling/DataWrangling.jl @@ -9,7 +9,7 @@ export WOAClimatology, WOAAnnual, WOAMonthly export metadata_time_step, metadata_epoch export LinearlyTaperedPolarMask export DatasetRestoring -export ERA5Hourly, ERA5Monthly, ERA5HourlyPressureLevels, ERA5MonthlyPressureLevels +export ERA5SingleLevelsHourly, ERA5SingleLevelsMonthly, ERA5HourlyPressureLevels, ERA5MonthlyPressureLevels using Oceananigans using Downloads diff --git a/src/DataWrangling/ERA5/ERA5.jl b/src/DataWrangling/ERA5/ERA5.jl index 1fa7cb3c2..5c41e08e4 100644 --- a/src/DataWrangling/ERA5/ERA5.jl +++ b/src/DataWrangling/ERA5/ERA5.jl @@ -1,6 +1,9 @@ module ERA5 -export ERA5Hourly, ERA5Monthly +# 2-D data +export ERA5SingleLevelsHourly, ERA5SingleLevelsMonthly + +# 3-D data export ERA5HourlyPressureLevels, ERA5MonthlyPressureLevels, ERA5_all_pressure_levels, pressure_field using NCDatasets @@ -49,11 +52,11 @@ abstract type ERA5Dataset end default_download_directory(::ERA5Dataset) = download_ERA5_cache -struct ERA5Hourly <: ERA5Dataset end -struct ERA5Monthly <: ERA5Dataset end +struct ERA5SingleLevelsHourly <: ERA5Dataset end +struct ERA5SingleLevelsMonthly <: ERA5Dataset end -dataset_name(::ERA5Hourly) = "ERA5Hourly" -dataset_name(::ERA5Monthly) = "ERA5Monthly" +dataset_name(::ERA5SingleLevelsHourly) = "ERA5SingleLevelsHourly" +dataset_name(::ERA5SingleLevelsMonthly) = "ERA5SingleLevelsMonthly" ##### ##### ERA5 pressure-level datasets @@ -101,8 +104,8 @@ function Base.size(::ERA5Dataset, variable) end # ERA5 reanalysis data available from 1940 to present (we use a practical range here) -all_dates(::ERA5Hourly, var) = range(DateTime("1940-01-01"), stop=DateTime("2024-12-31"), step=Hour(1)) -all_dates(::ERA5Monthly, var) = range(DateTime("1940-01-01"), stop=DateTime("2024-12-01"), step=Month(1)) +all_dates(::ERA5SingleLevelsHourly, var) = range(DateTime("1940-01-01"), stop=DateTime("2024-12-31"), step=Hour(1)) +all_dates(::ERA5SingleLevelsMonthly, var) = range(DateTime("1940-01-01"), stop=DateTime("2024-12-01"), step=Month(1)) const ERA5Metadata{D} = Metadata{<:ERA5Dataset, D} const ERA5Metadatum = Metadatum{<:ERA5Dataset} diff --git a/test/test_cds_downloading.jl b/test/test_cds_downloading.jl index c51958657..63b725b7a 100644 --- a/test/test_cds_downloading.jl +++ b/test/test_cds_downloading.jl @@ -5,7 +5,7 @@ using Dates using NCDatasets using NumericalEarth.DataWrangling.ERA5 -using NumericalEarth.DataWrangling.ERA5: ERA5Hourly, ERA5Monthly, ERA5_dataset_variable_names +using NumericalEarth.DataWrangling.ERA5: ERA5SingleLevelsHourly, ERA5SingleLevelsMonthly, ERA5_dataset_variable_names using NumericalEarth.DataWrangling.ERA5: ERA5HourlyPressureLevels, ERA5MonthlyPressureLevels, ERA5_all_pressure_levels, ERA5PL_dataset_variable_names, pressure_field @@ -17,7 +17,7 @@ start_date = DateTime(2005, 2, 16, 12) @testset "ERA5 data downloading and utilities" begin @info "Testing ERA5 downloading and NetCDF file verification..." - dataset = ERA5Hourly() + dataset = ERA5SingleLevelsHourly() # Use a small bounding box to reduce download time bounding_box = NumericalEarth.DataWrangling.BoundingBox(longitude=(0, 5), latitude=(40, 45)) @@ -88,7 +88,7 @@ start_date = DateTime(2005, 2, 16, 12) # Test metadata properties @test metadatum.name == :temperature - @test metadatum.dataset isa ERA5Hourly + @test metadatum.dataset isa ERA5SingleLevelsHourly @test metadatum.dates == start_date @test metadatum.bounding_box == bounding_box @@ -127,8 +127,8 @@ start_date = DateTime(2005, 2, 16, 12) end @testset "ERA5 Monthly dataset" begin - monthly_dataset = ERA5Monthly() - @test monthly_dataset isa ERA5Monthly + monthly_dataset = ERA5SingleLevelsMonthly() + @test monthly_dataset isa ERA5SingleLevelsMonthly # Test that all_dates returns a valid range dates = NumericalEarth.DataWrangling.all_dates(monthly_dataset, :temperature) From 4643ffa34092286020a7c7a3b7ba6ead152c009b Mon Sep 17 00:00:00 2001 From: Eliot Quon Date: Tue, 10 Mar 2026 12:34:49 -0700 Subject: [PATCH 17/53] Fix ERA5 grid creation --- examples/ERA5_winds_and_stokes_drift.jl | 3 ++- src/DataWrangling/ERA5/ERA5.jl | 6 +++--- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/examples/ERA5_winds_and_stokes_drift.jl b/examples/ERA5_winds_and_stokes_drift.jl index f58ebdb87..871359b1f 100644 --- a/examples/ERA5_winds_and_stokes_drift.jl +++ b/examples/ERA5_winds_and_stokes_drift.jl @@ -42,10 +42,11 @@ v_wind_meta = Metadatum(:northward_velocity; dataset, date) # We build a single `LatitudeLongitudeGrid` and use `set!` to download # and interpolate all four variables onto it. -grid = LatitudeLongitudeGrid(size = (1440, 721, 1), +grid = LatitudeLongitudeGrid(size = (1440, 720, 1), longitude = (0, 360), latitude = (-90, 90), z = (0, 1)) +@show grid u_stokes = CenterField(grid) v_stokes = CenterField(grid) diff --git a/src/DataWrangling/ERA5/ERA5.jl b/src/DataWrangling/ERA5/ERA5.jl index 5c41e08e4..1fadac59d 100644 --- a/src/DataWrangling/ERA5/ERA5.jl +++ b/src/DataWrangling/ERA5/ERA5.jl @@ -81,7 +81,7 @@ ERA5MonthlyPressureLevels(; levels=ERA5_all_pressure_levels) = ERA5MonthlyPressu dataset_name(::ERA5HourlyPressureLevels) = "ERA5HourlyPressureLevels" dataset_name(::ERA5MonthlyPressureLevels) = "ERA5MonthlyPressureLevels" -Base.size(ds::ERA5PressureDataset, variable) = (1440, 721, length(ds.levels)) +Base.size(ds::ERA5PressureDataset, variable) = (1440, 720, length(ds.levels)) all_dates(::ERA5HourlyPressureLevels, var) = range(DateTime("1940-01-01"), stop=DateTime("2024-12-31"), step=Hour(1)) all_dates(::ERA5MonthlyPressureLevels, var) = range(DateTime("1940-01-01"), stop=DateTime("2024-12-01"), step=Month(1)) @@ -377,8 +377,8 @@ inpainted_metadata_path(metadata::ERA5Metadata) = joinpath(metadata.dir, inpaint location(::ERA5Metadata) = (Center, Center, Center) -# ERA5 global coverage: 0-360 longitude, -90 to 90 latitude at 0.25 degree resolution -longitude_interfaces(::ERA5Metadata) = (0, 360) +# ERA5 global coverage: 0-359.75 longitude, -90 to 90 latitude at 0.25 degree resolution +longitude_interfaces(::ERA5Metadata) = (-0.125, 359.875) latitude_interfaces(::ERA5Metadata) = (-90, 90) # ERA5 single-levels (2-D) data product From 9089c991c47777bb0a21adc890c0f7653f329b79 Mon Sep 17 00:00:00 2001 From: Eliot Quon Date: Tue, 10 Mar 2026 14:24:38 -0700 Subject: [PATCH 18/53] Fix bbox handling Clipped field will match downloaded bounding box, tested with ERA5 --- src/DataWrangling/metadata_field.jl | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/src/DataWrangling/metadata_field.jl b/src/DataWrangling/metadata_field.jl index 061684dc9..758728452 100644 --- a/src/DataWrangling/metadata_field.jl +++ b/src/DataWrangling/metadata_field.jl @@ -8,11 +8,16 @@ restrict(::Nothing, interfaces, N) = interfaces, N # TODO support stretched native grids function restrict(bbox_interfaces, interfaces, N) - Δ = interfaces[2] - interfaces[1] - rΔ = bbox_interfaces[2] - bbox_interfaces[1] - ϵ = rΔ / Δ + LΔ = interfaces[2] - interfaces[1] + Δ = LΔ / N + grid_interfaces = (bbox_interfaces[1] - Δ/2, + bbox_interfaces[2] + Δ/2) + + rΔ = grid_interfaces[2] - grid_interfaces[1] + ϵ = rΔ / LΔ rN = ceil(Int, ϵ * N) # Round up to ensure bounding box is covered - return bbox_interfaces, rN + + return grid_interfaces, rN end """ @@ -188,9 +193,12 @@ function set_metadata_field!(field, data, metadatum) arch = architecture(grid) Nx, Ny, Nz = size(metadatum) + mangling = if size(data, 2) == Ny-1 + @info "Shifting field southward" ShiftSouth() elseif size(data, 2) == Ny+1 + @info "Averaging field in north-south dir" AverageNorthSouth() else nothing From cf2b8e0f7e2aa14ea9a9584afe63bd2ab6027cfd Mon Sep 17 00:00:00 2001 From: Eliot Quon Date: Tue, 10 Mar 2026 15:36:51 -0700 Subject: [PATCH 19/53] Use standard units for pressure_levels --- ext/NumericalEarthCDSAPIExt.jl | 12 +++++++++--- src/DataWrangling/ERA5/ERA5.jl | 8 +++++--- 2 files changed, 14 insertions(+), 6 deletions(-) diff --git a/ext/NumericalEarthCDSAPIExt.jl b/ext/NumericalEarthCDSAPIExt.jl index afbdcd8ac..bc8cddcdc 100644 --- a/ext/NumericalEarthCDSAPIExt.jl +++ b/ext/NumericalEarthCDSAPIExt.jl @@ -128,10 +128,12 @@ function download_dataset(meta::ERA5PressureMetadatum; skip_existing=true) day = lpad(string(Dates.day(date)), 2, '0') hour = lpad(string(Dates.hour(date)), 2, '0') * ":00" + p_hPa = [round(Int, p * 1e-2) for p in meta.dataset.levels] + request = Dict( "product_type" => ["reanalysis"], "variable" => [variable_name], - "pressure_level" => [string(p) for p in meta.dataset.levels], + "pressure_level" => [string(p) for p in p_hPa], "year" => [year], "month" => [month], "day" => [day], @@ -203,10 +205,12 @@ function download_dataset(names::Vector{Symbol}, meta::ERA5PressureMetadatum; sk day = lpad(string(Dates.day(date)), 2, '0') hour = lpad(string(Dates.hour(date)), 2, '0') * ":00" + p_hPa = [round(Int, p * 1e-2) for p in meta.dataset.levels] + request = Dict( "product_type" => ["reanalysis"], "variable" => cds_vars, - "pressure_level" => [string(p) for p in meta.dataset.levels], + "pressure_level" => [string(p) for p in p_hPa], "year" => [year], "month" => [month], "day" => [day], @@ -391,10 +395,12 @@ function _download_era5pl_day(names, dataset, day_dates; month = lpad(string(Dates.month(dt)), 2, '0') day = lpad(string(Dates.day(dt)), 2, '0') + p_hPa = [round(Int, p * 1e-2) for p in dataset.levels] + request = Dict( "product_type" => ["reanalysis"], "variable" => cds_vars, - "pressure_level" => [string(p) for p in dataset.levels], + "pressure_level" => [string(p) for p in p_hPa], "year" => [year], "month" => [month], "day" => [day], diff --git a/src/DataWrangling/ERA5/ERA5.jl b/src/DataWrangling/ERA5/ERA5.jl index 1fadac59d..5fe770263 100644 --- a/src/DataWrangling/ERA5/ERA5.jl +++ b/src/DataWrangling/ERA5/ERA5.jl @@ -4,7 +4,8 @@ module ERA5 export ERA5SingleLevelsHourly, ERA5SingleLevelsMonthly # 3-D data -export ERA5HourlyPressureLevels, ERA5MonthlyPressureLevels, ERA5_all_pressure_levels, pressure_field +export ERA5HourlyPressureLevels, ERA5MonthlyPressureLevels, ERA5_all_pressure_levels, pressure_field, hPa +export standard_atmosphere_z_interfaces, mean_geopotential_z_interfaces using NCDatasets using Printf @@ -62,9 +63,10 @@ dataset_name(::ERA5SingleLevelsMonthly) = "ERA5SingleLevelsMonthly" ##### ERA5 pressure-level datasets ##### +const hPa = 100.0 const ERA5_all_pressure_levels = [1, 2, 3, 5, 7, 10, 20, 30, 50, 70, 100, 125, 150, 175, 200, 225, 250, 300, 350, 400, 450, 500, 550, 600, 650, 700, 750, 775, 800, - 825, 850, 875, 900, 925, 950, 975, 1000] + 825, 850, 875, 900, 925, 950, 975, 1000]*hPa abstract type ERA5PressureDataset <: ERA5Dataset end @@ -397,7 +399,7 @@ const ERA5_gravitational_acceleration = 9.80665 function standard_atmosphere_geopotential_height(p) g = ERA5_gravitational_acceleration T⁰ = 288.15 # K - p⁰ = 1013.25 # hPa + p⁰ = 1013.25 * hPa Rᵈ = 287.0528 # J/(kg-K) return (Rᵈ * T⁰ / g) * log(p⁰ / p) From 4139a6e25e4aacf6ec3afa32c8e5e1da9ebdc7b2 Mon Sep 17 00:00:00 2001 From: Eliot Quon Date: Tue, 10 Mar 2026 16:01:28 -0700 Subject: [PATCH 20/53] No special treatment of lower z_interface, allow z < 0 --- src/DataWrangling/ERA5/ERA5.jl | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/src/DataWrangling/ERA5/ERA5.jl b/src/DataWrangling/ERA5/ERA5.jl index 5fe770263..1a49bf952 100644 --- a/src/DataWrangling/ERA5/ERA5.jl +++ b/src/DataWrangling/ERA5/ERA5.jl @@ -419,16 +419,15 @@ function standard_atmosphere_z_interfaces(levels) interfaces = Vector{Float64}(undef, Nz + 1) if Nz == 1 - interfaces[1] = max(0, heights[1] - 500) - interfaces[2] = heights[1] + 500 - return interfaces - end - - interfaces[1] = max(0, heights[1] - (heights[2] - heights[1]) / 2) - for k in 2:Nz - interfaces[k] = (heights[k-1] + heights[k]) / 2 + interfaces[1] = heights[1] - 0.5 + interfaces[2] = heights[1] + 0.5 + else + interfaces[1] = heights[1] - (heights[2] - heights[1]) / 2 + for k in 2:Nz + interfaces[k] = (heights[k-1] + heights[k]) / 2 + end + interfaces[Nz+1] = heights[Nz] + (heights[Nz] - heights[Nz-1]) / 2 end - interfaces[Nz+1] = heights[Nz] + (heights[Nz] - heights[Nz-1]) / 2 return interfaces end From c45200a9153ddfea4d6a6e5fd88992165eb7ad17 Mon Sep 17 00:00:00 2001 From: Eliot Quon Date: Tue, 10 Mar 2026 18:31:53 -0700 Subject: [PATCH 21/53] Fix defaults --- src/DataWrangling/ERA5/ERA5.jl | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/DataWrangling/ERA5/ERA5.jl b/src/DataWrangling/ERA5/ERA5.jl index 1a49bf952..f2e97d32d 100644 --- a/src/DataWrangling/ERA5/ERA5.jl +++ b/src/DataWrangling/ERA5/ERA5.jl @@ -219,11 +219,11 @@ available_variables(::ERA5PressureDataset) = ERA5PL_dataset_variable_names dataset_variable_name(md::ERA5PressureMetadata) = ERA5PL_dataset_variable_names[md.name] netcdf_variable_name(md::ERA5PressureMetadata) = ERA5PL_netcdf_variable_names[md.name] -conversion_units(md::ERA5PressureMetadatum) = +conversion_units(md::ERA5PressureMetadata) = md.name == :geopotential_height ? InverseGravity() : nothing -default_inpainting(md::ERA5Metadatum) = nothing -default_inpainting(md::ERA5PressureMetadatum) = nothing +default_inpainting(md::ERA5Metadata) = nothing +default_inpainting(md::ERA5PressureMetadata) = nothing """ retrieve_data(metadata::ERA5Metadatum) From 18cba4c3951c13ee4fb93244d13bbb4fd7bf9b38 Mon Sep 17 00:00:00 2001 From: Eliot Quon Date: Tue, 10 Mar 2026 18:50:25 -0700 Subject: [PATCH 22/53] Implement mean_geopotential_z_interfaces; make mutable to allow z update rename levels -> pressure_levels for clarity --- ext/NumericalEarthCDSAPIExt.jl | 10 +++--- src/DataWrangling/ERA5/ERA5.jl | 62 +++++++++++++++++++++++----------- test/test_cds_downloading.jl | 6 ++-- 3 files changed, 51 insertions(+), 27 deletions(-) diff --git a/ext/NumericalEarthCDSAPIExt.jl b/ext/NumericalEarthCDSAPIExt.jl index bc8cddcdc..59e9bdbe9 100644 --- a/ext/NumericalEarthCDSAPIExt.jl +++ b/ext/NumericalEarthCDSAPIExt.jl @@ -128,7 +128,7 @@ function download_dataset(meta::ERA5PressureMetadatum; skip_existing=true) day = lpad(string(Dates.day(date)), 2, '0') hour = lpad(string(Dates.hour(date)), 2, '0') * ":00" - p_hPa = [round(Int, p * 1e-2) for p in meta.dataset.levels] + p_hPa = [round(Int, p * 1e-2) for p in meta.dataset.pressure_levels] request = Dict( "product_type" => ["reanalysis"], @@ -205,7 +205,7 @@ function download_dataset(names::Vector{Symbol}, meta::ERA5PressureMetadatum; sk day = lpad(string(Dates.day(date)), 2, '0') hour = lpad(string(Dates.hour(date)), 2, '0') * ":00" - p_hPa = [round(Int, p * 1e-2) for p in meta.dataset.levels] + p_hPa = [round(Int, p * 1e-2) for p in meta.dataset.pressure_levels] request = Dict( "product_type" => ["reanalysis"], @@ -248,7 +248,7 @@ Download one or more ERA5 pressure-level variables at a single datetime, without # Example ```julia -ds = ERA5HourlyPressureLevels(levels=[850, 500]) +ds = ERA5HourlyPressureLevels(pressure_levels=[850, 500] .* hPa) download_dataset([:temperature, :geopotential_height, :eastward_velocity], ds; date=DateTime(2020, 6, 15, 0), bounding_box=bbox) ``` @@ -338,7 +338,7 @@ request per unique calendar day (which may contain multiple hours). # Example ```julia -ds = ERA5HourlyPressureLevels(levels=[850, 500]) +ds = ERA5HourlyPressureLevels(pressure_levels=[850, 500] .* hPa) dates = DateTime(2020, 6, 15, 0) : Hour(6) : DateTime(2020, 6, 16, 18) download_dataset([:temperature, :geopotential_height], ds, collect(dates); bounding_box=bbox) ``` @@ -395,7 +395,7 @@ function _download_era5pl_day(names, dataset, day_dates; month = lpad(string(Dates.month(dt)), 2, '0') day = lpad(string(Dates.day(dt)), 2, '0') - p_hPa = [round(Int, p * 1e-2) for p in dataset.levels] + p_hPa = [round(Int, p * 1e-2) for p in dataset.pressure_levels] request = Dict( "product_type" => ["reanalysis"], diff --git a/src/DataWrangling/ERA5/ERA5.jl b/src/DataWrangling/ERA5/ERA5.jl index f2e97d32d..b02dad505 100644 --- a/src/DataWrangling/ERA5/ERA5.jl +++ b/src/DataWrangling/ERA5/ERA5.jl @@ -70,20 +70,26 @@ const ERA5_all_pressure_levels = [1, 2, 3, 5, 7, 10, 20, 30, 50, 70, 100, 125, 1 abstract type ERA5PressureDataset <: ERA5Dataset end -struct ERA5HourlyPressureLevels <: ERA5PressureDataset - levels :: Vector{Int} +mutable struct ERA5HourlyPressureLevels <: ERA5PressureDataset + pressure_levels :: Vector{Float64} + z :: Union{Nothing, Vector{Float64}} + ERA5HourlyPressureLevels(pressure_levels, z=nothing) = new(sort(pressure_levels, rev=true), z) end -ERA5HourlyPressureLevels(; levels=ERA5_all_pressure_levels) = ERA5HourlyPressureLevels(levels) +ERA5HourlyPressureLevels(; pressure_levels=ERA5_all_pressure_levels, z=nothing) = + ERA5HourlyPressureLevels(pressure_levels, z) -struct ERA5MonthlyPressureLevels <: ERA5PressureDataset - levels :: Vector{Int} +mutable struct ERA5MonthlyPressureLevels <: ERA5PressureDataset + pressure_levels :: Vector{Float64} + z :: Union{Nothing, Vector{Float64}} + ERA5MonthlyPressureLevels(pressure_levels, z=nothing) = new(sort(pressure_levels, rev=true), z) end -ERA5MonthlyPressureLevels(; levels=ERA5_all_pressure_levels) = ERA5MonthlyPressureLevels(levels) +ERA5MonthlyPressureLevels(; pressure_levels=ERA5_all_pressure_levels, z=nothing) = + ERA5MonthlyPressureLevels(pressure_levels, z) dataset_name(::ERA5HourlyPressureLevels) = "ERA5HourlyPressureLevels" dataset_name(::ERA5MonthlyPressureLevels) = "ERA5MonthlyPressureLevels" -Base.size(ds::ERA5PressureDataset, variable) = (1440, 720, length(ds.levels)) +Base.size(ds::ERA5PressureDataset, variable) = (1440, 720, length(ds.pressure_levels)) all_dates(::ERA5HourlyPressureLevels, var) = range(DateTime("1940-01-01"), stop=DateTime("2024-12-31"), step=Hour(1)) all_dates(::ERA5MonthlyPressureLevels, var) = range(DateTime("1940-01-01"), stop=DateTime("2024-12-01"), step=Month(1)) @@ -408,10 +414,9 @@ end # Build z-interfaces (Nz+1 values) from pressure levels. # Levels may be in any order; output is sorted so k=1 is highest pressure (lowest altitude). function standard_atmosphere_z_interfaces(levels) - @info """ - Calculating z-interfaces based on International Standard Atmosphere... - For greater accuracy, use `mean_geopotential_heights`! - """ + @info "Calculating z-interfaces based on the International Standard Atmosphere... \ + For greater accuracy, download the :geopotential field and \ + set `dataset.z = mean_geopotential_z_interfaces(metadata)`" sorted_levels = sort(levels, rev=true) # highest pressure first → k=1 is bottom heights = standard_atmosphere_geopotential_height.(Float64.(sorted_levels)) Nz = length(heights) @@ -433,7 +438,10 @@ function standard_atmosphere_z_interfaces(levels) end # ERA5 pressure-levels (3-D) data product -z_interfaces(metadata::ERA5PressureMetadata) = standard_atmosphere_z_interfaces(metadata.dataset.levels) +function z_interfaces(metadata::ERA5PressureMetadata) + z = metadata.dataset.z + return isnothing(z) ? standard_atmosphere_z_interfaces(metadata.dataset.pressure_levels) : z +end ##### ##### pressure_field — synthetic pressure coordinate field @@ -449,7 +457,7 @@ highest pressure level). function pressure_field(metadata::ERA5PressureMetadatum, arch=CPU(); halo=(3,3,3)) grid = native_grid(metadata, arch; halo) field = CenterField(grid) - reversed_levels = sort(metadata.dataset.levels, rev=true) # highest pressure → k=1 + reversed_levels = sort(metadata.dataset.pressure_levels, rev=true) # highest pressure → k=1 for (k, p) in enumerate(reversed_levels) interior(field)[:, :, k] .= Float32(p) end @@ -462,27 +470,43 @@ end ##### """ - mean_geopotential_heights(metadata::ERA5PressureMetadata; arch=CPU()) + mean_geopotential_heights(metadata::ERA5PressureMetadata; interfaces=false) Compute spatially and temporally averaged geopotential heights (m) for each -pressure level in `metadata`. This provides more accurate z-coordinates than -the standard-atmosphere fallback used by `z_interfaces`. +pressure level in `metadata`. Downloads the `:geopotential` field for every date in `metadata`, divides by g, averages over the horizontal domain and all dates, and returns one representative height per pressure level in bottom-to-top order (k=1 is highest pressure). """ -function mean_geopotential_heights(metadata::ERA5PressureMetadata; arch=CPU()) +function mean_geopotential_heights(metadata::ERA5PressureMetadata) geo_metadata = Metadata(:geopotential; dataset=metadata.dataset, dates=metadata.dates, bounding_box=metadata.bounding_box, dir=metadata.dir) - heights = zeros(length(metadata.dataset.levels)) + heights = zeros(length(metadata.dataset.pressure_levels)) + # average over time for geo_datum in geo_metadata data = retrieve_data(geo_datum) ./ Float32(ERA5_gravitational_acceleration) # Φ → Z (m) + # average over horizontal dims heights .+= dropdims(mean(data; dims=(1, 2)); dims=(1, 2)) end heights ./= length(geo_metadata) - return heights + + return sort(heights) +end + +function mean_geopotential_z_interfaces(metadata::ERA5PressureMetadata) + return stagger(mean_geopotential_heights(metadata)) +end + +function stagger(zc::AbstractVector) + # heights are ascending (k=1 = highest pressure = lowest altitude, + # consistent with retrieve_data's reverse() and Oceananigans bottom-to-top + # convention); bottom and top interfaces are extrapolated + zf = (zc[1:end-1] .+ zc[2:end]) / 2 # Nz-1 interior interfaces + pushfirst!(zf, zc[1] - (zf[1] - zc[1])) # bottom interface + push!(zf, zc[end] + (zc[end] - zf[end])) # top interface + return zf end end # module ERA5 diff --git a/test/test_cds_downloading.jl b/test/test_cds_downloading.jl index 63b725b7a..304d17079 100644 --- a/test/test_cds_downloading.jl +++ b/test/test_cds_downloading.jl @@ -140,11 +140,11 @@ start_date = DateTime(2005, 2, 16, 12) # Default constructor uses all 37 standard levels ds_full = ERA5HourlyPressureLevels() @test ds_full isa ERA5HourlyPressureLevels - @test length(ds_full.levels) == 37 + @test length(ds_full.pressure_levels) == 37 @test Base.size(ds_full, :temperature) == (1440, 721, 37) # Subset constructor - ds_sub = ERA5HourlyPressureLevels(levels=[850, 500]) + ds_sub = ERA5HourlyPressureLevels(pressure_levels=[850, 500] .* hPa) @test Base.size(ds_sub, :temperature) == (1440, 721, 2) # Monthly variant @@ -240,7 +240,7 @@ start_date = DateTime(2005, 2, 16, 12) @testset "ERA5 pressure-level download and Field on CPU" begin arch = CPU() - ds_pl = ERA5HourlyPressureLevels(levels=[850, 500]) + ds_pl = ERA5HourlyPressureLevels(pressure_levels=[850, 500] .* hPa) @testset "Download and 3D Field" begin meta = Metadatum(:temperature; dataset=ds_pl, bounding_box, date=start_date) From ae0f0b17e8baeef2b7cd277e44bc2ed6fa5711cd Mon Sep 17 00:00:00 2001 From: Eliot Quon Date: Tue, 10 Mar 2026 23:28:08 -0700 Subject: [PATCH 23/53] Organize ERA5 code into single-level and pressure-levels source files --- src/DataWrangling/ERA5/ERA5.jl | 410 +----------------- .../ERA5/ERA5_pressure_levels.jl | 257 +++++++++++ src/DataWrangling/ERA5/ERA5_single_levels.jl | 150 +++++++ 3 files changed, 420 insertions(+), 397 deletions(-) create mode 100644 src/DataWrangling/ERA5/ERA5_pressure_levels.jl create mode 100644 src/DataWrangling/ERA5/ERA5_single_levels.jl diff --git a/src/DataWrangling/ERA5/ERA5.jl b/src/DataWrangling/ERA5/ERA5.jl index b02dad505..f790a7ced 100644 --- a/src/DataWrangling/ERA5/ERA5.jl +++ b/src/DataWrangling/ERA5/ERA5.jl @@ -53,237 +53,27 @@ abstract type ERA5Dataset end default_download_directory(::ERA5Dataset) = download_ERA5_cache -struct ERA5SingleLevelsHourly <: ERA5Dataset end -struct ERA5SingleLevelsMonthly <: ERA5Dataset end - -dataset_name(::ERA5SingleLevelsHourly) = "ERA5SingleLevelsHourly" -dataset_name(::ERA5SingleLevelsMonthly) = "ERA5SingleLevelsMonthly" - -##### -##### ERA5 pressure-level datasets -##### - -const hPa = 100.0 -const ERA5_all_pressure_levels = [1, 2, 3, 5, 7, 10, 20, 30, 50, 70, 100, 125, 150, - 175, 200, 225, 250, 300, 350, 400, 450, 500, 550, 600, 650, 700, 750, 775, 800, - 825, 850, 875, 900, 925, 950, 975, 1000]*hPa - -abstract type ERA5PressureDataset <: ERA5Dataset end - -mutable struct ERA5HourlyPressureLevels <: ERA5PressureDataset - pressure_levels :: Vector{Float64} - z :: Union{Nothing, Vector{Float64}} - ERA5HourlyPressureLevels(pressure_levels, z=nothing) = new(sort(pressure_levels, rev=true), z) -end -ERA5HourlyPressureLevels(; pressure_levels=ERA5_all_pressure_levels, z=nothing) = - ERA5HourlyPressureLevels(pressure_levels, z) - -mutable struct ERA5MonthlyPressureLevels <: ERA5PressureDataset - pressure_levels :: Vector{Float64} - z :: Union{Nothing, Vector{Float64}} - ERA5MonthlyPressureLevels(pressure_levels, z=nothing) = new(sort(pressure_levels, rev=true), z) -end -ERA5MonthlyPressureLevels(; pressure_levels=ERA5_all_pressure_levels, z=nothing) = - ERA5MonthlyPressureLevels(pressure_levels, z) - -dataset_name(::ERA5HourlyPressureLevels) = "ERA5HourlyPressureLevels" -dataset_name(::ERA5MonthlyPressureLevels) = "ERA5MonthlyPressureLevels" - -Base.size(ds::ERA5PressureDataset, variable) = (1440, 720, length(ds.pressure_levels)) - -all_dates(::ERA5HourlyPressureLevels, var) = range(DateTime("1940-01-01"), stop=DateTime("2024-12-31"), step=Hour(1)) -all_dates(::ERA5MonthlyPressureLevels, var) = range(DateTime("1940-01-01"), stop=DateTime("2024-12-01"), step=Month(1)) - -const ERA5PressureMetadata{D} = Metadata{<:ERA5PressureDataset, D} -const ERA5PressureMetadatum = Metadatum{<:ERA5PressureDataset} - -# Wave variables are on a 0.5° grid (720×361), atmospheric variables on 0.25° (1440×721) -const ERA5_wave_variables = Set([ - :eastward_stokes_drift, :northward_stokes_drift, - :significant_wave_height, :mean_wave_period, :mean_wave_direction, -]) - -function Base.size(::ERA5Dataset, variable) - if variable in ERA5_wave_variables - return (720, 361, 1) - else - return (1440, 721, 1) - end -end - -# ERA5 reanalysis data available from 1940 to present (we use a practical range here) -all_dates(::ERA5SingleLevelsHourly, var) = range(DateTime("1940-01-01"), stop=DateTime("2024-12-31"), step=Hour(1)) -all_dates(::ERA5SingleLevelsMonthly, var) = range(DateTime("1940-01-01"), stop=DateTime("2024-12-01"), step=Month(1)) - const ERA5Metadata{D} = Metadata{<:ERA5Dataset, D} const ERA5Metadatum = Metadatum{<:ERA5Dataset} -# ERA5 is a spatially 2D dataset (atmospheric surface variables) -is_three_dimensional(::ERA5Metadata) = false - -# ERA5 pressure-level data is 3D -is_three_dimensional(::ERA5PressureMetadata) = true - -# ERA5 stores pressure levels bottom-to-top -reversed_vertical_axis(::ERA5PressureDataset) = false - -# Variable name mappings from NumericalEarth names to ERA5/CDS API variable names -ERA5_dataset_variable_names = Dict( - :temperature => "2m_temperature", - :dewpoint_temperature => "2m_dewpoint_temperature", - :eastward_velocity => "10m_u_component_of_wind", - :northward_velocity => "10m_v_component_of_wind", - :surface_pressure => "surface_pressure", - :mean_sea_level_pressure => "mean_sea_level_pressure", - :total_precipitation => "total_precipitation", - :sea_surface_temperature => "sea_surface_temperature", - :downwelling_shortwave_radiation => "surface_solar_radiation_downwards", - :downwelling_longwave_radiation => "surface_thermal_radiation_downwards", - :total_cloud_cover => "total_cloud_cover", - :evaporation => "evaporation", - :specific_humidity => "specific_humidity", - :eastward_stokes_drift => "u_component_stokes_drift", - :northward_stokes_drift => "v_component_stokes_drift", - :significant_wave_height => "significant_height_of_combined_wind_waves_and_swell", - :mean_wave_period => "mean_wave_period", - :mean_wave_direction => "mean_wave_direction", -) - -# Variables available for download -ERA5_variable_names = keys(ERA5_dataset_variable_names) - -available_variables(::ERA5Dataset) = ERA5_dataset_variable_names - -dataset_variable_name(metadata::ERA5Metadata) = ERA5_dataset_variable_names[metadata.name] - -# NetCDF short variable names (what's actually in the downloaded files) -# These differ from the CDS API variable names above -ERA5_netcdf_variable_names = Dict( - :temperature => "t2m", - :dewpoint_temperature => "d2m", - :eastward_velocity => "u10", - :northward_velocity => "v10", - :surface_pressure => "sp", - :mean_sea_level_pressure => "msl", - :total_precipitation => "tp", - :sea_surface_temperature => "sst", - :downwelling_shortwave_radiation => "ssrd", - :downwelling_longwave_radiation => "strd", - :total_cloud_cover => "tcc", - :evaporation => "e", - :specific_humidity => "q", - :eastward_stokes_drift => "ust", - :northward_stokes_drift => "vst", - :significant_wave_height => "swh", - :mean_wave_period => "mwp", - :mean_wave_direction => "mwd", -) - -netcdf_variable_name(metadata::ERA5Metadata) = ERA5_netcdf_variable_names[metadata.name] - ##### -##### ERA5 pressure-level variable name mappings +##### Grid interfaces ##### -ERA5PL_dataset_variable_names = Dict( - :temperature => "temperature", - :eastward_velocity => "u_component_of_wind", - :northward_velocity => "v_component_of_wind", - :vertical_velocity => "vertical_velocity", - :geopotential => "geopotential", - :geopotential_height => "geopotential", - :specific_humidity => "specific_humidity", - :relative_humidity => "relative_humidity", - :vorticity => "vorticity", - :divergence => "divergence", - :potential_vorticity => "potential_vorticity", - :ozone_mass_mixing_ratio => "ozone_mass_mixing_ratio", - :fraction_of_cloud_cover => "fraction_of_cloud_cover", - :specific_cloud_liquid_water_content => "specific_cloud_liquid_water_content", - :specific_cloud_ice_water_content => "specific_cloud_ice_water_content", -) - -ERA5PL_netcdf_variable_names = Dict( - :temperature => "t", - :eastward_velocity => "u", - :northward_velocity => "v", - :vertical_velocity => "w", - :geopotential => "z", - :geopotential_height => "z", - :specific_humidity => "q", - :relative_humidity => "r", - :vorticity => "vo", - :divergence => "d", - :potential_vorticity => "pv", - :ozone_mass_mixing_ratio => "o3", - :fraction_of_cloud_cover => "cc", - :specific_cloud_liquid_water_content => "clwc", - :specific_cloud_ice_water_content => "ciwc", -) - -available_variables(::ERA5PressureDataset) = ERA5PL_dataset_variable_names -dataset_variable_name(md::ERA5PressureMetadata) = ERA5PL_dataset_variable_names[md.name] -netcdf_variable_name(md::ERA5PressureMetadata) = ERA5PL_netcdf_variable_names[md.name] - -conversion_units(md::ERA5PressureMetadata) = - md.name == :geopotential_height ? InverseGravity() : nothing - -default_inpainting(md::ERA5Metadata) = nothing -default_inpainting(md::ERA5PressureMetadata) = nothing - -""" - retrieve_data(metadata::ERA5Metadatum) - -Retrieve ERA5 data from NetCDF file according to `metadata`. -ERA5 is 2D surface data, so we return a 2D array with an added singleton z-dimension. -""" -function retrieve_data(metadata::ERA5Metadatum) - path = metadata_path(metadata) - name = netcdf_variable_name(metadata) - - ds = NCDatasets.Dataset(path) - - # ERA5 is 2D + time, we take the first time step - # Data shape is typically (lon, lat) or (lon, lat, time) - raw_data = ds[name] - ndim = ndims(raw_data) - - if ndim == 2 - data_2d = raw_data[:, :] - elseif ndim == 3 - data_2d = raw_data[:, :, 1] - else - error("Unexpected ERA5 data dimensions: $ndim") - end - - close(ds) +location(::ERA5Metadata) = (Center, Center, Center) - # Latitude is stored from 90°N → 90°S - data_2d = reverse(data_2d, dims=2) - - # Add singleton z-dimension for 3D field compatibility - # Return as (Nx, Ny, 1) - return reshape(data_2d, size(data_2d, 1), size(data_2d, 2), 1) -end +# ERA5 global coverage: 0-359.75 longitude, -90 to 90 latitude at 0.25 degree resolution +longitude_interfaces(::ERA5Metadata) = (-0.125, 359.875) +latitude_interfaces(::ERA5Metadata) = (-90, 90) -""" - retrieve_data(metadata::ERA5PressureMetadatum) +# ERA5 single-levels (2-D) data product +z_interfaces(::ERA5Metadata) = (0, 1) -Retrieve ERA5 pressure-level data from a NetCDF file. -Returns a 3D array (lon, lat, level) with levels ordered bottom-to-top -(highest pressure at k=1, lowest pressure at k=Nz). -""" -function retrieve_data(metadata::ERA5PressureMetadatum) - path = metadata_path(metadata) - name = netcdf_variable_name(metadata) - ds = NCDatasets.Dataset(path) - data = ds[name][:, :, :, 1] # (lon, lat, pressure_level, time=1) - close(ds) - return reverse(data, dims=2) # Latitude is stored from 90°N → 90°S -end +# ERA5 data is stored as Float32 +eltype(::ERA5Metadata) = Float32 ##### -##### Metadata filename construction +##### Shared filename utilities ##### function date_str(date::DateTime) @@ -312,56 +102,6 @@ function bbox_strs(c) return first, second end -function metadata_prefix(metadata::ERA5PressureMetadata) - var = ERA5PL_dataset_variable_names[metadata.name] - dataset = dataset_name(metadata.dataset) - start_date = start_date_str(metadata.dates) - end_date = end_date_str(metadata.dates) - bbox = metadata.bounding_box - - if !isnothing(bbox) - w, e = bbox_strs(bbox.longitude) - s, n = bbox_strs(bbox.latitude) - suffix = string(w, e, s, n) - else - suffix = "" - end - - if start_date == end_date - prefix = string(var, "_", dataset, "_", start_date, suffix) - else - prefix = string(var, "_", dataset, "_", start_date, "_", end_date, suffix) - end - prefix = colon2dash(prefix) - prefix = underscore_spaces(prefix) - return prefix -end - -function metadata_prefix(metadata::ERA5Metadata) - var = ERA5_dataset_variable_names[metadata.name] - dataset = dataset_name(metadata.dataset) - start_date = start_date_str(metadata.dates) - end_date = end_date_str(metadata.dates) - bbox = metadata.bounding_box - - if !isnothing(bbox) - w, e = bbox_strs(bbox.longitude) - s, n = bbox_strs(bbox.latitude) - suffix = string(w, e, s, n) - else - suffix = "" - end - - if start_date == end_date - prefix = string(var, "_", dataset, "_", start_date, suffix) - else - prefix = string(var, "_", dataset, "_", start_date, "_", end_date, suffix) - end - prefix = colon2dash(prefix) - prefix = underscore_spaces(prefix) - return prefix -end - function metadata_filename(metadata::ERA5Metadatum) prefix = metadata_prefix(metadata) return string(prefix, ".nc") @@ -380,134 +120,10 @@ end inpainted_metadata_path(metadata::ERA5Metadata) = joinpath(metadata.dir, inpainted_metadata_filename(metadata)) ##### -##### Grid interfaces +##### Single-level and pressure-level specifics ##### -location(::ERA5Metadata) = (Center, Center, Center) - -# ERA5 global coverage: 0-359.75 longitude, -90 to 90 latitude at 0.25 degree resolution -longitude_interfaces(::ERA5Metadata) = (-0.125, 359.875) -latitude_interfaces(::ERA5Metadata) = (-90, 90) - -# ERA5 single-levels (2-D) data product -z_interfaces(::ERA5Metadata) = (0, 1) - -# ERA5 data is stored as Float32 -eltype(::ERA5Metadata) = Float32 - -##### -##### Pressure-level vertical coordinate -##### - -const ERA5_gravitational_acceleration = 9.80665 - -# International Standard Atmosphere height (m) for a given pressure (hPa) -function standard_atmosphere_geopotential_height(p) - g = ERA5_gravitational_acceleration - T⁰ = 288.15 # K - p⁰ = 1013.25 * hPa - Rᵈ = 287.0528 # J/(kg-K) - - return (Rᵈ * T⁰ / g) * log(p⁰ / p) -end - -# Build z-interfaces (Nz+1 values) from pressure levels. -# Levels may be in any order; output is sorted so k=1 is highest pressure (lowest altitude). -function standard_atmosphere_z_interfaces(levels) - @info "Calculating z-interfaces based on the International Standard Atmosphere... \ - For greater accuracy, download the :geopotential field and \ - set `dataset.z = mean_geopotential_z_interfaces(metadata)`" - sorted_levels = sort(levels, rev=true) # highest pressure first → k=1 is bottom - heights = standard_atmosphere_geopotential_height.(Float64.(sorted_levels)) - Nz = length(heights) - - interfaces = Vector{Float64}(undef, Nz + 1) - - if Nz == 1 - interfaces[1] = heights[1] - 0.5 - interfaces[2] = heights[1] + 0.5 - else - interfaces[1] = heights[1] - (heights[2] - heights[1]) / 2 - for k in 2:Nz - interfaces[k] = (heights[k-1] + heights[k]) / 2 - end - interfaces[Nz+1] = heights[Nz] + (heights[Nz] - heights[Nz-1]) / 2 - end - - return interfaces -end - -# ERA5 pressure-levels (3-D) data product -function z_interfaces(metadata::ERA5PressureMetadata) - z = metadata.dataset.z - return isnothing(z) ? standard_atmosphere_z_interfaces(metadata.dataset.pressure_levels) : z -end - -##### -##### pressure_field — synthetic pressure coordinate field -##### - -""" - pressure_field(metadata::ERA5PressureMetadatum, arch=CPU(); halo=(3,3,3)) - -Return a `CenterField` on the native grid of `metadata` filled with the pressure -value (hPa) at each vertical level. Levels are ordered bottom-to-top (k=1 is the -highest pressure level). -""" -function pressure_field(metadata::ERA5PressureMetadatum, arch=CPU(); halo=(3,3,3)) - grid = native_grid(metadata, arch; halo) - field = CenterField(grid) - reversed_levels = sort(metadata.dataset.pressure_levels, rev=true) # highest pressure → k=1 - for (k, p) in enumerate(reversed_levels) - interior(field)[:, :, k] .= Float32(p) - end - fill_halo_regions!(field) - return field -end - -##### -##### mean_geopotential_heights — data-derived static z-coordinate -##### - -""" - mean_geopotential_heights(metadata::ERA5PressureMetadata; interfaces=false) - -Compute spatially and temporally averaged geopotential heights (m) for each -pressure level in `metadata`. - -Downloads the `:geopotential` field for every date in `metadata`, divides by g, -averages over the horizontal domain and all dates, and returns one representative -height per pressure level in bottom-to-top order (k=1 is highest pressure). -""" -function mean_geopotential_heights(metadata::ERA5PressureMetadata) - geo_metadata = Metadata(:geopotential; dataset=metadata.dataset, - dates=metadata.dates, bounding_box=metadata.bounding_box, - dir=metadata.dir) - heights = zeros(length(metadata.dataset.pressure_levels)) - # average over time - for geo_datum in geo_metadata - data = retrieve_data(geo_datum) ./ Float32(ERA5_gravitational_acceleration) # Φ → Z (m) - # average over horizontal dims - heights .+= dropdims(mean(data; dims=(1, 2)); dims=(1, 2)) - end - heights ./= length(geo_metadata) - - return sort(heights) -end - -function mean_geopotential_z_interfaces(metadata::ERA5PressureMetadata) - return stagger(mean_geopotential_heights(metadata)) -end - -function stagger(zc::AbstractVector) - # heights are ascending (k=1 = highest pressure = lowest altitude, - # consistent with retrieve_data's reverse() and Oceananigans bottom-to-top - # convention); bottom and top interfaces are extrapolated - zf = (zc[1:end-1] .+ zc[2:end]) / 2 # Nz-1 interior interfaces - pushfirst!(zf, zc[1] - (zf[1] - zc[1])) # bottom interface - push!(zf, zc[end] + (zc[end] - zf[end])) # top interface - return zf -end +include("ERA5_single_levels.jl") +include("ERA5_pressure_levels.jl") end # module ERA5 - diff --git a/src/DataWrangling/ERA5/ERA5_pressure_levels.jl b/src/DataWrangling/ERA5/ERA5_pressure_levels.jl new file mode 100644 index 000000000..c9feb2dae --- /dev/null +++ b/src/DataWrangling/ERA5/ERA5_pressure_levels.jl @@ -0,0 +1,257 @@ +abstract type ERA5PressureDataset <: ERA5Dataset end + +# ERA5PressureMetadata is a subtype of ERA5Metadata +const ERA5PressureMetadata{D} = Metadata{<:ERA5PressureDataset, D} +const ERA5PressureMetadatum = Metadatum{<:ERA5PressureDataset} + +mutable struct ERA5HourlyPressureLevels <: ERA5PressureDataset + pressure_levels :: Vector{Float64} + z :: Union{Nothing, Vector{Float64}} + ERA5HourlyPressureLevels(pressure_levels, z=nothing) = new(sort(pressure_levels, rev=true), z) +end +ERA5HourlyPressureLevels(; pressure_levels=ERA5_all_pressure_levels, z=nothing) = + ERA5HourlyPressureLevels(pressure_levels, z) + +mutable struct ERA5MonthlyPressureLevels <: ERA5PressureDataset + pressure_levels :: Vector{Float64} + z :: Union{Nothing, Vector{Float64}} + ERA5MonthlyPressureLevels(pressure_levels, z=nothing) = new(sort(pressure_levels, rev=true), z) +end +ERA5MonthlyPressureLevels(; pressure_levels=ERA5_all_pressure_levels, z=nothing) = + ERA5MonthlyPressureLevels(pressure_levels, z) + +dataset_name(::ERA5HourlyPressureLevels) = "ERA5HourlyPressureLevels" +dataset_name(::ERA5MonthlyPressureLevels) = "ERA5MonthlyPressureLevels" + +##### +##### ERA5 pressure-level data availability +##### + +# ERA5 reanalysis data available from 1940 to present (we use a practical range here) +all_dates(::ERA5HourlyPressureLevels, var) = range(DateTime("1940-01-01"), stop=DateTime("2024-12-31"), step=Hour(1)) +all_dates(::ERA5MonthlyPressureLevels, var) = range(DateTime("1940-01-01"), stop=DateTime("2024-12-01"), step=Month(1)) + +# ERA5 pressure-level data is a spatially 3-D dataset +is_three_dimensional(::ERA5PressureMetadata) = true + +const hPa = 100.0 +const ERA5_all_pressure_levels = [1, 2, 3, 5, 7, 10, 20, 30, 50, 70, 100, 125, 150, + 175, 200, 225, 250, 300, 350, 400, 450, 500, 550, 600, 650, 700, 750, 775, 800, + 825, 850, 875, 900, 925, 950, 975, 1000]*hPa + +# ERA5 stores pressure levels bottom-to-top +reversed_vertical_axis(::ERA5PressureDataset) = false + +Base.size(ds::ERA5PressureDataset, variable) = (1440, 720, length(ds.pressure_levels)) + +##### +##### ERA5 pressure-level variable name mappings +##### + +ERA5PL_dataset_variable_names = Dict( + :temperature => "temperature", + :eastward_velocity => "u_component_of_wind", + :northward_velocity => "v_component_of_wind", + :vertical_velocity => "vertical_velocity", + :geopotential => "geopotential", + :geopotential_height => "geopotential", + :specific_humidity => "specific_humidity", + :relative_humidity => "relative_humidity", + :vorticity => "vorticity", + :divergence => "divergence", + :potential_vorticity => "potential_vorticity", + :ozone_mass_mixing_ratio => "ozone_mass_mixing_ratio", + :fraction_of_cloud_cover => "fraction_of_cloud_cover", + :specific_cloud_liquid_water_content => "specific_cloud_liquid_water_content", + :specific_cloud_ice_water_content => "specific_cloud_ice_water_content", +) + +# NetCDF short variable names (what's actually in the downloaded files) +# These differ from the CDS API variable names above +ERA5PL_netcdf_variable_names = Dict( + :temperature => "t", + :eastward_velocity => "u", + :northward_velocity => "v", + :vertical_velocity => "w", + :geopotential => "z", + :geopotential_height => "z", + :specific_humidity => "q", + :relative_humidity => "r", + :vorticity => "vo", + :divergence => "d", + :potential_vorticity => "pv", + :ozone_mass_mixing_ratio => "o3", + :fraction_of_cloud_cover => "cc", + :specific_cloud_liquid_water_content => "clwc", + :specific_cloud_ice_water_content => "ciwc", +) + +# Variables available for download +available_variables(::ERA5PressureDataset) = ERA5PL_dataset_variable_names +dataset_variable_name(md::ERA5PressureMetadata) = ERA5PL_dataset_variable_names[md.name] +netcdf_variable_name(md::ERA5PressureMetadata) = ERA5PL_netcdf_variable_names[md.name] + +conversion_units(md::ERA5PressureMetadata) = + md.name == :geopotential_height ? InverseGravity() : nothing + +default_inpainting(md::ERA5PressureMetadata) = nothing + +""" + retrieve_data(metadata::ERA5PressureMetadatum) + +Retrieve ERA5 pressure-level data from a NetCDF file. +Returns a 3D array (lon, lat, level) with levels ordered bottom-to-top +(highest pressure at k=1, lowest pressure at k=Nz). +""" +function retrieve_data(metadata::ERA5PressureMetadatum) + path = metadata_path(metadata) + name = netcdf_variable_name(metadata) + ds = NCDatasets.Dataset(path) + data = ds[name][:, :, :, 1] # (lon, lat, pressure_level, time=1) + close(ds) + return reverse(data, dims=2) # Latitude is stored from 90°N → 90°S +end + +##### +##### Metadata filename construction +##### + +function metadata_prefix(md::ERA5PressureMetadata) + var = ERA5PL_dataset_variable_names[md.name] + dataset = dataset_name(md.dataset) + start_date = start_date_str(md.dates) + end_date = end_date_str(md.dates) + bbox = md.bounding_box + + if !isnothing(bbox) + w, e = bbox_strs(bbox.longitude) + s, n = bbox_strs(bbox.latitude) + suffix = string(w, e, s, n) + else + suffix = "" + end + + if start_date == end_date + prefix = string(var, "_", dataset, "_", start_date, suffix) + else + prefix = string(var, "_", dataset, "_", start_date, "_", end_date, suffix) + end + prefix = colon2dash(prefix) + prefix = underscore_spaces(prefix) + return prefix +end + +##### +##### Pressure-level vertical coordinate +##### + +const ERA5_gravitational_acceleration = 9.80665 + +# International Standard Atmosphere height (m) for a given pressure (hPa) +function standard_atmosphere_geopotential_height(p) + g = ERA5_gravitational_acceleration + T⁰ = 288.15 # K + p⁰ = 1013.25 * hPa + Rᵈ = 287.0528 # J/(kg-K) + + return (Rᵈ * T⁰ / g) * log(p⁰ / p) +end + +# Build z-interfaces (Nz+1 values) from pressure levels. +# Levels may be in any order; output is sorted so k=1 is highest pressure (lowest altitude). +function standard_atmosphere_z_interfaces(levels) + @info "Calculating z-interfaces based on the International Standard Atmosphere... \ + For greater accuracy, download the :geopotential field and \ + set `dataset.z = mean_geopotential_z_interfaces(ϕ_metadata)`" + sorted_levels = sort(levels, rev=true) # highest pressure first → k=1 is bottom + heights = standard_atmosphere_geopotential_height.(Float64.(sorted_levels)) + Nz = length(heights) + + interfaces = Vector{Float64}(undef, Nz + 1) + + if Nz == 1 + interfaces[1] = heights[1] - 0.5 + interfaces[2] = heights[1] + 0.5 + else + interfaces[1] = heights[1] - (heights[2] - heights[1]) / 2 + for k in 2:Nz + interfaces[k] = (heights[k-1] + heights[k]) / 2 + end + interfaces[Nz+1] = heights[Nz] + (heights[Nz] - heights[Nz-1]) / 2 + end + + return interfaces +end + +# ERA5 pressure-levels (3-D) data product +function z_interfaces(metadata::ERA5PressureMetadata) + z = metadata.dataset.z + return isnothing(z) ? standard_atmosphere_z_interfaces(metadata.dataset.pressure_levels) : z +end + +##### +##### mean_geopotential_heights — data-derived static z-coordinate +##### + +""" + mean_geopotential_heights(metadata::ERA5PressureMetadata) + +Compute spatially and temporally averaged geopotential heights (m) for each +pressure level in `metadata`. + +Downloads the `:geopotential` field for every date in `metadata`, divides by g, +averages over the horizontal domain and all dates, and returns one representative +height per pressure level in bottom-to-top order (k=1 is highest pressure). +""" +function mean_geopotential_heights(metadata::ERA5PressureMetadata) + geo_metadata = Metadata(:geopotential; dataset=metadata.dataset, + dates=metadata.dates, bounding_box=metadata.bounding_box, + dir=metadata.dir) + heights = zeros(length(metadata.dataset.pressure_levels)) + # average over time + for geo_datum in geo_metadata + data = retrieve_data(geo_datum) ./ Float32(ERA5_gravitational_acceleration) # Φ → Z (m) + # average over horizontal dims + heights .+= dropdims(mean(data; dims=(1, 2)); dims=(1, 2)) + end + heights ./= length(geo_metadata) + + return sort(heights) +end + +function mean_geopotential_z_interfaces(metadata::ERA5PressureMetadata) + return stagger(mean_geopotential_heights(metadata)) +end + +function stagger(zc::AbstractVector) + # heights are ascending (k=1 = highest pressure = lowest altitude, + # consistent with retrieve_data's reverse() and Oceananigans bottom-to-top + # convention); bottom and top interfaces are extrapolated + zf = (zc[1:end-1] .+ zc[2:end]) / 2 # Nz-1 interior interfaces + pushfirst!(zf, zc[1] - (zf[1] - zc[1])) # bottom interface + push!(zf, zc[end] + (zc[end] - zf[end])) # top interface + return zf +end + +##### +##### pressure_field — synthetic pressure coordinate field +##### + +""" + pressure_field(metadata::ERA5PressureMetadatum, arch=CPU(); halo=(3,3,3)) + +Return a `CenterField` on the native grid of `metadata` filled with the pressure +value (hPa) at each vertical level. Levels are ordered bottom-to-top (k=1 is the +highest pressure level). +""" +function pressure_field(metadata::ERA5PressureMetadatum, arch=CPU(); halo=(3,3,3)) + grid = native_grid(metadata, arch; halo) + field = CenterField(grid) + reversed_levels = sort(metadata.dataset.pressure_levels, rev=true) # highest pressure → k=1 + for (k, p) in enumerate(reversed_levels) + interior(field)[:, :, k] .= Float32(p) + end + fill_halo_regions!(field) + return field +end + diff --git a/src/DataWrangling/ERA5/ERA5_single_levels.jl b/src/DataWrangling/ERA5/ERA5_single_levels.jl new file mode 100644 index 000000000..5ebbd46ed --- /dev/null +++ b/src/DataWrangling/ERA5/ERA5_single_levels.jl @@ -0,0 +1,150 @@ +struct ERA5SingleLevelsHourly <: ERA5Dataset end +struct ERA5SingleLevelsMonthly <: ERA5Dataset end + +dataset_name(::ERA5SingleLevelsHourly) = "ERA5SingleLevelsHourly" +dataset_name(::ERA5SingleLevelsMonthly) = "ERA5SingleLevelsMonthly" + +# Wave variables are on a 0.5° grid (720×361), atmospheric variables on 0.25° (1440×721) +const ERA5_wave_variables = Set([ + :eastward_stokes_drift, :northward_stokes_drift, + :significant_wave_height, :mean_wave_period, :mean_wave_direction, +]) + +##### +##### ERA5 single-level data availability +##### + +# ERA5 reanalysis data available from 1940 to present (we use a practical range here) +all_dates(::ERA5SingleLevelsHourly, var) = range(DateTime("1940-01-01"), stop=DateTime("2024-12-31"), step=Hour(1)) +all_dates(::ERA5SingleLevelsMonthly, var) = range(DateTime("1940-01-01"), stop=DateTime("2024-12-01"), step=Month(1)) + +# ERA5 single-level data is a spatially 2-D dataset +is_three_dimensional(::ERA5Metadata) = false + +function Base.size(::ERA5Dataset, variable) + if variable in ERA5_wave_variables + return (720, 361, 1) + else + return (1440, 721, 1) + end +end + +##### +##### ERA5 single-level variable name mappings +##### + +# Variable name mappings from NumericalEarth names to ERA5/CDS API variable names +ERA5_dataset_variable_names = Dict( + :temperature => "2m_temperature", + :dewpoint_temperature => "2m_dewpoint_temperature", + :eastward_velocity => "10m_u_component_of_wind", + :northward_velocity => "10m_v_component_of_wind", + :surface_pressure => "surface_pressure", + :mean_sea_level_pressure => "mean_sea_level_pressure", + :total_precipitation => "total_precipitation", + :sea_surface_temperature => "sea_surface_temperature", + :downwelling_shortwave_radiation => "surface_solar_radiation_downwards", + :downwelling_longwave_radiation => "surface_thermal_radiation_downwards", + :total_cloud_cover => "total_cloud_cover", + :evaporation => "evaporation", + :specific_humidity => "specific_humidity", + :eastward_stokes_drift => "u_component_stokes_drift", + :northward_stokes_drift => "v_component_stokes_drift", + :significant_wave_height => "significant_height_of_combined_wind_waves_and_swell", + :mean_wave_period => "mean_wave_period", + :mean_wave_direction => "mean_wave_direction", +) + +# NetCDF short variable names (what's actually in the downloaded files) +# These differ from the CDS API variable names above +ERA5_netcdf_variable_names = Dict( + :temperature => "t2m", + :dewpoint_temperature => "d2m", + :eastward_velocity => "u10", + :northward_velocity => "v10", + :surface_pressure => "sp", + :mean_sea_level_pressure => "msl", + :total_precipitation => "tp", + :sea_surface_temperature => "sst", + :downwelling_shortwave_radiation => "ssrd", + :downwelling_longwave_radiation => "strd", + :total_cloud_cover => "tcc", + :evaporation => "e", + :specific_humidity => "q", + :eastward_stokes_drift => "ust", + :northward_stokes_drift => "vst", + :significant_wave_height => "swh", + :mean_wave_period => "mwp", + :mean_wave_direction => "mwd", +) + +# Variables available for download +available_variables(::ERA5Dataset) = ERA5_dataset_variable_names +dataset_variable_name(md::ERA5Metadata) = ERA5_dataset_variable_names[md.name] +netcdf_variable_name(md::ERA5Metadata) = ERA5_netcdf_variable_names[md.name] + +default_inpainting(md::ERA5Metadata) = nothing + +""" + retrieve_data(metadata::ERA5Metadatum) + +Retrieve ERA5 data from NetCDF file according to `metadata`. +ERA5 is 2D surface data, so we return a 2D array with an added singleton z-dimension. +""" +function retrieve_data(metadata::ERA5Metadatum) + path = metadata_path(metadata) + name = netcdf_variable_name(metadata) + + ds = NCDatasets.Dataset(path) + + # ERA5 is 2D + time, we take the first time step + # Data shape is typically (lon, lat) or (lon, lat, time) + raw_data = ds[name] + ndim = ndims(raw_data) + + if ndim == 2 + data_2d = raw_data[:, :] + elseif ndim == 3 + data_2d = raw_data[:, :, 1] + else + error("Unexpected ERA5 data dimensions: $ndim") + end + + close(ds) + + # Latitude is stored from 90°N → 90°S + data_2d = reverse(data_2d, dims=2) + + # Add singleton z-dimension for 3D field compatibility + # Return as (Nx, Ny, 1) + return reshape(data_2d, size(data_2d, 1), size(data_2d, 2), 1) +end + +##### +##### Metadata filename construction +##### + +function metadata_prefix(md::ERA5Metadata) + var = ERA5_dataset_variable_names[md.name] + dataset = dataset_name(md.dataset) + start_date = start_date_str(md.dates) + end_date = end_date_str(md.dates) + bbox = md.bounding_box + + if !isnothing(bbox) + w, e = bbox_strs(bbox.longitude) + s, n = bbox_strs(bbox.latitude) + suffix = string(w, e, s, n) + else + suffix = "" + end + + if start_date == end_date + prefix = string(var, "_", dataset, "_", start_date, suffix) + else + prefix = string(var, "_", dataset, "_", start_date, "_", end_date, suffix) + end + prefix = colon2dash(prefix) + prefix = underscore_spaces(prefix) + return prefix +end From c5c2e2f4b12b856221a0c44ad08218b71091473c Mon Sep 17 00:00:00 2001 From: Eliot Quon Date: Tue, 10 Mar 2026 23:33:57 -0700 Subject: [PATCH 24/53] Use consistent ERA5 naming --- examples/ERA5_winds_and_stokes_drift.jl | 4 ++-- src/DataWrangling/DataWrangling.jl | 2 +- src/DataWrangling/ERA5/ERA5.jl | 2 +- src/DataWrangling/ERA5/ERA5_single_levels.jl | 12 ++++++------ test/test_cds_downloading.jl | 10 +++++----- 5 files changed, 15 insertions(+), 15 deletions(-) diff --git a/examples/ERA5_winds_and_stokes_drift.jl b/examples/ERA5_winds_and_stokes_drift.jl index 871359b1f..d0fe9d018 100644 --- a/examples/ERA5_winds_and_stokes_drift.jl +++ b/examples/ERA5_winds_and_stokes_drift.jl @@ -16,7 +16,7 @@ using NumericalEarth using NumericalEarth.DataWrangling: Metadatum -using NumericalEarth.DataWrangling.ERA5: ERA5SingleLevelsHourly +using NumericalEarth.DataWrangling.ERA5: ERA5HourlySingleLevel using CDSAPI using Oceananigans @@ -29,7 +29,7 @@ using Dates # while ocean wave variables (Stokes drift) live on a 0.5° grid (720×361). # We define metadata for each variable at a single date. -dataset = ERA5SingleLevelsHourly() +dataset = ERA5HourlySingleLevel() date = DateTime(2020, 1, 15, 12) # January 15, 2020 at 12:00 UTC u_stokes_meta = Metadatum(:eastward_stokes_drift; dataset, date) diff --git a/src/DataWrangling/DataWrangling.jl b/src/DataWrangling/DataWrangling.jl index 142eb47b2..cb03f6040 100644 --- a/src/DataWrangling/DataWrangling.jl +++ b/src/DataWrangling/DataWrangling.jl @@ -9,7 +9,7 @@ export WOAClimatology, WOAAnnual, WOAMonthly export metadata_time_step, metadata_epoch export LinearlyTaperedPolarMask export DatasetRestoring -export ERA5SingleLevelsHourly, ERA5SingleLevelsMonthly, ERA5HourlyPressureLevels, ERA5MonthlyPressureLevels +export ERA5HourlySingleLevel, ERA5MonthlySingleLevel, ERA5HourlyPressureLevels, ERA5MonthlyPressureLevels using Oceananigans using Downloads diff --git a/src/DataWrangling/ERA5/ERA5.jl b/src/DataWrangling/ERA5/ERA5.jl index f790a7ced..85b131fc6 100644 --- a/src/DataWrangling/ERA5/ERA5.jl +++ b/src/DataWrangling/ERA5/ERA5.jl @@ -1,7 +1,7 @@ module ERA5 # 2-D data -export ERA5SingleLevelsHourly, ERA5SingleLevelsMonthly +export ERA5HourlySingleLevel, ERA5MonthlySingleLevel # 3-D data export ERA5HourlyPressureLevels, ERA5MonthlyPressureLevels, ERA5_all_pressure_levels, pressure_field, hPa diff --git a/src/DataWrangling/ERA5/ERA5_single_levels.jl b/src/DataWrangling/ERA5/ERA5_single_levels.jl index 5ebbd46ed..67f9f785b 100644 --- a/src/DataWrangling/ERA5/ERA5_single_levels.jl +++ b/src/DataWrangling/ERA5/ERA5_single_levels.jl @@ -1,8 +1,8 @@ -struct ERA5SingleLevelsHourly <: ERA5Dataset end -struct ERA5SingleLevelsMonthly <: ERA5Dataset end +struct ERA5HourlySingleLevel <: ERA5Dataset end +struct ERA5MonthlySingleLevel <: ERA5Dataset end -dataset_name(::ERA5SingleLevelsHourly) = "ERA5SingleLevelsHourly" -dataset_name(::ERA5SingleLevelsMonthly) = "ERA5SingleLevelsMonthly" +dataset_name(::ERA5HourlySingleLevel) = "ERA5HourlySingleLevel" +dataset_name(::ERA5MonthlySingleLevel) = "ERA5MonthlySingleLevel" # Wave variables are on a 0.5° grid (720×361), atmospheric variables on 0.25° (1440×721) const ERA5_wave_variables = Set([ @@ -15,8 +15,8 @@ const ERA5_wave_variables = Set([ ##### # ERA5 reanalysis data available from 1940 to present (we use a practical range here) -all_dates(::ERA5SingleLevelsHourly, var) = range(DateTime("1940-01-01"), stop=DateTime("2024-12-31"), step=Hour(1)) -all_dates(::ERA5SingleLevelsMonthly, var) = range(DateTime("1940-01-01"), stop=DateTime("2024-12-01"), step=Month(1)) +all_dates(::ERA5HourlySingleLevel, var) = range(DateTime("1940-01-01"), stop=DateTime("2024-12-31"), step=Hour(1)) +all_dates(::ERA5MonthlySingleLevel, var) = range(DateTime("1940-01-01"), stop=DateTime("2024-12-01"), step=Month(1)) # ERA5 single-level data is a spatially 2-D dataset is_three_dimensional(::ERA5Metadata) = false diff --git a/test/test_cds_downloading.jl b/test/test_cds_downloading.jl index 304d17079..5ffad9671 100644 --- a/test/test_cds_downloading.jl +++ b/test/test_cds_downloading.jl @@ -5,7 +5,7 @@ using Dates using NCDatasets using NumericalEarth.DataWrangling.ERA5 -using NumericalEarth.DataWrangling.ERA5: ERA5SingleLevelsHourly, ERA5SingleLevelsMonthly, ERA5_dataset_variable_names +using NumericalEarth.DataWrangling.ERA5: ERA5HourlySingleLevel, ERA5MonthlySingleLevel, ERA5_dataset_variable_names using NumericalEarth.DataWrangling.ERA5: ERA5HourlyPressureLevels, ERA5MonthlyPressureLevels, ERA5_all_pressure_levels, ERA5PL_dataset_variable_names, pressure_field @@ -17,7 +17,7 @@ start_date = DateTime(2005, 2, 16, 12) @testset "ERA5 data downloading and utilities" begin @info "Testing ERA5 downloading and NetCDF file verification..." - dataset = ERA5SingleLevelsHourly() + dataset = ERA5HourlySingleLevel() # Use a small bounding box to reduce download time bounding_box = NumericalEarth.DataWrangling.BoundingBox(longitude=(0, 5), latitude=(40, 45)) @@ -88,7 +88,7 @@ start_date = DateTime(2005, 2, 16, 12) # Test metadata properties @test metadatum.name == :temperature - @test metadatum.dataset isa ERA5SingleLevelsHourly + @test metadatum.dataset isa ERA5HourlySingleLevel @test metadatum.dates == start_date @test metadatum.bounding_box == bounding_box @@ -127,8 +127,8 @@ start_date = DateTime(2005, 2, 16, 12) end @testset "ERA5 Monthly dataset" begin - monthly_dataset = ERA5SingleLevelsMonthly() - @test monthly_dataset isa ERA5SingleLevelsMonthly + monthly_dataset = ERA5MonthlySingleLevel() + @test monthly_dataset isa ERA5MonthlySingleLevel # Test that all_dates returns a valid range dates = NumericalEarth.DataWrangling.all_dates(monthly_dataset, :temperature) From 77d33f35ec3500d2af6466fbe182e85f2da9819e Mon Sep 17 00:00:00 2001 From: Eliot Quon Date: Wed, 11 Mar 2026 13:11:23 -0700 Subject: [PATCH 25/53] Automatic geopotential download to calculate mean geopotential heights --- src/DataWrangling/ERA5/ERA5.jl | 2 +- .../ERA5/ERA5_pressure_levels.jl | 67 ++++++++++++------- 2 files changed, 45 insertions(+), 24 deletions(-) diff --git a/src/DataWrangling/ERA5/ERA5.jl b/src/DataWrangling/ERA5/ERA5.jl index 85b131fc6..3bb770e00 100644 --- a/src/DataWrangling/ERA5/ERA5.jl +++ b/src/DataWrangling/ERA5/ERA5.jl @@ -14,7 +14,7 @@ using Statistics using Oceananigans.Fields: Center using Oceananigans: CenterField, interior, fill_halo_regions!, CPU -using NumericalEarth.DataWrangling: Metadata, Metadatum, metadata_path, native_grid, InverseGravity +using NumericalEarth.DataWrangling: Metadata, Metadatum, metadata_path, native_grid, InverseGravity, download_dataset using Dates using Dates: DateTime, Day, Month, Hour diff --git a/src/DataWrangling/ERA5/ERA5_pressure_levels.jl b/src/DataWrangling/ERA5/ERA5_pressure_levels.jl index c9feb2dae..c5d360761 100644 --- a/src/DataWrangling/ERA5/ERA5_pressure_levels.jl +++ b/src/DataWrangling/ERA5/ERA5_pressure_levels.jl @@ -4,21 +4,25 @@ abstract type ERA5PressureDataset <: ERA5Dataset end const ERA5PressureMetadata{D} = Metadata{<:ERA5PressureDataset, D} const ERA5PressureMetadatum = Metadatum{<:ERA5PressureDataset} -mutable struct ERA5HourlyPressureLevels <: ERA5PressureDataset - pressure_levels :: Vector{Float64} - z :: Union{Nothing, Vector{Float64}} - ERA5HourlyPressureLevels(pressure_levels, z=nothing) = new(sort(pressure_levels, rev=true), z) +struct ERA5HourlyPressureLevels <: ERA5PressureDataset + pressure_levels :: Vector{Float64} + z :: Union{Nothing, Vector{Float64}} + mean_geopotential_height :: Bool + ERA5HourlyPressureLevels(pressure_levels, z=nothing; mean_geopotential_height=true) = + new(sort(pressure_levels, rev=true), z, mean_geopotential_height) end -ERA5HourlyPressureLevels(; pressure_levels=ERA5_all_pressure_levels, z=nothing) = - ERA5HourlyPressureLevels(pressure_levels, z) - -mutable struct ERA5MonthlyPressureLevels <: ERA5PressureDataset - pressure_levels :: Vector{Float64} - z :: Union{Nothing, Vector{Float64}} - ERA5MonthlyPressureLevels(pressure_levels, z=nothing) = new(sort(pressure_levels, rev=true), z) +ERA5HourlyPressureLevels(; pressure_levels=ERA5_all_pressure_levels, z=nothing, mean_geopotential_height=true) = + ERA5HourlyPressureLevels(pressure_levels, z; mean_geopotential_height) + +struct ERA5MonthlyPressureLevels <: ERA5PressureDataset + pressure_levels :: Vector{Float64} + z :: Union{Nothing, Vector{Float64}} + mean_geopotential_height :: Bool + ERA5MonthlyPressureLevels(pressure_levels, z=nothing; mean_geopotential_height=true) = + new(sort(pressure_levels, rev=true), z, mean_geopotential_height) end -ERA5MonthlyPressureLevels(; pressure_levels=ERA5_all_pressure_levels, z=nothing) = - ERA5MonthlyPressureLevels(pressure_levels, z) +ERA5MonthlyPressureLevels(; pressure_levels=ERA5_all_pressure_levels, z=nothing, mean_geopotential_height=true) = + ERA5MonthlyPressureLevels(pressure_levels, z; mean_geopotential_height) dataset_name(::ERA5HourlyPressureLevels) = "ERA5HourlyPressureLevels" dataset_name(::ERA5MonthlyPressureLevels) = "ERA5MonthlyPressureLevels" @@ -161,8 +165,7 @@ end # Levels may be in any order; output is sorted so k=1 is highest pressure (lowest altitude). function standard_atmosphere_z_interfaces(levels) @info "Calculating z-interfaces based on the International Standard Atmosphere... \ - For greater accuracy, download the :geopotential field and \ - set `dataset.z = mean_geopotential_z_interfaces(ϕ_metadata)`" + For greater accuracy, set ERA5PressureDataset(; mean_geopotential_height=true)" sorted_levels = sort(levels, rev=true) # highest pressure first → k=1 is bottom heights = standard_atmosphere_geopotential_height.(Float64.(sorted_levels)) Nz = length(heights) @@ -185,8 +188,26 @@ end # ERA5 pressure-levels (3-D) data product function z_interfaces(metadata::ERA5PressureMetadata) - z = metadata.dataset.z - return isnothing(z) ? standard_atmosphere_z_interfaces(metadata.dataset.pressure_levels) : z + # Return cached z if already set + !isnothing(metadata.dataset.z) && return metadata.dataset.z + + # If mean_geopotential_height is enabled, try to download and compute + if metadata.dataset.mean_geopotential_height + ϕ_metadata = Metadata(:geopotential; dataset=metadata.dataset, + dates=metadata.dates, bounding_box=metadata.bounding_box, + dir=metadata.dir) + try + download_dataset(ϕ_metadata) + zf = mean_geopotential_z_interfaces(metadata) + metadata.dataset.z = zf + return zf + catch e + @warn "Failed to download geopotential data; falling back to standard atmosphere" exception=e + end + end + + # Fallback + return standard_atmosphere_z_interfaces(metadata.dataset.pressure_levels) end ##### @@ -204,17 +225,17 @@ averages over the horizontal domain and all dates, and returns one representativ height per pressure level in bottom-to-top order (k=1 is highest pressure). """ function mean_geopotential_heights(metadata::ERA5PressureMetadata) - geo_metadata = Metadata(:geopotential; dataset=metadata.dataset, - dates=metadata.dates, bounding_box=metadata.bounding_box, - dir=metadata.dir) + ϕ_metadata = Metadata(:geopotential; dataset=metadata.dataset, + dates=metadata.dates, bounding_box=metadata.bounding_box, + dir=metadata.dir) heights = zeros(length(metadata.dataset.pressure_levels)) # average over time - for geo_datum in geo_metadata - data = retrieve_data(geo_datum) ./ Float32(ERA5_gravitational_acceleration) # Φ → Z (m) + for ϕ_datum in ϕ_metadata + data = retrieve_data(ϕ_datum) ./ Float32(ERA5_gravitational_acceleration) # Φ → Z (m) # average over horizontal dims heights .+= dropdims(mean(data; dims=(1, 2)); dims=(1, 2)) end - heights ./= length(geo_metadata) + heights ./= length(ϕ_metadata) return sort(heights) end From 5aa675b2f4a95c185c6eac670a09d8a4cd62ac5e Mon Sep 17 00:00:00 2001 From: Eliot Quon Date: Wed, 11 Mar 2026 13:23:59 -0700 Subject: [PATCH 26/53] Streamline automatic download --- ext/NumericalEarthCDSAPIExt.jl | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/ext/NumericalEarthCDSAPIExt.jl b/ext/NumericalEarthCDSAPIExt.jl index 59e9bdbe9..e46454b15 100644 --- a/ext/NumericalEarthCDSAPIExt.jl +++ b/ext/NumericalEarthCDSAPIExt.jl @@ -105,12 +105,12 @@ end ##### ERA5 pressure-level download ##### -function download_dataset(metadata::ERA5PressureMetadata; kwargs...) - paths = Array{String}(undef, length(metadata)) - for (m, metadatum) in enumerate(metadata) - paths[m] = download_dataset(metadatum; kwargs...) - end - return paths +function download_dataset(metadata::ERA5PressureMetadata; skip_existing=true, cleanup=true) + dates = metadata.dates isa AbstractVector ? metadata.dates : [metadata.dates] + download_dataset([metadata.name], metadata.dataset, dates; + bounding_box = metadata.bounding_box, + dir = metadata.dir, + skip_existing, cleanup) end function download_dataset(meta::ERA5PressureMetadatum; skip_existing=true) From 79dcb1730153b0d0b42ece7e1c6a609b88ca1bc2 Mon Sep 17 00:00:00 2001 From: Eliot Quon Date: Thu, 12 Mar 2026 10:24:07 -0700 Subject: [PATCH 27/53] Refactor; add examples --- examples/ERA5_bounded_pressure_level_data.jl | 121 ++++++ examples/ERA5_bounded_surface_data.jl | 92 +++++ ext/NumericalEarthCDSAPIExt.jl | 381 ++++++++---------- .../ERA5/ERA5_pressure_levels.jl | 6 +- src/DataWrangling/ERA5/ERA5_single_levels.jl | 4 +- test/test_cds_downloading.jl | 28 +- 6 files changed, 397 insertions(+), 235 deletions(-) create mode 100644 examples/ERA5_bounded_pressure_level_data.jl create mode 100644 examples/ERA5_bounded_surface_data.jl diff --git a/examples/ERA5_bounded_pressure_level_data.jl b/examples/ERA5_bounded_pressure_level_data.jl new file mode 100644 index 000000000..792a16ce5 --- /dev/null +++ b/examples/ERA5_bounded_pressure_level_data.jl @@ -0,0 +1,121 @@ +# Download 3-D snapshots of the atmospheric state from ERA5, e.g., to be used with FieldTimeSeries +# +# ## Install dependencies +# +# ```julia +# using Pkg +# pkg"add NumericalEarth CDSAPI" +# ``` +# +# You also need CDS API credentials in `~/.cdsapirc`. +# See for setup instructions. +# +# See for +# more information about pressure-level data + +using NumericalEarth +using NumericalEarth.DataWrangling: Metadata, BoundingBox, download_dataset +using NumericalEarth.DataWrangling.ERA5 +using CDSAPI +using Dates +using Oceananigans + +using Statistics +using CairoMakie + +selected_levels = filter(≥(250hPa), ERA5_all_pressure_levels) # select all levels below 250 hPa +dataset = ERA5HourlyPressureLevels(selected_levels) + +dates = DateTime(2004, 12, 16):Hour(1):DateTime(2005, 01, 09) # van Zanten et al 2011 study period +#dates = DateTime(2004, 12, 16):Hour(1):DateTime(2004, 12, 23) # shorter time range for demo + +# Rauber et al 2007, Fig 1: focus region +bounding_box = BoundingBox(latitude=(17, 18.5), longitude=(-62.5, -61)) + +# OPTIONAL: download all variables at once (fewer CDS API requests) +variables = [:geopotential, # for calculating zlevels as the mean geopotential height + :eastward_velocity, + :northward_velocity, + :temperature, + :specific_humidity] +download_dataset(variables, dataset, dates; bounding_box) + +# ## Create time series + +T_meta = Metadata(:temperature; dataset, dates, bounding_box) +q_meta = Metadata(:specific_humidity; dataset, dates, bounding_box) +u_meta = Metadata(:eastward_velocity; dataset, dates, bounding_box) +v_meta = Metadata(:northward_velocity; dataset, dates, bounding_box) + +T_series = FieldTimeSeries(T_meta; time_indices_in_memory=2) +q_series = FieldTimeSeries(q_meta; time_indices_in_memory=2) +u_series = FieldTimeSeries(u_meta; time_indices_in_memory=2) +v_series = FieldTimeSeries(v_meta; time_indices_in_memory=2) + +# ## Vertical profiles (mean ± spread) + +# Height coordinate (m) from the grid +_, _, z = nodes(T_series[1]) +Nz = length(z) +Nt = length(T_series.times) + +# Pressure at each level (hPa), ordered bottom-to-top (k=1 = highest pressure) +p_levels = sort(selected_levels, rev=true) ./ hPa # in hPa + +# Horizontal-mean profile for each snapshot → (Nz, Nt) arrays +function horizontal_mean_profiles(series) + profiles = zeros(Nz, Nt) + for n in 1:Nt + data = interior(series[n], :, :, :) + profiles[:, n] = dropdims(mean(data, dims=(1, 2)), dims=(1, 2)) + end + return profiles +end + +T_profiles = horizontal_mean_profiles(T_series) +q_profiles = horizontal_mean_profiles(q_series) +u_profiles = horizontal_mean_profiles(u_series) +v_profiles = horizontal_mean_profiles(v_series) + +# Convert T → potential temperature: θ = T (p₀/p)^(R/cₚ) +Rₐ_over_cₚ = 0.286 +θ_profiles = T_profiles .* (1000 ./ p_levels) .^ Rₐ_over_cₚ + +# Convert specific humidity from kg/kg → g/kg +q_profiles .*= 1000 + +# ## Plot + +fig = Figure(size=(900, 500), fontsize=12) + +ax_θ = Axis(fig[1, 1], xlabel="θ [K]", ylabel="Height [m]") +ax_q = Axis(fig[1, 2], xlabel="qᵥ [g kg⁻¹]", ylabel="Height [m]") +ax_u = Axis(fig[1, 3], xlabel="u [m s⁻¹]", ylabel="Height [m]") +ax_v = Axis(fig[1, 4], xlabel="v [m s⁻¹]", ylabel="Height [m]") + +for (ax, profiles) in [(ax_θ, θ_profiles), + (ax_q, q_profiles), + (ax_u, u_profiles), + (ax_v, v_profiles)] + μ = vec(mean(profiles, dims=2)) + #lo = vec(minimum(profiles, dims=2)) + #hi = vec(maximum(profiles, dims=2)) + lo = [quantile(r, 0.25) for r in eachrow(profiles)] + hi = [quantile(r, 0.75) for r in eachrow(profiles)] + band!(ax, z, lo, hi; direction=:y, color=(:gray, 0.4)) + lines!(ax, μ, z; color=:black, linewidth=2) +end + +xlims!(ax_θ, 293, 317) +xlims!(ax_q, 0, 20.4) +xlims!(ax_u, -10, 0) +xlims!(ax_v, -10, 0) + +linkyaxes!(ax_θ, ax_q, ax_u, ax_v) +ylims!(ax_θ, 0, 4000) + +hideydecorations!(ax_q) +hideydecorations!(ax_u) +hideydecorations!(ax_v) + +save("ERA5_pressure_level_profiles.png", fig; px_per_unit=4) diff --git a/examples/ERA5_bounded_surface_data.jl b/examples/ERA5_bounded_surface_data.jl new file mode 100644 index 000000000..99aaeb8c9 --- /dev/null +++ b/examples/ERA5_bounded_surface_data.jl @@ -0,0 +1,92 @@ +# Download 3-D snapshots of the atmospheric state from ERA5, e.g., to be used with FieldTimeSeries +# +# ## Install dependencies +# +# ```julia +# using Pkg +# pkg"add NumericalEarth CDSAPI" +# ``` +# +# You also need CDS API credentials in `~/.cdsapirc`. +# See for setup instructions. +# +# See for +# more information about single-level data + +using NumericalEarth.DataWrangling: Metadata, BoundingBox, download_dataset +using NumericalEarth.DataWrangling.ERA5 +using CDSAPI +using Dates +using Oceananigans + +using Statistics: mean +using CairoMakie +#precip_cmap = :rain +precip_cmap = cgrad([:indigo, :darkblue, :blue, :deepskyblue, :cyan, + :palegreen, :green, :yellow, :orange, :red, :pink]) + + +dataset = ERA5HourlySingleLevel() + +#dates = DateTime(2004, 12, 16):Hour(1):DateTime(2005, 01, 09) # van Zanten et al 2011 study period +dates = DateTime(2004, 12, 16):Hour(1):DateTime(2004, 12, 23) # shorter time range for demo + +# Rauber et al 2007, Fig 1: precip map +bounding_box = BoundingBox(latitude=(-25, 35), longitude=(-110, 30)) + +# OPTIONAL: download all variables at once (fewer CDS API requests) +# variables = [:total_precipitation, +# :sea_surface_temperature, +# :temperature, # at 2 m +# :dewpoint_temperature, # at 2 m +# :surface_pressure] +# download_dataset(variables, dataset, dates; bounding_box) + +# ## Create time series + +precip_meta = Metadata(:total_precipitation; dataset, dates, bounding_box) +precip_series = FieldTimeSeries(precip_meta) + +# ## Plotting + +Nt = length(precip_series.times) +λ, φ, _ = nodes(precip_series[1]) + +# ERA5 total_precipitation is in metres per hour; convert to mm/day +to_mm_day = 1000 * 24 + +# ### Time-averaged precipitation map + +precip_avg = mean(interior(precip_series[n], :, :, 1) for n in 1:Nt) .* to_mm_day + +fig1 = Figure(size=(900, 400)) +ax1 = Axis(fig1[1, 1], + title = "Mean precipitation (2004-12-16 to 2005-01-08)", + xlabel = "Longitude (°)", + ylabel = "Latitude (°)") +ax1.xticks = -90:30:30 +hm = heatmap!(ax1, λ, φ, precip_avg; colormap=precip_cmap, colorrange=(0, 12)) +Colorbar(fig1[1, 2], hm, label="Precipitation (mm/day)") +save("ERA5_mean_precipitation.png", fig1) + +# ### Precipitation animation + +fig2 = Figure(size=(900, 400)) + +n = Observable(1) +precip_n = @lift interior(precip_series[$n], :, :, 1) .* to_mm_day +anim_title = @lift "Precipitation (mm/day), " * string(first(dates) + Second(round(Int, precip_series.times[$n]))) + +ax2 = Axis(fig2[1, 1], + title = anim_title, + xlabel = "Longitude (°)", + ylabel = "Latitude (°)") +ax2.xticks = -90:30:30 + +hm2 = heatmap!(ax2, λ, φ, precip_n; colormap=precip_cmap, colorrange=(0, 12)) +Colorbar(fig2[1, 2], hm2, label="Precipitation (mm/day)") + +record(fig2, "ERA5_precipitation.mp4", 1:Nt; framerate=12) do nn + n[] = nn +end + diff --git a/ext/NumericalEarthCDSAPIExt.jl b/ext/NumericalEarthCDSAPIExt.jl index e46454b15..1857a18ec 100644 --- a/ext/NumericalEarthCDSAPIExt.jl +++ b/ext/NumericalEarthCDSAPIExt.jl @@ -8,148 +8,144 @@ using Oceananigans using Oceananigans.DistributedComputations: @root using Dates -using NumericalEarth.DataWrangling.ERA5: ERA5Metadata, ERA5Metadatum, ERA5_dataset_variable_names +using NumericalEarth.DataWrangling.ERA5: ERA5Dataset, ERA5Metadata, ERA5Metadatum, + ERA5_dataset_variable_names, ERA5_netcdf_variable_names using NumericalEarth.DataWrangling.ERA5: ERA5PressureDataset, ERA5PressureMetadata, ERA5PressureMetadatum, ERA5PL_dataset_variable_names, ERA5PL_netcdf_variable_names import NumericalEarth.DataWrangling: download_dataset +##### +##### Dispatch helpers — encapsulate single-level vs pressure-level differences +##### + +_cds_product(::ERA5Dataset) = "reanalysis-era5-single-levels" +_cds_product(::ERA5PressureDataset) = "reanalysis-era5-pressure-levels" + +_cds_varnames(::ERA5Dataset) = ERA5_dataset_variable_names +_cds_varnames(::ERA5PressureDataset) = ERA5PL_dataset_variable_names + +_nc_varnames(::ERA5Dataset) = ERA5_netcdf_variable_names +_nc_varnames(::ERA5PressureDataset) = ERA5PL_netcdf_variable_names + # Coordinate / dimension variables to propagate into each split file +const ERA5_COORD_VARS = Set(["longitude", "latitude", + "time", "valid_time", + "expver", "number"]) + const ERA5PL_COORD_VARS = Set(["longitude", "latitude", "pressure_level", "level", "time", "valid_time", - "expver", # experiment version: ==1 for final data, ==5 preliminary 5-day data - "number"]) # ensemble member + "expver", "number"]) -""" - download_dataset(metadata::ERA5Metadata; kwargs...) +_coord_vars(::ERA5Dataset) = ERA5_COORD_VARS +_coord_vars(::ERA5PressureDataset) = ERA5PL_COORD_VARS -Download ERA5 data for each date in the metadata, returning paths to downloaded files. -""" -function download_dataset(metadata::ERA5Metadata; kwargs...) - paths = Array{String}(undef, length(metadata)) - for (m, metadatum) in enumerate(metadata) - paths[m] = download_dataset(metadatum; kwargs...) - end - return paths +_extra_request_keys!(request, ::ERA5Dataset) = nothing +function _extra_request_keys!(request, ds::ERA5PressureDataset) + p_hPa = [round(Int, p * 1e-2) for p in ds.pressure_levels] + request["pressure_level"] = [string(p) for p in p_hPa] end -""" - download_dataset(meta::ERA5Metadatum; skip_existing=true, kwargs...) - -Download ERA5 data for a single date/time using the CDSAPI package. - -# Keyword Arguments -- `skip_existing`: Skip download if the file already exists (default: `true`). - -# Environment Setup -Before downloading, you must: -1. Create an account at https://cds.climate.copernicus.eu/ -2. Accept the Terms of Use for the ERA5 dataset on the dataset page -3. Set up your API credentials in `~/.cdsapirc` +##### +##### Single-date download +##### -See https://cds.climate.copernicus.eu/how-to-api for details. -""" function download_dataset(meta::ERA5Metadatum; skip_existing=true) + output_path = NumericalEarth.DataWrangling.metadata_path(meta) - output_directory = meta.dir - output_filename = NumericalEarth.DataWrangling.metadata_filename(meta) - output_path = joinpath(output_directory, output_filename) - - # Skip if file already exists - if skip_existing && isfile(output_path) - return output_path - end - - # Ensure output directory exists - mkpath(output_directory) - - # Get the ERA5 variable name - variable_name = ERA5_dataset_variable_names[meta.name] + skip_existing && isfile(output_path) && return output_path + mkpath(dirname(output_path)) - # Extract date information date = meta.dates - year = string(Dates.year(date)) - month = lpad(string(Dates.month(date)), 2, '0') - day = lpad(string(Dates.day(date)), 2, '0') - hour = lpad(string(Dates.hour(date)), 2, '0') * ":00" - - # Build request parameters request = Dict( "product_type" => ["reanalysis"], - "variable" => [variable_name], - "year" => [year], - "month" => [month], - "day" => [day], - "time" => [hour], + "variable" => [_cds_varnames(meta.dataset)[meta.name]], + "year" => [string(Dates.year(date))], + "month" => [lpad(string(Dates.month(date)), 2, '0')], + "day" => [lpad(string(Dates.day(date)), 2, '0')], + "time" => [lpad(string(Dates.hour(date)), 2, '0') * ":00"], "data_format" => "netcdf", "download_format" => "unarchived", ) - # Add area constraint from bounding box + _extra_request_keys!(request, meta.dataset) area = build_era5_area(meta.bounding_box) - if !isnothing(area) - request["area"] = area - end + isnothing(area) || (request["area"] = area) - # Perform the download using CDSAPI - @root begin - CDSAPI.retrieve("reanalysis-era5-single-levels", request, output_path) - end + @root CDSAPI.retrieve(_cds_product(meta.dataset), request, output_path) return output_path end ##### -##### ERA5 pressure-level download +##### Multi-date download — batches by calendar day ##### -function download_dataset(metadata::ERA5PressureMetadata; skip_existing=true, cleanup=true) +function download_dataset(metadata::ERA5Metadata; skip_existing=true, cleanup=true) dates = metadata.dates isa AbstractVector ? metadata.dates : [metadata.dates] - download_dataset([metadata.name], metadata.dataset, dates; - bounding_box = metadata.bounding_box, - dir = metadata.dir, - skip_existing, cleanup) + grouped = Dict(d => filter(dt -> Dates.Date(dt) == d, dates) + for d in unique(Dates.Date.(dates))) + + for day in sort(collect(keys(grouped))) + _download_era5_day(metadata.name, metadata.dataset, grouped[day]; + bounding_box = metadata.bounding_box, + dir = metadata.dir, + skip_existing, cleanup) + end end -function download_dataset(meta::ERA5PressureMetadatum; skip_existing=true) - output_directory = meta.dir - output_filename = NumericalEarth.DataWrangling.metadata_filename(meta) - output_path = joinpath(output_directory, output_filename) +function _download_era5_day(name, dataset, day_dates; + bounding_box, dir, skip_existing, cleanup) - skip_existing && isfile(output_path) && return output_path - mkpath(output_directory) + MDatum = NumericalEarth.DataWrangling.Metadatum + meta_path = NumericalEarth.DataWrangling.metadata_path - variable_name = ERA5PL_dataset_variable_names[meta.name] - date = meta.dates - year = string(Dates.year(date)) - month = lpad(string(Dates.month(date)), 2, '0') - day = lpad(string(Dates.day(date)), 2, '0') - hour = lpad(string(Dates.hour(date)), 2, '0') * ":00" + all_pairs = [(dt, meta_path(MDatum(name; dataset, date=dt, bounding_box, dir))) + for dt in day_dates] + + pending = skip_existing ? filter(((_, p),) -> !isfile(p), all_pairs) : all_pairs + isempty(pending) && return nothing - p_hPa = [round(Int, p * 1e-2) for p in meta.dataset.pressure_levels] + sorted_dts = sort(unique([dt for (dt, _) in pending])) + hours_str = [lpad(string(Dates.hour(dt)), 2, '0') * ":00" for dt in sorted_dts] + dt_to_tidx = Dict(dt => i for (i, dt) in enumerate(sorted_dts)) + + dt = first(sorted_dts) + year = string(Dates.year(dt)) + month = lpad(string(Dates.month(dt)), 2, '0') + day = lpad(string(Dates.day(dt)), 2, '0') request = Dict( "product_type" => ["reanalysis"], - "variable" => [variable_name], - "pressure_level" => [string(p) for p in p_hPa], + "variable" => [_cds_varnames(dataset)[name]], "year" => [year], "month" => [month], "day" => [day], - "time" => [hour], + "time" => hours_str, "data_format" => "netcdf", "download_format" => "unarchived", ) - area = build_era5_area(meta.bounding_box) + _extra_request_keys!(request, dataset) + area = build_era5_area(bounding_box) isnothing(area) || (request["area"] = area) + mkpath(dir) + tmp_path = joinpath(dir, "_tmp_$(year)$(month)$(day).nc") + nc_varname = _nc_varnames(dataset)[name] + nc_triples = [(nc_varname, dt_to_tidx[dt], path) for (dt, path) in pending] + + time_dimnames = Set(["time", "valid_time"]) + @root begin - CDSAPI.retrieve("reanalysis-era5-pressure-levels", request, output_path) + CDSAPI.retrieve(_cds_product(dataset), request, tmp_path) + _split_era5_nc_multistep(tmp_path, nc_triples, _coord_vars(dataset), time_dimnames) + cleanup && rm(tmp_path; force=true) end - return output_path + return nothing end ##### @@ -172,13 +168,9 @@ end download_dataset(names::Vector{Symbol}, meta::ERA5PressureMetadatum; skip_existing=true) Download multiple ERA5 pressure-level variables for a single date in one CDS API request. - -The multi-variable NetCDF returned by CDS is split into individual per-variable files, each -stored at the path returned by `metadata_path(Metadatum(name; dataset=meta.dataset, ...))`. -This keeps subsequent calls to `Field` and `retrieve_data` unchanged. +The multi-variable NetCDF is split into individual per-variable files. """ function download_dataset(names::Vector{Symbol}, meta::ERA5PressureMetadatum; skip_existing=true) - # Build (symbol → output path) pairs using the standard single-variable filename scheme name_path_pairs = [(name, NumericalEarth.DataWrangling.metadata_path( NumericalEarth.DataWrangling.Metadatum(name; dataset = meta.dataset, @@ -187,7 +179,6 @@ function download_dataset(names::Vector{Symbol}, meta::ERA5PressureMetadatum; sk dir = meta.dir))) for name in names] - # Determine which output files are missing pending = if skip_existing [(n, p) for (n, p) in name_path_pairs if !isfile(p)] else @@ -196,8 +187,7 @@ function download_dataset(names::Vector{Symbol}, meta::ERA5PressureMetadatum; sk isempty(pending) && return [path for (_, path) in name_path_pairs] - # Unique CDS variable names (:geopotential and :geopotential_height share one CDS field) - cds_vars = unique([ERA5PL_dataset_variable_names[name] for (name, _) in pending]) + cds_vars = unique([_cds_varnames(meta.dataset)[name] for (name, _) in pending]) date = meta.dates year = string(Dates.year(date)) @@ -205,12 +195,9 @@ function download_dataset(names::Vector{Symbol}, meta::ERA5PressureMetadatum; sk day = lpad(string(Dates.day(date)), 2, '0') hour = lpad(string(Dates.hour(date)), 2, '0') * ":00" - p_hPa = [round(Int, p * 1e-2) for p in meta.dataset.pressure_levels] - request = Dict( "product_type" => ["reanalysis"], "variable" => cds_vars, - "pressure_level" => [string(p) for p in p_hPa], "year" => [year], "month" => [month], "day" => [day], @@ -219,17 +206,18 @@ function download_dataset(names::Vector{Symbol}, meta::ERA5PressureMetadatum; sk "download_format" => "unarchived", ) + _extra_request_keys!(request, meta.dataset) area = build_era5_area(meta.bounding_box) isnothing(area) || (request["area"] = area) mkpath(meta.dir) tmp_path = joinpath(meta.dir, "_tmp_multi_$(year)$(month)$(day)T$(hour[1:2]).nc") - nc_name_path_pairs = [(ERA5PL_netcdf_variable_names[name], path) for (name, path) in pending] + nc_name_path_pairs = [(_nc_varnames(meta.dataset)[name], path) for (name, path) in pending] @root begin - CDSAPI.retrieve("reanalysis-era5-pressure-levels", request, tmp_path) - _split_era5pl_nc(tmp_path, nc_name_path_pairs) + CDSAPI.retrieve(_cds_product(meta.dataset), request, tmp_path) + _split_era5_nc(tmp_path, nc_name_path_pairs, _coord_vars(meta.dataset)) rm(tmp_path; force=true) end @@ -237,114 +225,30 @@ function download_dataset(names::Vector{Symbol}, meta::ERA5PressureMetadatum; sk end """ - download_dataset(names::Union{Symbol, Vector{Symbol}}, - dataset::ERA5PressureDataset, - datetime; - bounding_box=nothing, - dir=default_download_directory(dataset)) - -Download one or more ERA5 pressure-level variables at a single datetime, without requiring a dummy -`Metadatum`. - -# Example -```julia -ds = ERA5HourlyPressureLevels(pressure_levels=[850, 500] .* hPa) -download_dataset([:temperature, :geopotential_height, :eastward_velocity], ds; - date=DateTime(2020, 6, 15, 0), bounding_box=bbox) -``` + download_dataset(names, dataset::ERA5Dataset, datetime; ...) + +Download one or more ERA5 variables at a single datetime. """ -function download_dataset(names::Vector{Symbol}, dataset::ERA5PressureDataset, datetime; +function download_dataset(names::Vector{Symbol}, dataset::ERA5Dataset, datetime; bounding_box = nothing, dir = NumericalEarth.DataWrangling.default_download_directory(dataset)) - meta = NumericalEarth.DataWrangling.Metadatum(first(names); dataset, datetime, bounding_box, dir) + meta = NumericalEarth.DataWrangling.Metadatum(first(names); dataset, date=datetime, bounding_box, dir) return download_dataset(names, meta) end -function download_dataset(name::Symbol, dataset::ERA5PressureDataset, datetime; +function download_dataset(name::Symbol, dataset::ERA5Dataset, datetime; bounding_box = nothing, dir = NumericalEarth.DataWrangling.default_download_directory(dataset)) return download_dataset([name], dataset, datetime; bounding_box, dir) end """ - _split_era5pl_nc(src_path, nc_name_path_pairs) - -Split a multi-variable ERA5 pressure-level NetCDF at `src_path` into individual -per-variable files. `nc_name_path_pairs` is a vector of `(nc_varname, dst_path)` tuples. + download_dataset(names, dataset::ERA5Dataset, datetimes::AbstractVector; ...) -Each output file contains all coordinate/dimension variables plus the one data variable. -Global attributes and dimension definitions are preserved. Raw data (including scale/offset -encoding) is copied verbatim so that downstream readers decode values identically. -""" -function _split_era5pl_nc(src_path, nc_name_path_pairs) - NCDatasets.Dataset(src_path, "r") do src - for (nc_varname, dst_path) in nc_name_path_pairs - NCDatasets.Dataset(dst_path, "c") do dst - # Define dimensions - unlimited = NCDatasets.unlimited(src) - for (dname, dlen) in src.dim - NCDatasets.defDim(dst, dname, dname in unlimited ? Inf : dlen) - end - - # Copy global attributes - for (k, v) in src.attrib - dst.attrib[k] = v - end - - # Copy coordinate variables and the one requested data variable - for (vname, var) in src - (vname in ERA5PL_COORD_VARS || vname == nc_varname) || continue - _ncvar_copy!(dst, var, vname) - end - end - end - end -end - -function _ncvar_copy!(dst, src_var, vname) - dims = NCDatasets.dimnames(src_var) - T = eltype(src_var.var) # raw storage type; eltype(src_var) gives CF-decoded DateTime - attribs = src_var.attrib - fill_val = haskey(attribs, "_FillValue") ? attribs["_FillValue"] : nothing - - dst_var = if isnothing(fill_val) - NCDatasets.defVar(dst, vname, T, dims) - else - NCDatasets.defVar(dst, vname, T, dims; fillvalue=fill_val) - end - - # Copy variable attributes (skip _FillValue — handled by defVar above) - for (k, v) in attribs - k == "_FillValue" && continue - dst_var.attrib[k] = v - end - - # Copy raw encoded data, bypassing scale/offset transforms - dst_var.var[:] = src_var.var[:] - - return nothing -end - -""" - download_dataset(names::Union{Symbol, Vector{Symbol}}, - dataset::ERA5PressureDataset, - datetimes::AbstractVector; - bounding_box=nothing, - dir=default_download_directory(dataset), - skip_existing=true) - -Download one or more ERA5 pressure-level variables for a vector of `datetimes`, issuing one CDS API -request per unique calendar day (which may contain multiple hours). - -# Example -```julia -ds = ERA5HourlyPressureLevels(pressure_levels=[850, 500] .* hPa) -dates = DateTime(2020, 6, 15, 0) : Hour(6) : DateTime(2020, 6, 16, 18) -download_dataset([:temperature, :geopotential_height], ds, collect(dates); bounding_box=bbox) -``` +Download one or more ERA5 variables for multiple datetimes, batching by calendar day. """ function download_dataset(names::Vector{Symbol}, - dataset::ERA5PressureDataset, + dataset::ERA5Dataset, datetimes::AbstractVector; bounding_box = nothing, dir = NumericalEarth.DataWrangling.default_download_directory(dataset), @@ -355,14 +259,14 @@ function download_dataset(names::Vector{Symbol}, for d in unique(Dates.Date.(datetimes))) for day in sort(collect(keys(grouped))) - _download_era5pl_day(names, dataset, grouped[day]; bounding_box, dir, skip_existing, cleanup) + _download_era5_multivar_day(names, dataset, grouped[day]; bounding_box, dir, skip_existing, cleanup) end return nothing end function download_dataset(name::Symbol, - dataset::ERA5PressureDataset, + dataset::ERA5Dataset, datetimes::AbstractVector; bounding_box = nothing, dir = NumericalEarth.DataWrangling.default_download_directory(dataset), @@ -371,21 +275,19 @@ function download_dataset(name::Symbol, return download_dataset([name], dataset, datetimes; bounding_box, dir, skip_existing, cleanup) end -function _download_era5pl_day(names, dataset, day_dates; - bounding_box, dir, skip_existing, cleanup) +function _download_era5_multivar_day(names, dataset, day_dates; + bounding_box, dir, skip_existing, cleanup) MDatum = NumericalEarth.DataWrangling.Metadatum meta_path = NumericalEarth.DataWrangling.metadata_path - # All (name, dt, output path) triples for this day all_triples = [(name, dt, meta_path(MDatum(name; dataset, date=dt, bounding_box, dir))) for name in names for dt in day_dates] pending = skip_existing ? filter(((_, _, p),) -> !isfile(p), all_triples) : all_triples isempty(pending) && return nothing - # Unique CDS variable names and hours needed - cds_vars = unique([ERA5PL_dataset_variable_names[name] for (name, _, _) in pending]) + cds_vars = unique([_cds_varnames(dataset)[name] for (name, _, _) in pending]) sorted_dts = sort(unique([dt for (_, dt, _) in pending])) hours_str = [lpad(string(Dates.hour(dt)), 2, '0') * ":00" for dt in sorted_dts] dt_to_tidx = Dict(dt => i for (i, dt) in enumerate(sorted_dts)) @@ -395,12 +297,9 @@ function _download_era5pl_day(names, dataset, day_dates; month = lpad(string(Dates.month(dt)), 2, '0') day = lpad(string(Dates.day(dt)), 2, '0') - p_hPa = [round(Int, p * 1e-2) for p in dataset.pressure_levels] - request = Dict( "product_type" => ["reanalysis"], "variable" => cds_vars, - "pressure_level" => [string(p) for p in p_hPa], "year" => [year], "month" => [month], "day" => [day], @@ -409,32 +308,69 @@ function _download_era5pl_day(names, dataset, day_dates; "download_format" => "unarchived", ) + _extra_request_keys!(request, dataset) area = build_era5_area(bounding_box) isnothing(area) || (request["area"] = area) mkpath(dir) tmp_path = joinpath(dir, "_tmp_multi_$(year)$(month)$(day).nc") - nc_triples = [(ERA5PL_netcdf_variable_names[name], dt_to_tidx[dt], path) + nc_triples = [(_nc_varnames(dataset)[name], dt_to_tidx[dt], path) for (name, dt, path) in pending] + time_dimnames = Set(["time", "valid_time"]) + @root begin - CDSAPI.retrieve("reanalysis-era5-pressure-levels", request, tmp_path) - _split_era5pl_nc_multistep(tmp_path, nc_triples) + CDSAPI.retrieve(_cds_product(dataset), request, tmp_path) + _split_era5_nc_multistep(tmp_path, nc_triples, _coord_vars(dataset), time_dimnames) cleanup && rm(tmp_path; force=true) end return nothing end -function _split_era5pl_nc_multistep(src_path, nc_varname_tidx_path_triples) - time_dimnames = Set(["time", "valid_time"]) +##### +##### NetCDF splitting utilities +##### + +""" + _split_era5_nc(src_path, nc_name_path_pairs, coord_vars) + +Split a multi-variable NetCDF into individual per-variable files (single time step). +""" +function _split_era5_nc(src_path, nc_name_path_pairs, coord_vars) + NCDatasets.Dataset(src_path, "r") do src + for (nc_varname, dst_path) in nc_name_path_pairs + NCDatasets.Dataset(dst_path, "c") do dst + unlimited = NCDatasets.unlimited(src) + for (dname, dlen) in src.dim + NCDatasets.defDim(dst, dname, dname in unlimited ? Inf : dlen) + end + + for (k, v) in src.attrib + dst.attrib[k] = v + end + + for (vname, var) in src + (vname in coord_vars || vname == nc_varname) || continue + _ncvar_copy!(dst, var, vname) + end + end + end + end +end +""" + _split_era5_nc_multistep(src_path, triples, coord_vars, time_dimnames) + +Split a multi-timestep NetCDF into individual per-variable, per-timestep files. +`triples` is a vector of `(nc_varname, time_index, dst_path)`. +""" +function _split_era5_nc_multistep(src_path, nc_varname_tidx_path_triples, coord_vars, time_dimnames) NCDatasets.Dataset(src_path, "r") do src unlimited = NCDatasets.unlimited(src) for (nc_varname, tidx, dst_path) in nc_varname_tidx_path_triples NCDatasets.Dataset(dst_path, "c") do dst - # Collapse the time dimension to 1; keep all others for (dname, dlen) in src.dim out_len = dname in time_dimnames ? 1 : dname in unlimited ? Inf : dlen @@ -446,7 +382,7 @@ function _split_era5pl_nc_multistep(src_path, nc_varname_tidx_path_triples) end for (vname, var) in src - (vname in ERA5PL_COORD_VARS || vname == nc_varname) || continue + (vname in coord_vars || vname == nc_varname) || continue _ncvar_copy_tslice!(dst, var, vname, tidx, time_dimnames) end end @@ -454,6 +390,25 @@ function _split_era5pl_nc_multistep(src_path, nc_varname_tidx_path_triples) end end +function _ncvar_copy!(dst, src_var, vname) + dims = NCDatasets.dimnames(src_var) + T = eltype(src_var.var) + attribs = src_var.attrib + fill_val = haskey(attribs, "_FillValue") ? attribs["_FillValue"] : nothing + + dst_var = isnothing(fill_val) ? + NCDatasets.defVar(dst, vname, T, dims) : + NCDatasets.defVar(dst, vname, T, dims; fillvalue=fill_val) + + for (k, v) in attribs + k == "_FillValue" && continue + dst_var.attrib[k] = v + end + + dst_var.var[:] = src_var.var[:] + return nothing +end + function _ncvar_copy_tslice!(dst, src_var, vname, tidx, time_dimnames) dims = NCDatasets.dimnames(src_var) T = eltype(src_var.var) @@ -469,7 +424,6 @@ function _ncvar_copy_tslice!(dst, src_var, vname, tidx, time_dimnames) dst_var.attrib[k] = v end - # Slice time dimension; keep all others intact has_time = any(d -> d in time_dimnames, dims) if has_time idx = ntuple(ndims(src_var.var)) do i @@ -492,9 +446,6 @@ build_era5_area(::Nothing) = nothing const BBOX = NumericalEarth.DataWrangling.BoundingBox function build_era5_area(bbox::BBOX) - # CDS API uses [north, west, south, east] ordering - # BoundingBox has longitude = (west, east), latitude = (south, north) - lon = bbox.longitude lat = bbox.latitude diff --git a/src/DataWrangling/ERA5/ERA5_pressure_levels.jl b/src/DataWrangling/ERA5/ERA5_pressure_levels.jl index c5d360761..9e0a43096 100644 --- a/src/DataWrangling/ERA5/ERA5_pressure_levels.jl +++ b/src/DataWrangling/ERA5/ERA5_pressure_levels.jl @@ -198,9 +198,7 @@ function z_interfaces(metadata::ERA5PressureMetadata) dir=metadata.dir) try download_dataset(ϕ_metadata) - zf = mean_geopotential_z_interfaces(metadata) - metadata.dataset.z = zf - return zf + return mean_geopotential_z_interfaces(metadata) catch e @warn "Failed to download geopotential data; falling back to standard atmosphere" exception=e end @@ -262,7 +260,7 @@ end pressure_field(metadata::ERA5PressureMetadatum, arch=CPU(); halo=(3,3,3)) Return a `CenterField` on the native grid of `metadata` filled with the pressure -value (hPa) at each vertical level. Levels are ordered bottom-to-top (k=1 is the +value (Pa) at each vertical level. Levels are ordered bottom-to-top (k=1 is the highest pressure level). """ function pressure_field(metadata::ERA5PressureMetadatum, arch=CPU(); halo=(3,3,3)) diff --git a/src/DataWrangling/ERA5/ERA5_single_levels.jl b/src/DataWrangling/ERA5/ERA5_single_levels.jl index 67f9f785b..8da81deae 100644 --- a/src/DataWrangling/ERA5/ERA5_single_levels.jl +++ b/src/DataWrangling/ERA5/ERA5_single_levels.jl @@ -23,9 +23,9 @@ is_three_dimensional(::ERA5Metadata) = false function Base.size(::ERA5Dataset, variable) if variable in ERA5_wave_variables - return (720, 361, 1) + return (720, 360, 1) else - return (1440, 721, 1) + return (1440, 720, 1) end end diff --git a/test/test_cds_downloading.jl b/test/test_cds_downloading.jl index 5ffad9671..a13496f12 100644 --- a/test/test_cds_downloading.jl +++ b/test/test_cds_downloading.jl @@ -95,7 +95,7 @@ start_date = DateTime(2005, 2, 16, 12) # Test size (should be global ERA5 size with 1 time step) Nx, Ny, Nz, Nt = size(metadatum) @test Nx == 1440 # ERA5 longitude points - @test Ny == 721 # ERA5 latitude points + @test Ny == 720 # ERA5 latitude points (poles averaged into adjacent cells) @test Nz == 1 # 2D surface data @test Nt == 1 # Single time step @@ -104,23 +104,23 @@ start_date = DateTime(2005, 2, 16, 12) end @testset "ERA5 wave variable metadata sizes" begin - # Wave variables should be on the 0.5° grid (720×361) + # Wave variables should be on the 0.5° grid (720×360) for wave_var in (:eastward_stokes_drift, :northward_stokes_drift, :significant_wave_height, :mean_wave_period, :mean_wave_direction) metadatum = Metadatum(wave_var; dataset, date=start_date) Nx, Ny, Nz, Nt = size(metadatum) @test Nx == 720 - @test Ny == 361 + @test Ny == 360 @test Nz == 1 @test Nt == 1 end - # Atmospheric variables should remain on the 0.25° grid (1440×721) + # Atmospheric variables should remain on the 0.25° grid (1440×720) for atmos_var in (:temperature, :eastward_velocity, :surface_pressure) metadatum = Metadatum(atmos_var; dataset, date=start_date) Nx, Ny, Nz, Nt = size(metadatum) @test Nx == 1440 - @test Ny == 721 + @test Ny == 720 @test Nz == 1 @test Nt == 1 end @@ -141,11 +141,11 @@ start_date = DateTime(2005, 2, 16, 12) ds_full = ERA5HourlyPressureLevels() @test ds_full isa ERA5HourlyPressureLevels @test length(ds_full.pressure_levels) == 37 - @test Base.size(ds_full, :temperature) == (1440, 721, 37) + @test Base.size(ds_full, :temperature) == (1440, 720, 37) # Subset constructor ds_sub = ERA5HourlyPressureLevels(pressure_levels=[850, 500] .* hPa) - @test Base.size(ds_sub, :temperature) == (1440, 721, 2) + @test Base.size(ds_sub, :temperature) == (1440, 720, 2) # Monthly variant ds_monthly = ERA5MonthlyPressureLevels() @@ -163,15 +163,15 @@ start_date = DateTime(2005, 2, 16, 12) end @testset "ERA5 pressure-level z_interfaces (standard atmosphere)" begin - levels_2 = [850, 500] - z = NumericalEarth.DataWrangling.ERA5._std_atm_z_interfaces(levels_2) + levels_2 = [850, 500] .* hPa + z = standard_atmosphere_z_interfaces(levels_2) @test length(z) == 3 # Nz+1 interfaces @test issorted(z) # monotonically increasing with altitude # 850 hPa ≈ 1457 m, 500 hPa ≈ 5575 m @test z[1] < 1457.0 < z[2] < 5575.0 < z[3] # Single level - z1 = NumericalEarth.DataWrangling.ERA5._std_atm_z_interfaces([500]) + z1 = standard_atmosphere_z_interfaces([500] .* hPa) @test length(z1) == 2 @test z1[1] < z1[2] end @@ -299,10 +299,10 @@ start_date = DateTime(2005, 2, 16, 12) @test Nz == 2 @allowscalar begin - # k=1 should be 850 hPa (highest pressure, lowest altitude) - @test interior(pf)[1, 1, 1] ≈ 850.0f0 - # k=2 should be 500 hPa - @test interior(pf)[1, 1, 2] ≈ 500.0f0 + # k=1 should be 850 hPa = 85000 Pa (highest pressure, lowest altitude) + @test interior(pf)[1, 1, 1] ≈ Float32(850 * hPa) + # k=2 should be 500 hPa = 50000 Pa + @test interior(pf)[1, 1, 2] ≈ Float32(500 * hPa) end end end From 577b3c4e6d5dc2ee3a2578cf3a62137327f63983 Mon Sep 17 00:00:00 2001 From: Eliot Quon Date: Mon, 16 Mar 2026 22:52:45 -0600 Subject: [PATCH 28/53] Address a very very very minor point --- ext/NumericalEarthCDSAPIExt.jl | 20 +++++++++---------- .../ERA5/ERA5_pressure_levels.jl | 18 ++++++++--------- 2 files changed, 19 insertions(+), 19 deletions(-) diff --git a/ext/NumericalEarthCDSAPIExt.jl b/ext/NumericalEarthCDSAPIExt.jl index bee3ed9eb..629e7e31a 100644 --- a/ext/NumericalEarthCDSAPIExt.jl +++ b/ext/NumericalEarthCDSAPIExt.jl @@ -10,7 +10,7 @@ using Oceananigans.DistributedComputations: @root using Dates using NumericalEarth.DataWrangling.ERA5: ERA5Dataset, ERA5Metadata, ERA5Metadatum, ERA5_dataset_variable_names, ERA5_netcdf_variable_names -using NumericalEarth.DataWrangling.ERA5: ERA5PressureDataset, +using NumericalEarth.DataWrangling.ERA5: ERA5PressureLevelsDataset, ERA5PressureMetadata, ERA5PressureMetadatum, ERA5PL_dataset_variable_names, ERA5PL_netcdf_variable_names @@ -20,14 +20,14 @@ import NumericalEarth.DataWrangling: download_dataset ##### Dispatch helpers — encapsulate single-level vs pressure-level differences ##### -_cds_product(::ERA5Dataset) = "reanalysis-era5-single-levels" -_cds_product(::ERA5PressureDataset) = "reanalysis-era5-pressure-levels" +_cds_product(::ERA5Dataset) = "reanalysis-era5-single-levels" +_cds_product(::ERA5PressureLevelsDataset) = "reanalysis-era5-pressure-levels" -_cds_varnames(::ERA5Dataset) = ERA5_dataset_variable_names -_cds_varnames(::ERA5PressureDataset) = ERA5PL_dataset_variable_names +_cds_varnames(::ERA5Dataset) = ERA5_dataset_variable_names +_cds_varnames(::ERA5PressureLevelsDataset) = ERA5PL_dataset_variable_names -_nc_varnames(::ERA5Dataset) = ERA5_netcdf_variable_names -_nc_varnames(::ERA5PressureDataset) = ERA5PL_netcdf_variable_names +_nc_varnames(::ERA5Dataset) = ERA5_netcdf_variable_names +_nc_varnames(::ERA5PressureLevelsDataset) = ERA5PL_netcdf_variable_names # Coordinate / dimension variables to propagate into each split file const ERA5_COORD_VARS = Set(["longitude", "latitude", @@ -39,11 +39,11 @@ const ERA5PL_COORD_VARS = Set(["longitude", "latitude", "time", "valid_time", "expver", "number"]) -_coord_vars(::ERA5Dataset) = ERA5_COORD_VARS -_coord_vars(::ERA5PressureDataset) = ERA5PL_COORD_VARS +_coord_vars(::ERA5Dataset) = ERA5_COORD_VARS +_coord_vars(::ERA5PressureLevelsDataset) = ERA5PL_COORD_VARS _extra_request_keys!(request, ::ERA5Dataset) = nothing -function _extra_request_keys!(request, ds::ERA5PressureDataset) +function _extra_request_keys!(request, ds::ERA5PressureLevelsDataset) p_hPa = [round(Int, p * 1e-2) for p in ds.pressure_levels] request["pressure_level"] = [string(p) for p in p_hPa] end diff --git a/src/DataWrangling/ERA5/ERA5_pressure_levels.jl b/src/DataWrangling/ERA5/ERA5_pressure_levels.jl index 9e0a43096..ffb3b4155 100644 --- a/src/DataWrangling/ERA5/ERA5_pressure_levels.jl +++ b/src/DataWrangling/ERA5/ERA5_pressure_levels.jl @@ -1,10 +1,10 @@ -abstract type ERA5PressureDataset <: ERA5Dataset end +abstract type ERA5PressureLevelsDataset <: ERA5Dataset end # ERA5PressureMetadata is a subtype of ERA5Metadata -const ERA5PressureMetadata{D} = Metadata{<:ERA5PressureDataset, D} -const ERA5PressureMetadatum = Metadatum{<:ERA5PressureDataset} +const ERA5PressureMetadata{D} = Metadata{<:ERA5PressureLevelsDataset, D} +const ERA5PressureMetadatum = Metadatum{<:ERA5PressureLevelsDataset} -struct ERA5HourlyPressureLevels <: ERA5PressureDataset +struct ERA5HourlyPressureLevels <: ERA5PressureLevelsDataset pressure_levels :: Vector{Float64} z :: Union{Nothing, Vector{Float64}} mean_geopotential_height :: Bool @@ -14,7 +14,7 @@ end ERA5HourlyPressureLevels(; pressure_levels=ERA5_all_pressure_levels, z=nothing, mean_geopotential_height=true) = ERA5HourlyPressureLevels(pressure_levels, z; mean_geopotential_height) -struct ERA5MonthlyPressureLevels <: ERA5PressureDataset +struct ERA5MonthlyPressureLevels <: ERA5PressureLevelsDataset pressure_levels :: Vector{Float64} z :: Union{Nothing, Vector{Float64}} mean_geopotential_height :: Bool @@ -44,9 +44,9 @@ const ERA5_all_pressure_levels = [1, 2, 3, 5, 7, 10, 20, 30, 50, 70, 100, 125, 1 825, 850, 875, 900, 925, 950, 975, 1000]*hPa # ERA5 stores pressure levels bottom-to-top -reversed_vertical_axis(::ERA5PressureDataset) = false +reversed_vertical_axis(::ERA5PressureLevelsDataset) = false -Base.size(ds::ERA5PressureDataset, variable) = (1440, 720, length(ds.pressure_levels)) +Base.size(ds::ERA5PressureLevelsDataset, variable) = (1440, 720, length(ds.pressure_levels)) ##### ##### ERA5 pressure-level variable name mappings @@ -91,7 +91,7 @@ ERA5PL_netcdf_variable_names = Dict( ) # Variables available for download -available_variables(::ERA5PressureDataset) = ERA5PL_dataset_variable_names +available_variables(::ERA5PressureLevelsDataset) = ERA5PL_dataset_variable_names dataset_variable_name(md::ERA5PressureMetadata) = ERA5PL_dataset_variable_names[md.name] netcdf_variable_name(md::ERA5PressureMetadata) = ERA5PL_netcdf_variable_names[md.name] @@ -165,7 +165,7 @@ end # Levels may be in any order; output is sorted so k=1 is highest pressure (lowest altitude). function standard_atmosphere_z_interfaces(levels) @info "Calculating z-interfaces based on the International Standard Atmosphere... \ - For greater accuracy, set ERA5PressureDataset(; mean_geopotential_height=true)" + For greater accuracy, set ERA5PressureLevelsDataset(; mean_geopotential_height=true)" sorted_levels = sort(levels, rev=true) # highest pressure first → k=1 is bottom heights = standard_atmosphere_geopotential_height.(Float64.(sorted_levels)) Nz = length(heights) From f4afe090e6e5162eec2270ccefdafeace044d99d Mon Sep 17 00:00:00 2001 From: Eliot Quon Date: Mon, 16 Mar 2026 23:44:54 -0600 Subject: [PATCH 29/53] Retrieve var name from correct dict --- src/DataWrangling/ERA5/ERA5.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/DataWrangling/ERA5/ERA5.jl b/src/DataWrangling/ERA5/ERA5.jl index 9f3e76a72..57b31c2c3 100644 --- a/src/DataWrangling/ERA5/ERA5.jl +++ b/src/DataWrangling/ERA5/ERA5.jl @@ -103,7 +103,7 @@ function bbox_strs(c) end function metadata_prefix(dataset::ERA5Dataset, name, date, bounding_box) - var = ERA5_dataset_variable_names[name] + var = available_variables(dataset)[name] ds = dataset_name(dataset) start_date = start_date_str(date) end_date = end_date_str(date) From c28db4f8aa94658aad0e5cb66ea5b5a61772c10b Mon Sep 17 00:00:00 2001 From: Eliot Quon Date: Mon, 16 Mar 2026 23:50:24 -0600 Subject: [PATCH 30/53] Add additional surface variables --- src/DataWrangling/ERA5/ERA5_single_levels.jl | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/DataWrangling/ERA5/ERA5_single_levels.jl b/src/DataWrangling/ERA5/ERA5_single_levels.jl index 8da81deae..6a7f8c49c 100644 --- a/src/DataWrangling/ERA5/ERA5_single_levels.jl +++ b/src/DataWrangling/ERA5/ERA5_single_levels.jl @@ -43,6 +43,8 @@ ERA5_dataset_variable_names = Dict( :mean_sea_level_pressure => "mean_sea_level_pressure", :total_precipitation => "total_precipitation", :sea_surface_temperature => "sea_surface_temperature", + :surface_sensible_heat_flux => "surface_sensible_heat_flux", + :surface_latent_heat_flux => "surface_latent_heat_flux", :downwelling_shortwave_radiation => "surface_solar_radiation_downwards", :downwelling_longwave_radiation => "surface_thermal_radiation_downwards", :total_cloud_cover => "total_cloud_cover", @@ -66,6 +68,8 @@ ERA5_netcdf_variable_names = Dict( :mean_sea_level_pressure => "msl", :total_precipitation => "tp", :sea_surface_temperature => "sst", + :surface_sensible_heat_flux => "sshf", + :surface_latent_heat_flux => "slhf", :downwelling_shortwave_radiation => "ssrd", :downwelling_longwave_radiation => "strd", :total_cloud_cover => "tcc", From 74cb622ea9f52bc5be78ead37712271e3357c8eb Mon Sep 17 00:00:00 2001 From: Eliot Quon Date: Tue, 17 Mar 2026 00:00:56 -0600 Subject: [PATCH 31/53] Improved readability --- ext/NumericalEarthCDSAPIExt.jl | 17 ++++++++++------- src/DataWrangling/ERA5/ERA5_pressure_levels.jl | 3 ++- 2 files changed, 12 insertions(+), 8 deletions(-) diff --git a/ext/NumericalEarthCDSAPIExt.jl b/ext/NumericalEarthCDSAPIExt.jl index 629e7e31a..feae84725 100644 --- a/ext/NumericalEarthCDSAPIExt.jl +++ b/ext/NumericalEarthCDSAPIExt.jl @@ -173,13 +173,16 @@ Download multiple ERA5 pressure-level variables for a single date in one CDS API The multi-variable NetCDF is split into individual per-variable files. """ function download_dataset(names::Vector{Symbol}, meta::ERA5PressureMetadatum; skip_existing=true) - name_path_pairs = [(name, NumericalEarth.DataWrangling.metadata_path( - NumericalEarth.DataWrangling.Metadatum(name; - dataset = meta.dataset, - bounding_box = meta.bounding_box, - date = meta.dates, - dir = meta.dir))) - for name in names] + name_path_pairs = [] + for name in names + metadatum = NumericalEarth.DataWrangling.Metadatum(name; + dataset = meta.dataset, + bounding_box = meta.bounding_box, + date = meta.dates, + dir = meta.dir) + path = NumericalEarth.DataWrangling.metadata_path(metadatum) + push!(name_path_pairs, (name, path)) + end pending = if skip_existing [(n, p) for (n, p) in name_path_pairs if !isfile(p)] diff --git a/src/DataWrangling/ERA5/ERA5_pressure_levels.jl b/src/DataWrangling/ERA5/ERA5_pressure_levels.jl index ffb3b4155..4acec6639 100644 --- a/src/DataWrangling/ERA5/ERA5_pressure_levels.jl +++ b/src/DataWrangling/ERA5/ERA5_pressure_levels.jl @@ -231,7 +231,8 @@ function mean_geopotential_heights(metadata::ERA5PressureMetadata) for ϕ_datum in ϕ_metadata data = retrieve_data(ϕ_datum) ./ Float32(ERA5_gravitational_acceleration) # Φ → Z (m) # average over horizontal dims - heights .+= dropdims(mean(data; dims=(1, 2)); dims=(1, 2)) + data_mean = mean(data; dims=(1, 2)) + heights .+= dropdims(data_mean; dims=(1, 2)) end heights ./= length(ϕ_metadata) From ad135e31d87dc8679422b027f8f150b7c03e866d Mon Sep 17 00:00:00 2001 From: Eliot Quon Date: Tue, 17 Mar 2026 00:26:15 -0600 Subject: [PATCH 32/53] Follow Oceananigans convention --- ext/NumericalEarthCDSAPIExt.jl | 80 +++++++++++++++++----------------- 1 file changed, 40 insertions(+), 40 deletions(-) diff --git a/ext/NumericalEarthCDSAPIExt.jl b/ext/NumericalEarthCDSAPIExt.jl index feae84725..7816be62a 100644 --- a/ext/NumericalEarthCDSAPIExt.jl +++ b/ext/NumericalEarthCDSAPIExt.jl @@ -20,14 +20,14 @@ import NumericalEarth.DataWrangling: download_dataset ##### Dispatch helpers — encapsulate single-level vs pressure-level differences ##### -_cds_product(::ERA5Dataset) = "reanalysis-era5-single-levels" -_cds_product(::ERA5PressureLevelsDataset) = "reanalysis-era5-pressure-levels" +cds_product(::ERA5Dataset) = "reanalysis-era5-single-levels" +cds_product(::ERA5PressureLevelsDataset) = "reanalysis-era5-pressure-levels" -_cds_varnames(::ERA5Dataset) = ERA5_dataset_variable_names -_cds_varnames(::ERA5PressureLevelsDataset) = ERA5PL_dataset_variable_names +cds_varnames(::ERA5Dataset) = ERA5_dataset_variable_names +cds_varnames(::ERA5PressureLevelsDataset) = ERA5PL_dataset_variable_names -_nc_varnames(::ERA5Dataset) = ERA5_netcdf_variable_names -_nc_varnames(::ERA5PressureLevelsDataset) = ERA5PL_netcdf_variable_names +nc_varnames(::ERA5Dataset) = ERA5_netcdf_variable_names +nc_varnames(::ERA5PressureLevelsDataset) = ERA5PL_netcdf_variable_names # Coordinate / dimension variables to propagate into each split file const ERA5_COORD_VARS = Set(["longitude", "latitude", @@ -39,11 +39,11 @@ const ERA5PL_COORD_VARS = Set(["longitude", "latitude", "time", "valid_time", "expver", "number"]) -_coord_vars(::ERA5Dataset) = ERA5_COORD_VARS -_coord_vars(::ERA5PressureLevelsDataset) = ERA5PL_COORD_VARS +coord_vars(::ERA5Dataset) = ERA5_COORD_VARS +coord_vars(::ERA5PressureLevelsDataset) = ERA5PL_COORD_VARS -_extra_request_keys!(request, ::ERA5Dataset) = nothing -function _extra_request_keys!(request, ds::ERA5PressureLevelsDataset) +extra_request_keys!(request, ::ERA5Dataset) = nothing +function extra_request_keys!(request, ds::ERA5PressureLevelsDataset) p_hPa = [round(Int, p * 1e-2) for p in ds.pressure_levels] request["pressure_level"] = [string(p) for p in p_hPa] end @@ -63,7 +63,7 @@ function download_dataset(meta::ERA5Metadatum; skip_existing=true) date = meta.dates request = Dict( "product_type" => ["reanalysis"], - "variable" => [_cds_varnames(meta.dataset)[meta.name]], + "variable" => [cds_varnames(meta.dataset)[meta.name]], "year" => [string(Dates.year(date))], "month" => [lpad(string(Dates.month(date)), 2, '0')], "day" => [lpad(string(Dates.day(date)), 2, '0')], @@ -72,11 +72,11 @@ function download_dataset(meta::ERA5Metadatum; skip_existing=true) "download_format" => "unarchived", ) - _extra_request_keys!(request, meta.dataset) + extra_request_keys!(request, meta.dataset) area = build_era5_area(meta.bounding_box) isnothing(area) || (request["area"] = area) - @root CDSAPI.retrieve(_cds_product(meta.dataset), request, output_path) + @root CDSAPI.retrieve(cds_product(meta.dataset), request, output_path) return output_path end @@ -91,14 +91,14 @@ function download_dataset(metadata::ERA5Metadata; skip_existing=true, cleanup=tr for d in unique(Dates.Date.(dates))) for day in sort(collect(keys(grouped))) - _download_era5_day(metadata.name, metadata.dataset, grouped[day]; + download_era5_day(metadata.name, metadata.dataset, grouped[day]; bounding_box = metadata.bounding_box, dir = metadata.dir, skip_existing, cleanup) end end -function _download_era5_day(name, dataset, day_dates; +function download_era5_day(name, dataset, day_dates; bounding_box, dir, skip_existing, cleanup) MDatum = NumericalEarth.DataWrangling.Metadatum @@ -121,7 +121,7 @@ function _download_era5_day(name, dataset, day_dates; request = Dict( "product_type" => ["reanalysis"], - "variable" => [_cds_varnames(dataset)[name]], + "variable" => [cds_varnames(dataset)[name]], "year" => [year], "month" => [month], "day" => [day], @@ -130,20 +130,20 @@ function _download_era5_day(name, dataset, day_dates; "download_format" => "unarchived", ) - _extra_request_keys!(request, dataset) + extra_request_keys!(request, dataset) area = build_era5_area(bounding_box) isnothing(area) || (request["area"] = area) mkpath(dir) tmp_path = joinpath(dir, "_tmp_$(year)$(month)$(day).nc") - nc_varname = _nc_varnames(dataset)[name] + nc_varname = nc_varnames(dataset)[name] nc_triples = [(nc_varname, dt_to_tidx[dt], path) for (dt, path) in pending] time_dimnames = Set(["time", "valid_time"]) @root begin - CDSAPI.retrieve(_cds_product(dataset), request, tmp_path) - _split_era5_nc_multistep(tmp_path, nc_triples, _coord_vars(dataset), time_dimnames) + CDSAPI.retrieve(cds_product(dataset), request, tmp_path) + split_era5_nc_multistep(tmp_path, nc_triples, coord_vars(dataset), time_dimnames) cleanup && rm(tmp_path; force=true) end @@ -192,7 +192,7 @@ function download_dataset(names::Vector{Symbol}, meta::ERA5PressureMetadatum; sk isempty(pending) && return [path for (_, path) in name_path_pairs] - cds_vars = unique([_cds_varnames(meta.dataset)[name] for (name, _) in pending]) + cds_vars = unique([cds_varnames(meta.dataset)[name] for (name, _) in pending]) date = meta.dates year = string(Dates.year(date)) @@ -211,18 +211,18 @@ function download_dataset(names::Vector{Symbol}, meta::ERA5PressureMetadatum; sk "download_format" => "unarchived", ) - _extra_request_keys!(request, meta.dataset) + extra_request_keys!(request, meta.dataset) area = build_era5_area(meta.bounding_box) isnothing(area) || (request["area"] = area) mkpath(meta.dir) tmp_path = joinpath(meta.dir, "_tmp_multi_$(year)$(month)$(day)T$(hour[1:2]).nc") - nc_name_path_pairs = [(_nc_varnames(meta.dataset)[name], path) for (name, path) in pending] + nc_name_path_pairs = [(nc_varnames(meta.dataset)[name], path) for (name, path) in pending] @root begin - CDSAPI.retrieve(_cds_product(meta.dataset), request, tmp_path) - _split_era5_nc(tmp_path, nc_name_path_pairs, _coord_vars(meta.dataset)) + CDSAPI.retrieve(cds_product(meta.dataset), request, tmp_path) + split_era5_nc(tmp_path, nc_name_path_pairs, coord_vars(meta.dataset)) rm(tmp_path; force=true) end @@ -264,7 +264,7 @@ function download_dataset(names::Vector{Symbol}, for d in unique(Dates.Date.(datetimes))) for day in sort(collect(keys(grouped))) - _download_era5_multivar_day(names, dataset, grouped[day]; bounding_box, dir, skip_existing, cleanup) + download_era5_multivar_day(names, dataset, grouped[day]; bounding_box, dir, skip_existing, cleanup) end return nothing @@ -280,7 +280,7 @@ function download_dataset(name::Symbol, return download_dataset([name], dataset, datetimes; bounding_box, dir, skip_existing, cleanup) end -function _download_era5_multivar_day(names, dataset, day_dates; +function download_era5_multivar_day(names, dataset, day_dates; bounding_box, dir, skip_existing, cleanup) MDatum = NumericalEarth.DataWrangling.Metadatum @@ -292,7 +292,7 @@ function _download_era5_multivar_day(names, dataset, day_dates; pending = skip_existing ? filter(((_, _, p),) -> !isfile(p), all_triples) : all_triples isempty(pending) && return nothing - cds_vars = unique([_cds_varnames(dataset)[name] for (name, _, _) in pending]) + cds_vars = unique([cds_varnames(dataset)[name] for (name, _, _) in pending]) sorted_dts = sort(unique([dt for (_, dt, _) in pending])) hours_str = [lpad(string(Dates.hour(dt)), 2, '0') * ":00" for dt in sorted_dts] dt_to_tidx = Dict(dt => i for (i, dt) in enumerate(sorted_dts)) @@ -313,20 +313,20 @@ function _download_era5_multivar_day(names, dataset, day_dates; "download_format" => "unarchived", ) - _extra_request_keys!(request, dataset) + extra_request_keys!(request, dataset) area = build_era5_area(bounding_box) isnothing(area) || (request["area"] = area) mkpath(dir) tmp_path = joinpath(dir, "_tmp_multi_$(year)$(month)$(day).nc") - nc_triples = [(_nc_varnames(dataset)[name], dt_to_tidx[dt], path) + nc_triples = [(nc_varnames(dataset)[name], dt_to_tidx[dt], path) for (name, dt, path) in pending] time_dimnames = Set(["time", "valid_time"]) @root begin - CDSAPI.retrieve(_cds_product(dataset), request, tmp_path) - _split_era5_nc_multistep(tmp_path, nc_triples, _coord_vars(dataset), time_dimnames) + CDSAPI.retrieve(cds_product(dataset), request, tmp_path) + split_era5_nc_multistep(tmp_path, nc_triples, coord_vars(dataset), time_dimnames) cleanup && rm(tmp_path; force=true) end @@ -338,11 +338,11 @@ end ##### """ - _split_era5_nc(src_path, nc_name_path_pairs, coord_vars) + split_era5_nc(src_path, nc_name_path_pairs, coord_vars) Split a multi-variable NetCDF into individual per-variable files (single time step). """ -function _split_era5_nc(src_path, nc_name_path_pairs, coord_vars) +function split_era5_nc(src_path, nc_name_path_pairs, coord_vars) NCDatasets.Dataset(src_path, "r") do src for (nc_varname, dst_path) in nc_name_path_pairs NCDatasets.Dataset(dst_path, "c") do dst @@ -357,7 +357,7 @@ function _split_era5_nc(src_path, nc_name_path_pairs, coord_vars) for (vname, var) in src (vname in coord_vars || vname == nc_varname) || continue - _ncvar_copy!(dst, var, vname) + ncvar_copy!(dst, var, vname) end end end @@ -365,12 +365,12 @@ function _split_era5_nc(src_path, nc_name_path_pairs, coord_vars) end """ - _split_era5_nc_multistep(src_path, triples, coord_vars, time_dimnames) + split_era5_nc_multistep(src_path, triples, coord_vars, time_dimnames) Split a multi-timestep NetCDF into individual per-variable, per-timestep files. `triples` is a vector of `(nc_varname, time_index, dst_path)`. """ -function _split_era5_nc_multistep(src_path, nc_varname_tidx_path_triples, coord_vars, time_dimnames) +function split_era5_nc_multistep(src_path, nc_varname_tidx_path_triples, coord_vars, time_dimnames) NCDatasets.Dataset(src_path, "r") do src unlimited = NCDatasets.unlimited(src) @@ -388,14 +388,14 @@ function _split_era5_nc_multistep(src_path, nc_varname_tidx_path_triples, coord_ for (vname, var) in src (vname in coord_vars || vname == nc_varname) || continue - _ncvar_copy_tslice!(dst, var, vname, tidx, time_dimnames) + ncvar_copy_tslice!(dst, var, vname, tidx, time_dimnames) end end end end end -function _ncvar_copy!(dst, src_var, vname) +function ncvar_copy!(dst, src_var, vname) dims = NCDatasets.dimnames(src_var) T = eltype(src_var.var) attribs = src_var.attrib @@ -414,7 +414,7 @@ function _ncvar_copy!(dst, src_var, vname) return nothing end -function _ncvar_copy_tslice!(dst, src_var, vname, tidx, time_dimnames) +function ncvar_copy_tslice!(dst, src_var, vname, tidx, time_dimnames) dims = NCDatasets.dimnames(src_var) T = eltype(src_var.var) attribs = src_var.attrib From b6baf6473d69744d2975d4b6e7f5e90b354725ee Mon Sep 17 00:00:00 2001 From: Eliot Quon Date: Wed, 18 Mar 2026 22:37:41 -0600 Subject: [PATCH 33/53] Aesthetics --- src/DataWrangling/ERA5/ERA5_pressure_levels.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/DataWrangling/ERA5/ERA5_pressure_levels.jl b/src/DataWrangling/ERA5/ERA5_pressure_levels.jl index 4acec6639..3159d6757 100644 --- a/src/DataWrangling/ERA5/ERA5_pressure_levels.jl +++ b/src/DataWrangling/ERA5/ERA5_pressure_levels.jl @@ -269,7 +269,7 @@ function pressure_field(metadata::ERA5PressureMetadatum, arch=CPU(); halo=(3,3,3 field = CenterField(grid) reversed_levels = sort(metadata.dataset.pressure_levels, rev=true) # highest pressure → k=1 for (k, p) in enumerate(reversed_levels) - interior(field)[:, :, k] .= Float32(p) + interior(field, :, :, k) .= Float32(p) end fill_halo_regions!(field) return field From 7cc877e7a1f62cec01f06256a323fda8a468e875 Mon Sep 17 00:00:00 2001 From: Eliot Quon Date: Mon, 23 Mar 2026 12:58:05 -0600 Subject: [PATCH 34/53] Handle download of instantaneous and accumulated/mean quantities --- ext/NumericalEarthCDSAPIExt.jl | 22 ++++++++++++++- src/DataWrangling/ERA5/ERA5_single_levels.jl | 29 +++++++++++++++++--- 2 files changed, 46 insertions(+), 5 deletions(-) diff --git a/ext/NumericalEarthCDSAPIExt.jl b/ext/NumericalEarthCDSAPIExt.jl index 7816be62a..04c2a0d3e 100644 --- a/ext/NumericalEarthCDSAPIExt.jl +++ b/ext/NumericalEarthCDSAPIExt.jl @@ -9,7 +9,8 @@ using Oceananigans.DistributedComputations: @root using Dates using NumericalEarth.DataWrangling.ERA5: ERA5Dataset, ERA5Metadata, ERA5Metadatum, - ERA5_dataset_variable_names, ERA5_netcdf_variable_names + ERA5_dataset_variable_names, ERA5_netcdf_variable_names, + ERA5_single_level_accumulated_variables using NumericalEarth.DataWrangling.ERA5: ERA5PressureLevelsDataset, ERA5PressureMetadata, ERA5PressureMetadatum, ERA5PL_dataset_variable_names, ERA5PL_netcdf_variable_names @@ -280,9 +281,28 @@ function download_dataset(name::Symbol, return download_dataset([name], dataset, datetimes; bounding_box, dir, skip_existing, cleanup) end +is_accumulated(name::Symbol) = name in ERA5_single_level_accumulated_variables + function download_era5_multivar_day(names, dataset, day_dates; bounding_box, dir, skip_existing, cleanup) + # Split accumulated and instantaneous variables into separate requests + # to avoid CDS returning a zipfile with multiple NetCDF files + accum_names = filter(is_accumulated, names) + instant_names = filter(!is_accumulated, names) + + for name_group in (accum_names, instant_names) + isempty(name_group) && continue + download_era5_multivar_day_batch(name_group, dataset, day_dates; + bounding_box, dir, skip_existing, cleanup) + end + + return nothing +end + +function download_era5_multivar_day_batch(names, dataset, day_dates; + bounding_box, dir, skip_existing, cleanup) + MDatum = NumericalEarth.DataWrangling.Metadatum meta_path = NumericalEarth.DataWrangling.metadata_path diff --git a/src/DataWrangling/ERA5/ERA5_single_levels.jl b/src/DataWrangling/ERA5/ERA5_single_levels.jl index 6a7f8c49c..77bf5e36a 100644 --- a/src/DataWrangling/ERA5/ERA5_single_levels.jl +++ b/src/DataWrangling/ERA5/ERA5_single_levels.jl @@ -10,6 +10,21 @@ const ERA5_wave_variables = Set([ :significant_wave_height, :mean_wave_period, :mean_wave_direction, ]) +# Mean rate / accumulated variables (CDS "step type" = accum). +# All other single-level variables are instantaneous. +# See ECMWF ERA5 documentation, Tables 3 and 4: +# https://confluence.ecmwf.int/display/CKB/ERA5%3A+data+documentation#ERA5:datadocumentation-Meanrates/fluxesandaccumulations +const ERA5_single_level_accumulated_variables = Set([ + :total_precipitation, + :mean_surface_sensible_heat_flux, + :mean_surface_latent_heat_flux, + :mean_surface_momentum_flux_x, + :mean_surface_momentum_flux_y, + :downwelling_shortwave_radiation, + :downwelling_longwave_radiation, + :evaporation, +]) + ##### ##### ERA5 single-level data availability ##### @@ -42,9 +57,11 @@ ERA5_dataset_variable_names = Dict( :surface_pressure => "surface_pressure", :mean_sea_level_pressure => "mean_sea_level_pressure", :total_precipitation => "total_precipitation", + :mean_surface_momentum_flux_x => "mean_eastward_turbulent_surface_stress", + :mean_surface_momentum_flux_y => "mean_northward_turbulent_surface_stress", :sea_surface_temperature => "sea_surface_temperature", - :surface_sensible_heat_flux => "surface_sensible_heat_flux", - :surface_latent_heat_flux => "surface_latent_heat_flux", + :mean_surface_sensible_heat_flux => "mean_surface_sensible_heat_flux", + :mean_surface_latent_heat_flux => "mean_surface_latent_heat_flux", :downwelling_shortwave_radiation => "surface_solar_radiation_downwards", :downwelling_longwave_radiation => "surface_thermal_radiation_downwards", :total_cloud_cover => "total_cloud_cover", @@ -59,6 +76,8 @@ ERA5_dataset_variable_names = Dict( # NetCDF short variable names (what's actually in the downloaded files) # These differ from the CDS API variable names above +# The expected shortName (see https://confluence.ecmwf.int/display/CKB/ERA5%3A+data+documentation#heading-Parameterlistings) +# did not always match the netcdf variable names. In those cases, the longName and units were manually verified. ERA5_netcdf_variable_names = Dict( :temperature => "t2m", :dewpoint_temperature => "d2m", @@ -67,9 +86,11 @@ ERA5_netcdf_variable_names = Dict( :surface_pressure => "sp", :mean_sea_level_pressure => "msl", :total_precipitation => "tp", + :mean_surface_momentum_flux_x => "avg_iews", # shortName: metss + :mean_surface_momentum_flux_y => "avg_inss", # shortName: mntss :sea_surface_temperature => "sst", - :surface_sensible_heat_flux => "sshf", - :surface_latent_heat_flux => "slhf", + :mean_surface_sensible_heat_flux => "avg_ishf", # shortName: msshf + :mean_surface_latent_heat_flux => "avg_slhtf", # shortName: mslhf :downwelling_shortwave_radiation => "ssrd", :downwelling_longwave_radiation => "strd", :total_cloud_cover => "tcc", From 567101b2c91cdf33298633430bb324e1fea7899d Mon Sep 17 00:00:00 2001 From: Eliot Quon Date: Tue, 24 Mar 2026 09:05:08 -0600 Subject: [PATCH 35/53] Remove redundant duplicate date when requesting ERA5 single-level hourly data --- src/DataWrangling/ERA5/ERA5.jl | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/DataWrangling/ERA5/ERA5.jl b/src/DataWrangling/ERA5/ERA5.jl index 57b31c2c3..1a2e3f910 100644 --- a/src/DataWrangling/ERA5/ERA5.jl +++ b/src/DataWrangling/ERA5/ERA5.jl @@ -116,7 +116,11 @@ function metadata_prefix(dataset::ERA5Dataset, name, date, bounding_box) suffix = "" end - prefix = string(var, "_", ds, "_", start_date, "_", end_date, suffix) + if start_date == end_date + prefix = string(var, "_", ds, "_", start_date, suffix) + else + prefix = string(var, "_", ds, "_", start_date, "_", end_date, suffix) + end prefix = colon2dash(prefix) prefix = underscore_spaces(prefix) return prefix From 48a6b3e9ccf1192b66e7a58c0358114360d7b988 Mon Sep 17 00:00:00 2001 From: Eliot Quon Date: Tue, 24 Mar 2026 09:10:38 -0600 Subject: [PATCH 36/53] =?UTF-8?q?Add=20mean=20evaporation=20rate=20aka=20"?= =?UTF-8?q?time-mean=20moisture=20flux"=20[kg=20m=E2=81=BB=C2=B2=20s?= =?UTF-8?q?=E2=81=BB=C2=B9]?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/DataWrangling/ERA5/ERA5_single_levels.jl | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/src/DataWrangling/ERA5/ERA5_single_levels.jl b/src/DataWrangling/ERA5/ERA5_single_levels.jl index 77bf5e36a..d53bd0fa6 100644 --- a/src/DataWrangling/ERA5/ERA5_single_levels.jl +++ b/src/DataWrangling/ERA5/ERA5_single_levels.jl @@ -23,6 +23,7 @@ const ERA5_single_level_accumulated_variables = Set([ :downwelling_shortwave_radiation, :downwelling_longwave_radiation, :evaporation, + :mean_evaporation_rate, ]) ##### @@ -66,6 +67,7 @@ ERA5_dataset_variable_names = Dict( :downwelling_longwave_radiation => "surface_thermal_radiation_downwards", :total_cloud_cover => "total_cloud_cover", :evaporation => "evaporation", + :mean_evaporation_rate => "mean_evaporation_rate", :specific_humidity => "specific_humidity", :eastward_stokes_drift => "u_component_stokes_drift", :northward_stokes_drift => "v_component_stokes_drift", @@ -75,9 +77,9 @@ ERA5_dataset_variable_names = Dict( ) # NetCDF short variable names (what's actually in the downloaded files) -# These differ from the CDS API variable names above -# The expected shortName (see https://confluence.ecmwf.int/display/CKB/ERA5%3A+data+documentation#heading-Parameterlistings) -# did not always match the netcdf variable names. In those cases, the longName and units were manually verified. +# - These differ from the CDS API variable names above +# - The expected "shortName" (see https://confluence.ecmwf.int/display/CKB/ERA5%3A+data+documentation#heading-Parameterlistings) +# did not always match the netcdf variable names?! In those cases, the longName and units were manually verified. ERA5_netcdf_variable_names = Dict( :temperature => "t2m", :dewpoint_temperature => "d2m", @@ -95,6 +97,7 @@ ERA5_netcdf_variable_names = Dict( :downwelling_longwave_radiation => "strd", :total_cloud_cover => "tcc", :evaporation => "e", + :mean_evaporation_rate => "avg_ie", # shortName: mer :specific_humidity => "q", :eastward_stokes_drift => "ust", :northward_stokes_drift => "vst", From 0eb2c78fc7a3cc73c151d5518b545b73e6de605e Mon Sep 17 00:00:00 2001 From: Eliot Quon Date: Wed, 25 Mar 2026 13:45:49 -0600 Subject: [PATCH 37/53] Most robustly handle situations with multiple data types in one request including instantaneous, accumulated, and averaged --- ext/NumericalEarthCDSAPIExt.jl | 72 ++++++++++++++++++++++------------ 1 file changed, 48 insertions(+), 24 deletions(-) diff --git a/ext/NumericalEarthCDSAPIExt.jl b/ext/NumericalEarthCDSAPIExt.jl index 04c2a0d3e..98940b16c 100644 --- a/ext/NumericalEarthCDSAPIExt.jl +++ b/ext/NumericalEarthCDSAPIExt.jl @@ -9,8 +9,7 @@ using Oceananigans.DistributedComputations: @root using Dates using NumericalEarth.DataWrangling.ERA5: ERA5Dataset, ERA5Metadata, ERA5Metadatum, - ERA5_dataset_variable_names, ERA5_netcdf_variable_names, - ERA5_single_level_accumulated_variables + ERA5_dataset_variable_names, ERA5_netcdf_variable_names using NumericalEarth.DataWrangling.ERA5: ERA5PressureLevelsDataset, ERA5PressureMetadata, ERA5PressureMetadatum, ERA5PL_dataset_variable_names, ERA5PL_netcdf_variable_names @@ -49,6 +48,40 @@ function extra_request_keys!(request, ds::ERA5PressureLevelsDataset) request["pressure_level"] = [string(p) for p in p_hPa] end +##### +##### ZIP detection — CDS returns a ZIP when mixing step types (inst/accum/avg) +##### + +const ZIP_MAGIC = UInt8[0x50, 0x4b, 0x03, 0x04] + +function is_zip(path) + open(path, "r") do io + magic = read(io, 4) + return length(magic) >= 4 && magic == ZIP_MAGIC + end +end + +""" + foreach_nc(f, download_path, cleanup_dir) + +If `download_path` is a ZIP archive (as CDS returns when mixing variable step types), +extract all NetCDF files and call `f(nc_path)` on each. Otherwise call `f` directly +on `download_path`. +""" +function foreach_nc(f, download_path, cleanup_dir) + if is_zip(download_path) + tmp_dir = mktempdir(cleanup_dir) + run(`unzip -qo $download_path -d $tmp_dir`) + nc_files = filter(p -> endswith(p, ".nc"), readdir(tmp_dir; join=true)) + for nc_file in nc_files + f(nc_file) + end + rm(tmp_dir; recursive=true, force=true) + else + f(download_path) + end +end + ##### ##### Single-date download ##### @@ -144,7 +177,9 @@ function download_era5_day(name, dataset, day_dates; @root begin CDSAPI.retrieve(cds_product(dataset), request, tmp_path) - split_era5_nc_multistep(tmp_path, nc_triples, coord_vars(dataset), time_dimnames) + foreach_nc(tmp_path, dir) do nc_path + split_era5_nc_multistep(nc_path, nc_triples, coord_vars(dataset), time_dimnames) + end cleanup && rm(tmp_path; force=true) end @@ -223,7 +258,9 @@ function download_dataset(names::Vector{Symbol}, meta::ERA5PressureMetadatum; sk @root begin CDSAPI.retrieve(cds_product(meta.dataset), request, tmp_path) - split_era5_nc(tmp_path, nc_name_path_pairs, coord_vars(meta.dataset)) + foreach_nc(tmp_path, meta.dir) do nc_path + split_era5_nc(nc_path, nc_name_path_pairs, coord_vars(meta.dataset)) + end rm(tmp_path; force=true) end @@ -281,28 +318,9 @@ function download_dataset(name::Symbol, return download_dataset([name], dataset, datetimes; bounding_box, dir, skip_existing, cleanup) end -is_accumulated(name::Symbol) = name in ERA5_single_level_accumulated_variables - function download_era5_multivar_day(names, dataset, day_dates; bounding_box, dir, skip_existing, cleanup) - # Split accumulated and instantaneous variables into separate requests - # to avoid CDS returning a zipfile with multiple NetCDF files - accum_names = filter(is_accumulated, names) - instant_names = filter(!is_accumulated, names) - - for name_group in (accum_names, instant_names) - isempty(name_group) && continue - download_era5_multivar_day_batch(name_group, dataset, day_dates; - bounding_box, dir, skip_existing, cleanup) - end - - return nothing -end - -function download_era5_multivar_day_batch(names, dataset, day_dates; - bounding_box, dir, skip_existing, cleanup) - MDatum = NumericalEarth.DataWrangling.Metadatum meta_path = NumericalEarth.DataWrangling.metadata_path @@ -346,7 +364,9 @@ function download_era5_multivar_day_batch(names, dataset, day_dates; @root begin CDSAPI.retrieve(cds_product(dataset), request, tmp_path) - split_era5_nc_multistep(tmp_path, nc_triples, coord_vars(dataset), time_dimnames) + foreach_nc(tmp_path, dir) do nc_path + split_era5_nc_multistep(nc_path, nc_triples, coord_vars(dataset), time_dimnames) + end cleanup && rm(tmp_path; force=true) end @@ -364,7 +384,9 @@ Split a multi-variable NetCDF into individual per-variable files (single time st """ function split_era5_nc(src_path, nc_name_path_pairs, coord_vars) NCDatasets.Dataset(src_path, "r") do src + src_varnames = Set(keys(src)) for (nc_varname, dst_path) in nc_name_path_pairs + nc_varname in src_varnames || continue NCDatasets.Dataset(dst_path, "c") do dst unlimited = NCDatasets.unlimited(src) for (dname, dlen) in src.dim @@ -392,9 +414,11 @@ Split a multi-timestep NetCDF into individual per-variable, per-timestep files. """ function split_era5_nc_multistep(src_path, nc_varname_tidx_path_triples, coord_vars, time_dimnames) NCDatasets.Dataset(src_path, "r") do src + src_varnames = Set(keys(src)) unlimited = NCDatasets.unlimited(src) for (nc_varname, tidx, dst_path) in nc_varname_tidx_path_triples + nc_varname in src_varnames || continue NCDatasets.Dataset(dst_path, "c") do dst for (dname, dlen) in src.dim out_len = dname in time_dimnames ? 1 : From 7b7d1c7c84f4ecb7aa220627552e8c4b99303cfd Mon Sep 17 00:00:00 2001 From: Eliot Quon Date: Wed, 25 Mar 2026 16:11:19 -0600 Subject: [PATCH 38/53] Better handle geopotential height variability --- src/DataWrangling/metadata_field.jl | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/src/DataWrangling/metadata_field.jl b/src/DataWrangling/metadata_field.jl index 7529bb836..68c8af7f4 100644 --- a/src/DataWrangling/metadata_field.jl +++ b/src/DataWrangling/metadata_field.jl @@ -176,8 +176,13 @@ function set!(target_field::Field, metadata::Metadatum; kw...) Lzt = grid.Lz Lzm = meta_field.grid.Lz - - if Lzt > Lzm + + # Allow up to 1% vertical mismatch. For pressure-level datasets with time-varying + # geopotential heights, the per-timestep vertical extent can be slightly smaller + # than the temporal-mean extent used for the target grid (e.g. when the atmosphere + # is compressed). Oceananigans' interpolate! does not extrapolate, so target points + # just outside the source domain will use the nearest interior values. + if Lzt > Lzm * (1 + 1e-2) throw("The vertical range of the $(metadata.dataset) dataset ($(Lzm) m) is smaller than " * "the target grid ($(Lzt) m). Some vertical levels cannot be filled with data.") end From ae169b9aa62bdc9df398709da72dcad1ba50823a Mon Sep 17 00:00:00 2001 From: Eliot Quon Date: Thu, 26 Mar 2026 00:17:35 -0600 Subject: [PATCH 39/53] Allow use of pre-downloaded ERA5 data without depending on CDSAPI --- src/DataWrangling/DataWrangling.jl | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/src/DataWrangling/DataWrangling.jl b/src/DataWrangling/DataWrangling.jl index 528a9650f..70c88711f 100644 --- a/src/DataWrangling/DataWrangling.jl +++ b/src/DataWrangling/DataWrangling.jl @@ -177,6 +177,19 @@ Arguments https://github.com/CliMA/NumericalEarth.jl/blob/main/src/DataWrangling/ECCO/README.md """ function download_dataset end # methods specific to datasets are added within each dataset module + +# Fallback: if no download extension is loaded, check that all files already exist +function download_dataset(metadata::Metadata) + paths = metadata_path(metadata) + paths isa AbstractString && (paths = [paths]) + missing_files = filter(!isfile, paths) + if !isempty(missing_files) + n = length(missing_files) + error("No download method is available (is the backend package loaded?) " * + "and $n data file(s) are missing. First missing: $(first(missing_files))") + end + return paths +end function inpainted_metadata_path end """ From 4afc7290e8e71e7801badc1265ec56364154cd40 Mon Sep 17 00:00:00 2001 From: Eliot Quon Date: Fri, 27 Mar 2026 21:01:47 -0600 Subject: [PATCH 40/53] Fix bug introduced by prev commit; add additional era5 vars --- src/DataWrangling/DataWrangling.jl | 26 +++++++++---------- .../ERA5/ERA5_pressure_levels.jl | 4 +++ 2 files changed, 17 insertions(+), 13 deletions(-) diff --git a/src/DataWrangling/DataWrangling.jl b/src/DataWrangling/DataWrangling.jl index 70c88711f..39e57321b 100644 --- a/src/DataWrangling/DataWrangling.jl +++ b/src/DataWrangling/DataWrangling.jl @@ -177,19 +177,6 @@ Arguments https://github.com/CliMA/NumericalEarth.jl/blob/main/src/DataWrangling/ECCO/README.md """ function download_dataset end # methods specific to datasets are added within each dataset module - -# Fallback: if no download extension is loaded, check that all files already exist -function download_dataset(metadata::Metadata) - paths = metadata_path(metadata) - paths isa AbstractString && (paths = [paths]) - missing_files = filter(!isfile, paths) - if !isempty(missing_files) - n = length(missing_files) - error("No download method is available (is the backend package loaded?) " * - "and $n data file(s) are missing. First missing: $(first(missing_files))") - end - return paths -end function inpainted_metadata_path end """ @@ -250,4 +237,17 @@ using .ORCA using .WOA using .JRA55 +# Fallback: if no download extension is loaded, check that all files already exist +function download_dataset(metadata::Metadata) + paths = metadata_path(metadata) + paths isa AbstractString && (paths = [paths]) + missing_files = filter(!isfile, paths) + if !isempty(missing_files) + n = length(missing_files) + error("No download method is available (is the backend package loaded?) " * + "and $n data file(s) are missing. First missing: $(first(missing_files))") + end + return paths +end + end # module diff --git a/src/DataWrangling/ERA5/ERA5_pressure_levels.jl b/src/DataWrangling/ERA5/ERA5_pressure_levels.jl index 3159d6757..84715f19c 100644 --- a/src/DataWrangling/ERA5/ERA5_pressure_levels.jl +++ b/src/DataWrangling/ERA5/ERA5_pressure_levels.jl @@ -68,6 +68,8 @@ ERA5PL_dataset_variable_names = Dict( :fraction_of_cloud_cover => "fraction_of_cloud_cover", :specific_cloud_liquid_water_content => "specific_cloud_liquid_water_content", :specific_cloud_ice_water_content => "specific_cloud_ice_water_content", + :specific_rain_water_content => "specific_rain_water_content", + :specific_snow_water_content => "specific_snow_water_content", ) # NetCDF short variable names (what's actually in the downloaded files) @@ -88,6 +90,8 @@ ERA5PL_netcdf_variable_names = Dict( :fraction_of_cloud_cover => "cc", :specific_cloud_liquid_water_content => "clwc", :specific_cloud_ice_water_content => "ciwc", + :specific_rain_water_content => "crwc", + :specific_snow_water_content => "cswc", ) # Variables available for download From 0960d35c3322facb3a2b3d4d67970b6e0fa56901 Mon Sep 17 00:00:00 2001 From: Eliot Quon Date: Tue, 21 Apr 2026 16:38:39 -0600 Subject: [PATCH 41/53] Minor cleanup --- src/DataWrangling/ERA5/ERA5_pressure_levels.jl | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/src/DataWrangling/ERA5/ERA5_pressure_levels.jl b/src/DataWrangling/ERA5/ERA5_pressure_levels.jl index 84715f19c..1deaaa00b 100644 --- a/src/DataWrangling/ERA5/ERA5_pressure_levels.jl +++ b/src/DataWrangling/ERA5/ERA5_pressure_levels.jl @@ -38,10 +38,9 @@ all_dates(::ERA5MonthlyPressureLevels, var) = range(DateTime("1940-01-01"), stop # ERA5 pressure-level data is a spatially 3-D dataset is_three_dimensional(::ERA5PressureMetadata) = true -const hPa = 100.0 const ERA5_all_pressure_levels = [1, 2, 3, 5, 7, 10, 20, 30, 50, 70, 100, 125, 150, 175, 200, 225, 250, 300, 350, 400, 450, 500, 550, 600, 650, 700, 750, 775, 800, - 825, 850, 875, 900, 925, 950, 975, 1000]*hPa + 825, 850, 875, 900, 925, 950, 975, 1000] * 100 # hPa -> Pa # ERA5 stores pressure levels bottom-to-top reversed_vertical_axis(::ERA5PressureLevelsDataset) = false @@ -155,11 +154,11 @@ end const ERA5_gravitational_acceleration = 9.80665 -# International Standard Atmosphere height (m) for a given pressure (hPa) +# International Standard Atmosphere height (m) for a given pressure function standard_atmosphere_geopotential_height(p) g = ERA5_gravitational_acceleration T⁰ = 288.15 # K - p⁰ = 1013.25 * hPa + p⁰ = 101325 Rᵈ = 287.0528 # J/(kg-K) return (Rᵈ * T⁰ / g) * log(p⁰ / p) From 50cc57dc0bb177f846cd8936e552f3f651a8ce42 Mon Sep 17 00:00:00 2001 From: Eliot Quon Date: Wed, 29 Apr 2026 12:59:43 -0600 Subject: [PATCH 42/53] Fix ERA5 Column grid test to use renamed type Upstream's column-field test used `ERA5Hourly`, which this branch renamed to `ERA5HourlySingleLevel` (reserving the simpler name for the abstract dataset family). Update the test accordingly. Co-Authored-By: Claude Opus 4.7 (1M context) --- test/test_column_field.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/test_column_field.jl b/test/test_column_field.jl index ea33bad6c..8a9b51695 100644 --- a/test/test_column_field.jl +++ b/test/test_column_field.jl @@ -212,7 +212,7 @@ end @testset "ERA5 Column grid" begin col = Column(200.0, 35.0) - md = Metadatum(:temperature; dataset=ERA5Hourly(), + md = Metadatum(:temperature; dataset=ERA5HourlySingleLevel(), date=DateTime(2020, 1, 1), region=col) grid = native_grid(md) From bb099f0ef4ad7a10985a6d0bf52bc6706bb4cf2a Mon Sep 17 00:00:00 2001 From: Eliot Quon Date: Wed, 29 Apr 2026 14:51:12 -0600 Subject: [PATCH 43/53] Simplify pressure_field to a reduced column Field Per @glwagner's review: a `Field{Nothing, Nothing, Center}` represents the same per-level pressure values as the previous `CenterField` but without copying across the full horizontal grid. Use `set!` for the column assignment, drop the per-k loop, and let the field's eltype follow the grid (no more hardcoded `Float32`). Co-Authored-By: Claude Opus 4.7 (1M context) --- src/DataWrangling/ERA5/ERA5.jl | 4 ++-- src/DataWrangling/ERA5/ERA5_pressure_levels.jl | 13 ++++++------- 2 files changed, 8 insertions(+), 9 deletions(-) diff --git a/src/DataWrangling/ERA5/ERA5.jl b/src/DataWrangling/ERA5/ERA5.jl index 5af969520..610655d52 100644 --- a/src/DataWrangling/ERA5/ERA5.jl +++ b/src/DataWrangling/ERA5/ERA5.jl @@ -12,8 +12,8 @@ using Printf using Scratch using Statistics -using Oceananigans.Fields: Center -using Oceananigans: CenterField, interior, fill_halo_regions!, CPU +using Oceananigans.Fields: Center, set! +using Oceananigans: Field, fill_halo_regions!, CPU using NumericalEarth.DataWrangling: Metadata, Metadatum, metadata_path, native_grid, InverseGravity, download_dataset using Dates using Dates: DateTime, Day, Month, Hour diff --git a/src/DataWrangling/ERA5/ERA5_pressure_levels.jl b/src/DataWrangling/ERA5/ERA5_pressure_levels.jl index 68c21a79f..2cd3164e9 100644 --- a/src/DataWrangling/ERA5/ERA5_pressure_levels.jl +++ b/src/DataWrangling/ERA5/ERA5_pressure_levels.jl @@ -263,17 +263,16 @@ end """ pressure_field(metadata::ERA5PressureMetadatum, arch=CPU(); halo=(3,3,3)) -Return a `CenterField` on the native grid of `metadata` filled with the pressure -value (Pa) at each vertical level. Levels are ordered bottom-to-top (k=1 is the -highest pressure level). +Return a `Field{Nothing, Nothing, Center}` on the native grid of `metadata` +holding the pressure value (Pa) at each vertical level. Levels are ordered +bottom-to-top (k=1 is the highest pressure level). The `Nothing` horizontal +locations make this field broadcast against full 3-D fields without copying. """ function pressure_field(metadata::ERA5PressureMetadatum, arch=CPU(); halo=(3,3,3)) grid = native_grid(metadata, arch; halo) - field = CenterField(grid) + field = Field{Nothing, Nothing, Center}(grid) reversed_levels = sort(metadata.dataset.pressure_levels, rev=true) # highest pressure → k=1 - for (k, p) in enumerate(reversed_levels) - interior(field, :, :, k) .= Float32(p) - end + set!(field, reversed_levels) fill_halo_regions!(field) return field end From 575b52d943ac43c5ad408b30d0e25d289b7420d6 Mon Sep 17 00:00:00 2001 From: Eliot Quon Date: Wed, 29 Apr 2026 16:01:53 -0600 Subject: [PATCH 44/53] Temporarily define hPa --- src/DataWrangling/ERA5/ERA5_pressure_levels.jl | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/DataWrangling/ERA5/ERA5_pressure_levels.jl b/src/DataWrangling/ERA5/ERA5_pressure_levels.jl index 2cd3164e9..aab2567df 100644 --- a/src/DataWrangling/ERA5/ERA5_pressure_levels.jl +++ b/src/DataWrangling/ERA5/ERA5_pressure_levels.jl @@ -38,9 +38,12 @@ all_dates(::ERA5MonthlyPressureLevels, var) = range(DateTime("1940-01-01"), stop # ERA5 pressure-level data is a spatially 3-D dataset is_three_dimensional(::ERA5PressureMetadata) = true +# TODO: drop once Oceananigans.Units exports `hPa` in a tagged release +const hPa = 100 + const ERA5_all_pressure_levels = [1, 2, 3, 5, 7, 10, 20, 30, 50, 70, 100, 125, 150, 175, 200, 225, 250, 300, 350, 400, 450, 500, 550, 600, 650, 700, 750, 775, 800, - 825, 850, 875, 900, 925, 950, 975, 1000] * 100 # hPa -> Pa + 825, 850, 875, 900, 925, 950, 975, 1000]hPa # ERA5 stores pressure levels bottom-to-top reversed_vertical_axis(::ERA5PressureLevelsDataset) = false From a597c60ab8289e7ed03645a959a638e75250ec1d Mon Sep 17 00:00:00 2001 From: Eliot Quon Date: Wed, 29 Apr 2026 16:39:02 -0600 Subject: [PATCH 45/53] Cleanup CDS download test --- src/DataWrangling/ERA5/ERA5_pressure_levels.jl | 2 -- test/test_cds_downloading.jl | 12 ++++++------ 2 files changed, 6 insertions(+), 8 deletions(-) diff --git a/src/DataWrangling/ERA5/ERA5_pressure_levels.jl b/src/DataWrangling/ERA5/ERA5_pressure_levels.jl index aab2567df..beac2ab74 100644 --- a/src/DataWrangling/ERA5/ERA5_pressure_levels.jl +++ b/src/DataWrangling/ERA5/ERA5_pressure_levels.jl @@ -170,8 +170,6 @@ end # Build z-interfaces (Nz+1 values) from pressure levels. # Levels may be in any order; output is sorted so k=1 is highest pressure (lowest altitude). function standard_atmosphere_z_interfaces(levels) - @info "Calculating z-interfaces based on the International Standard Atmosphere... \ - For greater accuracy, set ERA5PressureLevelsDataset(; mean_geopotential_height=true)" sorted_levels = sort(levels, rev=true) # highest pressure first → k=1 is bottom heights = standard_atmosphere_geopotential_height.(Float64.(sorted_levels)) Nz = length(heights) diff --git a/test/test_cds_downloading.jl b/test/test_cds_downloading.jl index 8fbf8d9d6..55b4178cb 100644 --- a/test/test_cds_downloading.jl +++ b/test/test_cds_downloading.jl @@ -147,7 +147,7 @@ start_date = DateTime(2005, 2, 16, 12) @test Base.size(ds_full, :temperature) == (1440, 720, 37) # Subset constructor - ds_sub = ERA5HourlyPressureLevels(pressure_levels=[850, 500] .* hPa) + ds_sub = ERA5HourlyPressureLevels(pressure_levels=[850, 500]hPa) @test Base.size(ds_sub, :temperature) == (1440, 720, 2) # Monthly variant @@ -166,7 +166,7 @@ start_date = DateTime(2005, 2, 16, 12) end @testset "ERA5 pressure-level z_interfaces (standard atmosphere)" begin - levels_2 = [850, 500] .* hPa + levels_2 = [850, 500]hPa z = standard_atmosphere_z_interfaces(levels_2) @test length(z) == 3 # Nz+1 interfaces @test issorted(z) # monotonically increasing with altitude @@ -174,7 +174,7 @@ start_date = DateTime(2005, 2, 16, 12) @test z[1] < 1457.0 < z[2] < 5575.0 < z[3] # Single level - z1 = standard_atmosphere_z_interfaces([500] .* hPa) + z1 = standard_atmosphere_z_interfaces([500]hPa) @test length(z1) == 2 @test z1[1] < z1[2] end @@ -247,7 +247,7 @@ start_date = DateTime(2005, 2, 16, 12) @testset "ERA5 pressure-level download and Field on CPU" begin arch = CPU() - ds_pl = ERA5HourlyPressureLevels(pressure_levels=[850, 500] .* hPa) + ds_pl = ERA5HourlyPressureLevels(pressure_levels=[850, 500]hPa) @testset "Download and 3D Field" begin meta = Metadatum(:temperature; dataset=ds_pl, region, date=start_date) @@ -307,9 +307,9 @@ start_date = DateTime(2005, 2, 16, 12) @allowscalar begin # k=1 should be 850 hPa = 85000 Pa (highest pressure, lowest altitude) - @test interior(pf)[1, 1, 1] ≈ Float32(850 * hPa) + @test interior(pf)[1, 1, 1] ≈ Float32(850hPa) # k=2 should be 500 hPa = 50000 Pa - @test interior(pf)[1, 1, 2] ≈ Float32(500 * hPa) + @test interior(pf)[1, 1, 2] ≈ Float32(500hPa) end end end From be9657cf14df3093a8e3fffe6df86c5a3c660456 Mon Sep 17 00:00:00 2001 From: Eliot Quon Date: Wed, 29 Apr 2026 20:03:48 -0600 Subject: [PATCH 46/53] Share CDS test fixtures across testsets to avoid redundant downloads MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Surface-level testsets each downloaded `2m_temperature` then removed it, forcing the next testset to re-download the same bytes — three CDS round-trips for one fixture. Keep the pre-clean only in the testset that's testing the download path itself; let downstream testsets reuse the file and run cleanup in the last consumer. Pressure-level "Geopotential height conversion" pre-cleaned the `geopotential_...nc` that the previous testset's `z_interfaces` side-effect leaves on disk, then re-downloaded it. Drop the pre-clean and the redundant explicit `download_dataset(meta_z)` call (Field() already downloads if needed). Net: surface round-trips drop 3 → 1, geopotential round-trips drop 2 → 1. Co-Authored-By: Claude Opus 4.7 (1M context) --- test/test_cds_downloading.jl | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/test/test_cds_downloading.jl b/test/test_cds_downloading.jl index 55b4178cb..0211f353e 100644 --- a/test/test_cds_downloading.jl +++ b/test/test_cds_downloading.jl @@ -66,8 +66,7 @@ start_date = DateTime(2005, 2, 16, 12) close(ds) - # Clean up - rm(filepath; force=true) + # Note: leave `filepath` in place; downstream surface-level testsets reuse it. end @testset "Availability of ERA5 variables" begin @@ -205,10 +204,7 @@ start_date = DateTime(2005, 2, 16, 12) @test !all(iszero, interior(ψ)) end - # Clean up - rm(filepath; force=true) - inpainted_path = NumericalEarth.DataWrangling.inpainted_metadata_path(metadatum) - isfile(inpainted_path) && rm(inpainted_path; force=true) + # Note: cleanup happens in the last surface-level testset below. end @testset "Setting a field from ERA5 metadata on $A" begin @@ -282,9 +278,9 @@ start_date = DateTime(2005, 2, 16, 12) @testset "Geopotential height conversion" begin meta_z = Metadatum(:geopotential_height; dataset=ds_pl, region, date=start_date) filepath = metadata_path(meta_z) - isfile(filepath) && rm(filepath; force=true) - download_dataset(meta_z) + # Field() downloads if needed; the file may already be on disk from + # the previous testset's z_interfaces side-effect. fz = Field(meta_z, arch) @allowscalar begin From 6e6e84c5a968381400efac2fa914fb761b05090a Mon Sep 17 00:00:00 2001 From: Eliot Quon Date: Thu, 30 Apr 2026 08:42:34 -0600 Subject: [PATCH 47/53] Support Column regions for ERA5 CDS downloads MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Add `build_era5_area` methods for `Column` (Linear and Nearest) so `FieldTimeSeries(Metadata(...; region=Column))` no longer hits MethodError. Linear pads ε=0.3° (slightly more than ERA5's 0.25° native spacing) so the downloaded file contains the 2x2 stencil bilinear interpolation needs; Nearest uses ε=1e-3°. - Realign `dataset_variable_name(::ERA5*Metadata)` to return the in-file short name (e.g. "u") instead of the CDS API catalog name ("u_component_of_wind"), matching the docstring ("the name used for the variable in its raw dataset file"). The CDS API name is still accessed via the `*_dataset_variable_names` dict directly in CDSAPIExt. Drops the now-redundant `netcdf_variable_name` methods. This fixes `column_field_from_file` for ERA5 — it calls `dataset_variable_name` generically and previously got the wrong name back. - Validate cached file's vertical extent in `column_field_from_file` and `mean_geopotential_heights`. A stale cache from a previous run with different `pressure_levels` previously produced silent NaN data or a cryptic broadcast DimensionMismatch; now throws a clear actionable error. - Tighten warning text in `z_interfaces` ("Failed to derive geopotential heights" rather than "Failed to download") and attach `catch_backtrace` so the underlying cause is visible in the warning. Co-Authored-By: Claude Opus 4.7 (1M context) --- ext/NumericalEarthCDSAPIExt.jl | 18 ++++++++++++++++ .../ERA5/ERA5_pressure_levels.jl | 21 ++++++++++++++----- src/DataWrangling/ERA5/ERA5_single_levels.jl | 10 ++++++--- src/DataWrangling/metadata_field.jl | 10 +++++++++ 4 files changed, 51 insertions(+), 8 deletions(-) diff --git a/ext/NumericalEarthCDSAPIExt.jl b/ext/NumericalEarthCDSAPIExt.jl index 41dfcee22..fcbe1271c 100644 --- a/ext/NumericalEarthCDSAPIExt.jl +++ b/ext/NumericalEarthCDSAPIExt.jl @@ -509,6 +509,9 @@ end build_era5_area(::Nothing) = nothing const BBOX = NumericalEarth.DataWrangling.BoundingBox +const COL = NumericalEarth.DataWrangling.Column +const LIN = NumericalEarth.DataWrangling.Linear +const NR = NumericalEarth.DataWrangling.Nearest function build_era5_area(bbox::BBOX) lon = bbox.longitude @@ -526,4 +529,19 @@ function build_era5_area(bbox::BBOX) return [north, west, south, east] end +# Column with Nearest interpolation: tight box; CDS returns the nearest cell. +function build_era5_area(col::COL{<:Any, <:Any, <:Any, <:NR}) + lon, lat = col.longitude, col.latitude + ε = 1e-3 + return [lat + ε, lon - ε, lat - ε, lon + ε] # [N, W, S, E] +end + +# Column with Linear interpolation: pad by slightly more than ERA5's native +# 0.25° spacing so the file contains the 2x2 stencil bilinear interp needs. +function build_era5_area(col::COL{<:Any, <:Any, <:Any, <:LIN}) + lon, lat = col.longitude, col.latitude + ε = 0.3 + return [lat + ε, lon - ε, lat - ε, lon + ε] +end + end # module NumericalEarthCDSAPIExt diff --git a/src/DataWrangling/ERA5/ERA5_pressure_levels.jl b/src/DataWrangling/ERA5/ERA5_pressure_levels.jl index beac2ab74..3be6eadb1 100644 --- a/src/DataWrangling/ERA5/ERA5_pressure_levels.jl +++ b/src/DataWrangling/ERA5/ERA5_pressure_levels.jl @@ -98,8 +98,12 @@ ERA5PL_netcdf_variable_names = Dict( # Variables available for download available_variables(::ERA5PressureLevelsDataset) = ERA5PL_dataset_variable_names -dataset_variable_name(md::ERA5PressureMetadata) = ERA5PL_dataset_variable_names[md.name] -netcdf_variable_name(md::ERA5PressureMetadata) = ERA5PL_netcdf_variable_names[md.name] + +# `dataset_variable_name` returns the short name as stored in the NetCDF file +# (e.g. "u"). The CDS API catalog name (e.g. "u_component_of_wind") used in +# download requests is accessed via the `ERA5PL_dataset_variable_names` dict +# directly in `NumericalEarthCDSAPIExt`. +dataset_variable_name(md::ERA5PressureMetadata) = ERA5PL_netcdf_variable_names[md.name] conversion_units(md::ERA5PressureMetadata) = md.name == :geopotential_height ? InverseGravity() : nothing @@ -115,7 +119,7 @@ Returns a 3D array (lon, lat, level) with levels ordered bottom-to-top """ function retrieve_data(metadata::ERA5PressureMetadatum) path = metadata_path(metadata) - name = netcdf_variable_name(metadata) + name = dataset_variable_name(metadata) ds = NCDatasets.Dataset(path) data = ds[name][:, :, :, 1] # (lon, lat, pressure_level, time=1) close(ds) @@ -204,7 +208,7 @@ function z_interfaces(metadata::ERA5PressureMetadata) download_dataset(ϕ_metadata) return mean_geopotential_z_interfaces(metadata) catch e - @warn "Failed to download geopotential data; falling back to standard atmosphere" exception=e + @warn "Failed to derive geopotential heights; falling back to standard atmosphere" exception=(e, catch_backtrace()) end end @@ -230,10 +234,17 @@ function mean_geopotential_heights(metadata::ERA5PressureMetadata) ϕ_metadata = Metadata(:geopotential; dataset=metadata.dataset, dates=metadata.dates, region=metadata.region, dir=metadata.dir) - heights = zeros(length(metadata.dataset.pressure_levels)) + Nz = length(metadata.dataset.pressure_levels) + heights = zeros(Nz) # average over time for ϕ_datum in ϕ_metadata data = retrieve_data(ϕ_datum) ./ Float32(ERA5_gravitational_acceleration) # Φ → Z (m) + if size(data, 3) != Nz + error("Cached geopotential file at $(metadata_path(ϕ_datum)) has " * + "$(size(data, 3)) pressure levels, but the dataset configuration " * + "expects $Nz. This is most likely a stale cache from a previous " * + "run with different `pressure_levels`. Delete the file and re-run.") + end # average over horizontal dims data_mean = mean(data; dims=(1, 2)) heights .+= dropdims(data_mean; dims=(1, 2)) diff --git a/src/DataWrangling/ERA5/ERA5_single_levels.jl b/src/DataWrangling/ERA5/ERA5_single_levels.jl index 08d54f1a0..5c00626da 100644 --- a/src/DataWrangling/ERA5/ERA5_single_levels.jl +++ b/src/DataWrangling/ERA5/ERA5_single_levels.jl @@ -108,8 +108,12 @@ ERA5_netcdf_variable_names = Dict( # Variables available for download available_variables(::ERA5Dataset) = ERA5_dataset_variable_names -dataset_variable_name(md::ERA5Metadata) = ERA5_dataset_variable_names[md.name] -netcdf_variable_name(md::ERA5Metadata) = ERA5_netcdf_variable_names[md.name] + +# `dataset_variable_name` returns the short name as stored in the NetCDF file +# (e.g. "t2m"). The CDS API catalog name (e.g. "2m_temperature") used in +# download requests is accessed via the `ERA5_dataset_variable_names` dict +# directly in `NumericalEarthCDSAPIExt`. +dataset_variable_name(md::ERA5Metadata) = ERA5_netcdf_variable_names[md.name] default_inpainting(md::ERA5Metadata) = nothing @@ -121,7 +125,7 @@ ERA5 is 2D surface data, so we return a 2D array with an added singleton z-dimen """ function retrieve_data(metadata::ERA5Metadatum) path = metadata_path(metadata) - name = netcdf_variable_name(metadata) + name = dataset_variable_name(metadata) ds = NCDatasets.Dataset(path) diff --git a/src/DataWrangling/metadata_field.jl b/src/DataWrangling/metadata_field.jl index c541f3dbb..eb377c4dd 100644 --- a/src/DataWrangling/metadata_field.jl +++ b/src/DataWrangling/metadata_field.jl @@ -282,6 +282,16 @@ function column_field_from_file(metadata, arch; halo=(3, 3, 3), kw...) end _, _, Nz, _ = size(metadata) + + # Validate that the cached file's vertical extent matches the dataset + # configuration. A common cause of mismatch is a stale cache from a previous + # run with a different vertical configuration (e.g. ERA5 `pressure_levels`). + if is_three_dimensional(metadata) && length(data_size) >= 3 && data_size[3] != Nz + error("Cached file $(path) has $(data_size[3]) vertical levels, but the " * + "dataset configuration expects $Nz. This is most likely a stale " * + "cache from a previous run with a different vertical configuration. " * + "Delete the file and re-run.") + end z = z_interfaces(metadata) FT = eltype(metadata) From c7ad02dd775a93586f6af11f1b98c6c535c29ff1 Mon Sep 17 00:00:00 2001 From: Eliot Quon Date: Thu, 30 Apr 2026 09:08:00 -0600 Subject: [PATCH 48/53] Add network-free tests for CDSAPIExt dispatch and NetCDF helpers Two new top-level testsets exercise internals of `NumericalEarthCDSAPIExt` that were previously uncovered. All tests are hermetic (no CDS API, no filesystem dependencies beyond `mktempdir`). - "ERA5 CDSAPIExt dispatch helpers and area construction": * `cds_product`, `cds_varnames`, `nc_varnames`, `coord_vars` for single-level and pressure-level datasets; * `extra_request_keys!` (no-op for single level, populates `pressure_level` for pressure-level datasets); * `build_era5_area` for `Nothing`, `BoundingBox` (both axes set, one axis missing), `Column{Linear}`, `Column{Nearest}`. - "ERA5 CDSAPIExt NetCDF copy and split helpers": * `ncvar_copy!` round-trips data, attributes, and fill values; * `ncvar_copy_tslice!` correctly handles both time-dependent and time-independent variables; * `split_era5_nc` and `split_era5_nc_multistep` produce one output file per (variable[, timestep]) request, skipping variables not present in the source. Synthetic ERA5-shaped NetCDFs are written via NCDatasets and discarded with `mktempdir`; the extension's private symbols are reached through `Base.get_extension`. Co-Authored-By: Claude Opus 4.7 (1M context) --- test/test_cds_downloading.jl | 262 ++++++++++++++++++++++++++++++++++- 1 file changed, 259 insertions(+), 3 deletions(-) diff --git a/test/test_cds_downloading.jl b/test/test_cds_downloading.jl index 0211f353e..159a238bc 100644 --- a/test/test_cds_downloading.jl +++ b/test/test_cds_downloading.jl @@ -6,11 +6,16 @@ using Dates using NCDatasets using NumericalEarth.DataWrangling.ERA5 -using NumericalEarth.DataWrangling.ERA5: ERA5HourlySingleLevel, ERA5MonthlySingleLevel, ERA5_dataset_variable_names +using NumericalEarth.DataWrangling.ERA5: ERA5HourlySingleLevel, ERA5MonthlySingleLevel, + ERA5_dataset_variable_names, ERA5_netcdf_variable_names using NumericalEarth.DataWrangling.ERA5: ERA5HourlyPressureLevels, ERA5MonthlyPressureLevels, ERA5_all_pressure_levels, ERA5PL_dataset_variable_names, - pressure_field -using NumericalEarth.DataWrangling: metadata_path, download_dataset + ERA5PL_netcdf_variable_names, pressure_field +using NumericalEarth.DataWrangling: metadata_path, download_dataset, BoundingBox, Column, Linear, Nearest + +# Internal extension module — exposes dispatch helpers and NetCDF utilities +# that are not part of the public API but worth pinning behavior for. +const CDSExt = Base.get_extension(NumericalEarth, :NumericalEarthCDSAPIExt) # Test date: Kyoto Protocol ratification date, February 16, 2005 start_date = DateTime(2005, 2, 16, 12) @@ -310,3 +315,254 @@ start_date = DateTime(2005, 2, 16, 12) end end end + +@testset "ERA5 CDSAPIExt dispatch helpers and area construction" begin + sl = ERA5HourlySingleLevel() + pl = ERA5HourlyPressureLevels(pressure_levels=[500hPa, 850hPa]) + + @testset "cds_product / cds_varnames / nc_varnames" begin + @test CDSExt.cds_product(sl) == "reanalysis-era5-single-levels" + @test CDSExt.cds_product(pl) == "reanalysis-era5-pressure-levels" + + @test CDSExt.cds_varnames(sl) === ERA5_dataset_variable_names + @test CDSExt.cds_varnames(pl) === ERA5PL_dataset_variable_names + + @test CDSExt.nc_varnames(sl) === ERA5_netcdf_variable_names + @test CDSExt.nc_varnames(pl) === ERA5PL_netcdf_variable_names + end + + @testset "coord_vars" begin + sl_coords = CDSExt.coord_vars(sl) + pl_coords = CDSExt.coord_vars(pl) + + @test sl_coords isa Set + @test "longitude" in sl_coords + @test "latitude" in sl_coords + @test "valid_time" in sl_coords + @test !("pressure_level" in sl_coords) + + @test "longitude" in pl_coords + @test "pressure_level" in pl_coords + @test "level" in pl_coords + end + + @testset "extra_request_keys!" begin + # ERA5Dataset (single level): no-op + request = Dict{String, Any}("variable" => ["2m_temperature"]) + CDSExt.extra_request_keys!(request, sl) + @test !haskey(request, "pressure_level") + + # ERA5PressureLevelsDataset: populates `pressure_level` (in hPa, as strings) + CDSExt.extra_request_keys!(request, pl) + @test haskey(request, "pressure_level") + @test Set(request["pressure_level"]) == Set(["500", "850"]) + end + + @testset "build_era5_area" begin + # Nothing → nothing + @test CDSExt.build_era5_area(nothing) === nothing + + # BoundingBox with both axes → [N, W, S, E] + bbox = BoundingBox(longitude=(-10.0, 5.0), latitude=(40.0, 50.0)) + @test CDSExt.build_era5_area(bbox) == [50.0, -10.0, 40.0, 5.0] + + # BoundingBox with one axis missing → nothing (CDS gets the global slab) + bbox_no_lat = BoundingBox(longitude=(-10.0, 5.0)) + @test CDSExt.build_era5_area(bbox_no_lat) === nothing + bbox_no_lon = BoundingBox(latitude=(40.0, 50.0)) + @test CDSExt.build_era5_area(bbox_no_lon) === nothing + + # Column with Nearest interpolation → tight ε=1e-3 box around the point + col_nr = Column(-61.5, 18.0; interpolation=Nearest()) + area_nr = CDSExt.build_era5_area(col_nr) + @test length(area_nr) == 4 + # [N, W, S, E] + @test area_nr[1] ≈ 18.0 + 1e-3 # north + @test area_nr[2] ≈ -61.5 - 1e-3 # west + @test area_nr[3] ≈ 18.0 - 1e-3 # south + @test area_nr[4] ≈ -61.5 + 1e-3 # east + + # Column with Linear interpolation → ε=0.3 padding for 2x2 stencil + col_lin = Column(-61.5, 18.0; interpolation=Linear()) + area_lin = CDSExt.build_era5_area(col_lin) + @test area_lin[1] ≈ 18.0 + 0.3 + @test area_lin[2] ≈ -61.5 - 0.3 + @test area_lin[3] ≈ 18.0 - 0.3 + @test area_lin[4] ≈ -61.5 + 0.3 + # Linear box must enclose more than one ERA5 grid cell (0.25°) + @test (area_lin[1] - area_lin[3]) > 0.25 + @test (area_lin[4] - area_lin[2]) > 0.25 + end +end + +@testset "ERA5 CDSAPIExt NetCDF copy and split helpers" begin + # Helper: write a synthetic ERA5-like NetCDF with `Nt` timesteps and two + # variables (`u`, `v`) on dims (longitude, latitude, valid_time). + function write_synthetic_era5_nc(path; Nx=2, Ny=2, Nt=3) + NCDatasets.Dataset(path, "c") do ds + NCDatasets.defDim(ds, "longitude", Nx) + NCDatasets.defDim(ds, "latitude", Ny) + NCDatasets.defDim(ds, "valid_time", Nt) + ds.attrib["title"] = "synthetic_era5_test" + + lon = NCDatasets.defVar(ds, "longitude", Float64, ("longitude",)) + lat = NCDatasets.defVar(ds, "latitude", Float64, ("latitude",)) + t = NCDatasets.defVar(ds, "valid_time", Int64, ("valid_time",)) + lon[:] = collect(range(-1.0, 1.0; length=Nx)) + lat[:] = collect(range(40.0, 41.0; length=Ny)) + t[:] = collect(1:Nt) + + # u: includes _FillValue and a custom attribute + u = NCDatasets.defVar(ds, "u", Float32, + ("longitude", "latitude", "valid_time"); + fillvalue=Float32(-9999.0)) + u.attrib["units"] = "m s**-1" + u.attrib["long_name"] = "u_component_of_wind" + for k in 1:Nt, j in 1:Ny, i in 1:Nx + u[i, j, k] = Float32(100k + 10j + i) + end + + # v: no fill value, no extra attributes + v = NCDatasets.defVar(ds, "v", Float32, + ("longitude", "latitude", "valid_time")) + for k in 1:Nt, j in 1:Ny, i in 1:Nx + v[i, j, k] = Float32(-(100k + 10j + i)) + end + end + end + + coord_vars = CDSExt.ERA5_COORD_VARS + + @testset "ncvar_copy! preserves data, attributes, fill value" begin + mktempdir() do dir + src_path = joinpath(dir, "src.nc") + dst_path = joinpath(dir, "dst.nc") + write_synthetic_era5_nc(src_path; Nx=3, Ny=2, Nt=1) + + NCDatasets.Dataset(src_path, "r") do src + NCDatasets.Dataset(dst_path, "c") do dst + for (dname, dlen) in src.dim + NCDatasets.defDim(dst, dname, dlen) + end + CDSExt.ncvar_copy!(dst, src["u"], "u") + end + end + + NCDatasets.Dataset(dst_path, "r") do dst + @test haskey(dst, "u") + @test eltype(dst["u"].var) == Float32 + @test dst["u"].attrib["units"] == "m s**-1" + @test dst["u"].attrib["long_name"] == "u_component_of_wind" + @test dst["u"].attrib["_FillValue"] == Float32(-9999.0) + + NCDatasets.Dataset(src_path, "r") do src + @test dst["u"].var[:] == src["u"].var[:] + end + end + end + end + + @testset "ncvar_copy_tslice! extracts a single timestep" begin + mktempdir() do dir + src_path = joinpath(dir, "src.nc") + dst_path = joinpath(dir, "dst.nc") + write_synthetic_era5_nc(src_path; Nx=2, Ny=2, Nt=3) + + tidx = 2 + time_dimnames = Set(["valid_time"]) + + NCDatasets.Dataset(src_path, "r") do src + NCDatasets.Dataset(dst_path, "c") do dst + for (dname, dlen) in src.dim + out_len = dname in time_dimnames ? 1 : dlen + NCDatasets.defDim(dst, dname, out_len) + end + CDSExt.ncvar_copy_tslice!(dst, src["u"], "u", tidx, time_dimnames) + # `valid_time` is a coord variable in the file — copy that too, + # using the same tslice path. Exercises the has_time branch. + CDSExt.ncvar_copy_tslice!(dst, src["valid_time"], "valid_time", tidx, time_dimnames) + # `longitude` has no time dim — exercises the !has_time branch. + CDSExt.ncvar_copy_tslice!(dst, src["longitude"], "longitude", tidx, time_dimnames) + end + end + + NCDatasets.Dataset(dst_path, "r") do dst + @test dst.dim["valid_time"] == 1 + @test size(dst["u"]) == (2, 2, 1) + @test dst["valid_time"][:] == [tidx] + + NCDatasets.Dataset(src_path, "r") do src + @test dst["u"].var[:, :, 1] == src["u"].var[:, :, tidx] + @test dst["longitude"][:] == src["longitude"][:] + end + end + end + end + + @testset "split_era5_nc produces per-variable files" begin + mktempdir() do dir + src_path = joinpath(dir, "src.nc") + write_synthetic_era5_nc(src_path; Nx=2, Ny=2, Nt=1) + + pairs = [ + ("u", joinpath(dir, "u_only.nc")), + ("v", joinpath(dir, "v_only.nc")), + ("missing_var", joinpath(dir, "should_not_exist.nc")), + ] + + CDSExt.split_era5_nc(src_path, pairs, coord_vars) + + @test !isfile(joinpath(dir, "should_not_exist.nc")) + + for (vname, dst_path) in pairs[1:2] + @test isfile(dst_path) + NCDatasets.Dataset(dst_path, "r") do dst + @test haskey(dst, vname) + other = vname == "u" ? "v" : "u" + @test !haskey(dst, other) + NCDatasets.Dataset(src_path, "r") do src + @test dst[vname].var[:] == src[vname].var[:] + end + end + end + end + end + + @testset "split_era5_nc_multistep produces per-(var,timestep) files" begin + mktempdir() do dir + src_path = joinpath(dir, "src.nc") + write_synthetic_era5_nc(src_path; Nx=2, Ny=2, Nt=3) + + triples = [ + ("u", 1, joinpath(dir, "u_t1.nc")), + ("u", 3, joinpath(dir, "u_t3.nc")), + ("v", 2, joinpath(dir, "v_t2.nc")), + # Variable not present in source — silently skipped, no file. + ("missing_var", 1, joinpath(dir, "should_not_exist.nc")), + ] + time_dimnames = Set(["valid_time"]) + + CDSExt.split_era5_nc_multistep(src_path, triples, coord_vars, time_dimnames) + + # The skipped variable produces no output. + @test !isfile(joinpath(dir, "should_not_exist.nc")) + + for (vname, tidx, dst_path) in triples[1:3] + @test isfile(dst_path) + NCDatasets.Dataset(dst_path, "r") do dst + @test haskey(dst, vname) + @test dst.dim["valid_time"] == 1 + @test haskey(dst, "longitude") + @test haskey(dst, "latitude") + # The other ERA5 variable should not have leaked in. + other = vname == "u" ? "v" : "u" + @test !haskey(dst, other) + + NCDatasets.Dataset(src_path, "r") do src + @test dst[vname].var[:, :, 1] == src[vname].var[:, :, tidx] + end + end + end + end + end +end From 5eacbc74207e437bfd57429599771fc483a3d529 Mon Sep 17 00:00:00 2001 From: Eliot Quon Date: Thu, 30 Apr 2026 09:57:25 -0600 Subject: [PATCH 49/53] Cleanup --- Project.toml | 2 +- examples/ERA5_bounded_pressure_level_data.jl | 9 ++++----- examples/ERA5_winds_and_stokes_drift.jl | 1 - src/DataWrangling/metadata_field.jl | 4 ++-- 4 files changed, 7 insertions(+), 9 deletions(-) diff --git a/Project.toml b/Project.toml index e33fb9936..ab7f75323 100644 --- a/Project.toml +++ b/Project.toml @@ -58,9 +58,9 @@ Breeze = "0.4" CDSAPI = "2.2.1" CFTime = "0.1, 0.2" ClimaSeaIce = "0.4.4, 0.5" -CubedSphere = "0.3.4" CondaPkg = "0.2.33" CopernicusMarine = "0.1.1" +CubedSphere = "0.3.4" DataDeps = "0.7" Dates = "<0.0.1, 1" Distances = "0.10" diff --git a/examples/ERA5_bounded_pressure_level_data.jl b/examples/ERA5_bounded_pressure_level_data.jl index ad9b6a909..767773d6c 100644 --- a/examples/ERA5_bounded_pressure_level_data.jl +++ b/examples/ERA5_bounded_pressure_level_data.jl @@ -27,7 +27,6 @@ selected_levels = filter(≥(250hPa), ERA5_all_pressure_levels) # select all lev dataset = ERA5HourlyPressureLevels(selected_levels) dates = DateTime(2004, 12, 16):Hour(1):DateTime(2005, 01, 09) # van Zanten et al 2011 study period -#dates = DateTime(2004, 12, 16):Hour(1):DateTime(2004, 12, 23) # shorter time range for demo # Rauber et al 2007, Fig 1: focus region region = BoundingBox(latitude=(17, 18.5), longitude=(-62.5, -61)) @@ -55,19 +54,19 @@ v_series = FieldTimeSeries(v_meta; time_indices_in_memory=2) # ## Vertical profiles (mean ± spread) # Height coordinate (m) from the grid -_, _, z = nodes(T_series[1]) +z = znodes(T_series[1]) Nz = length(z) Nt = length(T_series.times) -# Pressure at each level (hPa), ordered bottom-to-top (k=1 = highest pressure) -p_levels = sort(selected_levels, rev=true) ./ hPa # in hPa +# Pressure at each level (hPa), ordered bottom-to-top (k=1 ⇒ highest pressure) +p_levels = sort(selected_levels, rev=true) ./ hPa # Pa → hPa # Horizontal-mean profile for each snapshot → (Nz, Nt) arrays function horizontal_mean_profiles(series) profiles = zeros(Nz, Nt) for n in 1:Nt data = interior(series[n], :, :, :) - profiles[:, n] = dropdims(mean(data, dims=(1, 2)), dims=(1, 2)) + profiles[:, n] = mean(data, dims=(1, 2)) end return profiles end diff --git a/examples/ERA5_winds_and_stokes_drift.jl b/examples/ERA5_winds_and_stokes_drift.jl index b2a8bcf43..2ccfedecc 100644 --- a/examples/ERA5_winds_and_stokes_drift.jl +++ b/examples/ERA5_winds_and_stokes_drift.jl @@ -46,7 +46,6 @@ grid = LatitudeLongitudeGrid(size = (1440, 720, 1), longitude = (0, 360), latitude = (-90, 90), z = (0, 1)) -@show grid u_stokes = CenterField(grid) v_stokes = CenterField(grid) diff --git a/src/DataWrangling/metadata_field.jl b/src/DataWrangling/metadata_field.jl index eb377c4dd..d5656f320 100644 --- a/src/DataWrangling/metadata_field.jl +++ b/src/DataWrangling/metadata_field.jl @@ -411,10 +411,10 @@ function set_metadata_field!(field, data, metadatum) Nx, Ny, Nz = size(metadatum) mangling = if size(data, 2) == Ny-1 - @info "Shifting field southward" + @debug "Shifting field southward" ShiftSouth() elseif size(data, 2) == Ny+1 - @info "Averaging field in north-south dir" + @debug "Averaging field in north-south dir" AverageNorthSouth() else nothing From eeb699a5769e51522a5d0bc4f6d32ec1d3df0020 Mon Sep 17 00:00:00 2001 From: Eliot Quon Date: Thu, 30 Apr 2026 10:40:30 -0600 Subject: [PATCH 50/53] Add unit-level tests for ERA5 single-level helpers Cover the parts of `src/DataWrangling/ERA5/ERA5_single_levels.jl` that the integration tests don't exercise directly: hourly `all_dates` step (mirrors the existing monthly test), the API/netcdf variable-name dicts staying in sync, the `available_variables` / `dataset_variable_name` dispatch (catches the easy swap between API catalog name and netcdf short name), `default_inpainting` returning `nothing` (the wrong value here silently makes Field construction expensive), and `metadata_prefix` filename construction across the single-date / multi-date / no-region branches plus filename-safety transformations. Co-Authored-By: Claude Opus 4.7 (1M context) --- test/test_cds_downloading.jl | 61 ++++++++++++++++++++++++++++++++++++ 1 file changed, 61 insertions(+) diff --git a/test/test_cds_downloading.jl b/test/test_cds_downloading.jl index 159a238bc..5edee1204 100644 --- a/test/test_cds_downloading.jl +++ b/test/test_cds_downloading.jl @@ -143,6 +143,67 @@ start_date = DateTime(2005, 2, 16, 12) @test step(dates) == Month(1) end + @testset "ERA5 single-level all_dates (Hourly)" begin + hourly_dataset = ERA5HourlySingleLevel() + dates = NumericalEarth.DataWrangling.all_dates(hourly_dataset, :temperature) + @test first(dates) == DateTime("1940-01-01") + @test step(dates) == Hour(1) + end + + @testset "ERA5 single-level dispatch helpers" begin + ds = ERA5HourlySingleLevel() + md = Metadatum(:temperature; dataset=ds, region, date=start_date) + + # API-name and netcdf-name dicts cover the same variable set — + # catches forgetting to add a new variable to one of the two + @test keys(ERA5_dataset_variable_names) == keys(ERA5_netcdf_variable_names) + + # available_variables returns the API-name dict (used to build CDS requests), + # not the netcdf short-name dict — guards against the easy swap-mistake + @test NumericalEarth.DataWrangling.available_variables(ds) === ERA5_dataset_variable_names + + # dataset_variable_name returns the netcdf short name (read from file), + # not the API catalog name — same swap risk + @test NumericalEarth.DataWrangling.dataset_variable_name(md) == "t2m" + + # default_inpainting is nothing for ERA5 (vs NearestNeighborInpainting for ECCO); + # accidentally enabling it would massively slow Field construction + @test NumericalEarth.DataWrangling.default_inpainting(md) === nothing + end + + @testset "ERA5 single-level metadata_prefix" begin + ds = ERA5HourlySingleLevel() + + # Single-date metadatum, with region: prefix should not duplicate the date + md_single = Metadatum(:temperature; dataset=ds, region, date=start_date) + prefix_single = NumericalEarth.DataWrangling.ERA5.metadata_prefix(md_single) + @test occursin("2m_temperature", prefix_single) + @test occursin("ERA5HourlySingleLevel", prefix_single) + @test occursin("2005-02-16", prefix_single) + @test count("2005-02-16", prefix_single) == 1 # date appears once for single-date + @test occursin("0.0", prefix_single) # west bound + @test occursin("5.0", prefix_single) # east bound + @test occursin("40.0", prefix_single) # south bound + @test occursin("45.0", prefix_single) # north bound + # Filename safety + @test !occursin(":", prefix_single) # colons replaced by dashes + @test !occursin(" ", prefix_single) # spaces replaced by underscores + + # Single-date metadatum, no region: suffix should be empty + md_no_region = Metadatum(:temperature; dataset=ds, date=start_date) + prefix_no_region = NumericalEarth.DataWrangling.ERA5.metadata_prefix(md_no_region) + @test !occursin("0.0", prefix_no_region) + @test !occursin("nothing", prefix_no_region) + + # Multi-date metadata: prefix should include both start and end dates + end_date = start_date + Hour(2) + md_multi = Metadata(:temperature; dataset=ds, region, + dates=start_date:Hour(1):end_date) + prefix_multi = NumericalEarth.DataWrangling.ERA5.metadata_prefix(md_multi) + @test occursin("2005-02-16T12", prefix_multi) + @test occursin("2005-02-16T14", prefix_multi) + end + @testset "ERA5HourlyPressureLevels construction and metadata" begin # Default constructor uses all 37 standard levels ds_full = ERA5HourlyPressureLevels() From 445f49322e537424d1cfb9393dbdeea9aa484ce2 Mon Sep 17 00:00:00 2001 From: Eliot Quon Date: Thu, 30 Apr 2026 10:55:36 -0600 Subject: [PATCH 51/53] Add tests for calendar-day grouping and skip_existing short-circuit Extract `_group_by_calendar_day(datetimes)` from the inline comprehensions in two `download_dataset` overloads so the grouping logic is testable in isolation. Test boundary cases: - 00:00 belongs to its own day, not the previous one - multi-day interleaved input - duplicate datetimes are preserved - single-element input Also add tests for the `skip_existing=true` short-circuit in three multi-file paths (multi-variable pressure-level, single-variable multi-date, multi-variable multi-date). Pre-create the expected output files in a tempdir and assert each path returns without invoking CDSAPI; if the short-circuit ever regresses the test will throw a credentials/network error and fail loudly. Co-Authored-By: Claude Opus 4.7 (1M context) --- ext/NumericalEarthCDSAPIExt.jl | 18 +++++-- test/test_cds_downloading.jl | 97 ++++++++++++++++++++++++++++++++++ 2 files changed, 111 insertions(+), 4 deletions(-) diff --git a/ext/NumericalEarthCDSAPIExt.jl b/ext/NumericalEarthCDSAPIExt.jl index fcbe1271c..9b979a491 100644 --- a/ext/NumericalEarthCDSAPIExt.jl +++ b/ext/NumericalEarthCDSAPIExt.jl @@ -137,8 +137,7 @@ end function download_dataset(metadata::ERA5Metadata; skip_existing=true, cleanup=true) dates = metadata.dates isa AbstractVector ? metadata.dates : [metadata.dates] - grouped = Dict(d => filter(dt -> Dates.Date(dt) == d, dates) - for d in unique(Dates.Date.(dates))) + grouped = _group_by_calendar_day(dates) for day in sort(collect(keys(grouped))) download_era5_day(metadata.name, metadata.dataset, grouped[day]; @@ -148,6 +147,18 @@ function download_dataset(metadata::ERA5Metadata; skip_existing=true, cleanup=tr end end +""" + _group_by_calendar_day(datetimes) + +Group an iterable of `DateTime`s by calendar day. Returns a `Dict{Date, Vector}` +where each value is the subset of `datetimes` whose calendar day equals the key. +The `00:00` instant of a day belongs to that day (not the previous one). +""" +function _group_by_calendar_day(datetimes) + return Dict(d => filter(dt -> Dates.Date(dt) == d, datetimes) + for d in unique(Dates.Date.(datetimes))) +end + function download_era5_day(name, dataset, day_dates; region, dir, skip_existing, cleanup) @@ -314,8 +325,7 @@ function download_dataset(names::Vector{Symbol}, skip_existing = true, cleanup = true) - grouped = Dict(d => filter(dt -> Dates.Date(dt) == d, datetimes) - for d in unique(Dates.Date.(datetimes))) + grouped = _group_by_calendar_day(datetimes) for day in sort(collect(keys(grouped))) download_era5_multivar_day(names, dataset, grouped[day]; region, dir, skip_existing, cleanup) diff --git a/test/test_cds_downloading.jl b/test/test_cds_downloading.jl index 5edee1204..5a9174cfc 100644 --- a/test/test_cds_downloading.jl +++ b/test/test_cds_downloading.jl @@ -456,6 +456,103 @@ end end end +@testset "ERA5 CDSAPIExt _group_by_calendar_day" begin + # Single calendar day with multiple hours + same_day = [DateTime(2005, 2, 16, 0), + DateTime(2005, 2, 16, 6), + DateTime(2005, 2, 16, 23)] + g = CDSExt._group_by_calendar_day(same_day) + @test length(g) == 1 + @test Date(2005, 2, 16) in keys(g) + @test length(g[Date(2005, 2, 16)]) == 3 + + # Boundary: 00:00 belongs to its OWN day (not the previous one) + midnight_pair = [DateTime(2005, 2, 16, 23), + DateTime(2005, 2, 17, 0)] + g = CDSExt._group_by_calendar_day(midnight_pair) + @test length(g) == 2 + @test g[Date(2005, 2, 16)] == [DateTime(2005, 2, 16, 23)] + @test g[Date(2005, 2, 17)] == [DateTime(2005, 2, 17, 0)] + + # Multiple days, interleaved order — grouping must be order-independent + mixed = [DateTime(2005, 2, 17, 6), + DateTime(2005, 2, 16, 6), + DateTime(2005, 2, 17, 12), + DateTime(2005, 2, 16, 18)] + g = CDSExt._group_by_calendar_day(mixed) + @test length(g) == 2 + @test Set(g[Date(2005, 2, 16)]) == Set([DateTime(2005, 2, 16, 6), DateTime(2005, 2, 16, 18)]) + @test Set(g[Date(2005, 2, 17)]) == Set([DateTime(2005, 2, 17, 6), DateTime(2005, 2, 17, 12)]) + + # Duplicate datetimes are preserved (CDS will dedupe; we don't) + dups = [DateTime(2005, 2, 16, 12), DateTime(2005, 2, 16, 12)] + g = CDSExt._group_by_calendar_day(dups) + @test length(g) == 1 + @test length(g[Date(2005, 2, 16)]) == 2 + + # Single-element input + g = CDSExt._group_by_calendar_day([DateTime(2005, 2, 16, 12)]) + @test length(g) == 1 + @test g[Date(2005, 2, 16)] == [DateTime(2005, 2, 16, 12)] +end + +@testset "ERA5 CDSAPIExt skip_existing short-circuit" begin + # Build a temporary directory and pre-create the expected output files so + # `download_dataset(...; skip_existing=true)` returns without contacting CDS. + # If the short-circuit ever regresses, these tests will throw a credentials + # error (or 4xx from the CDS API) and fail loudly. + region = NumericalEarth.DataWrangling.BoundingBox(longitude=(0, 5), latitude=(40, 45)) + mktempdir() do tmp + ds_pl = ERA5HourlyPressureLevels(pressure_levels=[850, 500]hPa) + date1 = DateTime(2005, 2, 16, 12) + date2 = DateTime(2005, 2, 16, 18) + names = [:temperature, :eastward_velocity] + + # Helper: pre-create the file that `download_dataset` would write + function touch_expected(name, dataset, date) + md = Metadatum(name; dataset, region, date, dir=tmp) + path = metadata_path(md) + mkpath(dirname(path)) + touch(path) + return path + end + + @testset "multi-variable pressure-level (single date)" begin + paths = [touch_expected(name, ds_pl, date1) for name in names] + meta = Metadatum(:temperature; dataset=ds_pl, region, date=date1, dir=tmp) + + result = download_dataset(names, meta; skip_existing=true) + @test result isa Vector{String} + @test length(result) == length(names) + @test Set(result) == Set(paths) + end + + @testset "single-variable multi-date (download_era5_day)" begin + # All hours of date1, date2 already on disk + ds_sl = ERA5HourlySingleLevel() + for dt in (date1, date2) + touch_expected(:temperature, ds_sl, dt) + end + + # Returns nothing without raising — the early-return guard fires + @test CDSExt.download_era5_day(:temperature, ds_sl, [date1, date2]; + region, dir=tmp, + skip_existing=true, cleanup=true) === nothing + end + + @testset "multi-variable multi-date (download_era5_multivar_day)" begin + ds_sl = ERA5HourlySingleLevel() + for name in names, dt in (date1, date2) + touch_expected(name, ds_sl, dt) + end + + @test CDSExt.download_era5_multivar_day(names, ds_sl, [date1, date2]; + region, dir=tmp, + skip_existing=true, cleanup=true) === nothing + end + end +end + @testset "ERA5 CDSAPIExt NetCDF copy and split helpers" begin # Helper: write a synthetic ERA5-like NetCDF with `Nt` timesteps and two # variables (`u`, `v`) on dims (longitude, latitude, valid_time). From 30b95603a32be877f7876dc23032b432014f2a72 Mon Sep 17 00:00:00 2001 From: Eliot Quon Date: Thu, 30 Apr 2026 11:45:07 -0600 Subject: [PATCH 52/53] Add unit tests for pure helpers in metadata_field, ERA5_pressure_levels, and CDSAPI ext Cover several pure functions and dispatch overloads that the integration tests either skip or don't exercise directly. Each test is no-network and self-contained. `metadata_field.jl`: - `restrict` (BoundingBox grid construction): identity, half-domain, small bbox, off-origin bbox, and the `Nothing` pass-through dispatch. - `restrict_location` for all three region kinds (`BoundingBox` / `Nothing` / `Column`), confirming the Column path reduces horizontal locations to `Nothing`. `ERA5_pressure_levels.jl`: - Constructors sort levels descending: pass ASCENDING input (`[500, 850]hPa`) so the test fails if `sort(...; rev=true)` regresses to a no-op or different order. Covered for both Hourly and Monthly variants. - `stagger`: pure function that converts ascending centers to Nz+1 staggered interfaces. Covers two-element evenly-spaced, three-element evenly-spaced, and three-element irregular cases (verifies extrapolation formula at top/bot and midpoint formula in the interior). `NumericalEarthCDSAPIExt.jl`: - `is_zip`: ZIP magic header detected, non-magic bytes rejected, short (<4 byte) files rejected. - `foreach_nc`: non-zip path calls `f` exactly once with the input path; zip path extracts and visits each `.nc`, ignoring non-`.nc` entries. Co-Authored-By: Claude Opus 4.7 (1M context) --- test/test_cds_downloading.jl | 106 +++++++++++++++++++++++++++++++++++ test/test_column_field.jl | 47 ++++++++++++++++ 2 files changed, 153 insertions(+) diff --git a/test/test_cds_downloading.jl b/test/test_cds_downloading.jl index 5a9174cfc..e1bd366f1 100644 --- a/test/test_cds_downloading.jl +++ b/test/test_cds_downloading.jl @@ -244,6 +244,55 @@ start_date = DateTime(2005, 2, 16, 12) @test z1[1] < z1[2] end + @testset "ERA5 pressure-level constructors sort levels descending" begin + # Pass ASCENDING input so the test fails if the inner constructor's + # `sort(...; rev=true)` regresses to a no-op or different order. + ds_h = ERA5HourlyPressureLevels([500, 850]hPa) + @test ds_h.pressure_levels == [850 * hPa, 500 * hPa] # stored highest-pressure-first + @test ds_h.z === nothing + @test ds_h.mean_geopotential_height == true # default kwarg + + ds_m = ERA5MonthlyPressureLevels([500, 850]hPa) + @test ds_m.pressure_levels == [850 * hPa, 500 * hPa] + @test ds_m.z === nothing + @test ds_m.mean_geopotential_height == true + + # Already-descending input is preserved (sort is a no-op here) + ds_h2 = ERA5HourlyPressureLevels([850, 500]hPa) + @test ds_h2.pressure_levels == [850 * hPa, 500 * hPa] + + # Three-level shuffled input + ds_h3 = ERA5HourlyPressureLevels([500, 850, 700]hPa) + @test ds_h3.pressure_levels == [850 * hPa, 700 * hPa, 500 * hPa] + end + + @testset "ERA5 pressure-level stagger" begin + stagger = NumericalEarth.DataWrangling.ERA5.stagger + + # Two-element input (evenly spaced): bottom and top faces are + # extrapolated symmetrically; result is Nz+1 monotonic. + zf = stagger([0.0, 1.0]) + @test length(zf) == 3 + @test issorted(zf) + @test zf ≈ [-0.5, 0.5, 1.5] + + # Three-element evenly-spaced: every interior interface is the + # midpoint of the adjacent centers; bottom/top are extrapolated. + zf = stagger([1.0, 3.0, 5.0]) + @test length(zf) == 4 + @test zf ≈ [0.0, 2.0, 4.0, 6.0] + + # Three-element irregularly-spaced: interior midpoints honor the + # actual spacing (not assumed-uniform). + zf = stagger([1.0, 3.0, 7.0]) + @test length(zf) == 4 + @test zf[2] ≈ 2.0 # midpoint(1, 3) + @test zf[3] ≈ 5.0 # midpoint(3, 7) + # Boundaries extrapolated at half the adjacent interior spacing + @test zf[1] ≈ 1.0 - (zf[2] - 1.0) + @test zf[4] ≈ 7.0 + (7.0 - zf[3]) + end + for arch in test_architectures A = typeof(arch) @@ -724,3 +773,60 @@ end end end end + +@testset "ERA5 CDSAPIExt is_zip and foreach_nc" begin + @testset "is_zip" begin + mktempdir() do tmp + # File starting with the ZIP magic header + zip_path = joinpath(tmp, "fake.zip") + open(zip_path, "w") do io + write(io, UInt8[0x50, 0x4b, 0x03, 0x04, 0x00, 0x00]) + end + @test CDSExt.is_zip(zip_path) == true + + # File with arbitrary non-magic bytes (NetCDF-3 starts with "CDF\x01") + nc_path = joinpath(tmp, "fake.nc") + open(nc_path, "w") do io + write(io, UInt8[0x43, 0x44, 0x46, 0x01]) + end + @test CDSExt.is_zip(nc_path) == false + + # Short file (<4 bytes) — length check guards against false positives + short_path = joinpath(tmp, "short.bin") + open(short_path, "w") do io + write(io, UInt8[0x50, 0x4b]) # only 2 of the 4 magic bytes + end + @test CDSExt.is_zip(short_path) == false + end + end + + @testset "foreach_nc — non-zip path calls f exactly once" begin + mktempdir() do tmp + nc_path = joinpath(tmp, "data.nc") + touch(nc_path) + + received = String[] + CDSExt.foreach_nc(p -> push!(received, p), nc_path, tmp) + + @test received == [nc_path] + end + end + + @testset "foreach_nc — zip path extracts and visits each .nc" begin + mktempdir() do tmp + # Build a ZIP fixture containing two .nc files (and a non-.nc file + # that should be ignored). + nc1 = joinpath(tmp, "a.nc"); touch(nc1) + nc2 = joinpath(tmp, "b.nc"); touch(nc2) + other = joinpath(tmp, "readme.txt"); touch(other) + + zip_path = joinpath(tmp, "bundle.zip") + run(`zip -j -q $zip_path $nc1 $nc2 $other`) + + received = String[] + CDSExt.foreach_nc(p -> push!(received, basename(p)), zip_path, tmp) + + @test sort(received) == ["a.nc", "b.nc"] # readme.txt filtered out + end + end +end diff --git a/test/test_column_field.jl b/test/test_column_field.jl index 8a9b51695..bb9004e15 100644 --- a/test/test_column_field.jl +++ b/test/test_column_field.jl @@ -231,3 +231,50 @@ end @test eltype(grid) == Float32 end end + +@testset "restrict (BoundingBox grid construction helper)" begin + restrict = NumericalEarth.DataWrangling.restrict + + # Identity case: bbox covers the full domain. Grid pads by Δ/2 on each side + # so that face midpoints land on data centers. The padded extent is N+1 + # cells of width Δ exactly, but Float64 rounding can push the ceil one cell + # past that — so allow [N+1, N+2]. + grid_interfaces, rN = restrict((0.0, 360.0), (0.0, 360.0), 1440) + @test grid_interfaces[1] ≈ -0.125 + @test grid_interfaces[2] ≈ 360.125 + @test 1441 <= rN <= 1442 + + # Half-domain bbox: rN should be just over half of N. + _, rN = restrict((0.0, 180.0), (0.0, 360.0), 1440) + @test 720 < rN <= 722 # ceil(0.5 * 1440 + small) = 721 + + # Small bbox (5° wide on a 1440-cell grid): rN should be ceil(20 + small) = 21. + grid_interfaces, rN = restrict((0.0, 5.0), (0.0, 360.0), 1440) + @test grid_interfaces[1] ≈ -0.125 + @test grid_interfaces[2] ≈ 5.125 + @test rN == 21 + + # Off-origin bbox preserves width: 5° wide on a 720-cell, 180°-tall grid → + # rΔ = 5° + Δ = 5.25°, rN = ceil((5.25/180) * 720) = 21. + _, rN_off = restrict((40.0, 45.0), (-90.0, 90.0), 720) + @test rN_off == 21 + + # Pass-through for `nothing` (the no-restriction case). + @test restrict(nothing, (0.0, 360.0), 1440) == ((0.0, 360.0), 1440) +end + +@testset "restrict_location dispatch" begin + bbox = BoundingBox(longitude=(0, 5), latitude=(40, 45)) + col = Column(2.5, 42.5) + + # BoundingBox: locations passed through unchanged + @test restrict_location((Center, Center, Center), bbox) == (Center, Center, Center) + @test restrict_location((Face, Face, Center), bbox) == (Face, Face, Center) + + # Nothing: same — no restriction + @test restrict_location((Center, Center, Center), nothing) == (Center, Center, Center) + + # Column: horizontal locations reduce to Nothing, vertical preserved + @test restrict_location((Center, Center, Center), col) == (Nothing, Nothing, Center) + @test restrict_location((Face, Face, Center), col) == (Nothing, Nothing, Center) +end From bdc98d79331cf3c600397ae1a20d28f53ab4d781 Mon Sep 17 00:00:00 2001 From: Eliot Quon Date: Thu, 30 Apr 2026 14:54:47 -0600 Subject: [PATCH 53/53] Apply suggestion from @glwagner Co-authored-by: Gregory L. Wagner --- src/DataWrangling/DataWrangling.jl | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) diff --git a/src/DataWrangling/DataWrangling.jl b/src/DataWrangling/DataWrangling.jl index 0cb60d861..3741c6e8a 100644 --- a/src/DataWrangling/DataWrangling.jl +++ b/src/DataWrangling/DataWrangling.jl @@ -244,15 +244,7 @@ using .OSPapa # Fallback: if no download extension is loaded, check that all files already exist function download_dataset(metadata::Metadata) - paths = metadata_path(metadata) - paths isa AbstractString && (paths = [paths]) - missing_files = filter(!isfile, paths) - if !isempty(missing_files) - n = length(missing_files) - error("No download method is available (is the backend package loaded?) " * - "and $n data file(s) are missing. First missing: $(first(missing_files))") - end - return paths + error("No download method for $metadata is available (is the backend package loaded?)") end end # module