Skip to content
1 change: 1 addition & 0 deletions fme/ace/testing/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
FV3GFSData,
MonthlyReferenceData,
StatsData,
get_nd_dataset,
save_nd_netcdf,
save_scalar_netcdf,
)
23 changes: 17 additions & 6 deletions scripts/data_process/get_stats.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,20 +88,30 @@ def get_stats(
debug: bool,
):
# Import dask-related things here to enable testing in environments without dask.
import dask
import distributed
try:
import dask
import distributed

client = distributed.Client(n_workers=16)
except ImportError as e:
# warn and continue
logging.warning(f"Could not import dask ({e}), chunking is disabled.")
client = None
dask = None

initial_time = time.time()
client = distributed.Client(n_workers=16)

xr.set_options(keep_attrs=True, display_max_rows=100)
logging.info(f"Reading data from {input_zarr}")

# Open data with roughly 128 MiB chunks via dask's automatic chunking. This
# is useful when opening sharded zarr stores with an inner chunk size of 1,
# which is otherwise inefficient for the type of computation done here.
with dask.config.set({"array.chunk-size": "128MiB"}):
ds = xr.open_zarr(input_zarr, chunks={"time": "auto"})
if dask is not None:
with dask.config.set({"array.chunk-size": "128MiB"}):
ds = xr.open_zarr(input_zarr, chunks={"time": "auto"})
else:
ds = xr.open_zarr(input_zarr)

ds = ds.drop_vars(DROP_VARIABLES, errors="ignore")
ds = ds.sel(time=slice(config.start_date, config.end_date))
Expand Down Expand Up @@ -186,7 +196,8 @@ def get_stats(
total_time = time.time() - initial_time
logging.info(f"Total time for computing stats: {total_time:0.2f} seconds.")

client.close()
if client is not None:
client.close()
client = None


Expand Down
12 changes: 12 additions & 0 deletions scripts/time_coarsen/c96-shield-stats.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
runs:
ic_0001: ""
ic_0002: ""
data_output_directory: /climate-default/2026-01-28-vertically-resolved-c96-1deg-daily-shield-amip-ensemble-dataset
stats:
output_directory: /climate-default/2026-01-28-vertically-resolved-c96-1deg-daily-shield-amip-ensemble-dataset-stats
beaker_dataset: 2026-01-28-vertically-resolved-c96-1deg-daily-shield-amip-ensemble-dataset-stats
start_date: "1940-01-01"
end_date: "2021-12-31"
data_type: FV3GFS
exclude_runs:
- "ic_0002"
93 changes: 93 additions & 0 deletions scripts/time_coarsen/c96-shield.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
paths:
- input: /climate-default/2026-01-28-vertically-resolved-c96-1deg-shield-amip-ensemble-dataset/ic_0001.zarr
output: /climate-default/2026-01-28-vertically-resolved-c96-1deg-daily-shield-amip-ensemble-dataset/ic_0001.zarr
- input: /climate-default/2026-01-28-vertically-resolved-c96-1deg-shield-amip-ensemble-dataset/ic_0002.zarr
output: /climate-default/2026-01-28-vertically-resolved-c96-1deg-daily-shield-amip-ensemble-dataset/ic_0002.zarr
coarsen_factor: 4
snapshot_names:
- PRESsfc
- Q2m
- RH200
- RH500
- RH850
- TMP200
- TMP2m
- TMP500
- TMP850
- UGRD1000
- UGRD10m
- UGRD200
- UGRD500
- UGRD850
- VGRD1000
- VGRD10m
- VGRD200
- VGRD500
- VGRD850
- air_temperature_0
- air_temperature_1
- air_temperature_2
- air_temperature_3
- air_temperature_4
- air_temperature_5
- air_temperature_6
- air_temperature_7
- eastward_wind_0
- eastward_wind_1
- eastward_wind_2
- eastward_wind_3
- eastward_wind_4
- eastward_wind_5
- eastward_wind_6
- eastward_wind_7
- global_mean_co2
- h1000
- h50
- h500
- h850
- land_fraction
- northward_wind_0
- northward_wind_1
- northward_wind_2
- northward_wind_3
- northward_wind_4
- northward_wind_5
- northward_wind_6
- northward_wind_7
- ocean_fraction
- sea_ice_fraction
- snow_cover_fraction
- soil_moisture_0
- soil_moisture_1
- soil_moisture_2
- soil_moisture_3
- specific_total_water_0
- specific_total_water_1
- specific_total_water_2
- specific_total_water_3
- specific_total_water_4
- specific_total_water_5
- specific_total_water_6
- specific_total_water_7
- surface_temperature
- total_water_path
window_names:
- DLWRFsfc
- DSWRFsfc
- DSWRFtoa
- LHTFLsfc
- PRATEsfc
- SHTFLsfc
- ULWRFsfc
- ULWRFtoa
- USWRFsfc
- USWRFtoa
- tendency_of_total_water_path
- tendency_of_total_water_path_due_to_advection
- total_frozen_precipitation_rate
constant_prefixes:
- ak_
- bk_
- HGTsfc
- grid_xt
- grid_yt
45 changes: 45 additions & 0 deletions scripts/time_coarsen/run.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@

#!/bin/bash

set -e

SCRIPT_PATH=$(git rev-parse --show-prefix) # relative to the root of the repository
REPO_ROOT=$(git rev-parse --show-toplevel)

cd "$REPO_ROOT"

run_coarsen() {
local config_filename="$1"
local job_name="$2"
local CONFIG_PATH="$SCRIPT_PATH/$config_filename"

# Extract additional args from config header
local extra_args=()
while IFS= read -r line; do
[[ "$line" =~ ^#\ arg:\ (.*) ]] && extra_args+=(${BASH_REMATCH[1]})
done < "$CONFIG_PATH"

gantry run \
--name "$job_name" \
--description 'Run ACE training' \
--beaker-image "$(cat $REPO_ROOT/latest_deps_only_image.txt)" \
--workspace ai2/climate-titan \
--priority urgent \
--not-preemptible \
--cluster ai2/jupiter \
--cluster ai2/ceres \
--env GOOGLE_APPLICATION_CREDENTIALS=/tmp/google_application_credentials.json \
--dataset-secret google-credentials:/tmp/google_application_credentials.json \
--gpus 0 \
--shared-memory 400GiB \
--weka climate-default:/climate-default \
--budget ai2/climate \
--system-python \
--install "pip install --no-deps ." \
"${extra_args[@]}" \
-- python $SCRIPT_PATH/time_coarsen.py "$CONFIG_PATH"
}

base_name="time-coarsen"

run_coarsen "c96-shield.yaml" "$base_name-c96-shield"
46 changes: 46 additions & 0 deletions scripts/time_coarsen/run_stats.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@

#!/bin/bash

set -e

SCRIPT_PATH=$(git rev-parse --show-prefix) # relative to the root of the repository
REPO_ROOT=$(git rev-parse --show-toplevel)

cd "$REPO_ROOT"

run_stats() {
local config_filename="$1"
local job_name="$2"
local CONFIG_PATH="$SCRIPT_PATH/$config_filename"

# Extract additional args from config header
local extra_args=()
while IFS= read -r line; do
[[ "$line" =~ ^#\ arg:\ (.*) ]] && extra_args+=(${BASH_REMATCH[1]})
done < "$CONFIG_PATH"

gantry run \
--name "$job_name" \
--description 'Run ACE stats computation' \
--beaker-image "$(cat $REPO_ROOT/latest_deps_only_image.txt)" \
--workspace ai2/climate-titan \
--priority urgent \
--not-preemptible \
--cluster ai2/jupiter \
--cluster ai2/ceres \
--env GOOGLE_APPLICATION_CREDENTIALS=/tmp/google_application_credentials.json \
--dataset-secret google-credentials:/tmp/google_application_credentials.json \
--gpus 0 \
--shared-memory 400GiB \
--weka climate-default:/climate-default \
--budget ai2/climate \
--system-python \
--install "pip install --no-deps ." \
"${extra_args[@]}" \
-- python $SCRIPT_PATH/../data_process/get_stats.py "$CONFIG_PATH" 0
}

base_name="time-coarsen-stats"

run_stats "c96-shield-stats.yaml" "$base_name-c96-shield"

72 changes: 72 additions & 0 deletions scripts/time_coarsen/test_time_coarsen.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
import numpy as np
import xarray as xr
from time_coarsen import Config, PathPair, main

from fme.ace.testing import DimSize, DimSizes, get_nd_dataset


def test_time_coarsen() -> None:
n_input_times = 5
nz_interface = 3
# Create a small but non-trivial dataset (coords + attrs)
ds = get_nd_dataset(
dim_sizes=DimSizes(
n_time=n_input_times,
horizontal=[
DimSize(name="lat", size=4),
DimSize(name="lon", size=8),
],
nz_interface=nz_interface,
),
variable_names=["temp", "temp_tendency", "flag"],
timestep_days=1.0,
include_vertical_coordinate=True,
)
ds["temp"].attrs["units"] = "K"
ds["temp_tendency"].attrs["units"] = "K/day"
constant_names = []
for name in ds.data_vars:
if name.startswith("ak_") or name.startswith("bk_"):
constant_names.append(name)
assert len(constant_names) == 2 * nz_interface # sanity check
input_path = "memory://test-time-coarsen/dataset.zarr"
output_path = "memory://test-time-coarsen/dataset_coarsened.zarr"

# Write to an in-memory filesystem (xarray recognizes the memory:// protocol)
ds.to_zarr(input_path)
ds = xr.open_zarr(input_path) # for comparison later, use fresh read

config = Config(
paths=[PathPair(input=input_path, output=output_path)],
coarsen_factor=2,
snapshot_names=["temp"],
window_names=["temp_tendency"],
)
main(config)

# Read back the coarsened dataset
ds_coarsened = xr.open_zarr(output_path)
# Note on each timestep, we have the tendencies which led to the current
# snapshot alongside the snapshot itself. This means the first snapshot
# of the coarsened dataset is not the first snapshot of the input dataset.
assert ds_coarsened.dims["time"] == n_input_times // config.coarsen_factor
expected_snapshot_slice = slice(
config.coarsen_factor - 1, None, config.coarsen_factor
)
np.testing.assert_array_equal(
ds_coarsened["time"].values,
ds["time"].isel(time=expected_snapshot_slice).values,
)
np.testing.assert_array_equal(
ds_coarsened["temp"].values,
ds["temp"].isel(time=expected_snapshot_slice).values,
)
np.testing.assert_array_equal(
ds_coarsened["temp_tendency"].isel(time=0).values,
ds["temp_tendency"]
.isel(time=slice(0, config.coarsen_factor))
.mean("time")
.values,
)
for name in constant_names:
np.testing.assert_array_equal(ds_coarsened[name].values, ds[name].values)
Loading