From 4dec08a57a7b26054ba51a2927732e12433f9dcc Mon Sep 17 00:00:00 2001 From: Jeremy McGibbon Date: Tue, 10 Feb 2026 19:00:54 +0000 Subject: [PATCH 01/20] add CUDATimer and NullTimer --- fme/core/benchmark/test_timer.py | 36 +++++++ fme/core/benchmark/timer.py | 171 +++++++++++++++++++++++++++++++ 2 files changed, 207 insertions(+) create mode 100644 fme/core/benchmark/test_timer.py create mode 100644 fme/core/benchmark/timer.py diff --git a/fme/core/benchmark/test_timer.py b/fme/core/benchmark/test_timer.py new file mode 100644 index 000000000..dd9cd5624 --- /dev/null +++ b/fme/core/benchmark/test_timer.py @@ -0,0 +1,36 @@ +from unittest.mock import patch + +import pytest +import torch + +from fme.core.benchmark.timer import CUDATimer + + +@pytest.mark.parametrize("is_available", [True, False]) +def test_new_if_available(is_available: bool): + from fme.core.benchmark.timer import CUDATimer, NullTimer + + with patch("torch.cuda.is_available", return_value=is_available): + timer = CUDATimer.new_if_available() + if is_available: + assert isinstance(timer, CUDATimer) + else: + assert isinstance(timer, NullTimer) + + +@pytest.mark.skipif( + not torch.cuda.is_available(), + reason="CUDA is not available, skipping CUDATimer tests.", +) +def test_timer_with_child(): + timer = CUDATimer() + with timer: + # get cuda to wait + torch.cuda._sleep(100_000) + with timer.child("child"): + torch.cuda._sleep(100_000) + result = timer.result + assert "child" in result.children + # parent time should include the child time, so it should be + # at least 2x the child time (since we sleep for the same amount of time in both) + assert result.avg_time >= 2.0 * result.children["child"].avg_time diff --git a/fme/core/benchmark/timer.py b/fme/core/benchmark/timer.py new file mode 100644 index 000000000..230fe4e72 --- /dev/null +++ b/fme/core/benchmark/timer.py @@ -0,0 +1,171 @@ +import collections +import contextlib +import dataclasses +from typing import Literal, Protocol, Self + +import torch + + +@dataclasses.dataclass +class TimerResult: + total_runs: int + avg_time: float + children: dict[str, "TimerResult"] + + def assert_close(self, other: "TimerResult", rtol=0.02, children_rtol=0.02) -> None: + if self.total_runs != other.total_runs: + raise AssertionError( + f"total_runs differ: {self.total_runs} vs {other.total_runs}" + ) + if not torch.isclose( + torch.tensor(self.avg_time), torch.tensor(other.avg_time), rtol=rtol + ): + raise AssertionError( + f"avg_time differ: {self.avg_time} vs " + f"{other.avg_time} given rtol={rtol}" + ) + if self.children.keys() != other.children.keys(): + raise AssertionError( + f"children keys differ: {self.children.keys()} vs " + f"{other.children.keys()}" + ) + for key in self.children.keys(): + try: + self.children[key].assert_close( + other.children[key], rtol=children_rtol, children_rtol=children_rtol + ) + except AssertionError as e: + raise AssertionError(f"child '{key}' differ: {e}") from e + + +class Timer(Protocol): + def child(self, name: str) -> Self: ... + def __enter__(self) -> Self: ... + def __exit__(self, exc_type, exc_val, exc_tb) -> Literal[False]: ... + + +class NullTimer: + def context(self, name: str) -> contextlib.nullcontext: + return contextlib.nullcontext() + + def child(self, name: str) -> "Self": + return self + + def __enter__(self) -> "Self": + return self + + def __exit__(self, exc_type, exc_val, exc_tb) -> Literal[False]: + return False + + def report(self) -> TimerResult: + return TimerResult(total_runs=0, avg_time=0.0, children={}) + + +_: Timer = NullTimer() +del _ + + +class EventPair: + def __init__(self): + self.start = torch.cuda.Event(enable_timing=True) + self.end = torch.cuda.Event(enable_timing=True) + self._stream = None + self._start_recorded = False + self._end_recorded = False + + def record_start(self): + if self._start_recorded: + raise RuntimeError( + "record_start has already been called on this EventPair." + ) + self._stream = torch.cuda.current_stream() + self.start.record(self._stream) + self._start_recorded = True + + def record_end(self): + if not self._start_recorded: + raise RuntimeError("record_start must be called before record_end") + if self._end_recorded: + raise RuntimeError("record_end has already been called on this EventPair.") + if self._stream is None: + raise RuntimeError("record_start must be called before record_end") + self.end.record(self._stream) + self._end_recorded = True + + def elapsed_time_ms(self) -> float: + if not self._start_recorded or not self._end_recorded: + raise RuntimeError( + "Both record_start and record_end must be called " + "before elapsed_time_ms can be called." + ) + return self.start.elapsed_time(self.end) + + +class CUDATimer: + def __init__(self): + if not torch.cuda.is_available(): + raise RuntimeError("CUDA is not available, cannot use CUDATimer.") + self._children: collections.defaultdict[str, CUDATimer] = ( + collections.defaultdict(CUDATimer) + ) + self._event_pairs: list[EventPair] = [] + self._entered = False + self._result: TimerResult | None = None + + @classmethod + def new_if_available(cls) -> "CUDATimer | NullTimer": + if torch.cuda.is_available(): + return cls() + else: + return NullTimer() + + def __enter__(self): + if self._entered: + raise RuntimeError("CUDATimer is already entered.") + self._entered = True + self._event_pairs.append(EventPair()) + self._event_pairs[-1].record_start() + return self + + def __exit__(self, exc_type, exc_val, exc_tb): + if not self._event_pairs: + raise RuntimeError("CUDATimer context was not properly entered.") + self._event_pairs[-1].record_end() + self._entered = False + return False + + def child(self, name: str) -> "CUDATimer": + if not self._entered: + raise RuntimeError( + "CUDATimer child cannot be used before entering the timer." + ) + return self._children[name] + + @property + def _avg_time(self) -> float: + if len(self._event_pairs) == 0: + raise RuntimeError( + "CUDATimer report cannot be generated before entering the timer." + ) + total_time = sum( + event_pair.elapsed_time_ms() for event_pair in self._event_pairs + ) + return total_time / len(self._event_pairs) + + def _child_reports(self) -> dict[str, TimerResult]: + return {name: child.result for name, child in self._children.items()} + + @property + def result(self) -> TimerResult: + if self._result is None: + torch.cuda.synchronize() + self._result = TimerResult( + total_runs=len(self._event_pairs), + avg_time=self._avg_time, + children=self._child_reports(), + ) + return self._result + + +__: type[Timer] = CUDATimer +del __ From 3460bea8cf28d1b6e002a74976ad773f63973798 Mon Sep 17 00:00:00 2001 From: Jeremy McGibbon Date: Tue, 10 Feb 2026 19:22:51 +0000 Subject: [PATCH 02/20] test assert_close --- fme/core/benchmark/test_timer.py | 80 +++++++++++++++++++++++++++++++- 1 file changed, 79 insertions(+), 1 deletion(-) diff --git a/fme/core/benchmark/test_timer.py b/fme/core/benchmark/test_timer.py index dd9cd5624..c8f97e6b8 100644 --- a/fme/core/benchmark/test_timer.py +++ b/fme/core/benchmark/test_timer.py @@ -1,9 +1,10 @@ +from typing import Literal from unittest.mock import patch import pytest import torch -from fme.core.benchmark.timer import CUDATimer +from fme.core.benchmark.timer import CUDATimer, TimerResult @pytest.mark.parametrize("is_available", [True, False]) @@ -34,3 +35,80 @@ def test_timer_with_child(): # parent time should include the child time, so it should be # at least 2x the child time (since we sleep for the same amount of time in both) assert result.avg_time >= 2.0 * result.children["child"].avg_time + + +def _create_parent_result(avg_time: float) -> TimerResult: + return TimerResult(total_runs=2, avg_time=avg_time, children={}) + + +def _create_child_result(avg_time: float) -> TimerResult: + return TimerResult( + total_runs=2, + avg_time=1.0, + children={"child": TimerResult(total_runs=2, avg_time=avg_time, children={})}, + ) + + +@pytest.mark.parametrize( + "v1, v2, rtol, expect_raise", + [ + (100, 101, 0.02, False), # within 2% + (100, 103, 0.02, True), # outside 2% + (100, 102, 0.02, False), # exactly 2% is considered inside + (10000, 10201, 0.02, True), # more than 2% is considered outside + (100, 102, 0.03, False), # exactly 2% is within 3% + ], +) +@pytest.mark.parametrize("kind", ["parent", "child"]) +def test_assert_close( + v1: int, v2: int, rtol: float, kind: Literal["parent", "child"], expect_raise: bool +): + if kind == "child": + result1 = _create_child_result(avg_time=v1) + result2 = _create_child_result(avg_time=v2) + else: + result1 = _create_parent_result(avg_time=v1) + result2 = _create_parent_result(avg_time=v2) + if expect_raise: + with pytest.raises(AssertionError): + result2.assert_close(result1, rtol=rtol) + else: + result2.assert_close(result1, rtol=rtol) + + +def test_assert_close_different_total_runs(): + # different total runs should raise regardless of rtol + result1 = TimerResult(total_runs=100, avg_time=100.0, children={}) + result2 = TimerResult(total_runs=101, avg_time=100.0, children={}) + with pytest.raises(AssertionError): + result2.assert_close(result1, rtol=0.5) + + +def test_assert_close_children_rtol(): + # test that children_rtol is used for child comparisons + result1 = TimerResult( + total_runs=2, + avg_time=100.0, + children={"child": TimerResult(total_runs=2, avg_time=100.0, children={})}, + ) + result2 = TimerResult( + total_runs=2, + avg_time=110.0, + children={"child": TimerResult(total_runs=2, avg_time=103.0, children={})}, + ) + result2.assert_close(result1, rtol=0.2, children_rtol=0.05) + + +def test_assert_close_children_rtol_raises(): + # test that children_rtol is used for child comparisons + result1 = TimerResult( + total_runs=2, + avg_time=100.0, + children={"child": TimerResult(total_runs=2, avg_time=100.0, children={})}, + ) + result2 = TimerResult( + total_runs=2, + avg_time=110.0, + children={"child": TimerResult(total_runs=2, avg_time=103.0, children={})}, + ) + result2.assert_close(result1, rtol=0.5, children_rtol=0.2) From 6c1c29c60dbd5341233a6e9528b698a9faf83586 Mon Sep 17 00:00:00 2001 From: Jeremy McGibbon Date: Tue, 10 Feb 2026 19:25:49 +0000 Subject: [PATCH 03/20] copy-paste sht_fix.py --- fme/core/models/conditional_sfno/sht.py | 223 ++++++++++++++++++++++++ 1 file changed, 223 insertions(+) create mode 100644 fme/core/models/conditional_sfno/sht.py diff --git a/fme/core/models/conditional_sfno/sht.py b/fme/core/models/conditional_sfno/sht.py new file mode 100644 index 000000000..b4127c5ec --- /dev/null +++ b/fme/core/models/conditional_sfno/sht.py @@ -0,0 +1,223 @@ +# flake8: noqa +# fmt: off +# isort: skip_file + +""" +This file contains a fix that we needed to get the SFNO to work on multiple +unroll steps in multiprocessing (e.g. multi-GPU mode.) We forked this code from +the torch harmonics sht.py file [*]. + +[*] https://github.com/NVIDIA/torch-harmonics/blob/17eefa53468d1a885d72087918eba905fa53e10a/torch_harmonics/sht.py +""" + + +# coding=utf-8 + +# SPDX-FileCopyrightText: Copyright (c) 2022 The torch-harmonics Authors. All rights reserved. +# SPDX-License-Identifier: BSD-3-Clause +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# 3. Neither the name of the copyright holder nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# + +import torch +import torch.nn as nn +import torch.fft + +from torch_harmonics.quadrature import legendre_gauss_weights, lobatto_weights, clenshaw_curtiss_weights +from torch_harmonics.legendre import _precompute_legpoly +import torch_harmonics + +from fme.core.device import get_device + +class RealSHT(nn.Module): + """ + Defines a module for computing the forward (real-valued) SHT. + Precomputes Legendre Gauss nodes, weights and associated Legendre polynomials on these nodes. + The SHT is applied to the last two dimensions of the input + + [1] Schaeffer, N. Efficient spherical harmonic transforms aimed at pseudospectral numerical simulations, G3: Geochemistry, Geophysics, Geosystems. + [2] Wang, B., Wang, L., Xie, Z.; Accurate calculation of spherical and vector spherical harmonic expansions via spectral element grids; Adv Comput Math. + """ + + def __init__(self, nlat, nlon, lmax=None, mmax=None, grid="lobatto", norm="ortho", csphase=True): + """ + Initializes the SHT Layer, precomputing the necessary quadrature weights + + Parameters: + nlat: input grid resolution in the latitudinal direction + nlon: input grid resolution in the longitudinal direction + grid: grid in the latitude direction (for now only tensor product grids are supported) + """ + + super().__init__() + + self.nlat = nlat + self.nlon = nlon + self.grid = grid + self.norm = norm + self.csphase = csphase + + # TODO: include assertions regarding the dimensions + + # compute quadrature points + if self.grid == "legendre-gauss": + cost, w = legendre_gauss_weights(nlat, -1, 1) + self.lmax = lmax or self.nlat + elif self.grid == "lobatto": + cost, w = lobatto_weights(nlat, -1, 1) + self.lmax = lmax or self.nlat-1 + elif self.grid == "equiangular": + cost, w = clenshaw_curtiss_weights(nlat, -1, 1) + # cost, w = fejer2_weights(nlat, -1, 1) + self.lmax = lmax or self.nlat + elif self.grid == "healpix": + raise(NotImplementedError("'healpix' grid not supported by InverseRealVectorSHT")) + else: + raise(ValueError("Unknown quadrature mode")) + + # apply cosine transform and flip them + tq = torch.flip(torch.arccos(cost), dims=(0,)) + + # determine the dimensions + self.mmax = mmax or self.nlon // 2 + 1 + + # combine quadrature weights with the legendre weights + pct = torch.as_tensor(_precompute_legpoly(self.mmax, self.lmax, tq, norm=self.norm, csphase=self.csphase)) + weights = torch.einsum('mlk,k->mlk', pct, w) + + # remember quadrature weights + self.weights = weights.float().to(get_device()) + + def extra_repr(self): + """ + Pretty print module + """ + return f'nlat={self.nlat}, nlon={self.nlon},\n lmax={self.lmax}, mmax={self.mmax},\n grid={self.grid}, csphase={self.csphase}' + + def forward(self, x: torch.Tensor): + + assert(x.shape[-2] == self.nlat) + assert(x.shape[-1] == self.nlon) + with torch.autocast("cuda", enabled=False): + # rfft and view_as_complex don't support BF16, see https://github.com/pytorch/pytorch/issues/117844 + x = x.float() + + # apply real fft in the longitudinal direction + x = 2.0 * torch.pi * torch.fft.rfft(x, dim=-1, norm="forward") + + # do the Legendre-Gauss quadrature + x = torch.view_as_real(x) + + # distributed contraction: fork + out_shape = list(x.size()) + out_shape[-3] = self.lmax + out_shape[-2] = self.mmax + xout = torch.zeros(out_shape, dtype=x.dtype, device=x.device) + + # contraction + weights = self.weights.to(x.device).to(x.dtype) + xout[..., 0] = torch.einsum('...km,mlk->...lm', x[..., :self.mmax, 0], weights) + xout[..., 1] = torch.einsum('...km,mlk->...lm', x[..., :self.mmax, 1], weights) + x = torch.view_as_complex(xout) + + return x + +class InverseRealSHT(nn.Module): + """ + Defines a module for computing the inverse (real-valued) SHT. + Precomputes Legendre Gauss nodes, weights and associated Legendre polynomials on these nodes. + nlat, nlon: Output dimensions + lmax, mmax: Input dimensions (spherical coefficients). For convenience, these are inferred from the output dimensions + + [1] Schaeffer, N. Efficient spherical harmonic transforms aimed at pseudospectral numerical simulations, G3: Geochemistry, Geophysics, Geosystems. + [2] Wang, B., Wang, L., Xie, Z.; Accurate calculation of spherical and vector spherical harmonic expansions via spectral element grids; Adv Comput Math. + """ + + def __init__(self, nlat, nlon, lmax=None, mmax=None, grid="lobatto", norm="ortho", csphase=True): + + super().__init__() + + self.nlat = nlat + self.nlon = nlon + self.grid = grid + self.norm = norm + self.csphase = csphase + + # compute quadrature points + if self.grid == "legendre-gauss": + cost, _ = legendre_gauss_weights(nlat, -1, 1) + self.lmax = lmax or self.nlat + elif self.grid == "lobatto": + cost, _ = lobatto_weights(nlat, -1, 1) + self.lmax = lmax or self.nlat-1 + elif self.grid == "equiangular": + cost, _ = clenshaw_curtiss_weights(nlat, -1, 1) + self.lmax = lmax or self.nlat + elif self.grid == "healpix": + raise(NotImplementedError("'healpix' grid not supported by RealVectorSHT")) + else: + raise(ValueError("Unknown quadrature mode")) + + # apply cosine transform and flip them + t = torch.flip(torch.arccos(cost), dims=(0,)) + + # determine the dimensions + self.mmax = mmax or self.nlon // 2 + 1 + + pct = torch.as_tensor(_precompute_legpoly(self.mmax, self.lmax, t, norm=self.norm, inverse=True, csphase=self.csphase)) + + # register buffer + self.pct = pct.float().to(get_device()) + + def extra_repr(self): + """ + Pretty print module + """ + return f'nlat={self.nlat}, nlon={self.nlon},\n lmax={self.lmax}, mmax={self.mmax},\n grid={self.grid}, csphase={self.csphase}' + + def forward(self, x: torch.Tensor): + + assert(x.shape[-2] == self.lmax) + assert(x.shape[-1] == self.mmax) + + with torch.autocast("cuda", enabled=False): + # irfft and view_as_complex don't support BF16, see https://github.com/pytorch/pytorch/issues/117844 + # Evaluate associated Legendre functions on the output nodes + x = torch.view_as_real(x).float() + + pct = self.pct.to(x.device).to(x.dtype) + rl = torch.einsum('...lm, mlk->...km', x[..., 0], pct ) + im = torch.einsum('...lm, mlk->...km', x[..., 1], pct ) + xs = torch.stack((rl, im), -1) + + # apply the inverse (real) FFT + x = torch.view_as_complex(xs) + x = torch.fft.irfft(x, n=self.nlon, dim=-1, norm="forward") + + return x + +torch_harmonics.RealSHT = RealSHT +torch_harmonics.InverseRealSHT = InverseRealSHT From 332540a6058fa0d6b206554ba02c3ed5694c60b5 Mon Sep 17 00:00:00 2001 From: Jeremy McGibbon Date: Tue, 10 Feb 2026 19:27:48 +0000 Subject: [PATCH 04/20] update conditional SFNO to pass timers for profiling --- fme/core/models/conditional_sfno/layers.py | 102 ++++++++++-------- .../makani/spectral_convolution.py | 9 +- .../models/conditional_sfno/s2convolutions.py | 54 ++++++---- fme/core/models/conditional_sfno/sfnonet.py | 74 +++++++------ fme/core/models/conditional_sfno/sht.py | 66 ++++++------ 5 files changed, 172 insertions(+), 133 deletions(-) diff --git a/fme/core/models/conditional_sfno/layers.py b/fme/core/models/conditional_sfno/layers.py index 47648d781..5f6dcbe8e 100644 --- a/fme/core/models/conditional_sfno/layers.py +++ b/fme/core/models/conditional_sfno/layers.py @@ -24,6 +24,7 @@ import torch.nn.functional as F from torch.utils.checkpoint import checkpoint +from fme.core.benchmark.timer import Timer, NullTimer from fme.core.models.conditional_sfno.lora import LoRAConv2d from .activations import ComplexReLU @@ -223,7 +224,12 @@ def reset_parameters(self): torch.nn.init.constant_(self.W_bias_pos.weight, 0.0) # no bias on 2d layers as it is already handled in the non-2d layers - def forward(self, x: torch.Tensor, context: Context) -> torch.Tensor: + def forward( + self, + x: torch.Tensor, + context: Context, + timer: Timer = NullTimer(), + ) -> torch.Tensor: """ Conditional Layer Normalization @@ -242,52 +248,58 @@ def forward(self, x: torch.Tensor, context: Context) -> torch.Tensor: self.W_scale_labels is not None or self.W_bias_labels is not None ): raise ValueError("labels must be provided") - if self.W_scale is not None: - if context.embedding_scalar is None: - raise ValueError("embedding_scalar must be provided") - scale: torch.Tensor = ( - self.W_scale(context.embedding_scalar).unsqueeze(-1).unsqueeze(-1) - ) - else: - scale = torch.ones( - list(x.shape[:-2]) + [1, 1], device=x.device, dtype=x.dtype - ) + with timer.child("compute_scaling_and_bias"): + if self.W_scale is not None: + if context.embedding_scalar is None: + raise ValueError("embedding_scalar must be provided") + scale: torch.Tensor = ( + self.W_scale(context.embedding_scalar).unsqueeze(-1).unsqueeze(-1) + ) + else: + scale = torch.ones( + list(x.shape[:-2]) + [1, 1], device=x.device, dtype=x.dtype + ) - if self.W_scale_2d is not None: - if context.noise is None: - raise ValueError("embedding_2d must be provided") - scale = scale + self.W_scale_2d(context.noise) - if self.W_bias is not None: - if context.embedding_scalar is None: - raise ValueError("embedding_scalar must be provided") - bias: torch.Tensor = ( - self.W_bias(context.embedding_scalar).unsqueeze(-1).unsqueeze(-1) - ) - else: - bias = torch.zeros( - list(x.shape[:-2]) + [1, 1], device=x.device, dtype=x.dtype - ) + if self.W_scale_2d is not None: + if context.noise is None: + raise ValueError("embedding_2d must be provided") + scale = scale + self.W_scale_2d(context.noise) + if self.W_bias is not None: + if context.embedding_scalar is None: + raise ValueError("embedding_scalar must be provided") + bias: torch.Tensor = ( + self.W_bias(context.embedding_scalar).unsqueeze(-1).unsqueeze(-1) + ) + else: + bias = torch.zeros( + list(x.shape[:-2]) + [1, 1], device=x.device, dtype=x.dtype + ) - if self.W_scale_labels is not None: - scale = scale + self.W_scale_labels(context.labels).unsqueeze(-1).unsqueeze( - -1 - ) - if self.W_bias_labels is not None: - bias = bias + self.W_bias_labels(context.labels).unsqueeze(-1).unsqueeze(-1) - if self.W_bias_2d is not None: - if context.noise is None: - raise ValueError("embedding_2d must be provided") - bias = bias + self.W_bias_2d(context.noise) - if self.W_scale_pos is not None: - if context.embedding_pos is None: - raise ValueError("embedding_pos must be provided") - scale = scale + self.W_scale_pos(context.embedding_pos) - if self.W_bias_pos is not None: - if context.embedding_pos is None: - raise ValueError("embedding_pos must be provided") - bias = bias + self.W_bias_pos(context.embedding_pos) - x_norm: torch.Tensor = self.norm(x) - return x_norm * scale + bias + if self.W_scale_labels is not None: + scale = scale + self.W_scale_labels(context.labels).unsqueeze( + -1 + ).unsqueeze(-1) + if self.W_bias_labels is not None: + bias = bias + self.W_bias_labels(context.labels).unsqueeze( + -1 + ).unsqueeze(-1) + if self.W_bias_2d is not None: + if context.noise is None: + raise ValueError("embedding_2d must be provided") + bias = bias + self.W_bias_2d(context.noise) + if self.W_scale_pos is not None: + if context.embedding_pos is None: + raise ValueError("embedding_pos must be provided") + scale = scale + self.W_scale_pos(context.embedding_pos) + if self.W_bias_pos is not None: + if context.embedding_pos is None: + raise ValueError("embedding_pos must be provided") + bias = bias + self.W_bias_pos(context.embedding_pos) + with timer.child("normalize"): + x_norm: torch.Tensor = self.norm(x) + with timer.child("apply_scaling_and_bias"): + return_value = x_norm * scale + bias + return return_value @torch.jit.script diff --git a/fme/core/models/conditional_sfno/makani/spectral_convolution.py b/fme/core/models/conditional_sfno/makani/spectral_convolution.py index f38894c83..e99a7f5e1 100644 --- a/fme/core/models/conditional_sfno/makani/spectral_convolution.py +++ b/fme/core/models/conditional_sfno/makani/spectral_convolution.py @@ -19,6 +19,8 @@ import torch.nn as nn from torch import amp +from fme.core.benchmark.timer import NullTimer, Timer + # import convenience functions for factorized tensors from .factorizations import get_contract_fun @@ -124,7 +126,7 @@ def __init__( if bias: self.bias = nn.Parameter(torch.zeros(1, self.out_channels, 1, 1)) - def forward(self, x): + def forward(self, x, timer: Timer = NullTimer()): dtype = x.dtype residual = x x = x.float() @@ -138,7 +140,10 @@ def forward(self, x): B, C, H, W = x.shape x = x.reshape(B, self.num_groups, C // self.num_groups, H, W) xp = self._contract( - x, self.weight, separable=self.separable, operator_type=self.operator_type + x, + self.weight, + separable=self.separable, + operator_type=self.operator_type, ) x = xp.reshape(B, self.out_channels, H, W).contiguous() diff --git a/fme/core/models/conditional_sfno/s2convolutions.py b/fme/core/models/conditional_sfno/s2convolutions.py index 93299256a..b138dd442 100644 --- a/fme/core/models/conditional_sfno/s2convolutions.py +++ b/fme/core/models/conditional_sfno/s2convolutions.py @@ -22,6 +22,8 @@ import torch_harmonics as th import torch_harmonics.distributed as thd +from fme.core.benchmark.timer import NullTimer, Timer + # import convenience functions for factorized tensors from .activations import ComplexReLU @@ -223,45 +225,51 @@ def __init__( self.bias = nn.Parameter(torch.zeros(1, out_channels, 1, 1)) self.out_channels = out_channels - def forward(self, x): # pragma: no cover + def forward(self, x, timer: Timer = NullTimer()): # pragma: no cover dtype = x.dtype residual = x x = x.float() with torch.amp.autocast("cuda", enabled=False): - x = self.forward_transform(x.float()) + with timer.child("forward_transform"): + x = self.forward_transform(x.float()) if self._round_trip_residual: - x = x.contiguous() - residual = self.inverse_transform(x) - residual = residual.to(dtype) + with timer.child("round_trip_residual"): + x = x.contiguous() + residual = self.inverse_transform(x) + residual = residual.to(dtype) B, C, H, W = x.shape assert C % self.num_groups == 0 x = x.reshape(B, self.num_groups, C // self.num_groups, H, W) if self.lora_A is not None and self.lora_B is not None: - lora_update = _contract_lora( - self.lora_A, - self.lora_B, - x[..., : self.modes_lat_local, : self.modes_lon_local], - ) + with timer.child("lora_update"): + lora_update = _contract_lora( + self.lora_A, + self.lora_B, + x[..., : self.modes_lat_local, : self.modes_lon_local], + ) else: lora_update = 0.0 - xp = torch.zeros_like(x) - xp[..., : self.modes_lat_local, : self.modes_lon_local] = _contract_dhconv( - x[..., : self.modes_lat_local, : self.modes_lon_local], - self.weight, - ) - xp = xp + self.lora_scaling * lora_update - xp = xp.reshape(B, self.out_channels, H, W) - x = xp.contiguous() + with timer.child("dhconv"): + xp = torch.zeros_like(x) + xp[..., : self.modes_lat_local, : self.modes_lon_local] = _contract_dhconv( + x[..., : self.modes_lat_local, : self.modes_lon_local], + self.weight, + ) + xp = xp + self.lora_scaling * lora_update + xp = xp.reshape(B, self.out_channels, H, W) + x = xp.contiguous() with torch.amp.autocast("cuda", enabled=False): - x = self.inverse_transform(x) + with timer.child("inverse_transform"): + x = self.inverse_transform(x) if hasattr(self, "bias"): - x = x + self.bias + with timer.child("add_bias"): + x = x + self.bias x = x.type(dtype) @@ -320,7 +328,7 @@ def __init__( scale * torch.randn(1, out_channels, *self.output_dims) ) - def forward(self, x): # pragma: no cover + def forward(self, x, timer: Timer = NullTimer()): # pragma: no cover dtype = x.dtype x = x.float() B, C, H, W = x.shape @@ -503,7 +511,7 @@ def forward_mlp(self, x): # pragma: no cover return x - def forward(self, x): # pragma: no cover + def forward(self, x, timer: Timer = NullTimer()): # pragma: no cover dtype = x.dtype residual = x x = x.to(torch.float32) @@ -626,7 +634,7 @@ def forward_mlp(self, x): # pragma: no cover return x - def forward(self, x): # pragma: no cover + def forward(self, x, timer: Timer = NullTimer()): # pragma: no cover dtype = x.dtype x = x.to(torch.float32) diff --git a/fme/core/models/conditional_sfno/sfnonet.py b/fme/core/models/conditional_sfno/sfnonet.py index 61d35ca27..29eb986f0 100644 --- a/fme/core/models/conditional_sfno/sfnonet.py +++ b/fme/core/models/conditional_sfno/sfnonet.py @@ -24,6 +24,8 @@ import torch_harmonics as th from torch.utils.checkpoint import checkpoint +from fme.core.benchmark.timer import Timer, NullTimer + from .initialization import trunc_normal_ # wrap fft, to unify interface to spectral transforms @@ -62,7 +64,7 @@ def __init__(self, *args, **kwargs): super().__init__() self.conv = th.DiscreteContinuousConvS2(*args, **kwargs) - def forward(self, x): + def forward(self, x, timer: Timer = NullTimer()): return self.conv(x), x @@ -153,8 +155,8 @@ def __init__( else: raise (NotImplementedError) - def forward(self, x): - return self.filter(x) + def forward(self, x, timer: Timer = NullTimer()): + return self.filter(x, timer=timer) class FourierNeuralOperatorBlock(nn.Module): @@ -295,44 +297,54 @@ def __init__( lora_alpha=lora_alpha, ) - def forward(self, x, context_embedding): - x_norm = torch.zeros_like(x) - x_norm[..., : self.input_shape_loc[0], : self.input_shape_loc[1]] = self.norm0( - x[..., : self.input_shape_loc[0], : self.input_shape_loc[1]], - context_embedding, - ) - x, residual = self.filter(x_norm) - + def forward(self, x, context_embedding, timer: Timer = NullTimer()): + with timer.child("norm0") as norm0_timer: + x_norm = torch.zeros_like(x) + x_norm[..., : self.input_shape_loc[0], : self.input_shape_loc[1]] = ( + self.norm0( + x[..., : self.input_shape_loc[0], : self.input_shape_loc[1]], + context_embedding, + timer=norm0_timer, + ) + ) + with timer.child("filter") as filter_timer: + x, residual = self.filter(x_norm, timer=filter_timer) if hasattr(self, "inner_skip"): - if self.concat_skip: - x = torch.cat((x, self.inner_skip(residual)), dim=1) - x = self.inner_skip_conv(x) - else: - x = x + self.inner_skip(residual) + with timer.child("inner_skip"): + if self.concat_skip: + x = torch.cat((x, self.inner_skip(residual)), dim=1) + x = self.inner_skip_conv(x) + else: + x = x + self.inner_skip(residual) if hasattr(self, "act_layer"): - x = self.act_layer(x) - - x_norm = torch.zeros_like(x) - x_norm[..., : self.output_shape_loc[0], : self.output_shape_loc[1]] = ( - self.norm1( - x[..., : self.output_shape_loc[0], : self.output_shape_loc[1]], - context_embedding, + with timer.child("activation"): + x = self.act_layer(x) + + with timer.child("norm1") as norm1_timer: + x_norm = torch.zeros_like(x) + x_norm[..., : self.output_shape_loc[0], : self.output_shape_loc[1]] = ( + self.norm1( + x[..., : self.output_shape_loc[0], : self.output_shape_loc[1]], + context_embedding, + timer=norm1_timer, + ) ) - ) - x = x_norm + x = x_norm if hasattr(self, "mlp"): - x = self.mlp(x) + with timer.child("mlp"): + x = self.mlp(x) x = self.drop_path(x) if hasattr(self, "outer_skip"): - if self.concat_skip: - x = torch.cat((x, self.outer_skip(residual)), dim=1) - x = self.outer_skip_conv(x) - else: - x = x + self.outer_skip(residual) + with timer.child("outer_skip"): + if self.concat_skip: + x = torch.cat((x, self.outer_skip(residual)), dim=1) + x = self.outer_skip_conv(x) + else: + x = x + self.outer_skip(residual) return x diff --git a/fme/core/models/conditional_sfno/sht.py b/fme/core/models/conditional_sfno/sht.py index b4127c5ec..dd9f8fc02 100644 --- a/fme/core/models/conditional_sfno/sht.py +++ b/fme/core/models/conditional_sfno/sht.py @@ -48,9 +48,10 @@ from torch_harmonics.quadrature import legendre_gauss_weights, lobatto_weights, clenshaw_curtiss_weights from torch_harmonics.legendre import _precompute_legpoly -import torch_harmonics from fme.core.device import get_device +from fme.core.benchmark.timer import Timer, NullTimer + class RealSHT(nn.Module): """ @@ -117,31 +118,33 @@ def extra_repr(self): """ return f'nlat={self.nlat}, nlon={self.nlon},\n lmax={self.lmax}, mmax={self.mmax},\n grid={self.grid}, csphase={self.csphase}' - def forward(self, x: torch.Tensor): + def forward(self, x: torch.Tensor, timer: Timer = NullTimer()): assert(x.shape[-2] == self.nlat) assert(x.shape[-1] == self.nlon) with torch.autocast("cuda", enabled=False): - # rfft and view_as_complex don't support BF16, see https://github.com/pytorch/pytorch/issues/117844 - x = x.float() + with timer.child("rfft"): + # rfft and view_as_complex don't support BF16, see https://github.com/pytorch/pytorch/issues/117844 + x = x.float() - # apply real fft in the longitudinal direction - x = 2.0 * torch.pi * torch.fft.rfft(x, dim=-1, norm="forward") + # apply real fft in the longitudinal direction + x = 2.0 * torch.pi * torch.fft.rfft(x, dim=-1, norm="forward") - # do the Legendre-Gauss quadrature - x = torch.view_as_real(x) + with timer.child("contraction"): + # do the Legendre-Gauss quadrature + x = torch.view_as_real(x) - # distributed contraction: fork - out_shape = list(x.size()) - out_shape[-3] = self.lmax - out_shape[-2] = self.mmax - xout = torch.zeros(out_shape, dtype=x.dtype, device=x.device) + # distributed contraction: fork + out_shape = list(x.size()) + out_shape[-3] = self.lmax + out_shape[-2] = self.mmax + xout = torch.zeros(out_shape, dtype=x.dtype, device=x.device) - # contraction - weights = self.weights.to(x.device).to(x.dtype) - xout[..., 0] = torch.einsum('...km,mlk->...lm', x[..., :self.mmax, 0], weights) - xout[..., 1] = torch.einsum('...km,mlk->...lm', x[..., :self.mmax, 1], weights) - x = torch.view_as_complex(xout) + # contraction + weights = self.weights.to(x.device).to(x.dtype) + xout[..., 0] = torch.einsum('...km,mlk->...lm', x[..., :self.mmax, 0], weights) + xout[..., 1] = torch.einsum('...km,mlk->...lm', x[..., :self.mmax, 1], weights) + x = torch.view_as_complex(xout) return x @@ -198,26 +201,25 @@ def extra_repr(self): """ return f'nlat={self.nlat}, nlon={self.nlon},\n lmax={self.lmax}, mmax={self.mmax},\n grid={self.grid}, csphase={self.csphase}' - def forward(self, x: torch.Tensor): + def forward(self, x: torch.Tensor, timer: Timer = NullTimer()): assert(x.shape[-2] == self.lmax) assert(x.shape[-1] == self.mmax) with torch.autocast("cuda", enabled=False): - # irfft and view_as_complex don't support BF16, see https://github.com/pytorch/pytorch/issues/117844 - # Evaluate associated Legendre functions on the output nodes - x = torch.view_as_real(x).float() + with timer.child("contraction"): + # irfft and view_as_complex don't support BF16, see https://github.com/pytorch/pytorch/issues/117844 + # Evaluate associated Legendre functions on the output nodes + x = torch.view_as_real(x).float() - pct = self.pct.to(x.device).to(x.dtype) - rl = torch.einsum('...lm, mlk->...km', x[..., 0], pct ) - im = torch.einsum('...lm, mlk->...km', x[..., 1], pct ) - xs = torch.stack((rl, im), -1) + pct = self.pct.to(x.device).to(x.dtype) + rl = torch.einsum('...lm, mlk->...km', x[..., 0], pct ) + im = torch.einsum('...lm, mlk->...km', x[..., 1], pct ) + xs = torch.stack((rl, im), -1) - # apply the inverse (real) FFT - x = torch.view_as_complex(xs) - x = torch.fft.irfft(x, n=self.nlon, dim=-1, norm="forward") + # apply the inverse (real) FFT + x = torch.view_as_complex(xs) + with timer.child("irfft"): + x = torch.fft.irfft(x, n=self.nlon, dim=-1, norm="forward") return x - -torch_harmonics.RealSHT = RealSHT -torch_harmonics.InverseRealSHT = InverseRealSHT From 50369f0ba0a18c6b0784d11e6bd5704a849d4983 Mon Sep 17 00:00:00 2001 From: Jeremy McGibbon Date: Tue, 10 Feb 2026 21:00:28 +0000 Subject: [PATCH 05/20] add benchmarks with gpu regression testing --- conftest.py | 13 + fme/core/__init__.py | 3 + fme/core/benchmark/.gitignore | 1 + fme/core/benchmark/__init__.py | 0 fme/core/benchmark/benchmark.py | 305 ++++++++++++++++++ fme/core/benchmark/memory.py | 86 +++++ fme/core/benchmark/run.py | 107 ++++++ fme/core/benchmark/test_benchmark.py | 54 ++++ fme/core/benchmark/test_memory.py | 29 ++ .../testdata/csfno_block-regression.pt | Bin 0 -> 12178 bytes .../csfno_block_8_groups-regression.pt | Bin 0 -> 12241 bytes fme/core/models/__init__.py | 3 + fme/core/models/conditional_sfno/__init__.py | 3 + fme/core/models/conditional_sfno/benchmark.py | 126 ++++++++ .../models/conditional_sfno/test_sfnonet.py | 62 ++++ 15 files changed, 792 insertions(+) create mode 100644 fme/core/benchmark/.gitignore create mode 100644 fme/core/benchmark/__init__.py create mode 100644 fme/core/benchmark/benchmark.py create mode 100644 fme/core/benchmark/memory.py create mode 100644 fme/core/benchmark/run.py create mode 100644 fme/core/benchmark/test_benchmark.py create mode 100644 fme/core/benchmark/test_memory.py create mode 100644 fme/core/benchmark/testdata/csfno_block-regression.pt create mode 100644 fme/core/benchmark/testdata/csfno_block_8_groups-regression.pt create mode 100644 fme/core/models/__init__.py create mode 100644 fme/core/models/conditional_sfno/benchmark.py diff --git a/conftest.py b/conftest.py index 84c62901a..930e9fc68 100644 --- a/conftest.py +++ b/conftest.py @@ -1,3 +1,7 @@ +import os + +os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8" # required for determinism + import gc import signal from unittest import mock @@ -5,6 +9,15 @@ import pytest import torch +from fme.core.rand import set_seed + + +@pytest.fixture(autouse=True, scope="session") +def deterministic_pytorch(): + torch.use_deterministic_algorithms(True) + torch.backends.cudnn.benchmark = False + set_seed(0) + def pytest_addoption(parser): parser.addoption( diff --git a/fme/core/__init__.py b/fme/core/__init__.py index 67658ccfa..a2650da09 100644 --- a/fme/core/__init__.py +++ b/fme/core/__init__.py @@ -1,3 +1,4 @@ +from . import models as _ # to trigger registrations from .atmosphere_data import AtmosphereData from .device import get_device, using_gpu from .gridded_ops import GriddedOperations @@ -14,6 +15,8 @@ from .rand import set_seed from .registry import Registry +del _ + __all__ = [ "spherical_area_weights", "weighted_mean", diff --git a/fme/core/benchmark/.gitignore b/fme/core/benchmark/.gitignore new file mode 100644 index 000000000..1a06816d8 --- /dev/null +++ b/fme/core/benchmark/.gitignore @@ -0,0 +1 @@ +results diff --git a/fme/core/benchmark/__init__.py b/fme/core/benchmark/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/fme/core/benchmark/benchmark.py b/fme/core/benchmark/benchmark.py new file mode 100644 index 000000000..c227264bf --- /dev/null +++ b/fme/core/benchmark/benchmark.py @@ -0,0 +1,305 @@ +import abc +import dataclasses +import pathlib +from collections.abc import Callable +from typing import Self, TypeVar + +import dacite +import matplotlib.pyplot as plt +import torch + +from fme.core.benchmark.memory import MemoryResult, benchmark_memory +from fme.core.benchmark.timer import CUDATimer, NullTimer, Timer, TimerResult +from fme.core.typing_ import TensorDict + + +@dataclasses.dataclass +class BenchmarkResult: + memory: MemoryResult + timer: TimerResult + + def __repr__(self) -> str: + return f"BenchmarkResult(memory={self.memory}, timer={self.timer})" + + def asdict(self) -> dict: + return dataclasses.asdict(self) + + @classmethod + def from_dict(cls, d: dict) -> "BenchmarkResult": + return dacite.from_dict(cls, d, config=dacite.Config(strict=True)) + + def assert_close( + self, other: "BenchmarkResult", rtol=0.02, children_rtol=0.02 + ) -> None: + try: + self.timer.assert_close(other.timer, rtol=rtol, children_rtol=children_rtol) + except AssertionError as e: + raise AssertionError(f"Timer results differ: {e}") from e + try: + self.memory.assert_close(other.memory, rtol=rtol) + except AssertionError as e: + raise AssertionError(f"Memory results differ: {e}") from e + + def to_png( + self, path: str | pathlib.Path, label: str, child: str | None = None + ) -> None: + # note this function was generated with AI + def avg_time(t: TimerResult) -> float: + return float(t.avg_time) + + def self_time(t: TimerResult) -> float: + t_avg = avg_time(t) + c_avg = sum(avg_time(c) for c in t.children.values()) + return max(t_avg - c_avg, 0.0) + + def fmt_time(ms: float) -> str: + if ms >= 1000.0: + return f"{ms/1000.0:.2f}s" + if ms >= 10.0: + return f"{ms:.1f}ms" + return f"{ms:.2f}ms" + + def label_ok(name: str, ms: float, frac_of_root: float) -> bool: + if not name: + return False + return frac_of_root >= 0.05 + + def sorted_children(t: TimerResult) -> list[tuple[str, TimerResult]]: + return sorted( + t.children.items(), key=lambda kv: avg_time(kv[1]), reverse=True + ) + + def blend_with_white( + rgb: tuple[float, float, float], amount: float + ) -> tuple[float, float, float]: + # amount in [0,1]: 0 -> original, 1 -> white + return ( + rgb[0] + (1.0 - rgb[0]) * amount, + rgb[1] + (1.0 - rgb[1]) * amount, + rgb[2] + (1.0 - rgb[2]) * amount, + ) + + root = self.timer + if child is not None: + for part in child.split("."): + if part not in root.children: + raise ValueError(f"Child '{child}' not found in timer results.") + root = root.children[part] + root_avg = avg_time(root) + + max_alloc_mb = self.memory.max_alloc / (1024.0 * 1024.0) + + fig = plt.figure(figsize=(8, 6), constrained_layout=True) + if root_avg <= 0.0: + fig.suptitle( + f"Benchmark for {label}\ntotal=0.00s, max_alloc={max_alloc_mb:.1f} MB", + fontsize=14, + ) + ax0 = fig.add_subplot(1, 1, 1) + ax0.text(0.5, 0.5, "No timing data", ha="center", va="center") + ax0.axis("off") + fig.savefig(path, dpi=200) + plt.close(fig) + return + + fig.suptitle( + f"Benchmark for {label}\ntotal={fmt_time(root_avg)}, " + f"max_alloc={max_alloc_mb:.1f} MB", + fontsize=14, + ) + + ax = fig.add_subplot(1, 1, 1) + ax.set_xlim(0, 2) + ax.set_ylim(0, root_avg) + ax.set_xticks([0.5, 1.5]) + ax.set_xticklabels(["Level 1", "Level 2"]) + ax.set_ylabel("Avg time") + ax.set_yticks([]) + ax.spines["top"].set_visible(False) + ax.spines["right"].set_visible(False) + + gray = (0.85, 0.85, 0.85, 1.0) + cmap = plt.get_cmap("tab20") + + lvl1 = sorted_children(root) + lvl1_names = [n for n, _ in lvl1] + lvl1_index = {n: i for i, n in enumerate(lvl1_names)} + + # Level 1 stack (root children + root self in gray, unlabeled) + lvl1_segments: list[tuple[str, float, tuple[float, float, float, float]]] = [] + for n1, t1 in lvl1: + base = cmap(lvl1_index[n1] % cmap.N) + lvl1_segments.append((n1, avg_time(t1), base)) + r_self = self_time(root) + if r_self > 0.0: + lvl1_segments.append(("", r_self, gray)) + + def draw_stack( + x_center: float, + segments: list[tuple[str, float, tuple[float, float, float, float]]], + ) -> None: + width = 0.86 + y = 0.0 + for name, sec, color in segments: + if sec <= 0.0: + continue + ax.bar( + x_center, + sec, + bottom=y, + width=width, + align="center", + color=color, + edgecolor="white", + linewidth=1.0, + ) + frac = sec / root_avg + if label_ok(name, sec, frac): + ax.text( + x_center, + y + sec / 2.0, + f"{name}\n{fmt_time(sec)}", + ha="center", + va="center", + fontsize=9, + rotation=0, # keep horizontal to avoid cross-column overlap + clip_on=True, + ) + y += sec + if y < root_avg: + ax.bar( + x_center, + root_avg - y, + bottom=y, + width=width, + align="center", + color=gray, + edgecolor="white", + linewidth=1.0, + ) + + draw_stack(0.5, lvl1_segments) + + # Level 2 stack: + # For each level-1 slice, stack its children + # (colored as parent hue variants) + self in gray. + lvl2_segments: list[tuple[str, float, tuple[float, float, float, float]]] = [] + for n1, t1 in lvl1: + parent_rgba = cmap(lvl1_index[n1] % cmap.N) + parent_rgb = (parent_rgba[0], parent_rgba[1], parent_rgba[2]) + + children = sorted_children(t1) + k = len(children) + for i, (n2, t2) in enumerate(children): + # Same “type” of color as parent: lighten progressively per child. + # First child is closest to parent; later children are lighter. + lighten = 0.10 + (0.55 * (i / max(k - 1, 1))) + rgb = blend_with_white(parent_rgb, lighten) + lvl2_segments.append((n2, avg_time(t2), (rgb[0], rgb[1], rgb[2], 1.0))) + + s1 = self_time(t1) + if s1 > 0.0: + lvl2_segments.append(("", s1, gray)) + + draw_stack(1.5, lvl2_segments) + + fig.tight_layout(rect=(0.02, 0.02, 0.98, 0.98)) + fig.savefig(path, dpi=200, bbox_inches="tight") + plt.close(fig) + + +T = TypeVar("T") + + +class BenchmarkABC(abc.ABC): + @classmethod + def new_from_fn( + cls, + fn: Callable[[Timer], TensorDict], + ) -> "BenchmarkABC": + class FnBenchmark(BenchmarkABC): + @classmethod + def new(cls) -> "FnBenchmark": + return FnBenchmark() + + def run_instance(self, timer: Timer) -> TensorDict: + return fn(timer) + + return FnBenchmark() + + @classmethod + @abc.abstractmethod + def new(cls: type[Self]) -> Self: + """ + Initialize any state needed for the benchmark. + This will be called once before the benchmark is run. + """ + pass + + @classmethod + def new_for_regression(cls: type[Self]) -> Self | None: + """ + Initialize any state needed for regression testing. + This will be called once before regression tests are run. + + If regression testing is not needed, this can return None, + and regression testing will not be run. + + This exists as a separate method from new so that it can + use small data sizes more conducive to storing regression targets in git. + """ + return None + + @classmethod + def run_benchmark(cls, iters=10, warmup=1) -> BenchmarkResult: + if not torch.cuda.is_available(): + raise RuntimeError("CUDA is not available, cannot run benchmark.") + null_timer = NullTimer() + benchmark = cls.new() + for _ in range(warmup): + benchmark.run_instance(null_timer) + timer = CUDATimer() + with benchmark_memory() as bm: + for _ in range(iters): + with timer: + benchmark.run_instance(timer) + return BenchmarkResult( + timer=timer.result, + memory=bm.result, + ) + + @classmethod + def run_regression(cls) -> TensorDict | None: + benchmark = cls.new_for_regression() + if benchmark is None: + return None + null_timer = NullTimer() + return benchmark.run_instance(null_timer) + + @abc.abstractmethod + def run_instance(self: Self, timer: Timer) -> TensorDict: + """ + Run the benchmark. This will be called multiple times, + and should return a TensorDict of results. + + This must not mutate any state on self, since the same instance may be + used across multiple iterations. + """ + pass + + +_BENCHMARKS: dict[str, type[BenchmarkABC]] = {} + + +def register_benchmark(name: str) -> Callable[[type[BenchmarkABC]], type[BenchmarkABC]]: + def _register(fn: type[BenchmarkABC]) -> type[BenchmarkABC]: + if name in _BENCHMARKS: + raise ValueError(f"Benchmark with name '{name}' is already registered.") + _BENCHMARKS[name] = fn + return fn + + return _register + + +def get_benchmarks() -> dict[str, type[BenchmarkABC]]: + return _BENCHMARKS.copy() diff --git a/fme/core/benchmark/memory.py b/fme/core/benchmark/memory.py new file mode 100644 index 000000000..d104b0035 --- /dev/null +++ b/fme/core/benchmark/memory.py @@ -0,0 +1,86 @@ +import dataclasses +from typing import Literal + +import torch + +_benchmark_memory_started = False + + +@dataclasses.dataclass +class MemoryResult: + max_alloc: int + max_reserved: int + + def assert_close(self, other: "MemoryResult", rtol=0.02) -> None: + if not torch.isclose( + torch.tensor(self.max_alloc, dtype=torch.float64), + torch.tensor(other.max_alloc, dtype=torch.float64), + rtol=rtol, + ): + raise AssertionError( + f"max_alloc differs: {self.max_alloc} vs " + f"{other.max_alloc} given rtol={rtol}" + ) + if not torch.isclose( + torch.tensor(self.max_reserved, dtype=torch.float64), + torch.tensor(other.max_reserved, dtype=torch.float64), + rtol=rtol, + ): + raise AssertionError( + f"max_reserved differs: {self.max_reserved} vs " + f"{other.max_reserved} given rtol={rtol}" + ) + + +class MemoryBenchmark: + def __init__(self): + self._started = False + self._ended = False + + def __enter__(self) -> "MemoryBenchmark": + global _benchmark_memory_started + if _benchmark_memory_started: + raise RuntimeError( + "benchmark_memory cannot be nested due to its use of globals" + ) + _benchmark_memory_started = True + if self._started: + raise RuntimeError( + "MemoryBenchmark cannot be nested due to its use of globals" + ) + if self._ended: + raise RuntimeError("MemoryBenchmark cannot be reused after it has ended.") + self._started = True + torch.cuda.synchronize() + torch.cuda.reset_peak_memory_stats() + self._max_alloc = 0 + self._max_reserved = 0 + return self + + def __exit__(self, exc_type, exc_val, exc_tb) -> Literal[False]: + torch.cuda.synchronize() + global _benchmark_memory_started + _benchmark_memory_started = False + self._started = False + self._ended = True + self._max_alloc = torch.cuda.max_memory_allocated() + self._max_reserved = torch.cuda.max_memory_reserved() + return False # Don't suppress exceptions + + @property + def result(self) -> MemoryResult: + if self._started: + raise RuntimeError( + "MemoryBenchmark is still running. " + "Please exit the context before getting results." + ) + if not self._ended: + raise RuntimeError( + "MemoryBenchmark has not been run yet. " + "Please enter and exit the context before getting results." + ) + return MemoryResult(max_alloc=self._max_alloc, max_reserved=self._max_reserved) + + +def benchmark_memory() -> MemoryBenchmark: + return MemoryBenchmark() diff --git a/fme/core/benchmark/run.py b/fme/core/benchmark/run.py new file mode 100644 index 000000000..b5a2988cb --- /dev/null +++ b/fme/core/benchmark/run.py @@ -0,0 +1,107 @@ +import argparse +import os +import pathlib +import subprocess + +import torch + +from fme.core.benchmark.benchmark import get_benchmarks + +RESULTS_PATH = pathlib.Path(os.path.abspath(os.path.dirname(__file__))) / "results" + +_GIT_COMMIT: str | None = None + + +def get_git_commit() -> str: + global _GIT_COMMIT + if _GIT_COMMIT is None: + args = ["git", "rev-parse", "--short", "HEAD"] + _GIT_COMMIT = ( + subprocess.check_output(args, stderr=subprocess.DEVNULL).decode().strip() + ) + return _GIT_COMMIT + + +def get_device_name() -> str: + if torch.cuda.is_available(): + return torch.cuda.get_device_properties(0).name + else: + return "CPU" + + +def main(names: list[str] | None, iters: int, child: str | None = None) -> None: + RESULTS_PATH.mkdir(exist_ok=True) + device_name = get_device_name() + + print(f"Running benchmarks on device: {device_name}") + benchmarks = get_benchmarks() + if names is not None: + if any(name not in benchmarks for name in names): + print("Some specified benchmarks not found. Available benchmarks:") + for name in benchmarks: + print(f" - {name}") + return + benchmarks_to_run = {name: benchmarks[name] for name in names} + else: + benchmarks_to_run = benchmarks + + def get_label(name): + return f"{name} on {device_name} at commit {get_git_commit()}" + + def get_filename(name) -> pathlib.Path: + safe_name = name.replace("/", "_").replace(".", "_").lower() + safe_device_name = device_name.replace(" ", "_").replace("/", "_").lower() + return RESULTS_PATH / f"{safe_name}_{safe_device_name}_{get_git_commit()}.png" + + for name, cls in benchmarks_to_run.items(): + print(f"Running benchmark: {name}") + result = cls.run_benchmark(iters=iters) + result.to_png(get_filename(name), label=get_label(name)) + if child is not None: + child_name = f"{name}.{child}" + child_label = get_label(child_name) + print(f" Generating report for child timer: {child_label}") + result.to_png(get_filename(child_name), label=child_label, child=child) + print(f" Result: {result}") + + +def get_benchmark_label(name): + device_name = get_device_name() + return f"{name} on {device_name} at commit {get_git_commit()}" + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Run registered benchmarks.") + parser.add_argument( + "benchmark", + type=str, + nargs="?", + default=None, + help=( + "Name of the benchmark to run. If not provided, " + "all benchmarks will be run." + ), + ) + parser.add_argument( + "--child", + type=str, + default=None, + help=( + "If provided, the child timer to generate a report for. " + "This should be a dot-separated path to a child timer, " + "e.g. 'forward' or 'forward.linear'." + ), + ) + parser.add_argument( + "--iters", + type=int, + default=10, + help="Number of iterations to run each benchmark for.", + ) + args = parser.parse_args() + + main( + names=[args.benchmark] if args.benchmark else None, + iters=args.iters, + child=args.child, + ) diff --git a/fme/core/benchmark/test_benchmark.py b/fme/core/benchmark/test_benchmark.py new file mode 100644 index 000000000..c7c95629e --- /dev/null +++ b/fme/core/benchmark/test_benchmark.py @@ -0,0 +1,54 @@ +import os + +import pytest +import torch + +import fme # to trigger registration of benchmarks +from fme.core.benchmark.benchmark import BenchmarkABC, get_benchmarks +from fme.core.rand import set_seed +from fme.core.testing.regression import validate_tensor_dict + +del fme + +DIR = os.path.abspath(os.path.dirname(__file__)) + + +@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA is not available") +def test_run_benchmark(): + def benchmark_fn(timer): + torch.cuda._sleep(100_000_000) + + benchmark = BenchmarkABC.new_from_fn(benchmark_fn) + + first_result = benchmark.run_benchmark(iters=15, warmup=1) + assert first_result.timer.total_runs == 15 + second_result = benchmark.run_benchmark(iters=20, warmup=1) + assert second_result.timer.total_runs == 20 + torch.testing.assert_close( + first_result.timer.avg_time, second_result.timer.avg_time, rtol=0.2, atol=0 + ) + + +def test_benchmarks_are_not_empty(): + assert ( + len(get_benchmarks()) > 0 + ), "No benchmarks were registered, but at least one was expected." + + +BENCHMARKS = get_benchmarks() + + +@pytest.mark.parametrize("benchmark_name", BENCHMARKS.keys()) +@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA is not available") +def test_regression(benchmark_name: str): + set_seed(0) + benchmark_cls = BENCHMARKS[benchmark_name] + regression_result = benchmark_cls.run_regression() + if regression_result is None: + pytest.skip("Benchmark does not have regression targets.") + # If run_regression returns something, we expect it to be a TensorDict of results + assert isinstance(regression_result, dict) + validate_tensor_dict( + regression_result, + os.path.join(DIR, "testdata", f"{benchmark_name}-regression.pt"), + ) diff --git a/fme/core/benchmark/test_memory.py b/fme/core/benchmark/test_memory.py new file mode 100644 index 000000000..fc986712b --- /dev/null +++ b/fme/core/benchmark/test_memory.py @@ -0,0 +1,29 @@ +import pytest +import torch + +from fme.core.benchmark.memory import benchmark_memory + + +@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA is not available") +def test_cannot_nest_benchmark(): + with benchmark_memory(): + with pytest.raises(RuntimeError, match="benchmark_memory cannot be nested"): + with benchmark_memory(): + pass + + +@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA is not available") +def test_cannot_get_result_before_end(): + with benchmark_memory() as bm: + with pytest.raises(RuntimeError, match="MemoryBenchmark is still running"): + bm.result + + +@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA is not available") +def test_larger_array_uses_larger_memory(): + with benchmark_memory() as bm1: + _ = torch.randn(100, 100, device="cuda") + with benchmark_memory() as bm2: + _ = torch.randn(200, 200, device="cuda") + + assert bm2.result.max_alloc > bm1.result.max_alloc diff --git a/fme/core/benchmark/testdata/csfno_block-regression.pt b/fme/core/benchmark/testdata/csfno_block-regression.pt new file mode 100644 index 0000000000000000000000000000000000000000..3ae9270b3f9f25352474bd2ff95f804ab6dc3611 GIT binary patch literal 12178 zcmbt)2{=|?yZ&oPGL$A%rY0FOl_~amUd^H;Q*)9jA=8^Q$($)uLdcYm6v+=2n%!m zOOxj;<=Fc;?e?(U;pSoQs^;zJ?Ct2|z6-%`K_tDh) z&7|_vv*6!1`@8>{i84p&k4*kOl(u$mF3!8%9e49qESd8^BBQJQn*evO*E{~vi~sI_ zCZNF4{1bs4`*@BXzoWfk$^8G(b^jyl++27(H^<)`LdmOXzw_vK|1$?!j>exj{LX3( z%@s>j|2G6CSr|~E_sW!c8cf4Waxlv(se1PrZR+U)SRwe4CJel$hLtXOd)pc!DdmqM z)rvK5$P+4@x&qA~PDcf=jrioTEe0(SM;ry-uqP}7dHPgOBss~qr zd678#>H1!DXiLGQxE|an9gGvEXw|%)Bw8~rFAnp4BvH59nYz`M<1W*aIOk_5ats>{ z4$YH8o8_@E>3JolSC`_gu#?E2mWcNlKCqNpfzQtk(pfQAsWDHurYka!NVE4LAZNc} z&zUB46ZA)?sCo<-8KmC?7NYTy0`~U$z0`YG3${PeW}ROUP>`;nJl~x-CvPm8eJQ{# zZ;xZZNhun#&k2nKV$kfi5lV*qg;|fc;j$xUBrkapt~e2i-&bgW|Fs4zKUqMH9%>t2 zJDZ3`ZN?}iCxzo;RdH#DM9tFP%Q#POI?6q9#%IlOXpkyY<33juHrNPKtIo6N9I^^! zt{mO4nvCX?#EB?3gNEzPp@Rnr=7SPmQamcc-p z8ys2H1lMhX;B@>Ix@xsK$UV3Q_Zr7@7jIeznjN2*K;5s`dOpmb1!G+N=dW>%_VGJCu;~cns?VE1}Fe z00d9&fc@Jv2)AH49CA2LPAK^kVS#mU)V&c}$4my-*ty){rYHzVbi~vFIS5)-Np3p1 zlWAR9P^){L=;ntJX7>O|TRQ@c!mB`W=~}4w42QnpV8Rhj)YlSDox%mxMwq)l0UA!paD^|& zgADyi#2#E^xU)iV&(kk7lFTCNI&-miWgydeO%@cJTR>y_XE4e10L{=$Lz@#5x!I>! zcyKBQT4n7){i6pM+uJgnR4`BqE`dPj`(StX5h=Cwg_@WxFnRZPNL}C#C-)<%;Lm|= zd_J%wEQJLG=h&|tquU=i327UxdECJ749jFO>uUO0TOsDyiw_2h{T zr42PNVQBt+Xo?*qdmsHG=`$aYCi9cfKgJ2V2Zh1UG=ptvUj{0#x5JJ}5?q_G*<6_3 z07dd~U^P3CQA!J8Jv3gDHU&j=dAN`SZQ#Mm#jY@B5rx`^Bdl|y7z9Kmfp=>Q_pa+-Ixa_z`-Oy+>584+an<#S*TDNL&uXinl+OZYP zg$#Mn(_Tj|KHd)QN9@5$ke{0qa1<1doj06)rxE(L%X1}iDD>pa1Vse!+;$%Niwt1p zNF9`L<*3vKl(V3c%kkgBG+Z!b#aDusS=Dnd^seu2l#tG#X*b*+R&BC_((M*Mp!fKbISk z1PR|nxZ9NEfK&3UYH;giSn9&Z?RNbH>6<$!bJq`w+yDa92ADyyF!D&d6{3YqQ21FZ zb9~@7SuE`ejR(eXpZhK2irt;htt_hor&AIzpKmP)-=BvYtd2rqV;aoD8;~*i3h+;f zg5g#U*KDm5bm(fs!&_-k^_B-rSQKmgF^LSM$3yldUwA*dMk`1;ZGTcvb3@DE+^;}b zaH18w+P=e(&od%pUII3gt)TRh6gT1LYVe%?22#%ygV2?WusNm#5}sWFh3C_`^RB)G zdx?Nc%>dG#&|Uq?HW~vykAsSvI>1#bfk^9oQuX00 z6YKbi%;$zdjj$bgsI!6`j6BDjR+2`)Rz4a~u&>&Fu{OkIU1#px*-qvfydy$at>Ly; zCdA$5hj9U$fcNb*O;~h?WPEOeiJRguC~rRM=fvR8&?*wp&%*kVb>x@*PB<7OLqGRO zfI{aD_86ZIS!ApQJABJ%^V|Fv0}d=JbP;cDbH$Y#!Xqo>FW@-Q$zl3EW7 zK;fAvXxqCL3cRD?efJd5^LauR%zsBaYcwz*$b+|ITgFrpc40XS?Yj$U?$ zfKE&Yb?+Ex@lPNIhJD0t$q0DrM!@PGX)tn?2TrAeVYJ#sB5YYta!q!lK2aU-vXc}U z6hmzrt->Hx7$Kj?7N3>W$zvhrmJ~An?0Y)qNFkA^yhArkv}Ijak0I=NS$x>N7sl^X zAS)HU$OUU}z@A?s^p$NF^DVy>&vs-$!rd(L zrLu-veXfM$_yp)PjDq;F5Q)9`VbQbY=r?W$JmAeF^WPjH0h1l{JGb1#^kf42z6XNC zK@W1uvm9MZ$JY4ADADWL&&lkj*)+5A74(mr4sYb7!94W_nYW9D(kWerO8tM4=hACg zcCr^$Jex&}jY8qENdw51SK{RSg=l20jqiUXfqLelDTw*8kflNhZ-&B{Men(ChB9r?)BAC>A9opjx`v(uZdsA z%fO2U6FievK)R)$5}nGw;K`~j*volj7&|W+&zD`poc!xl^iwV2>k0uM^EmoJC5bf5 zaR+hkXYze|1Nt77!RKFsfg_$=rDGXPtPWjfHYw%ehAbCGZg>j*ED3|@P6iNqCZ1*u zvA8OqL3Lg#K$<^+3{T3*-FI2TZkSg6wU27Xuf^o@cskAdGwGTXM2%$p@X6QJ5b-J! zg*isd$yebpv|16js%vAr$3jE@5^c&+&8*(;sz7`fBm+Bg2$p&4li`)#ba6#AQB%tV zv3^fnI3p3~NVl`c6wZPF(-^QJ+GIou$m%19VeuE`O143 z-V=p|Bdt`XLj)v-O|VyDgl$g+TNc z_rlw~TS4WS3`tsNMQ-$^(3lnpSabL_c|KJdrk}4O?~l)+zTLZ-@~}Q)-m?z|Ug?2k z+#xU>Is@LPN7xt>0uP!)>5a@xI_an&gpCm={F#;XQnqpRb9)CG+kX+4To;DoF9|TI zj{&$AidH#OK*vf)fNNfkQ-3YL!BAhzS4*REQhCG~&d}UN$uwVPn8wv4lUo`Mgm-Wq9JuC4=6tfH z3e!5+49!nu0|!`EYBMR?FNh^}GO)V9mA#g4iO)|)(dUvE;nVXda(g6^aJ9bCL(XSt zT3|3FT|5pq*3Ce_Nn2>(_9R;P$%c+t)Y9tBN8tF~ePlW_2R7RE(ZHHw?D4b0gGX)= zj#55uYtX~>fajHopt0+ICe#r%P&IaNg zAx*>YGwhCjMaYX>L!%_e(%kRzc+2KBT@&0zU*0%D7kF9Gs;k#1uSy)Zo{NBnXkoN2 zE+qvKuc|AYmrz07dPe2?D>fr&7917srvAEqv~e##YD!n*gH$!L?qM`8YZgIYH$9wp zP8Fli2s790G?;>ClH^B37}?G~s?v|UPiz`fF_-g$=2i>g!=}YlqDup}$=QIs;&RB% ziKGJa3bE)w58er1jiycsXqb_S@hy8$#dQuEI2&N&$q4qU}v+|;fw_QOg#j#35Y{s9(c%O zBTk&FgZ#^GQbDdIb1JY1=UWJ2v}7q1nOKt;mAAxSb2V7$@v*NC86o%nR_tkcL%yo* z#KIXR*q5!rL|eWm4~iz^`HjjL)whxE^c92qx8zV$@dJrt=RwU@dq}ojNE1$EV}iqV z`f+;$bzgb_W8bKfpS*YUS!^Ln!aLeg9#6*j*P`&HL=5!Yji+?xLqxPKb-K6+{7oj2 zh%#>w5Z8c=7Y*dbul#C-o~N|&*lU_|Qkjgsl}4U=&%~`~BFVuMQyCrO6=*jQhcS=m zfL_;n;FpD$E*A4o<(wviCxd4bR3u}TW_RlX9BU(TdsdKJ6V6H(_!Ii3*ZLL3uF+O>F)F3Ev) z*QG)1eJ0(va~6=7lfZZOAPHY%!9>@lFdKMBsuxHeXG5M(fai`+iTECOv|AI0N-r1U zmgG6OFv^bpe6yL!(Bl=;mdzVUd?+%jGgBq zPf8iKZQlqoZ)SjuqB@9O4q@}dAJY4BwM1!>EIIF4$5?*j5M%eT?D&2$bhD`aulSCxchZa#|8e?|*MU( zpFg1c>liehw*g^V3VpjIhfTY&2*oZCSQsB==pxZXEjIZfo;XbR9rA^A}f=yg!B0+jn!P-MJkD$oCimgC!!XR;Ee`2(tYGni06MVtFM3aV zz_6#>0s~aaXhlN>?sGZK&UmyM_Tpmi9vIve7`ba#-Re*!_GFCF_ zGPLE#)5xZ|IK*|v1L3x$B6O5N3D?~%rUnfHg1WDDs0#fL+pY#ha1g=2|PU~iI{_Vp! zsXqcF23hc!B1cRsE;2dSreWAO4mPg2ft|7jxW%EAKHvC~ELm-bJKjVQW5w~@s(mJ) zS$dJgjf;e+xu58lbEEYR?<>^ltTLdR0NTxO!Z}AaVSk%2#`zYZcbq<$>xn|MTOQHIzMV>UNwPyU7L*QhV76%LvX6roWMlvF)If zy~owO>$^__r0WdY-CtH`ZOX>(h-S)K3&ZXQf@r)+3r8h*(q+FApRKi~Rmy8ovzbet zS*k-9qkw}ZYq7AIVxNj9uBkdol{bnIw|U!OW`QUa9}T5p1_fk%*LR|~BZ_J27NX^c zQ*oT*nNdw_F-~90qpyc%wD z(M*N&IVk(M4N`Isla(_gu=9i{do5}Uwl9CfTKFBKGabIrb)pSa=;Tsbcjpi-tt}>B zzH~wEp?fhw3SY6c+=mUPpia(ZFp7Ff}faNKYl*JR0es;}lm6;<6~{8?}4+n5PFKp5Iln_VUYp{ES~w0oJZAg<7x+yem0LBSBWKlFQ&rGwg=$k$3l~04XCd* z1+PANXf62)%+Wpwu=-jRAQn}9*en8K#p>Yd!W2lgE2ZtbU$V~B49!h%u(}rV+-oZw ziFR@Zgp97uA7MwJCngR3Di9W*R|I3Lr|d`HOyD?OV)h%EfX(=`)K5bUHn49%G4u#z z)LsFVr=2v;DwEA$KB{e=&ty=XfXb;#fXl<=u~z^L>3Sh=sTJ6s`9P$lJ~0Z1R*|L! z<7ijzWq3Of%1$W_B&k0}zjH){VH%Yf)c|Keo`f)L8Ttj%rN5xjUIYyEb70niZsJ#S zf^jerCA>8iIOWw#sQj7;yVE|yta&_0(BA}iOI|`iUO*Mb8^Cn~R|seh12ZEN?%j?X zK+dVdjC+|xxaTvN&90}DZWu!lQvyZ)*P-{?K^Uta0?STj0Sa)q4F-$J&krGFX|o*e zk<-DDT~DCe{X*%fpnR-tXB}1F5F;eL^ z?X`M<>6HRCHOVc=NaQk`JXEl+q#w6UoLb|R^$14_2sUKS!V<9tGw8v{FY_RKvKN_78 zHH481tdiSksUhVkig#VZ1wR~cWX@PR=3Wr`3?8Qs1kTevxs7;CK%mA@=K)$9@Yg7r zSz(xz8kP+lV;kQ(;qbR=GXBR0w#PyfQV*EpVYTPzer$4$3uM#tFH}*`Vj8w=(4yxu z_ENtzbKHJm664RCNR1*Iu>Nip;kQvjxdXbmt|%KzmDi9jkNvR4F#vr`uhCu=KI~2n zAqO`VW57=ywHexoI+fPg8gvlN<>Q#2)wh|_5D^?NI0iei+OfdpD6T$}Pn~&1Bs$I; zcg^8p^xn5j$P7so60yb=-b={GkVGcV&xLvOsgK<6EQh?liEwcHB|3JrZhib#67JWC zLO;7wT%%({&)htTL7WiW&7Xqp9}nTSg9~x3Y%@K$V=f-93&5Sy_Nent8qyVQSrU>Qzqf_ z%t&lARiVTDrdaz#kG&S)NQWQ0laSA&`UFo0w)sdyz=d@n@-Y;qi#L-;n|V;LmP_vP z8sXM^KG4=v!bmy;LWTpt;Lt~MDES};85SaE#yt>9dI;%(8dxT~o~H0p;Mr#R(RtHB z{qSZGNIg3;bKY%Z)e~}d${Sf5Cg-Mvsuxmg7Z)GNRw@E|B&$OWTR2?#uE4V1GSj9gNxCdCVYdngBc z9zTG=&`IzXRhZ}RS{U`vbFkD*nQP-al_t&&8Qq`dl7gOZgtb3S`!cFYfk4>k`%4(M z%k&b>&LfzV@_;T-en5<$pJaD!FQE;$=A*n?I0#q!gN<+&EbF0QqF4vkFYLgny9Vx` zP~|Qu`T;hI9B!2aA4F`pNrZ~Nl6tS@=(8^ll;Ss2hg2)3XWsywwONpCQ2)&MO%H?2 z<>K(AwVh5Ejf5hPRQCDxHfY>i0TS{%*uJe%;F|0XMQZ%qk5=(a)<-+Ae(#S}O77%K zngGb{yiYf@rj6E}0Vqm+fSiFL+GM52THKAL@$oir`-c>_aN7+c7}uPuz}1O^_xb2Q@+X)d~o!)q>u& z_o-2R20DmOgm%l%l&9@Qt=c9-PVNvGDMeOCEwRV;Y70maEv99kfUbxbhL0Wh=-t1z zfEjk|AgcRR;`1x-89CK&7@s7BeOKOr zh%7F?lqH2GSL2)l5PEX1X%k5dHYis{i>mCT{Z?}V>t_4w)6adcV z*<`#>3q;;}NS40i=Wf|`13Fgs(xfa;YQM~y$ebe7_vL9KYdIB8&$1$P_bei~{}(wF zt2ydj*-E8mOOWmP1+dBRDHCCD0+SAZp+@XcMkT$NG&H{^r4k<*y-a)Zja4CT6IyBg z_C++xZ!9WaETO#aX0~Ep2l1>{tG;i=A;m!(NaKSi#QOanNb;42fh$2ovO$q1drFh- z>!e^*dL)BFZ>Z1=A2unxn1+c2lEHD2jK}Bgu(om)iRY`s6c3gbR-Gd$^a@R!Hy&iK z$)MKv9at=X9M3*+1pX)~xO9C2m2>PR>-Oy-Mh9-OZzVa{=48nvO#Du63yGpf*<27Q zKSh65jmO?gTA*iVMaoYaQ_f8(CbztS0(3$xB! z6gFCNFeK(Q>5qve{0epCQs`UKt-?>nT+||kX>H_P@>mGm=QpYtC6J1_A!K}uBJ9#f zCU%=Bd^%i8)3+pIm}D&7Y5u^lc{mg{B@Q#OMsCFXYB|+#Re(Ek(@2+)I-J_e5B$Ol zX!N^U(zxU^6Db`{G(%sKRL_~vw9g#(EEB}iH-oH{z68vf97Mj&k_CBj6Jqs!0UHpQ zPl7CKq3`q>s7~Ji_3?r*M`P`1O~C<%rfS0RKp8mSa~lJP#*!~TzO&cDr$T^b&E-7| zfx5TNBu7XVFW=cjtq;ag-zkEy{IMo1GqFMoxK7?@{vb&&j}i+-RZ#U5f#1%@{DS;t z0ZJUs@BU}!qZ~*3PtM2xG@xq!Z>*1hvtTY1`OV;dhO_m5@BdWe%>EMt^}m@feH^`A zMjfd8?S9*DZCxBzERp~32x;l)Ezs9mv`9xwOJ7G@S9_u60$t68`kLB{^!0Q`4?4Pw z^fdMK|7MfDx%kVz(GZQ=V1LJn?>FK9bUy#1nOFYz5C7Eu8z1z)g8ic(SMi^KR*qV9 z{{Z$cZs32#{6~we@;_nT|9@cq7hmtc!v3RcR^gvuEk@(>2de++*8Nwsf6PSLe?n{h zGuq#MxBm+Dk0~YhPf$&&eeLS|=wNJvP?l*kZ5GApD)2u%`^Cdrf}Ns{l`AJQaA zLXsvSl{9OXUfp~Dcm3}DzxVy!_3pLyI%}P?_p_h#IiI!9-p_t)EP44j96>?O|0;?c zea_l&-=JW3ufX87>)mzT*M$W~hJ-H&^H~?>6CUm#9JJ8eGs06fWPPBSu{Ou$|Et7} z+Y-ug78n~nI5Hw6G9r{`ZA5U`T0d3y$O!+yaAEf_AFoLNKyUX5pP=yIF!zlb!lC07 z6`jYA^#~tp>AB7)l=tslgv|njJtJ2B{ibjzpEJ)`8?{h=XWp^v+K|Xlf#q_-5uxL4 zEjfyoJeFdXLYCqYp@NEIcP;ZR#VvUvLWNS-t_=*O|GJvt{K%d_nl1Sf--+A=A{$~#I z9QnWG@H?v)s+k$j`7a0rMfsBWv$B{!(*qs%FF_btg35EO@nZfcS-+#5RHrAA7qNG_ zyS^VlkERMtr=fUc$_UNk(Vzv(E$P(nRoHfPJ3f6|k6Px>i75V{#*g_>E@LJRyXjJ! zqdfHJu_#bp-~?;J32d@GOVYCH=*<rj$TQ8JkOf24<*6%r!Jas?@)_7kvNVU1~TFYXz}J&a!Bei+a0@` zOpR+Kb0_S>G+&Ar+v71`y^vjh^C@Y3Qiv{v$8qkIrSMXBkD{Fl{zn>_%wci4R27K?c(K zQmLPWD5Lf=n0DLlr{nmhX6;3O_%;*$J*7qhOJkd17iWg-Zvygo^3ll$VWygK9h97cg*tFfAbjW=_ z)3szLtdTFp&&S*7#u>_ZqX~(*NFO!UP$mH<4nh44ZycCA2WmtBN>Z=l&To6LeE&4k zp?{7R>hscCq++qXI$R6-7hY#8M7o(qPCa{^_ZEDdJPpm;opBCF0G+OX zBi{o}k?)!()_RPCkb9dU&UPoFlP!dqWl`;yt4XZ4Afx{71|4kFBE6O7^vl3( zHeG?GUzS2%_yiVOSCP&{Aw#J%r$B=HpeE(a%KJwXaZ@3z zS}I7b%=@6&&9@c2rI|M(0dckCenZ)>UR#&hr{QWAazwvtSrd{{SsfS7sO zLv1eu*1jQ_VLT4S=E@jOT%8Gq8Ixi3^I9U!>cHsDjpVt$HoQ;@Ce!YFz~FQ@lESA2 z-4UONf+G(M9EpN!>liZGLv)up0czHP==5Wx$9Xy%6MGe2G}r)4^Tk$+4DfE; z07k1-3^ocX;g>vT*t$u`Fkq=aIBqY5A)zlMDRKc+wq^hV<;CuJ_}-=C&DvcUc>tjpMx@Y8L&E24Tnv=X+>%yy%ocQrA;2VUB89~40+*_ zoR939*$1hsK_>Asy+?1p%qRK$Sqwx?!FhSwT&opcSUEZo#3ff?!{zu4b~SQ6P2hfWdEf%L_25!n(KLC*$r7p4Sq%&4zGcCsk1TX zoe2D#+D3y+ydbDx4TviMm6>spymHOv-slaXoLgF`X*v;?9{fcFyTl=`%oi0;>7ia& zF&&zbgwmHLVL_)Gv=L#{k>O>X1rL$W$4?NxmbqAcV}#yxXr$(PnsD9j6dhduiJ8-| z0Ht~5L12FbfY{L(aF5u|7&vcFZEL;<^lxn=3L1VQn z@z#x#bg=gV)^3x;T|UP!Idw0}^@L(wxfbTSaj`7uG}b3SL-uSuH4m}Dhs^joMPHdZ z#p&4?8&JglP*6kxID;X%CD>ZaTNh`~lDVem@aHrutlvvfY)S_{yS^0_74xfM?4X9}Sgrdhm`k~hdQ;e=){jdF0<69yI$Q9$9bA_nwvl?&jt)skacHo;Uj@Wh5 z5nCQ6VUO=jraiL+Ww|_ca`JcZQfel3Hf_SpqNjM%V*>6u-i3n(?f7KwR4^6%qwai} zAF?q`xHl^u<2Fg@d(2VCH(#%#(MTG4b#`NQpDlXWO{;ro{e_Kq_YiIL_o25&Je81J zj(jHvacUM96;stnQA!+|>?+4U#4534{y7X>%c%=^H5S+Q2bd%_sqR$rbILXRNaH2< zp)c^(eJD7KQLPKmYN;)XD4fOvt%G=_F#(}#CVtjV#J+XQFsc0_MztiMMI#sAjd5xZ zC`ZoGb^31(RuB`eCweF3p<=-UM$s_=#lkbOtMDGiT3(?Nl9@E!vksYqov3Yoi_V*w zh`B6M*}4MkkQ+oJBTIugg~i+}S8_13s+9g{n+dYB$6`o!U)3A8EyhX6Eqx z>2+4dSy4~=l$P4q3MK;(@k!fTWQQwPLh#fc>Na7qmFT(^L8{2C&;$cqs?vJ~pGw%7ic z-UiY4_ESHhbY?5=A&GMbnAE`w^x?v@B+KCjiSjuC(_c)52}v{H@U9UOI5`>mABVx6 zOGjv1yDLeH%z@I|M)c@}{a7fn2lU$mq1;Q47@s|N5)Ydygd<(FehCsvq1I z%YvpHCCFVLLVQDAh*N$XaD?P*6Bkx6Nu~42FE?cnoxPS0_Ic2zqVe?LGC4TAnbMNCTa!oj4v9gQZaTD3aPCd! z=?g9qU$c^ao6=A9=c&_%JDO0?#dcR^Fy%^2=a)u_^sleT%tJPzLwj9047l#c;@WGJIX$Pbxo3Lz8kQeVd68 ztr!i5+*XkrF$5d@_tTasqa=Mw1!;b!iS0K{X}_2mI&yg6U8fUw|GG*d6EqcM9;o4c z!zVzV2tkf*vw?$Z5NVg6Oz)=vHR|6(xoc+>k_CgT;fd+wdf!rN;Io8w#lB|qYj?pXpA@__ zSPY7J9Efa4Mw_J)cp#~Pq^<4-pRH@D+qAhB} z5qRD7q4VPhvT*YX%<_+eILFtlig+rW^ihy@P0C;;9bMR%yt@%l-QJ zutDp%-BdXJIvl*Vjd8e}15uu{@P*7=ux($6d{54@Irk|M*a9S~+zSdJl-Bg!A{h=7 zn7JaOBHMh5o~+jNJYH)N$Euss2pvlIW!!UOP65R;>8%Q(n&Kk6S3;< z1DYfN)F#mmM?Npa@<9H&pjYQnBlrS3OZwuRzI$|!?E*Lw{+R9+KZPPvqIjj)73QX9 zF&erupi$@17DltyWI7VXGPGvMT%DYyueL8`Q&RHgAZM_fwSZso9wp% zmi@eravLfb#aGG5vDMel{3wg^y$aaFdz+C^d`sm-1<}grCaw3&qld2Qpx~E2j9Xu* zpSRix=S@3Fqsp{t_9y@+ew3Ys8M3F*wksF6gl(a}fK&JNRRU#2x6zNS+wtL`7Q9K0p-Yc1#KT{Y&>p8= zs$X0|cit%?Yj^EMwWkV9>%DiVpwvUOjgxR|(q-1XaT%FbkcuyOXHzEIh&@_6ALY3G zbyGXHqv4!%Tqe{>IO*GPqeBV3c4{*Ahgo9mQVu)f<%dCsLLoTv6bV^b%oMyYL%ZEZ z$Y0V+!v6S3SKs-7LEU-u2agM#5mrW&Hg-}G@yqN&t~gHDxlc}b+@^v{2C0!v9Lv)h zPdU>=5wBL_W@WxQuX6{{yGb2JRuFE?ZxLei{TR1f*)Ovp%0Fb z2VvS!dN+=|KW0oe&b&(W4^>dHG%&DSH4C!xOi5`8gX&2uXsP+wfAPBtdHM4&97uT0 zB$ua>hLt}^<%lqRuPG)`=Np;mJa;M;t;3l8?4mNSRIoVNjSl~kfOUcMfiqrKKNE_H zej}2T0%9m~Tbljc{E_WGaF?2_>Y}iH8nk>ZWLzg@7#Mwx;qnEutda=_=H9NLwmcSq zp}8b8{V=_t8-;^i-X!nsEO^$cj7@oaiB8~3Jndjb7FY0trQKps?h_(1^Yl^71lZ4E zd8B^kPexmOkS1SsCwH`MXv&6i#>3o`C@#$;FHWB)oco1T`-=#PwRXXh(pikkh%E!rqbmtVMv zY{Ff_7q3GW%iCh-m?&B8=7T+BYwgI=a=NU~6uS=P(KBuFv?;w2S=~ou^+-9j7HT5m zLgBb$fg0w%-;Pa_4x;ixS#Z;>phlZxxgM@=^wG~*n8G(1EBAh(k~@?^Kl27^oGrs0 zE+6QXUP6WH7Ng7Z3wSu-G46ahisdENRBB^AE!C68lyY4tdm>Ez`U5dtej8)tc@)Kd zxw3uds<6uQ4sj{lhial_*!z}&nhHBo77kRQ`XK&NOTrIx#x#!l`{aYR5(ZXJp+;F% zXp|pqkg|Fq&3$}P3nB_?uzvdqM(YnzRC2T!n?Lqo49`j|d48AXsYWx}*Y_ZYf4YIql^!bh z^d$DyT2kB0N_>(3l_^;HfN1>mqgKu%RJyDVb2n&Xs>f_JStnk%SmFh`c>_t}t4_gjD|Pn{qv|19HCxel6=he@tM4U>A&4K-C&S)R+` zRP34$8+l)oa+1I4b?X+<2P!M5cIGtVvFaf!(~*v?cNalVs4RA$mBM6|K`PoP48zxj zz&NoSSE_HI_QlhHD0|X`gO^F^NH1;9yi0Osj?%%`YRn}0NmOn8bGlck7*F42@xyyv zuz3BH_3{y|J5+Ou=}^*w)v0mhlSd>xl={G?@z+rA_+wPSrj;JckARDUR;0<*s5ZVkF}R`xexJ9~s?WjLcz74xu)CN{bLWAj#tc-g6T?5I$kW`Nm)P^t zhFB)%Oma5v#r;~EG(Jll&E|OG@Dc@_%;{&oSIs3ih&0NHeP<(+)oIVP>s0$>1d8og z2d;cu$+Uf3{JKGobTwXQ)Tb!Zo*$>F@%kaQt&L?L*4s0!7kxK*$FUvL{YEBO#yF?F@JB8z+q+T0#ydDRw2dqGC z>unPHb3BZmwL%s91orSM87#Z&PUe;w673pYxSQ=sbs|`rTm6`3Bu|2}YpTS`Y8Trv z&Vz<@AEr;n*3CVa^TEh79eS3X29Lv&S&_DAVlpxh=k08S!MQ`^fYJ)!REyxMAuH6* zp9K;n*&tz5Mf<&^XkN2FGE4T6;9C#9zJY5;sfJvrqPP?7ub&7 z0iZIr*G{fh0+A>eqHuKxl<&IJ?g#y(MX-x1h5>7z)J>lX?*r5JN#Kz%k20(@ITh^g8su+FjuZ!JeSo74|0YMP-${svoWAc$R>Vl*x^ zPd`sBni28kP(4*e97;DrQSl3;>W%}}zY~PZs=YK>zyvy^Kd>iO-h)oVM&kEY1je7a zOuhYdY1|1T%-C_T7)9Bc^N#e}X>0{LwM zjCj8svpMr7$-77BXtWkn#Xk!!59r{$ccmoj`%(6+sw;GF^CHb=zo>=&d2;r`D5>}; z&TyG?9|wi)ezX$NzvSCKoJ#Ux1M35|Z+ zMk39o!{8)cIy?3Wx#+G5=F5VRO|l~WYG&|iu!rb;^}s42W#WB%Arabc0w*T*vZ-qA zgf*<77gt{*TB+yA5BqOS)UgJfg;#R-P(fy||4h6Ws|Zt#$AP;F2TboZ zlE=gfU+ud}=0+=$#QHbXh(BL{weAg4ty@Es1TT=<#(Z{4x(tdg*-bVE*pR}Qcl4;0 zICM>&%%*5fFi_rkn$q`Y7%LS)=m5$Q-8<$QIcl})DjGHo0_e>Dld&US*AzUP_3Dfbvxy%B7vy^PHdyz$K1 z^LUss!Ems{9S!5_!fqZwkUU3)AqBhYr`8?V^A#&%Kj1u%>)3Up7JJRc)tS}iVd>FO zZ2TrqD;I3UqP~~3^U4K$dT~2OT+cz@SYLGYU4^!F_K?>c!%pp4f=#L`P$u>p?KPFg z`lap6Kx_>*&AE?Gv52qD$}w?D8=83TM_=s@j0+iHVvqd50QcJ{r=5b;*1}kHW)N- z95rWLK)xj*`0>6p?rJE&bBAnk#uCJNaqj3C^^4dQZpM6n5p+sAjE*b6;V{j{yuw0k zy`_n58@YHKwj-x$N^M=C1d7#{V9NO8_$GTNw&g@))yrxWm{EeL@`=_QXu^oWRcM&K z7zgJ6f!#-rQ`^2#+IpIo+04AZ^QdAaIrWk4LFWYe^}wEJvnrvXFleZ z9l^Y2c_u92D4Khfqqux69#%^pJEPA+{>p*n8;cfYNNB8AJvsEMgg8^j0oL>1Pka|9!4p7U#i$FutBvX5MRxQ@jQ z?R<6mp{01NWdc5apbvZ@-_SxZ26cDxqsO}WAThKAEl%qrJ>Z6$d-X8Z;t$O2yh7ip zsgR6ZXVl`ofdw0~QPx4Q&hE$^+PNck=S&eOmz^GNE!3RIJ?gyNZ(CYM*Sa4(QlO>3e1hiGtqZ&r`P}vh9~1rD-mR?&QWv6)tJ%2Qzxpw7GRR_Kz0@zh`>%-LfIJIdebzy>~e@*UhIZ)vnN7mzUVueFoXx>3DFY z1*JNY$otEQDEeYG_ME9jIqL(+WD1i^b|v2Dy@hilX4i3pQZZ)cZj8~;rayl@rKLw@ zVWVL;wpu$d0k5Wj@?`@w!m|_xicnPW91d!=qhFK`DjYY(Li=LeoBI+a;`&i^(>i(~ ztpJVXm9S`(4+D)dP(08KhXkdtZm%xRvkyS0W4G`O-x563QipF(x8VK0T6}ta8){cc zVMN$Rl#98BB^plj%OwuVPYc7o@ufIT;wZxD1DNG9Ks+bs(+9S*pg)(gCJ`;9_vZ}! zzQ~-8q|1W4zcg&Qbr$+X&qAZ?dobS91?sX1(D*={wHrMGo~If>HsS?oGH)TDb`FvC zq6ttv-itm?QzwA~XJPQXGJMt;M+?rMWEEE*BvO0ELz84Ksi+(gDKggz~$p*yprw8P$=4IcxCgY28GuW2Hf%ZQJXwoGf!`&;7k#B?hsXccCWPA9* zRi8Yf;(DHa?YbBWb3~x+s11$apKRE3={#BG8wW!v1!TpHQ3|9A#AgSBQB76t`6lL{tbh6#LiDIm{p4}+19CXTx|lbUuMg&0{|VszR8G>gTc zEaC*aclA6_3;x6{YFdGNW`$GNs01cpv;<5p7qE*>Z_uKW1ju)@p>hdvVBsZ*dr1h~VAmTC|dii0q3J6LDlOGC3o_ow`?)= zuWx_~mAy=^@?G*HVO;#6#F}XDm{3q9H9m40xVaQmw$vL|@MrwC-infYkToRO2wczv(2raOE-F z6C_6#qZs^lKE6Jp^&n1>!}(qRv-44bGygB0kN-5FD*PAL$G=%H7YYAnaOdbchrjlJ zYICIil7Z^q%$MOlVg6$d)ELj-K3sQyZ!_a*|ACOYmX3z5x}Khvy1K5GrncrHH4SaG zMY?KQ>S{WRv~|@rG<4L}H1rnzX89uDJ#s8WoZr>PlIJ(!e>$K4Z07y_{9ns|>x2Gx z)IS?@7yb+6(lM{@U!ne+E%@KT|7^O|_!say{|)&6a{B%|{+|uCO8;E0}&*`H5FQ9fRe+~M7Ph=ZQ z0fDj3e=D-Q3;v1v+s^*G{LgjM-;$HV`TCE>WBva2DEJ>)^mnH>|8KAJSfBBn-|he2 G_WuCs_UPvT literal 0 HcmV?d00001 diff --git a/fme/core/models/__init__.py b/fme/core/models/__init__.py new file mode 100644 index 000000000..4ce828d37 --- /dev/null +++ b/fme/core/models/__init__.py @@ -0,0 +1,3 @@ +from . import conditional_sfno as _ # to trigger registrations + +del _ diff --git a/fme/core/models/conditional_sfno/__init__.py b/fme/core/models/conditional_sfno/__init__.py index e69de29bb..d37a22639 100644 --- a/fme/core/models/conditional_sfno/__init__.py +++ b/fme/core/models/conditional_sfno/__init__.py @@ -0,0 +1,3 @@ +from . import benchmark as _ # to trigger registrations + +del _ diff --git a/fme/core/models/conditional_sfno/benchmark.py b/fme/core/models/conditional_sfno/benchmark.py new file mode 100644 index 000000000..62b4ef992 --- /dev/null +++ b/fme/core/models/conditional_sfno/benchmark.py @@ -0,0 +1,126 @@ +from typing import Self + +import torch + +from fme.core.benchmark.benchmark import BenchmarkABC, register_benchmark +from fme.core.benchmark.timer import Timer +from fme.core.device import get_device +from fme.core.models.conditional_sfno.layers import Context, ContextConfig +from fme.core.models.conditional_sfno.sfnonet import FourierNeuralOperatorBlock +from fme.core.models.conditional_sfno.sht import InverseRealSHT, RealSHT +from fme.core.typing_ import TensorDict + + +def get_block_benchmark(filter_num_groups: int) -> type[BenchmarkABC]: + class BlockBenchmark(BenchmarkABC): + def __init__( + self, block: FourierNeuralOperatorBlock, x: torch.Tensor, context: Context + ): + self.block = block + self.x = x + self.context = context + + def run_instance(self, timer: Timer) -> TensorDict: + result = self.block(self.x, self.context, timer=timer) + return {"output": result.detach()} + + @classmethod + def new(cls) -> Self: + B = 2 + C = 512 + H = 180 + L = 360 + G = filter_num_groups + conditional_embed_dim_noise = 64 + conditional_embed_dim_labels = 3 + conditional_embed_dim_pos = 32 + return cls._new_with_params( + B=B, + C=C, + H=H, + L=L, + G=G, + conditional_embed_dim_noise=conditional_embed_dim_noise, + conditional_embed_dim_labels=conditional_embed_dim_labels, + conditional_embed_dim_pos=conditional_embed_dim_pos, + ) + + @classmethod + def _new_with_params( + cls, + B: int, + C: int, + H: int, + L: int, + G: int, + conditional_embed_dim_noise: int, + conditional_embed_dim_labels: int, + conditional_embed_dim_pos: int, + ) -> Self: + G = filter_num_groups + device = get_device() + conditional_embed_dim_scalar = 0 + embedding_scalar = None + context_embedding_noise = torch.randn( + B, conditional_embed_dim_noise, H, L + ).to(device) + context_embedding_labels = torch.randn(B, conditional_embed_dim_labels).to( + device + ) + context_embedding_pos = torch.randn(B, conditional_embed_dim_pos, H, L).to( + device + ) + context = Context( + embedding_scalar=embedding_scalar, + embedding_pos=context_embedding_pos, + noise=context_embedding_noise, + labels=context_embedding_labels, + ) + x = torch.randn(B, C, H, L, device=get_device()) + forward = RealSHT(nlat=H, nlon=L) + inverse = InverseRealSHT(nlat=H, nlon=L) + context_config = ContextConfig( + embed_dim_scalar=conditional_embed_dim_scalar, + embed_dim_noise=conditional_embed_dim_noise, + embed_dim_labels=conditional_embed_dim_labels, + embed_dim_pos=conditional_embed_dim_pos, + ) + block = FourierNeuralOperatorBlock( + forward_transform=forward, + inverse_transform=inverse, + img_shape=(H, L), + embed_dim=C, + filter_type="linear", + operator_type="dhconv", + use_mlp=True, + context_config=context_config, + filter_num_groups=G, + ).to(device) + return cls(block=block, x=x, context=context) + + @classmethod + def new_for_regression(cls): + B = 1 + C = 16 + H = 9 + L = 18 + G = 2 + conditional_embed_dim_noise = 4 + conditional_embed_dim_labels = 3 + conditional_embed_dim_pos = 2 + return cls._new_with_params( + B=B, + C=C, + H=H, + L=L, + G=G, + conditional_embed_dim_noise=conditional_embed_dim_noise, + conditional_embed_dim_labels=conditional_embed_dim_labels, + conditional_embed_dim_pos=conditional_embed_dim_pos, + ) + + return BlockBenchmark + + +register_benchmark("csfno_block")(get_block_benchmark(filter_num_groups=1)) +register_benchmark("csfno_block_8_groups")(get_block_benchmark(filter_num_groups=8)) diff --git a/fme/core/models/conditional_sfno/test_sfnonet.py b/fme/core/models/conditional_sfno/test_sfnonet.py index 3230d7c87..eb01658f4 100644 --- a/fme/core/models/conditional_sfno/test_sfnonet.py +++ b/fme/core/models/conditional_sfno/test_sfnonet.py @@ -1,3 +1,4 @@ +import dataclasses import os from types import SimpleNamespace @@ -6,6 +7,7 @@ from torch import nn from fme.core.device import get_device +from fme.core.models.conditional_sfno.benchmark import get_block_benchmark from fme.core.testing.regression import validate_tensor from .layers import Context, ContextConfig @@ -221,3 +223,63 @@ def forward(self, x): assert not torch.isnan(output).any() else: assert torch.isnan(output).any() + + +@dataclasses.dataclass +class BenchmarkResult: + ms_total: float + ms_per: float + max_alloc: int + max_reserved: int + y_shape: tuple + y_dtype: torch.dtype + + +def benchmark(fn, iters=10, warmup=1) -> BenchmarkResult: + for _ in range(warmup): + fn() + torch.cuda.synchronize() + + torch.cuda.reset_peak_memory_stats() + starter = torch.cuda.Event(enable_timing=True) + ender = torch.cuda.Event(enable_timing=True) + + starter.record() + for _ in range(iters): + y = fn() + ender.record() + torch.cuda.synchronize() + + ms = starter.elapsed_time(ender) + return BenchmarkResult( + ms_total=ms, + ms_per=ms / iters, + max_alloc=torch.cuda.max_memory_allocated(), + max_reserved=torch.cuda.max_memory_reserved(), + y_shape=tuple(y.shape), + y_dtype=y.dtype, + ) + + +@pytest.mark.skipif( + get_device().type != "cuda", + reason=( + "This test is only relevant for CUDA since " + "it's testing speed of SFNO blocks on GPU." + ), +) # noqa: E501 +def test_block_speed(): + ungrouped = get_block_benchmark(filter_num_groups=1).run_benchmark( + iters=5, warmup=1 + ) + grouped = get_block_benchmark(filter_num_groups=8).run_benchmark(iters=5, warmup=1) + assert grouped.timer.avg_time < ungrouped.timer.avg_time, ( + "Expected grouped DHConv to be faster than ungrouped, but got " + f"{grouped.timer.avg_time:.6f} ms for grouped and " + f"{ungrouped.timer.avg_time:.6f} ms for ungrouped." + ) + assert grouped.memory.max_alloc < ungrouped.memory.max_alloc, ( + "Expected grouped DHConv to use less memory than ungrouped, but got " + f"{grouped.memory.max_alloc / 1e6:.2f} MB for grouped and " + f"{ungrouped.memory.max_alloc / 1e6:.2f} MB for ungrouped." + ) From a92561d3b76a74f95ab9c47c7b65bce96f2fb238 Mon Sep 17 00:00:00 2001 From: Jeremy McGibbon Date: Tue, 10 Feb 2026 21:02:34 +0000 Subject: [PATCH 06/20] remove unused typevar --- fme/core/benchmark/benchmark.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/fme/core/benchmark/benchmark.py b/fme/core/benchmark/benchmark.py index c227264bf..843c229fe 100644 --- a/fme/core/benchmark/benchmark.py +++ b/fme/core/benchmark/benchmark.py @@ -2,7 +2,7 @@ import dataclasses import pathlib from collections.abc import Callable -from typing import Self, TypeVar +from typing import Self import dacite import matplotlib.pyplot as plt @@ -208,9 +208,6 @@ def draw_stack( plt.close(fig) -T = TypeVar("T") - - class BenchmarkABC(abc.ABC): @classmethod def new_from_fn( From e53bff58f2045773e0ff748912887121675d5df4 Mon Sep 17 00:00:00 2001 From: Jeremy McGibbon Date: Tue, 10 Feb 2026 21:05:43 +0000 Subject: [PATCH 07/20] use optional argument for optional argument --- fme/core/benchmark/run.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/fme/core/benchmark/run.py b/fme/core/benchmark/run.py index b5a2988cb..23c428343 100644 --- a/fme/core/benchmark/run.py +++ b/fme/core/benchmark/run.py @@ -73,9 +73,8 @@ def get_benchmark_label(name): if __name__ == "__main__": parser = argparse.ArgumentParser(description="Run registered benchmarks.") parser.add_argument( - "benchmark", + "--name", type=str, - nargs="?", default=None, help=( "Name of the benchmark to run. If not provided, " @@ -101,7 +100,7 @@ def get_benchmark_label(name): args = parser.parse_args() main( - names=[args.benchmark] if args.benchmark else None, + names=[args.name] if args.name else None, iters=args.iters, child=args.child, ) From aeda7ee8203d7f20dd735b10a241fe36f20739f6 Mon Sep 17 00:00:00 2001 From: Jeremy McGibbon Date: Tue, 10 Feb 2026 21:07:10 +0000 Subject: [PATCH 08/20] codify single name --- fme/core/benchmark/run.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/fme/core/benchmark/run.py b/fme/core/benchmark/run.py index 23c428343..d540a3aef 100644 --- a/fme/core/benchmark/run.py +++ b/fme/core/benchmark/run.py @@ -29,19 +29,19 @@ def get_device_name() -> str: return "CPU" -def main(names: list[str] | None, iters: int, child: str | None = None) -> None: +def main(name: str | None, iters: int, child: str | None = None) -> None: RESULTS_PATH.mkdir(exist_ok=True) device_name = get_device_name() print(f"Running benchmarks on device: {device_name}") benchmarks = get_benchmarks() - if names is not None: - if any(name not in benchmarks for name in names): - print("Some specified benchmarks not found. Available benchmarks:") - for name in benchmarks: - print(f" - {name}") + if name is not None: + if name not in benchmarks: + print(f"Specified benchmark {name} not found. Available benchmarks:") + for benchmark_name in benchmarks: + print(f" - {benchmark_name}") return - benchmarks_to_run = {name: benchmarks[name] for name in names} + benchmarks_to_run = {name: benchmarks[name]} else: benchmarks_to_run = benchmarks @@ -100,7 +100,7 @@ def get_benchmark_label(name): args = parser.parse_args() main( - names=[args.name] if args.name else None, + name=args.name, iters=args.iters, child=args.child, ) From d17f3f15901300bcb4ecc24b052b261e700f74f3 Mon Sep 17 00:00:00 2001 From: Jeremy McGibbon Date: Tue, 10 Feb 2026 21:21:23 +0000 Subject: [PATCH 09/20] incorporate review comments --- fme/core/benchmark/test_timer.py | 30 +++++++++++++++--------------- fme/core/benchmark/timer.py | 17 ++++------------- 2 files changed, 19 insertions(+), 28 deletions(-) diff --git a/fme/core/benchmark/test_timer.py b/fme/core/benchmark/test_timer.py index c8f97e6b8..47c527d56 100644 --- a/fme/core/benchmark/test_timer.py +++ b/fme/core/benchmark/test_timer.py @@ -38,14 +38,14 @@ def test_timer_with_child(): def _create_parent_result(avg_time: float) -> TimerResult: - return TimerResult(total_runs=2, avg_time=avg_time, children={}) + return TimerResult(count=2, avg_time=avg_time, children={}) def _create_child_result(avg_time: float) -> TimerResult: return TimerResult( - total_runs=2, + count=2, avg_time=1.0, - children={"child": TimerResult(total_runs=2, avg_time=avg_time, children={})}, + children={"child": TimerResult(count=2, avg_time=avg_time, children={})}, ) @@ -76,10 +76,10 @@ def test_assert_close( result2.assert_close(result1, rtol=rtol) -def test_assert_close_different_total_runs(): - # different total runs should raise regardless of rtol - result1 = TimerResult(total_runs=100, avg_time=100.0, children={}) - result2 = TimerResult(total_runs=101, avg_time=100.0, children={}) +def test_assert_close_different_count(): + # different count should raise regardless of rtol + result1 = TimerResult(count=100, avg_time=100.0, children={}) + result2 = TimerResult(count=101, avg_time=100.0, children={}) with pytest.raises(AssertionError): result2.assert_close(result1, rtol=0.5) @@ -87,14 +87,14 @@ def test_assert_close_different_total_runs(): def test_assert_close_children_rtol(): # test that children_rtol is used for child comparisons result1 = TimerResult( - total_runs=2, + count=2, avg_time=100.0, - children={"child": TimerResult(total_runs=2, avg_time=100.0, children={})}, + children={"child": TimerResult(count=2, avg_time=100.0, children={})}, ) result2 = TimerResult( - total_runs=2, + count=2, avg_time=110.0, - children={"child": TimerResult(total_runs=2, avg_time=103.0, children={})}, + children={"child": TimerResult(count=2, avg_time=103.0, children={})}, ) result2.assert_close(result1, rtol=0.2, children_rtol=0.05) @@ -102,13 +102,13 @@ def test_assert_close_children_rtol(): def test_assert_close_children_rtol_raises(): # test that children_rtol is used for child comparisons result1 = TimerResult( - total_runs=2, + count=2, avg_time=100.0, - children={"child": TimerResult(total_runs=2, avg_time=100.0, children={})}, + children={"child": TimerResult(count=2, avg_time=100.0, children={})}, ) result2 = TimerResult( - total_runs=2, + count=2, avg_time=110.0, - children={"child": TimerResult(total_runs=2, avg_time=103.0, children={})}, + children={"child": TimerResult(count=2, avg_time=103.0, children={})}, ) result2.assert_close(result1, rtol=0.5, children_rtol=0.2) diff --git a/fme/core/benchmark/timer.py b/fme/core/benchmark/timer.py index 230fe4e72..943d034ff 100644 --- a/fme/core/benchmark/timer.py +++ b/fme/core/benchmark/timer.py @@ -1,5 +1,4 @@ import collections -import contextlib import dataclasses from typing import Literal, Protocol, Self @@ -8,15 +7,13 @@ @dataclasses.dataclass class TimerResult: - total_runs: int + count: int avg_time: float children: dict[str, "TimerResult"] def assert_close(self, other: "TimerResult", rtol=0.02, children_rtol=0.02) -> None: - if self.total_runs != other.total_runs: - raise AssertionError( - f"total_runs differ: {self.total_runs} vs {other.total_runs}" - ) + if self.count != other.count: + raise AssertionError(f"count differ: {self.count} vs {other.count}") if not torch.isclose( torch.tensor(self.avg_time), torch.tensor(other.avg_time), rtol=rtol ): @@ -45,9 +42,6 @@ def __exit__(self, exc_type, exc_val, exc_tb) -> Literal[False]: ... class NullTimer: - def context(self, name: str) -> contextlib.nullcontext: - return contextlib.nullcontext() - def child(self, name: str) -> "Self": return self @@ -57,9 +51,6 @@ def __enter__(self) -> "Self": def __exit__(self, exc_type, exc_val, exc_tb) -> Literal[False]: return False - def report(self) -> TimerResult: - return TimerResult(total_runs=0, avg_time=0.0, children={}) - _: Timer = NullTimer() del _ @@ -160,7 +151,7 @@ def result(self) -> TimerResult: if self._result is None: torch.cuda.synchronize() self._result = TimerResult( - total_runs=len(self._event_pairs), + count=len(self._event_pairs), avg_time=self._avg_time, children=self._child_reports(), ) From 43a00c3692d5790340e8a03f966d5a2c5fdf1f7a Mon Sep 17 00:00:00 2001 From: Jeremy McGibbon Date: Tue, 10 Feb 2026 21:23:24 +0000 Subject: [PATCH 10/20] fix test --- fme/core/benchmark/test_timer.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fme/core/benchmark/test_timer.py b/fme/core/benchmark/test_timer.py index 47c527d56..809894b66 100644 --- a/fme/core/benchmark/test_timer.py +++ b/fme/core/benchmark/test_timer.py @@ -111,4 +111,5 @@ def test_assert_close_children_rtol_raises(): avg_time=110.0, children={"child": TimerResult(count=2, avg_time=103.0, children={})}, ) - result2.assert_close(result1, rtol=0.5, children_rtol=0.2) + with pytest.raises(AssertionError): + result2.assert_close(result1, rtol=0.05, children_rtol=0.2) From c613b739cc38f08a075d9aa9ccac66192dd74cbb Mon Sep 17 00:00:00 2001 From: Jeremy McGibbon Date: Tue, 10 Feb 2026 22:29:12 +0000 Subject: [PATCH 11/20] delete dead code --- .../models/conditional_sfno/test_sfnonet.py | 37 ------------------- 1 file changed, 37 deletions(-) diff --git a/fme/core/models/conditional_sfno/test_sfnonet.py b/fme/core/models/conditional_sfno/test_sfnonet.py index eb01658f4..54cb735e1 100644 --- a/fme/core/models/conditional_sfno/test_sfnonet.py +++ b/fme/core/models/conditional_sfno/test_sfnonet.py @@ -1,4 +1,3 @@ -import dataclasses import os from types import SimpleNamespace @@ -225,42 +224,6 @@ def forward(self, x): assert torch.isnan(output).any() -@dataclasses.dataclass -class BenchmarkResult: - ms_total: float - ms_per: float - max_alloc: int - max_reserved: int - y_shape: tuple - y_dtype: torch.dtype - - -def benchmark(fn, iters=10, warmup=1) -> BenchmarkResult: - for _ in range(warmup): - fn() - torch.cuda.synchronize() - - torch.cuda.reset_peak_memory_stats() - starter = torch.cuda.Event(enable_timing=True) - ender = torch.cuda.Event(enable_timing=True) - - starter.record() - for _ in range(iters): - y = fn() - ender.record() - torch.cuda.synchronize() - - ms = starter.elapsed_time(ender) - return BenchmarkResult( - ms_total=ms, - ms_per=ms / iters, - max_alloc=torch.cuda.max_memory_allocated(), - max_reserved=torch.cuda.max_memory_reserved(), - y_shape=tuple(y.shape), - y_dtype=y.dtype, - ) - - @pytest.mark.skipif( get_device().type != "cuda", reason=( From 36f9cd840443a92d3f42c2a7bcc90823bda9ce74 Mon Sep 17 00:00:00 2001 From: Jeremy McGibbon Date: Tue, 10 Feb 2026 22:34:01 +0000 Subject: [PATCH 12/20] add force_cpu --- fme/core/device.py | 24 ++++++++++++++++++++++++ fme/core/test_device.py | 13 +++++++++++++ 2 files changed, 37 insertions(+) diff --git a/fme/core/device.py b/fme/core/device.py index dc70a33cc..044ab41bc 100644 --- a/fme/core/device.py +++ b/fme/core/device.py @@ -1,9 +1,31 @@ +import contextlib import os +from collections.abc import Generator import torch from .typing_ import TensorDict, TensorMapping +_FORCE_CPU: bool = False + + +@contextlib.contextmanager +def force_cpu(force: bool = True) -> Generator[None, None, None]: + """Force the use of CPU even if a GPU is available. This is useful for + testing and debugging. + + Args: + force: If True, force the use of CPU. If False, allow the use of GPU if + available. + """ + global _FORCE_CPU + previous = _FORCE_CPU + try: + _FORCE_CPU = force + yield + finally: + _FORCE_CPU = previous + def using_gpu() -> bool: return get_device().type == "cuda" @@ -20,6 +42,8 @@ def get_device() -> torch.device: """If CUDA is available, return a CUDA device. Otherwise, return a CPU device unless FME_USE_MPS is set, in which case return an MPS device if available. """ + if _FORCE_CPU: + return torch.device("cpu") if torch.cuda.is_available(): return torch.device("cuda", torch.cuda.current_device()) else: diff --git a/fme/core/test_device.py b/fme/core/test_device.py index 80037c99c..5a0f9d13f 100644 --- a/fme/core/test_device.py +++ b/fme/core/test_device.py @@ -1,7 +1,20 @@ +import pytest import torch import fme +from fme.core.device import force_cpu, get_device def test_device_is_defined(): assert isinstance(fme.get_device(), torch.device) + + +def test_force_cpu(): + device_before = get_device() + if device_before.type == "cpu": + pytest.skip("Device is already CPU, cannot test force_cpu.") + with force_cpu(): + device = get_device() + assert device.type == "cpu" + device_after = get_device() + assert device_after.type == device_before.type From 17554a82c65ee609118c5d6e198b47c194770186 Mon Sep 17 00:00:00 2001 From: Jeremy McGibbon Date: Tue, 10 Feb 2026 22:35:16 +0000 Subject: [PATCH 13/20] use cpu for regression test --- fme/core/benchmark/test_benchmark.py | 30 ++++++++++-------- .../testdata/csfno_block-regression.pt | Bin 12178 -> 12178 bytes .../csfno_block_8_groups-regression.pt | Bin 12241 -> 12241 bytes 3 files changed, 16 insertions(+), 14 deletions(-) diff --git a/fme/core/benchmark/test_benchmark.py b/fme/core/benchmark/test_benchmark.py index c7c95629e..e1de577de 100644 --- a/fme/core/benchmark/test_benchmark.py +++ b/fme/core/benchmark/test_benchmark.py @@ -5,6 +5,7 @@ import fme # to trigger registration of benchmarks from fme.core.benchmark.benchmark import BenchmarkABC, get_benchmarks +from fme.core.device import force_cpu from fme.core.rand import set_seed from fme.core.testing.regression import validate_tensor_dict @@ -21,9 +22,9 @@ def benchmark_fn(timer): benchmark = BenchmarkABC.new_from_fn(benchmark_fn) first_result = benchmark.run_benchmark(iters=15, warmup=1) - assert first_result.timer.total_runs == 15 + assert first_result.timer.count == 15 second_result = benchmark.run_benchmark(iters=20, warmup=1) - assert second_result.timer.total_runs == 20 + assert second_result.timer.count == 20 torch.testing.assert_close( first_result.timer.avg_time, second_result.timer.avg_time, rtol=0.2, atol=0 ) @@ -39,16 +40,17 @@ def test_benchmarks_are_not_empty(): @pytest.mark.parametrize("benchmark_name", BENCHMARKS.keys()) -@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA is not available") def test_regression(benchmark_name: str): - set_seed(0) - benchmark_cls = BENCHMARKS[benchmark_name] - regression_result = benchmark_cls.run_regression() - if regression_result is None: - pytest.skip("Benchmark does not have regression targets.") - # If run_regression returns something, we expect it to be a TensorDict of results - assert isinstance(regression_result, dict) - validate_tensor_dict( - regression_result, - os.path.join(DIR, "testdata", f"{benchmark_name}-regression.pt"), - ) + with force_cpu(): + set_seed(0) + benchmark_cls = BENCHMARKS[benchmark_name] + regression_result = benchmark_cls.run_regression() + if regression_result is None: + pytest.skip("Benchmark does not have regression targets.") + # If run_regression returns something, + # we expect it to be a TensorDict of results + assert isinstance(regression_result, dict) + validate_tensor_dict( + regression_result, + os.path.join(DIR, "testdata", f"{benchmark_name}-regression.pt"), + ) diff --git a/fme/core/benchmark/testdata/csfno_block-regression.pt b/fme/core/benchmark/testdata/csfno_block-regression.pt index 3ae9270b3f9f25352474bd2ff95f804ab6dc3611..85a5aa0157d3a6cc80813eced819f7808b5f22f8 100644 GIT binary patch literal 12178 zcmbt)30O~Ww|0{>X;3MlGN(zCir;>gB$YB0p(urlNGeGpDy1Y;iX;gk6wRo;R+Az9 zg))UCB%#QZkes~lIp=%d_k6x{&hxgG39#e{?;B!vFU z&=8s~v|_!t-#Yi@zUx-3(F^oi8R)fsz0W$oiJr@Xmg)Pi@wK*`Dzy0jZ-k441qeBd z_g{V8h9Lh9K>@-mg4PADSf%g2A;`yfy_9>P*YXWMzMk$uUViJ>1-b_tNd<^#X*mz< zU$MU5a@k6+0MWlMk+Sw(w=8J>-w#R!h&c=Q+ZY54a2D-q4{YUMO*$=iC4hoRmxnjjSUtg~kLH)P6UTSWjr&pktrEV6VX6Znd^F z{NH=i@HdkS@0+&&zS-aN&rBu=Y5kGOzq`_XnXk`EzqMX|LDrUI{zqg?O@0%Yx-sCj zNdLuu&p#8;6f*b|f#sWmyw?5pwzZ}1|LD5^5p}*kK|#J=zd1y+7ejx?(eL?Z4kLsn z{)xlysGewGZK?CWAuzqB92;MJ!UdDxqtt`Nto&^TP2E$Bn^neA?=KZ9nbD5-j);-! zqhu74A987*xHWcdwx(>Z48uPhL8;yu)D#tmPMJz%)Gbd=epgU;UNAqsJsg9J*1*FT z73{(4a28%To!pidQMPjdl~#yB$terc3=Y7@E#FZ#y&2U+r;*GcSL|Ia3tF|gv_?^k zxL@&nVn_sfDvd>lqQS75Ur(Fv9i$y~kLY|YVp8ukxY?#gBB>E%^3aoPzsQjO*;|zP zrV2YgaJ-d-107Sfp~;8xu|QRk1>aR+wMk)g?DRqE@#p!klMXS(rE)ayY6v|~P@$B~ zd#UW$K*~Nfjd3Z8sCv?!971N7?J$m@Hopfnt2mU3qT4XN!<@D47B5#ZKSHZ+hESW& zGB#!MB5E>s;m?^aVYTzxaZa`exqQfAtA5U4A#f9=S`ujHjs(*3<~hwAI~b{Q5}gMH zv+!0?!Sgr)YA@VMLicI~o8+S**I1d(tXf4T2YzrH9~6>yn-WENnNgh5TpH&n4l*uP z^z`FYviM7qM3j-Vaw|y8KA!R`C(@I;5;BePyWn)uE}E| zd#$*(a4}}PRSf++bFd@)JZ6PgA-6h{Qz|;bwL1#9k~x~>tG$>K^J9st{Do2#k!U?9 z9S3i@L(bD}`R^xmNZoTD4b@zNpIf?c$s}FUbQPzQTf?zua08kc%2L&|E#&O*8lOa8 zA=9iYl#qRbZkvqY_GCJc)5HtZI6;QK_T-ZDETCzM$<%v&7-E_#$;uC--SKN^M#*9l z%X^69jUJ(YpOZjq$$U~<5J6n)T539}OxNANqHJ+Gb-U%@o16)xx9AgU|2j{WTEpl| z>v8A_;eoilFwiKDK1-Sn_v?qa29_Cb#Y^cE8nTc6Y|2&w?0= z>(e9|p`BdIz{i3N?<>4&!zF=B*;sokC%;0w9=3=_~O@3T+7AvrQi(RU-$X3P{lctTJ z(oepW;2ud!*(7N5dc}6loKDS=v!J)}3Hu`JM-o?hxMPc)Snak~Y^3HI(w{aR9X83c zDC0Z0)UTMb2Kr!dkQfB}5sEnL(Y*9;tfKu>Y4Rd9@ONIyO&_O2XN=$B?EMDNtX04z z35#J9rQq`+(q!8ZNWK$C(%@m1l=t;DH@4$2v)#KF$0ci1j^cA%(r61}4f1ebCWgJg z>Q35@dekm<0i|cIqi^yn=vwJG`t{v|wuaYmt2Cao+u1J#jnhsET6dc=RW~zst4fa) z{i;!6@c|6ku1Dh*nXo{!UNql16Z--l;~6z)s=lT{sSy__t|*T5*>TFWiljL62zoQA zj;W13PD7l6sDjhwkJ&H7Qg?6iJmAb79(9S0C26vHt4=7IOS3j_V_OWxVZhP*tXo5b z5>f`D?Rj;`t8}A@!=gwwqKGCrmVo}WVA!gB5}xRUv&QgmT)Kf1jC74h&AazXYd4(X z^`7g}_re31pz%y_#dj9A4#~hEmul|vh(#z!yug}B5fmIhvAh?9X~-Hq@_x4+ixygu zbDb3V7EWa!ugb8+lbTTd^Fpc;wuG+LVvuCu%_{5{(zpHTG}v|#Me5pPdXpp6eM+a+ zL5}byWewAvj z7i;q#H$4=kwQ*XQsQHt{{nDjn`6;9|dtce|)z_iu=@^<_Adjh^^traxxwswjMJ*qtCbdjIhawp?4vslDreC)O5o77>p8_aP%G-RdNF-QgEo^L0C&xwnL7 zsd`f0_e*qq$wBtjP@K+2AHhYvZ&*&R2v`_@nWAn`i4v*WpxSVN)$t`PI=PM+-zZ^& z68FIHm#(lNy`HU#Pz8TwL(mtDgwUt}@Kip^o%)%@mX5cD*u*9rSMZ8$o|go!K}*?a zX-SH!yNHEPlOgB!CKxLN)0o%yZ920~A1x_dtAiUJkw{0jR~Tf|>`dT=?~r zDQ|lh(T-O|IA6k*w9c3?W6m2MPmF;nbKF3z?j@$yMA8aJOEMWN!Up*WAasr))Ll-3 zzRp!(uQ&jv-%nvhHsK&IDZ%U%?SUm4k!IN~eD*<=wsxyQU&m=!oe>S;i+ak;QYMr6 z`14#|??!lc-Wn$7$-t%1KDObk2MnoN4mF0poczfy!Oi9f$Z?Hev%Pn-aW`Kx%TETZ zQO^&q8qa}@C&yXSOIsFt>=%1=YzS=;k_KCu^NZ!w#bIT z{$7B{>haJB8z4L+5nEQg7vy#9<1|anpi6%{do-;WgDiF;u9^=Pu6caJ`gm4#CY`lj zKY(ZVwXk+i1R^6k58qt~ za4~)x^zHr5UWge(!i#L~$>7=03KL6xUL|+$`#ekVH`}cX(cla78Q^z|@e1 zZw?f6r;5Rk@DZSC;K5Gr%ZGOPcqs8X2@#W$p#6~|wiTRbsdu|r-or2)e(MG<*DS}o z?$OM7vmw_M)&D;&5XJU8@wi=5j9_;iZn4>pdBa!p7YhXl*>d2&EFQ)A8Z0u}iSod4 zku$F20D}y+T2UNl)$e8gKgSEiBQl|P+iJM_i6fKArgTW6gtd&H51BEV5TYP}r&Hu{ zu!0tA*AL@m0y_9?m#N&SFVc`M^@1x18j2F?I@lO;nR9(T5%M|=srRKbeF;mXpPvu2 zWOq0AXtXlO%URH5IKsLHP8Ymy$h{L`#=_7 z`q_m-Knq?isUWMkP`WsF6r53#0e4*m>f2;TAl!k|Cg-7|S~|9T7I0Vf?O~RqFO0Jt z4lVT|g7NFGFnH#J%_&~w@bEEsq!$SGbPgc%uQ9A^n>3B}D`rXS%%Qh38pSr|ldSAS z*04Js^~zo{gQP2*MQS!B$j_xdz5=^n1Tfq8OUUcd5B9C4fSvEkW=V`;PLw=!DDh4$v~NNZ_9K6d#`7P8Ew|1RggXfUggLtB;ge@(m5vd8(S7J~x04 zsi~3}4(HB&9}527U(2+nT5y^(>?rx14JC`0;zd9z+F%R%Q4bW zzoU~W9kAnfNUR0VpYJ%Im6f=~IRQ&H8Ij?*cv>{!3z%m|9vtXmX*Z-;XoE34 zdNvS@;%4^e2KIu!%*~Lib`Px=?qQ8%1$?rO21_j|rnWa1Xk5AnNyHn_&5j=O-aP?3 z7R!~-@4SH1vchSaXAIT$*^}zt?|g|v302C;m&bJLQup#UdKYzwUhla>7lZ2QvZzWRh*}?Qzk2~eGh?ML3 z*pYc+0Y5G>8dHO^Xz=L}I#ZHPKTP*hqI@_yf61lxsCr6>0nV+pKPMYHndC2rP)U0( z9b34WeEce@vFtNV4KAm`NEOoPuVu6c+^45nV&xNz&*Ga4m1Lo*#A}AmpvDSU3QVb` zle;WPvDg5Np5;Pl@HJ*#@dBeHq6lU8($|lb`25V}ej5b6(6i)UbAlERyU%i6#+Bc7 z&86qrmGrsbDlI+M!m=+q(3|nKWSC-4ao>~4wD2Wut!SbGu}{cu?j+eYC+OO>AWC*n zC8ZK^Y!REmOPppbA-9;9?iMDA2oIL7l1N$`ACSZAQ)G1N3*G+FhZP@o@q#_6crS1% zsZa%d+&`Oc&Wt8gu_5%a>JzOxQbeUj?`h$YCYoVEPz z74x~nmQlFb={(zD#XvpHn&KUFXeJu~O%o16r`l1vwmB@MNE)?Ie6WFA3xOCH-h3JaY;r&T%!SPYd!c4J{lx{OPuy_26}ca?@+(nWCLL8Rj^K_L zBJja*7%jfkfKC@mP>_BN6?Q+xg?Cpn=aBIrsVoe2m(H=3*RDcZegcyl)5)T5$bnw< zL>y@^$Lf7wF4#y##>z2b&b#cu$4Nt9N{`f zpTm=<+?dOHXLP@;N!{KW82hT5=Sn}Ixo{vQrCs3Lr|433-&rhKc7o>R_Mp&~B)VgJ z5~~ldrd>@N(O=USMt;bG0T&aY@mdOR(U^!LH|}zKAR5{nOkwLcJx~>k2mc*TEZ^6F z)LmUkvGWAe`80wGL#;qoG73_9npsrgT6nA<1R*^sf=ab4R&p=_gQ9Gh(i>}N&fS5P z*$a^?ThAuUjlzgEnIwB(gw33x3AZW+l32x1e2_bVT8@X4sLymZ;Oay6eXJWfuMxsj zo7JFmE1o&zrefG+M^Y3Y$#x6vf^W+Nut!~k30c>0q3VxV%lV10Om_|66KX^QGP=-O z(9Jg{AEC}-1+vI)=9#4>h}M~5&&PJm`e4npR>!hXg)*9R%oN^tonk+u&M|F)XPN3> z#+=-bZdP{49|GT{L0V2QjL%QP9SZ5Z_s15#dgo6D@tZ(AZZLg_9sIbF+<_LV-D*m9LC1o zzQN~vOosgjOql!Y;c&V&fC<&l02%!{rtDmfo$gWCIKi2_@}rhH>i87biCydAdW zc(J>W{8@4SWag`;z~tu6WxdnmS@y72oa@ax7Rl1!2-Pkh*)3v8K7^1|N6-n zmN8}poli=J%&bYgcfx2AKNLmIcbAa#P$kMX5rq#9xp?s&a;5?Da9&81Y9qIiV8V17 zCH(@Io!C!HJLb|Gi#hQ0{tyZuKVXX8+*)eeo(k4a)ZwI!2{{KBa_tWsIrWTqln)G{ zmT9}l#LJ65Xz0*87e@%VdV=Y0Hl*p%u_U+cB>gqz2r1uqN?MN|!T_OmrW$4d@`@3h zPFgq_y%mE?(;L{aRa0qy*KFE+Rum)EHqzOrjhJovntx@IMGNw3D0$;G{-ekxEcvDZ zt**hS`BaaOy{p6r1=k`Md=edpcVheHxm<8%e~pI&;OtDPsZ-`JgV9Tp*v05Upkx=# zZ|IVseIFh0oxdVD7-Zt`LLGSXK!WDz+Oa1ZX%OF%0Yi=`z#dP8g|7!pt<4C9=22si zJ6_Cc`AaZk&~ot4EQU*+3D9`E4J!SHLct;r=-ONXO{=y++0sNXebC4*$BRMP?i7&O zbQcy%?Sj-5&CpO;1;Zy=fI-t8{!Nw@oY5Z&=O>4<+s_E)>n>pI_ZYAW-VVjXn^~9x zvLDwprizJf2Q)PXRrMA8U@uc}nlK9NhkRttmZzEaxFP*Lqg`O(kqYfY1Hj~SI-GhK z&w|rWGWA&=(4lk)^gfQB+IuS+mdn*MZLP}?9kZQns?KD-H66_H%p9mrbb_X5pJ4Ct zHIQU3L$djY;2e{jI;8wO3)Mb@J!ecHkkzwH#VU~On*>c=^?a;%1K*=H7qkvLv6d^( z%SMgf4jx(+VD+d5YEmbGP(m`)&6b;*95xqnX6$7qLRoP5ME|?!iUzw_pA01^B3?177hmN?})&(z8fLu}wX z;7^1?&R^Y7qu;<+gm|-~T_tRXn;!I>-482x6;_58;UYVmL8s53AU{wJh$^0hazfo0*Ch zleP3-y5isg*3TT-W;HPweW{Yox~|G<3yy-8jM~(;4DqQ2Z))J+ophE~w}lz)yu%vr zNb!>s53pBzF0o%n^x^&seF(f60zLh4(J&|gq9TJ)NW~D8gc86V_pta#J68VX3X=^s zfQzl=*mO{bcFGQ*#B(kv-1Zo2V#|1ui++@TeF|lWd_wci`xxdm75zijbMoW1la;I( zy;T`X8Cw%+@lt!r4DhC&P2)*Q?k$QXeB}=m*x|K4QF=AfkYYXUDN!IzLBlWeM&=w@ zXegBF|5O3>uR0*zyqBpI&7s`VqcrW%b=q=Dm%hx8q#Lp&^u|d8)g~q3>5b8BhGZYx zBsGGTKMtXkWk~-kUg;chlD(ZCoGyG9Q}ZHzWj?UedDNjVgv@AKTkL2&ZXwV z@=U^W3w_JFNx1GZhU*Lz9E$B?31*3?Q$B=(u9%XH(NX^OU=h-69Y8%7mT@}6N7BWg z*XhG%Gn{+)HlsCpq_(peVvt znA)O5%41vCQ%|ri2|3zfU$G6!3frS)l?0tCI?OncH=N3%%UHef1|7RGnRYg2(1Vl% zWFXp~o68-daEtX=eYzJ7PNoUu8B+(< z=Jw(SsYv%ydVk$GH>-~wh%99@j5@JNX7!x&nbYXV-Dcq(A_7x`BAouq9!;KzGSA6_@q3F4zB^ced&^r{ z!VnQ~H@l7+{ol49hBkD5iyH0sE@0Ar_O$TqF;1_ufje|~8mBf&ftqcvb3Y5UF(p); z!hRX!Y=;r_Z0#c)b-EHuGnDCMwFdLBP7;(P3}ceI_T=xZ!RC}I(aP#X+-jZ1KiGeV zJ?Tj!9VJPaxjP>WGf_v z*O6&xjym)vFpJHN6fixP8`!O){lpJu=w- zk5ydGN)hy*ai0~tX0pN(VYr(S%k`>UVn6r!LY3kLEYO(=ns}K_7C*w9X1AcR^*(&~ z;}&{sv8F9IQn-Y<=Crm(f?_mB(tu5eaecNZIjvDc@wFO+jcWx)v*oGGLYuZ-5`xQ` zJ8L|f=zP2`1RKFnT37^iyXI`ZA?m{6q!p9bZipe^e`1a`smDu9$@fv~l!;dP=1T>NYcP0}-9oysX_Ii3t~g}^^90lsT%!t?ixFXA{^aEPrDHHHl-ZS3k$F{q0a0=Rvid9ipJ z)0->MbQ8h}r=M}br(+>^6@tw2>Cgt>Fh@9x6_qchw!hXhT@8uyil<#vKlBbvv}uC1 zSyfni<1FfC1wn_b0}bBSi_Kx*S!WkO!i7}O*4hQpDchl>jZw~{PyOGcvn1G*gh^aD z3?DfX%))G`e9NTV)99h$N|<#wsqA{g z6u2XQmrNp-aSLw(exH~}r$+Chu!S{e0u;!?APW5Ci>-wvj9XvJ*O72epuV=xabY>VR-Fk|lix!cc^If(xJ`Ap=Y=fJF zjfnTPrUJQz;C@vR$gPd3+jir)3568##EJamkAvpNy|8c6HSl{PV7*(F!5~eXE-HoVBwxo~yws(Vq6k2x`*_QQgW~0VTwIntNq2wXPpmk~ z1U_F_-?4jTGxhh;x!7>JKU9Ogcya|BH^ktV>IlwhsS&%sZWfl+R#7@VDK+Zfm)=#H z!MoOEvehZenbY0(EFwdsJTq|^wJPj}i|>3`@34GSQK+UqS-JA$?r7fkf*F}Uo(i^W zrQrIU)71EKI23gnupq}|Di7Vkg0_ZnAuVC-`;>U9IW(IZU-b94<}1UE&~Lot76n?h zyBnLd7r?L-6-qlL40|g^g4@nEJhw*%g|sFJ%yrM9ldLUFvDkn*si)BP-5q9-F9u7# zNTQ%}8EB?lW-p>D$oRxd+Ss)KPQCBIy4PZGeXb3S5I;@Bwepya{unyapVzA=>!Re? zWmLWLI(kntWeb@s|9G_rjs0N&cBMC&ivMA}B>ID0J0Js*LsH#;b?s`5{fAhG+aMN zji;+HcmF|>|6s=nE!u}ugkvyq*EwQu-;n;{44QLh6q$s*A&c;8irWy1r9Wr8iGXnfx+sN)?mC`!3B`k-Um|7$lS} zSMa1j=`qxpafLIjKgHT#^zjBUm2|q$k1UG*!ojz*XpFBbT5Py4uo|w~c zEbb;fk4+<)d->S7X9UeDJWeuRC#lq{m?{n(q}xmPqU7OnI#YUvs!B(Z$U$BDe1TDz z$s;rxx(dxj&C$LmA18F*BiHj!`n74kG}gI;{Hry%O2Y+IB~wRN{bo~(7m|&%GL7;& zL|0=Qsk{9&SuBt$AF@TQyrh4>-8N_}Gd;c$tp*&TgkM2?c(gHNkqxx|upDtyAF_8Z z1F_jFmK+~-;cTa4D1Ie?-oKqhV?)ZY#%u|;G@f8L6-`#qMD{vRLgl$Sy>Sp9i5FNkV>jFk)id;O=L(C9xg(EpqG za=llePrm{+blGqDt-Ft>wWY>?N62uJnUT5Sm%;w_lkjiC|EYZbM=`I)?-Tzt{u>?izk>av9(UY7 z0on9RbpHVMFKXa_#r#K!?Sy~Atp9&t{uf>Ezry~bYF6`~VCVJw=MPl>Q?2{2X#W_A zBmN2P{-4qQuDktLsDBJ8^?!o8t@B4>{~pNyiuI2VUhki<=IQ;E2-4))^W{cHX< z)J6Xf*l#`h@9`hM=6{PWA)%gsH|}5ew^Ifa|8DnpsrG>1V&(od1BHIu|K0ZA0B`Zt AAOHXW literal 12178 zcmbt)2{=|?yZ&oPGL$A%rY0FOl_~amUd^H;Q*)9jA=8^Q$($)uLdcYm6v+=2n%!m zOOxj;<=Fc;?e?(U;pSoQs^;zJ?Ct2|z6-%`K_tDh) z&7|_vv*6!1`@8>{i84p&k4*kOl(u$mF3!8%9e49qESd8^BBQJQn*evO*E{~vi~sI_ zCZNF4{1bs4`*@BXzoWfk$^8G(b^jyl++27(H^<)`LdmOXzw_vK|1$?!j>exj{LX3( z%@s>j|2G6CSr|~E_sW!c8cf4Waxlv(se1PrZR+U)SRwe4CJel$hLtXOd)pc!DdmqM z)rvK5$P+4@x&qA~PDcf=jrioTEe0(SM;ry-uqP}7dHPgOBss~qr zd678#>H1!DXiLGQxE|an9gGvEXw|%)Bw8~rFAnp4BvH59nYz`M<1W*aIOk_5ats>{ z4$YH8o8_@E>3JolSC`_gu#?E2mWcNlKCqNpfzQtk(pfQAsWDHurYka!NVE4LAZNc} z&zUB46ZA)?sCo<-8KmC?7NYTy0`~U$z0`YG3${PeW}ROUP>`;nJl~x-CvPm8eJQ{# zZ;xZZNhun#&k2nKV$kfi5lV*qg;|fc;j$xUBrkapt~e2i-&bgW|Fs4zKUqMH9%>t2 zJDZ3`ZN?}iCxzo;RdH#DM9tFP%Q#POI?6q9#%IlOXpkyY<33juHrNPKtIo6N9I^^! zt{mO4nvCX?#EB?3gNEzPp@Rnr=7SPmQamcc-p z8ys2H1lMhX;B@>Ix@xsK$UV3Q_Zr7@7jIeznjN2*K;5s`dOpmb1!G+N=dW>%_VGJCu;~cns?VE1}Fe z00d9&fc@Jv2)AH49CA2LPAK^kVS#mU)V&c}$4my-*ty){rYHzVbi~vFIS5)-Np3p1 zlWAR9P^){L=;ntJX7>O|TRQ@c!mB`W=~}4w42QnpV8Rhj)YlSDox%mxMwq)l0UA!paD^|& zgADyi#2#E^xU)iV&(kk7lFTCNI&-miWgydeO%@cJTR>y_XE4e10L{=$Lz@#5x!I>! zcyKBQT4n7){i6pM+uJgnR4`BqE`dPj`(StX5h=Cwg_@WxFnRZPNL}C#C-)<%;Lm|= zd_J%wEQJLG=h&|tquU=i327UxdECJ749jFO>uUO0TOsDyiw_2h{T zr42PNVQBt+Xo?*qdmsHG=`$aYCi9cfKgJ2V2Zh1UG=ptvUj{0#x5JJ}5?q_G*<6_3 z07dd~U^P3CQA!J8Jv3gDHU&j=dAN`SZQ#Mm#jY@B5rx`^Bdl|y7z9Kmfp=>Q_pa+-Ixa_z`-Oy+>584+an<#S*TDNL&uXinl+OZYP zg$#Mn(_Tj|KHd)QN9@5$ke{0qa1<1doj06)rxE(L%X1}iDD>pa1Vse!+;$%Niwt1p zNF9`L<*3vKl(V3c%kkgBG+Z!b#aDusS=Dnd^seu2l#tG#X*b*+R&BC_((M*Mp!fKbISk z1PR|nxZ9NEfK&3UYH;giSn9&Z?RNbH>6<$!bJq`w+yDa92ADyyF!D&d6{3YqQ21FZ zb9~@7SuE`ejR(eXpZhK2irt;htt_hor&AIzpKmP)-=BvYtd2rqV;aoD8;~*i3h+;f zg5g#U*KDm5bm(fs!&_-k^_B-rSQKmgF^LSM$3yldUwA*dMk`1;ZGTcvb3@DE+^;}b zaH18w+P=e(&od%pUII3gt)TRh6gT1LYVe%?22#%ygV2?WusNm#5}sWFh3C_`^RB)G zdx?Nc%>dG#&|Uq?HW~vykAsSvI>1#bfk^9oQuX00 z6YKbi%;$zdjj$bgsI!6`j6BDjR+2`)Rz4a~u&>&Fu{OkIU1#px*-qvfydy$at>Ly; zCdA$5hj9U$fcNb*O;~h?WPEOeiJRguC~rRM=fvR8&?*wp&%*kVb>x@*PB<7OLqGRO zfI{aD_86ZIS!ApQJABJ%^V|Fv0}d=JbP;cDbH$Y#!Xqo>FW@-Q$zl3EW7 zK;fAvXxqCL3cRD?efJd5^LauR%zsBaYcwz*$b+|ITgFrpc40XS?Yj$U?$ zfKE&Yb?+Ex@lPNIhJD0t$q0DrM!@PGX)tn?2TrAeVYJ#sB5YYta!q!lK2aU-vXc}U z6hmzrt->Hx7$Kj?7N3>W$zvhrmJ~An?0Y)qNFkA^yhArkv}Ijak0I=NS$x>N7sl^X zAS)HU$OUU}z@A?s^p$NF^DVy>&vs-$!rd(L zrLu-veXfM$_yp)PjDq;F5Q)9`VbQbY=r?W$JmAeF^WPjH0h1l{JGb1#^kf42z6XNC zK@W1uvm9MZ$JY4ADADWL&&lkj*)+5A74(mr4sYb7!94W_nYW9D(kWerO8tM4=hACg zcCr^$Jex&}jY8qENdw51SK{RSg=l20jqiUXfqLelDTw*8kflNhZ-&B{Men(ChB9r?)BAC>A9opjx`v(uZdsA z%fO2U6FievK)R)$5}nGw;K`~j*volj7&|W+&zD`poc!xl^iwV2>k0uM^EmoJC5bf5 zaR+hkXYze|1Nt77!RKFsfg_$=rDGXPtPWjfHYw%ehAbCGZg>j*ED3|@P6iNqCZ1*u zvA8OqL3Lg#K$<^+3{T3*-FI2TZkSg6wU27Xuf^o@cskAdGwGTXM2%$p@X6QJ5b-J! zg*isd$yebpv|16js%vAr$3jE@5^c&+&8*(;sz7`fBm+Bg2$p&4li`)#ba6#AQB%tV zv3^fnI3p3~NVl`c6wZPF(-^QJ+GIou$m%19VeuE`O143 z-V=p|Bdt`XLj)v-O|VyDgl$g+TNc z_rlw~TS4WS3`tsNMQ-$^(3lnpSabL_c|KJdrk}4O?~l)+zTLZ-@~}Q)-m?z|Ug?2k z+#xU>Is@LPN7xt>0uP!)>5a@xI_an&gpCm={F#;XQnqpRb9)CG+kX+4To;DoF9|TI zj{&$AidH#OK*vf)fNNfkQ-3YL!BAhzS4*REQhCG~&d}UN$uwVPn8wv4lUo`Mgm-Wq9JuC4=6tfH z3e!5+49!nu0|!`EYBMR?FNh^}GO)V9mA#g4iO)|)(dUvE;nVXda(g6^aJ9bCL(XSt zT3|3FT|5pq*3Ce_Nn2>(_9R;P$%c+t)Y9tBN8tF~ePlW_2R7RE(ZHHw?D4b0gGX)= zj#55uYtX~>fajHopt0+ICe#r%P&IaNg zAx*>YGwhCjMaYX>L!%_e(%kRzc+2KBT@&0zU*0%D7kF9Gs;k#1uSy)Zo{NBnXkoN2 zE+qvKuc|AYmrz07dPe2?D>fr&7917srvAEqv~e##YD!n*gH$!L?qM`8YZgIYH$9wp zP8Fli2s790G?;>ClH^B37}?G~s?v|UPiz`fF_-g$=2i>g!=}YlqDup}$=QIs;&RB% ziKGJa3bE)w58er1jiycsXqb_S@hy8$#dQuEI2&N&$q4qU}v+|;fw_QOg#j#35Y{s9(c%O zBTk&FgZ#^GQbDdIb1JY1=UWJ2v}7q1nOKt;mAAxSb2V7$@v*NC86o%nR_tkcL%yo* z#KIXR*q5!rL|eWm4~iz^`HjjL)whxE^c92qx8zV$@dJrt=RwU@dq}ojNE1$EV}iqV z`f+;$bzgb_W8bKfpS*YUS!^Ln!aLeg9#6*j*P`&HL=5!Yji+?xLqxPKb-K6+{7oj2 zh%#>w5Z8c=7Y*dbul#C-o~N|&*lU_|Qkjgsl}4U=&%~`~BFVuMQyCrO6=*jQhcS=m zfL_;n;FpD$E*A4o<(wviCxd4bR3u}TW_RlX9BU(TdsdKJ6V6H(_!Ii3*ZLL3uF+O>F)F3Ev) z*QG)1eJ0(va~6=7lfZZOAPHY%!9>@lFdKMBsuxHeXG5M(fai`+iTECOv|AI0N-r1U zmgG6OFv^bpe6yL!(Bl=;mdzVUd?+%jGgBq zPf8iKZQlqoZ)SjuqB@9O4q@}dAJY4BwM1!>EIIF4$5?*j5M%eT?D&2$bhD`aulSCxchZa#|8e?|*MU( zpFg1c>liehw*g^V3VpjIhfTY&2*oZCSQsB==pxZXEjIZfo;XbR9rA^A}f=yg!B0+jn!P-MJkD$oCimgC!!XR;Ee`2(tYGni06MVtFM3aV zz_6#>0s~aaXhlN>?sGZK&UmyM_Tpmi9vIve7`ba#-Re*!_GFCF_ zGPLE#)5xZ|IK*|v1L3x$B6O5N3D?~%rUnfHg1WDDs0#fL+pY#ha1g=2|PU~iI{_Vp! zsXqcF23hc!B1cRsE;2dSreWAO4mPg2ft|7jxW%EAKHvC~ELm-bJKjVQW5w~@s(mJ) zS$dJgjf;e+xu58lbEEYR?<>^ltTLdR0NTxO!Z}AaVSk%2#`zYZcbq<$>xn|MTOQHIzMV>UNwPyU7L*QhV76%LvX6roWMlvF)If zy~owO>$^__r0WdY-CtH`ZOX>(h-S)K3&ZXQf@r)+3r8h*(q+FApRKi~Rmy8ovzbet zS*k-9qkw}ZYq7AIVxNj9uBkdol{bnIw|U!OW`QUa9}T5p1_fk%*LR|~BZ_J27NX^c zQ*oT*nNdw_F-~90qpyc%wD z(M*N&IVk(M4N`Isla(_gu=9i{do5}Uwl9CfTKFBKGabIrb)pSa=;Tsbcjpi-tt}>B zzH~wEp?fhw3SY6c+=mUPpia(ZFp7Ff}faNKYl*JR0es;}lm6;<6~{8?}4+n5PFKp5Iln_VUYp{ES~w0oJZAg<7x+yem0LBSBWKlFQ&rGwg=$k$3l~04XCd* z1+PANXf62)%+Wpwu=-jRAQn}9*en8K#p>Yd!W2lgE2ZtbU$V~B49!h%u(}rV+-oZw ziFR@Zgp97uA7MwJCngR3Di9W*R|I3Lr|d`HOyD?OV)h%EfX(=`)K5bUHn49%G4u#z z)LsFVr=2v;DwEA$KB{e=&ty=XfXb;#fXl<=u~z^L>3Sh=sTJ6s`9P$lJ~0Z1R*|L! z<7ijzWq3Of%1$W_B&k0}zjH){VH%Yf)c|Keo`f)L8Ttj%rN5xjUIYyEb70niZsJ#S zf^jerCA>8iIOWw#sQj7;yVE|yta&_0(BA}iOI|`iUO*Mb8^Cn~R|seh12ZEN?%j?X zK+dVdjC+|xxaTvN&90}DZWu!lQvyZ)*P-{?K^Uta0?STj0Sa)q4F-$J&krGFX|o*e zk<-DDT~DCe{X*%fpnR-tXB}1F5F;eL^ z?X`M<>6HRCHOVc=NaQk`JXEl+q#w6UoLb|R^$14_2sUKS!V<9tGw8v{FY_RKvKN_78 zHH481tdiSksUhVkig#VZ1wR~cWX@PR=3Wr`3?8Qs1kTevxs7;CK%mA@=K)$9@Yg7r zSz(xz8kP+lV;kQ(;qbR=GXBR0w#PyfQV*EpVYTPzer$4$3uM#tFH}*`Vj8w=(4yxu z_ENtzbKHJm664RCNR1*Iu>Nip;kQvjxdXbmt|%KzmDi9jkNvR4F#vr`uhCu=KI~2n zAqO`VW57=ywHexoI+fPg8gvlN<>Q#2)wh|_5D^?NI0iei+OfdpD6T$}Pn~&1Bs$I; zcg^8p^xn5j$P7so60yb=-b={GkVGcV&xLvOsgK<6EQh?liEwcHB|3JrZhib#67JWC zLO;7wT%%({&)htTL7WiW&7Xqp9}nTSg9~x3Y%@K$V=f-93&5Sy_Nent8qyVQSrU>Qzqf_ z%t&lARiVTDrdaz#kG&S)NQWQ0laSA&`UFo0w)sdyz=d@n@-Y;qi#L-;n|V;LmP_vP z8sXM^KG4=v!bmy;LWTpt;Lt~MDES};85SaE#yt>9dI;%(8dxT~o~H0p;Mr#R(RtHB z{qSZGNIg3;bKY%Z)e~}d${Sf5Cg-Mvsuxmg7Z)GNRw@E|B&$OWTR2?#uE4V1GSj9gNxCdCVYdngBc z9zTG=&`IzXRhZ}RS{U`vbFkD*nQP-al_t&&8Qq`dl7gOZgtb3S`!cFYfk4>k`%4(M z%k&b>&LfzV@_;T-en5<$pJaD!FQE;$=A*n?I0#q!gN<+&EbF0QqF4vkFYLgny9Vx` zP~|Qu`T;hI9B!2aA4F`pNrZ~Nl6tS@=(8^ll;Ss2hg2)3XWsywwONpCQ2)&MO%H?2 z<>K(AwVh5Ejf5hPRQCDxHfY>i0TS{%*uJe%;F|0XMQZ%qk5=(a)<-+Ae(#S}O77%K zngGb{yiYf@rj6E}0Vqm+fSiFL+GM52THKAL@$oir`-c>_aN7+c7}uPuz}1O^_xb2Q@+X)d~o!)q>u& z_o-2R20DmOgm%l%l&9@Qt=c9-PVNvGDMeOCEwRV;Y70maEv99kfUbxbhL0Wh=-t1z zfEjk|AgcRR;`1x-89CK&7@s7BeOKOr zh%7F?lqH2GSL2)l5PEX1X%k5dHYis{i>mCT{Z?}V>t_4w)6adcV z*<`#>3q;;}NS40i=Wf|`13Fgs(xfa;YQM~y$ebe7_vL9KYdIB8&$1$P_bei~{}(wF zt2ydj*-E8mOOWmP1+dBRDHCCD0+SAZp+@XcMkT$NG&H{^r4k<*y-a)Zja4CT6IyBg z_C++xZ!9WaETO#aX0~Ep2l1>{tG;i=A;m!(NaKSi#QOanNb;42fh$2ovO$q1drFh- z>!e^*dL)BFZ>Z1=A2unxn1+c2lEHD2jK}Bgu(om)iRY`s6c3gbR-Gd$^a@R!Hy&iK z$)MKv9at=X9M3*+1pX)~xO9C2m2>PR>-Oy-Mh9-OZzVa{=48nvO#Du63yGpf*<27Q zKSh65jmO?gTA*iVMaoYaQ_f8(CbztS0(3$xB! z6gFCNFeK(Q>5qve{0epCQs`UKt-?>nT+||kX>H_P@>mGm=QpYtC6J1_A!K}uBJ9#f zCU%=Bd^%i8)3+pIm}D&7Y5u^lc{mg{B@Q#OMsCFXYB|+#Re(Ek(@2+)I-J_e5B$Ol zX!N^U(zxU^6Db`{G(%sKRL_~vw9g#(EEB}iH-oH{z68vf97Mj&k_CBj6Jqs!0UHpQ zPl7CKq3`q>s7~Ji_3?r*M`P`1O~C<%rfS0RKp8mSa~lJP#*!~TzO&cDr$T^b&E-7| zfx5TNBu7XVFW=cjtq;ag-zkEy{IMo1GqFMoxK7?@{vb&&j}i+-RZ#U5f#1%@{DS;t z0ZJUs@BU}!qZ~*3PtM2xG@xq!Z>*1hvtTY1`OV;dhO_m5@BdWe%>EMt^}m@feH^`A zMjfd8?S9*DZCxBzERp~32x;l)Ezs9mv`9xwOJ7G@S9_u60$t68`kLB{^!0Q`4?4Pw z^fdMK|7MfDx%kVz(GZQ=V1LJn?>FK9bUy#1nOFYz5C7Eu8z1z)g8ic(SMi^KR*qV9 z{{Z$cZs32#{6~we@;_nT|9@cq7hmtc!v3RcR^gvuEk@(>2de++*8Nwsf6PSLe?n{h zGuq#MxBm+Dk0~YhPf$&&e$BH<5>kqiTXIQ7qI4I^CEZn$q^P74C8ScM_L@&5 zM3Rt%l%ibU5OPW1Sgw-*Wtl-TR(>F?{i-ZMmSx?oAipmpnm)~}nh z(KAFcP;#nspj1epv~!?Lu%45>n0S)e)A1Jt(7&hCe>9^$NB1wysORM$wAnqxEp&a* zZ?oE1>-^W&H2lrv;vY|!Y!IyKqI(cC^hC!Q4#>OvMO$EG~VGPLuYN z8?=qpyDdTc?rE@_9E>5mqm_!v<&ZNC{_*@*8o@fP8< zD(yJ1S9PH4Yd^Jms?H33Pe5mX8GJS>h05F7;jZyQ&=Fw_9dGhTmh~=7uGL00A2sO7 z&d0RJ4q#Aut7g#m9FYu|f_gsksCg=iyq(v|&Q$QlqEXke1v3%%y`vdiA@oX11D2x) z)!gX=HOH{5%GcH*R$jFqvkj78K005fX%w@ej4Pk8BQB-ewMhre$ zvC|~WXm%6}KTb`>_R<(KMphNOTI11FZWbhO&p~$~H!KM)#9zXjVAJXuRO45Hd7{GtdmRpWQ z@rE3f=NLg-?t7}jNvD2ZM@dy!7gGy~Tm|E|bYR3Bh)`dT&cfH|Ah(GGG>^pR>-W&; zP5HP)=NDZzXC>OyJL2I>jkHAWy+mQmUS3RBFzaAx=NnRN9fO9R zKs|y32+qGr*R{osqeD$vOKlcG!mGTKZZ z_v{uzcW|T|ye&bcXBTuztz!DLieW}hCd?=fr>@V&6O-{aL?kwq6i!cs?LCBkRF7sJ zjtGRKy8&i&oWhqUzmm{X32^xKdfdFjk?bD$j?02tX|s+cX7J46>iZvrf8K=FXPcu& zunqS@jV8H+TRPKs^rkxu>!M&Ys{ToLtx*oR+<$LRyK;HeTM3F$(S0T0?P z)swds6Y-L&3UJo=khJ)4R>wP!T-ldJ!_vheYKH+VxSUUBtCSPV;Z&MD9Eqg|mC)g2 z5XnK5uuhy!Ov50Na35V+ZNosC`z z?6AQdvS0KCNj5r6cU)kg)yyBZ%$5h&%Y4bMRbiSGn#9E@99U(2_%jwnBNXV~ohVqw` z99eFKLs5%BY10v+KJf<$IQEFxFVF`KjX`E#?^cp2>qq8~I?28y1EgX8Rb>6|&>sCD zCgFH5eR)qDX2hvNo!v56pj-q7+uVrJVi`E%ri?1Lev;b40XlefI_QQP;fYCBIBtdm z+!A2C+m=K|1ka(PHf+T(xic8{coV%avyw^$I-*)w9ZlQSfM>Gz6ITN<{>O^`|SZHvdQT5 z(w*rWc>w#z?jwxtJGyh0J34sr$#w2aI)C~+Y$@R3{ibjd^J+g%x=@Cx`J<_;Vk_p$ z)YG@^ZD{>khw85xjatKRX==|RD(-)Vt#?nP9($b7Pt5}7L~q6X&!16x(LG$dr>myy z#UU#5)eg-q-0{NYauyu0WS&=G`ISf5GI13K4!@wxs}F2{ zOCIVKRG^Jz9KMeXz(a{rX!8lEbmb--Sa6wb+NOz1pTyz=ljAty!A>lyKZ&{xaj0AO znY}KShJkAjV?k>U?UagP2X>`m@#{l)qkI-VY+Qr8)NY`P|3}RA3uSi{ufV~fAsV_= z3?tlsVf-&s+|(0=$sbQ+(9%e}VE!FNcMK6!T|<9Z)gmjAgs~SBk*&UnxO4?R@t%e~ z_aiZV@oKu&o{P#^)9~bzXsq_i!!jubf9}i1&a6MUmV!O=T?RAiHCT>x67dt(w+!H_W?_$880k4{>O zR&TE3#tqK+l2e97Z;mr61696!MvT zeA+PLAvn)8&DAG`@pPXoZxVKv0r zl);SF8}O@OCC@xWg1B0T0j|D_j-~x%MN=aBR~eq+bwhk?GzL$f>%p;p>9B5f z5j2n80XCOjVjHKiM!6*(wk_%;yB-Pgw?ZYx^lm237HR0`;l@UFCWEM52CDkY@ZwkA z#}}uQF!fmoM4s1$@Z}Sr>)lcqBW=Q~yk*NX|D+DPCVIlt(i*y8(nah}uRsYmeMngM zm|J(w7!McR=DzYV0n@BD`e1_tiWnausW({|ym*J{uUd?OdPUeT*#K)K{b7=6#VCO2yE0X^TBAmU{EB+jK+)T%k7aH`;qq(rza1S+WsUqyI zOc=VRNB0ku&~5LAFd$<&L>Tl5pI51D151EUeLdwM$2sO-rU%h>> z@L0^pF0)I~+j*^t&iyLU;q3#*D zcw8keY?_N#q~D`r=2yJBvjokmOhHI*3~=IJu&Gv$uz6(?ru5q3nf4R-GEwlY@!_JA za|CN~>Hv*CDo=6gPrB_#JlPTZ19MdSu&T6&dK~N~AwT!i?zjb*Gi*;49fnc6$`Kw0 zt*KYJ& zs19?UrGxO=E%a-PDini1N}WC=$afV9PgoRxeI>}-VU4Tacp#@rik)-Im#nlui5{{Q z7_mRdlHOGYn`a4dza|AOlTMKQ;DcnH>pn9?7U9K*J4KzpjAaMWj0TF{+F z&#_vtL_QfSX6+`~w~Nu`%45b`+JcOJ?t}d$<4|(`ckCPWo*Dd>!x&ViF&bZlnb&7m zqQ|!nG-t#JzKa9m;XzYo>&*?ATV05moKDRB;~P_DehpI$2GLphD)Buzn;2W~qbhBy zNzYgst9qcRZ#<`R9q&r zt4-l#O*B1zCmc2!6WpGW%_L1*0@Y)y>0ZfQkaKG~KKiACGA>E*E31ILSJ{mQ>wC%R zIg=qXZF0xMn{uS^oa??J<;}P$mP+qZnL2As2chDp@mVKI|12W$&_QS zed(>>|D;syICH8$8t>ce!!@(=Si^83n(KItxlyve#4?w#{Je2YTv2d?2Q>@tx=UVeneE%=nBQ_Gude>9t$P)5& zK@zN*8Ud>wJ;dXye_#Tqml$nrW!{|Qz^Q>uRG3wZjp@!Hq8LM`w@AWSRc6BTkM|>gssVS6S=%uEyZRFbJa5Bl^0aZH{3KnIG_qodH9xd`7( zafSNIJP0>i3^$9G(W+K4ewDW+Mw?7TMVx>lMImT+!<26nCxe_SY{-#1Y zHcOg6>Nb#2^9T|)s}PMsf^g30MEvmj4CPFl4rP=2NWY~SOxOBKpXqf|*_b{O-}r%y z`ZkO5g%i+8IGvk#gA4DIy&-6#D|VknQekFXBYw_< zCH(HAhnW+&Wc_U`Sdc0OdanL(^WJpws`ol^o*xHI`WyKRgXH07zcIXdIgC}0SF+P~ zUdQ3|^we|9Rmt^jjRq+$&RYTkmPu8Jk1zYx_ahmUlRRlQ@54q&dI8 zLm0a(hUmU4zi4-SJ#r6-^BWBgz_s+9_$x+%&Uo^U9?qLWPb~4oM=^Ur^n4caqHh>O z0}o`YjDamRg8RWXv~IQ*Y;W^IotS7y%aw!ttxdG6zKcjLGCkQuJ#fIJ8_~2Ro3C8aD$j-ugz|j~c`I8gU}k8IDsXufw|S zqhZiFl(cOTV|xZ-!8TSJlEmIK1}p8D^ydf3)-XL($*F>=g8GI@iZ|vMUZ%z!Gof{l zA%u@CCUfXW@E&snISuZXQR=GL;U){)D|BHdFrc>i8Oh$RPKBsIT=!L z$yE;0e~H3dQ!y~@_(W8qC(;*>oXLywU~tWu3^^NXXs@mGS?pNEBzwPdp*Eq zPAHrCF@iQ8;llmB6G80rMUrtw8VYxK!Mj5bNy~asOez|pT07#PZ+RSJH`NSh#(k$# zMykL$U32tWwwEIG5m{wPD6#P*WvAq!d5JTt(rFHynfX<1!AjI-WG6MMJ55yV&w+N6 zI1V4nBNvxT!oX`y(i}VsIxmf8n`NU3=hx91=Y#ucbI_HV($X6wphyn0Ha;XuBd3Gq z)FRLki)VbN?W0}4glP4E9fqA-LOUaK8PjKh=+wwkYMd-BhZq$rfgFl>dA!_?W92%PlxelgWT|0HYTBs@B=Pf4l zw;K|Lf#Wn;MVy!U(+hi>_u;E;XRt&3Am*<;2_oMzprt595Npb?!8s8W^rW!V9Jzud z1x~IifKlV#vg3AX^P;mSpoUB_lYVwNDi3R-j8!K2anz8ehpCe#6)yBxb`Hd<8KAWH zL-u=>ERRuCqF@Rm6+7@*a0KrEBbPi0F~{rXX4rV{5*7Be0q&id_>Mlpw%c)-{je3q zR!5-qj~>=?&pPb?qYZPsAL1O}9A=5{55_0#0CX<0VO$<9qh13)$tvB)P(NiUEY;^h z^o&GyTFD8z?DK0_S$z=KN{_@#FH0~_L+wOPMzU_Ibt3(w5u1tjM2qW zm*enc=|LEpID)^{sD^vGRt`)HKj6*bEi|hx9@my@u=Lgw3bPJlgpV{@dB0!`XdJj2 zjl=J=_R-LQ2K-XokF|e9;~C*`r24^Osy25IUL87yDi`aR^fex+(G56x!FasYEdkO` z#c6wU6p4?_#}~biX_`+YiB!8swyB>)9VQwdNt?l7f(D5&KURI%NAUf9eGLabW`nDK zH)_B;?k6EHP^sRm5Utme}Ft)k|(tS@E zSX*Go?C6%@&r$LwC(?$f#qDt9`AdR&Ig2XYsZ{1DK`W(GSTLdzlmq5rlh#?Vb$>*2 z-O{kU%Lgk~-NkJ;!&v!Aa+uLQ2^u2KV6^;rqPX-Z;n<(Cbg&qYV*(H0!exqBzUwfR zZm`Ey%fDik+7(!+Tu6L#L&zRw24=>VQJ+#>cqDS-s&RPv9eTn?+I+E_)L^$7b zz-^TkxQdp8_xy_xZ(zslcz6dEj#T9Z3zdQ9#2*av=_TnewPSv9%0cJU9v1FsfX-!U zUPN{zNptNJlNueG6`EdIK0j^PvDWH1H>Q6 zfbBsVaD1ErngvXO7n>yEg}yYTnF;a%JWOmI;+U59BC65+5WK&9qPamFZj$j9crp4G zsY)qYYzGD9BH|-ujXQACG{na}Iag?HkblV+zkeJe<{5o6c(~zX~;+tHe0a z2%4w92UCqM(r8)<+tk0XYa2e-JW+hb?3|{`t7s?z%ezBV=X(@RGS7fr6MR6XOc-28 z8-SNM2j`lpLYi?uQAlxv*g`>Ea9_;CubEB17d<5MXOBRBLmBmVmFFpLzYR}T43OIQ zcgfn-j$DV0_dxEl4OlJAVpr~60s8TcB+4foB+samKEo@FylotjJYNs_|yGS-*v7*Nef920oljvLfzVuaT^S zMzCbjQraW?g{hPs$7_s{hjUvFfy%}gz$xvvu+?cN)9mMy$T5B-Zg&Z}HD(fyTfG1} ztX*Ki`b`4=ybLpO1dqKM3vej`oP;`IOW+*1`pg^_J(A$VRsC@_ii_e7Ux5xyez+2w}@yB7Gdw( z9k60n9yxzlAM(3H@VH7I%6h%yeufTalI;vkdy48ADafv=nnzmTbf)pXkKBhxlZxu6t^>y@8FQejOlBn-B8`4v+V9fLBsO>O|ToFBn z>tY%p?c!b>bA1H=U~woDvqqUeVK%Trf+Y`6;q^jm zkgH0j_Zo%h!WvmjZn}*sCXT$gRw>@W@IN4NRyDl+wGnqVBj{>K^FOoWP-aLHrzIZ6 zij(rTO~p!YFT5tLLBzSN0SPFMgEKI5ztX^UQpy2jh}pDAo+a)9s1%3m(Cr7 z7VAS0I8hze9LuE_)-`aCE6m5tYu8J(a-jj|m~=5GS85AtB{5V?-RxD z0rFP1lFTaB1frdQ9hNnapuQIt2j0LisZ26hb%?v;OA{lu`#lCV-3GG<%KTNA3dr%G z4p3B_&5pY&NfxZ~#!NRg;@EQq+a_4hkDnAEgJwZo-DgOi^o6cn8cCP(H2He04>m@h zMV;yEV9vuj|tiJYS2m6j?ba(A-eF%-gJR4#_C4`HJfH{&I4C^ONi z8|F`zL#^qXfLCt~1%YieUXV|J<_cnOYa|3dSq7zcH)x-&B3!*d$%*28dfi_hPw*6lqcwUX&jgjz4>*tanwwD?g^I; zRn8-MsY~dnhG@nx^byMFNn?P9ApCE;ga$&XxI8cdHILTP=*iQm+!qNNVLuhVq$D6Y zKLRJYC!pM(P|PD^@PU;M;)Ms){NP6JfrDP)LUxjqZtUMaN!=g{v-W$Mco};&ACm=l z+o|~?YkU@_OVZs+F?WX@eh3#r?s98*RI!J>y)OcP*xO;yANs`POdRA+oR4bO?X1(6 z^W^#zhD^C}hI~{y%dC}3AotW|@VeLpxVNRA*4;^ER~%}hN1WQ=-j3f;gZ=yR*WI$^f2P6B7aZ_9;eV}(5B{Ei{Q{!7=?DU;J zJ&}XQc#^Q7K8s#%u|SkL#a+r*XF?zqYA%1Lqw7^^tHo+MSMcNLBB)*6-(Nwa({ri0 z(KN1_rWO=9u+$;4nM{xs0;f^WYi>^ZNERmA<1TGs3|pv*3NsGFpzK{zVWR<+CACET zOdpDv`ax0OGL{K^jSg=Wp#IK1?C@GlX1twEUY---AN)kI7*(0fmdEV8Q>m~#dnFk% zy@HSJKB4TbL~iEhCRntx7Td$sa6#H@YSz97M!H&(v8VrFS85kvZVFDDfE>35uBudjE#63h^d+;AiSuWaKq|w ztJXOXx0D9ek1?PauMV|=QV{Ergh?01K(yy8{LY<?*dwoz6b{Q?PqiruE5o^ zdx)Cy{Tk;BTG%#sE<$N6Ccoy>>c=wB|LrlJwT;FmS0zlHo`Y?kKA1NziblmBLScax zLwTMIJ)kTXx`<+k5bi?-1#qD$M(JPlkW=D<92|NrFyL1V$@V)AiPc=u+l` z*WGUr3yTk=S1Wn0PW6BA3%rsdHSu+_+cVFlB-R>Yi4aA7-4Q--R9l&gRkk6DF6cODGOQ5D$ z0(%OLQO((u-ibO#6>HPj6y@vOP(dv!z}z1~MeQLhCXRfnIYI;8pQj!l3{iQ%Ajg~) zLxsP2!tQWEKSwGF)>PPGU*$6;+Di^*coU*f)#9N&E(T?p_>7}%csb=(_&y{62th->!xke!{o+gKep)P zPGYM*0&2%f;zwyi90)2SWBa3M?oKHzzq*$;N0f4}Pq3q*Q}+|;rfC?(-A6N~2>SL< z^~f*1TB<#$i|Pknux&~iBu`HocuCo`+};5k8smYxP=)H;@ki%TY0zI020GJ@l3zlF zB=YAbVDBHqfU_nfbw)ACtw^Vxg$AtRzI9|_u{>5i6U6K5i$MF1F3yZSLJE@9(KGKQ zm2Zt8N@`2dKw>m&bm$AUj1Yq_8>@)Jm;`cll_BXJR>snyD5|>NjD`%KCxi(^^?*IJ zIXsYb-jt|O&pb$9#rvU}ASW61PL-UT6-)Z54O`l|ob8)R-A_LyD5f_)!==n=YTEt}Z z`asFr9y3tdT1Go2?*>ipHdb-BCxm6LhMzGSwD5H;M6CKztvd~nS)&c&e`G<@`Tf-D z?FG`m_5zVz8%=h2ia=(@O!%#QJdjxUM}!WC^Lzeh<)b!d`d=y^|0zJ#{x77Df0JMu zOa5kXH)q4rzt(@6a>o881HHeAFN49wCW3gc0-yZq@!;Ai|^d9+J P>nzwK#`(Se?`!`Lz!bhf literal 12241 zcmbVy2{c#V{`OafkR+)jq<&>eLS|=wNJvP?l*kZ5GApD)2u%`^Cdrf}Ns{l`AJQaA zLXsvSl{9OXUfp~Dcm3}DzxVy!_3pLyI%}P?_p_h#IiI!9-p_t)EP44j96>?O|0;?c zea_l&-=JW3ufX87>)mzT*M$W~hJ-H&^H~?>6CUm#9JJ8eGs06fWPPBSu{Ou$|Et7} z+Y-ug78n~nI5Hw6G9r{`ZA5U`T0d3y$O!+yaAEf_AFoLNKyUX5pP=yIF!zlb!lC07 z6`jYA^#~tp>AB7)l=tslgv|njJtJ2B{ibjzpEJ)`8?{h=XWp^v+K|Xlf#q_-5uxL4 zEjfyoJeFdXLYCqYp@NEIcP;ZR#VvUvLWNS-t_=*O|GJvt{K%d_nl1Sf--+A=A{$~#I z9QnWG@H?v)s+k$j`7a0rMfsBWv$B{!(*qs%FF_btg35EO@nZfcS-+#5RHrAA7qNG_ zyS^VlkERMtr=fUc$_UNk(Vzv(E$P(nRoHfPJ3f6|k6Px>i75V{#*g_>E@LJRyXjJ! zqdfHJu_#bp-~?;J32d@GOVYCH=*<rj$TQ8JkOf24<*6%r!Jas?@)_7kvNVU1~TFYXz}J&a!Bei+a0@` zOpR+Kb0_S>G+&Ar+v71`y^vjh^C@Y3Qiv{v$8qkIrSMXBkD{Fl{zn>_%wci4R27K?c(K zQmLPWD5Lf=n0DLlr{nmhX6;3O_%;*$J*7qhOJkd17iWg-Zvygo^3ll$VWygK9h97cg*tFfAbjW=_ z)3szLtdTFp&&S*7#u>_ZqX~(*NFO!UP$mH<4nh44ZycCA2WmtBN>Z=l&To6LeE&4k zp?{7R>hscCq++qXI$R6-7hY#8M7o(qPCa{^_ZEDdJPpm;opBCF0G+OX zBi{o}k?)!()_RPCkb9dU&UPoFlP!dqWl`;yt4XZ4Afx{71|4kFBE6O7^vl3( zHeG?GUzS2%_yiVOSCP&{Aw#J%r$B=HpeE(a%KJwXaZ@3z zS}I7b%=@6&&9@c2rI|M(0dckCenZ)>UR#&hr{QWAazwvtSrd{{SsfS7sO zLv1eu*1jQ_VLT4S=E@jOT%8Gq8Ixi3^I9U!>cHsDjpVt$HoQ;@Ce!YFz~FQ@lESA2 z-4UONf+G(M9EpN!>liZGLv)up0czHP==5Wx$9Xy%6MGe2G}r)4^Tk$+4DfE; z07k1-3^ocX;g>vT*t$u`Fkq=aIBqY5A)zlMDRKc+wq^hV<;CuJ_}-=C&DvcUc>tjpMx@Y8L&E24Tnv=X+>%yy%ocQrA;2VUB89~40+*_ zoR939*$1hsK_>Asy+?1p%qRK$Sqwx?!FhSwT&opcSUEZo#3ff?!{zu4b~SQ6P2hfWdEf%L_25!n(KLC*$r7p4Sq%&4zGcCsk1TX zoe2D#+D3y+ydbDx4TviMm6>spymHOv-slaXoLgF`X*v;?9{fcFyTl=`%oi0;>7ia& zF&&zbgwmHLVL_)Gv=L#{k>O>X1rL$W$4?NxmbqAcV}#yxXr$(PnsD9j6dhduiJ8-| z0Ht~5L12FbfY{L(aF5u|7&vcFZEL;<^lxn=3L1VQn z@z#x#bg=gV)^3x;T|UP!Idw0}^@L(wxfbTSaj`7uG}b3SL-uSuH4m}Dhs^joMPHdZ z#p&4?8&JglP*6kxID;X%CD>ZaTNh`~lDVem@aHrutlvvfY)S_{yS^0_74xfM?4X9}Sgrdhm`k~hdQ;e=){jdF0<69yI$Q9$9bA_nwvl?&jt)skacHo;Uj@Wh5 z5nCQ6VUO=jraiL+Ww|_ca`JcZQfel3Hf_SpqNjM%V*>6u-i3n(?f7KwR4^6%qwai} zAF?q`xHl^u<2Fg@d(2VCH(#%#(MTG4b#`NQpDlXWO{;ro{e_Kq_YiIL_o25&Je81J zj(jHvacUM96;stnQA!+|>?+4U#4534{y7X>%c%=^H5S+Q2bd%_sqR$rbILXRNaH2< zp)c^(eJD7KQLPKmYN;)XD4fOvt%G=_F#(}#CVtjV#J+XQFsc0_MztiMMI#sAjd5xZ zC`ZoGb^31(RuB`eCweF3p<=-UM$s_=#lkbOtMDGiT3(?Nl9@E!vksYqov3Yoi_V*w zh`B6M*}4MkkQ+oJBTIugg~i+}S8_13s+9g{n+dYB$6`o!U)3A8EyhX6Eqx z>2+4dSy4~=l$P4q3MK;(@k!fTWQQwPLh#fc>Na7qmFT(^L8{2C&;$cqs?vJ~pGw%7ic z-UiY4_ESHhbY?5=A&GMbnAE`w^x?v@B+KCjiSjuC(_c)52}v{H@U9UOI5`>mABVx6 zOGjv1yDLeH%z@I|M)c@}{a7fn2lU$mq1;Q47@s|N5)Ydygd<(FehCsvq1I z%YvpHCCFVLLVQDAh*N$XaD?P*6Bkx6Nu~42FE?cnoxPS0_Ic2zqVe?LGC4TAnbMNCTa!oj4v9gQZaTD3aPCd! z=?g9qU$c^ao6=A9=c&_%JDO0?#dcR^Fy%^2=a)u_^sleT%tJPzLwj9047l#c;@WGJIX$Pbxo3Lz8kQeVd68 ztr!i5+*XkrF$5d@_tTasqa=Mw1!;b!iS0K{X}_2mI&yg6U8fUw|GG*d6EqcM9;o4c z!zVzV2tkf*vw?$Z5NVg6Oz)=vHR|6(xoc+>k_CgT;fd+wdf!rN;Io8w#lB|qYj?pXpA@__ zSPY7J9Efa4Mw_J)cp#~Pq^<4-pRH@D+qAhB} z5qRD7q4VPhvT*YX%<_+eILFtlig+rW^ihy@P0C;;9bMR%yt@%l-QJ zutDp%-BdXJIvl*Vjd8e}15uu{@P*7=ux($6d{54@Irk|M*a9S~+zSdJl-Bg!A{h=7 zn7JaOBHMh5o~+jNJYH)N$Euss2pvlIW!!UOP65R;>8%Q(n&Kk6S3;< z1DYfN)F#mmM?Npa@<9H&pjYQnBlrS3OZwuRzI$|!?E*Lw{+R9+KZPPvqIjj)73QX9 zF&erupi$@17DltyWI7VXGPGvMT%DYyueL8`Q&RHgAZM_fwSZso9wp% zmi@eravLfb#aGG5vDMel{3wg^y$aaFdz+C^d`sm-1<}grCaw3&qld2Qpx~E2j9Xu* zpSRix=S@3Fqsp{t_9y@+ew3Ys8M3F*wksF6gl(a}fK&JNRRU#2x6zNS+wtL`7Q9K0p-Yc1#KT{Y&>p8= zs$X0|cit%?Yj^EMwWkV9>%DiVpwvUOjgxR|(q-1XaT%FbkcuyOXHzEIh&@_6ALY3G zbyGXHqv4!%Tqe{>IO*GPqeBV3c4{*Ahgo9mQVu)f<%dCsLLoTv6bV^b%oMyYL%ZEZ z$Y0V+!v6S3SKs-7LEU-u2agM#5mrW&Hg-}G@yqN&t~gHDxlc}b+@^v{2C0!v9Lv)h zPdU>=5wBL_W@WxQuX6{{yGb2JRuFE?ZxLei{TR1f*)Ovp%0Fb z2VvS!dN+=|KW0oe&b&(W4^>dHG%&DSH4C!xOi5`8gX&2uXsP+wfAPBtdHM4&97uT0 zB$ua>hLt}^<%lqRuPG)`=Np;mJa;M;t;3l8?4mNSRIoVNjSl~kfOUcMfiqrKKNE_H zej}2T0%9m~Tbljc{E_WGaF?2_>Y}iH8nk>ZWLzg@7#Mwx;qnEutda=_=H9NLwmcSq zp}8b8{V=_t8-;^i-X!nsEO^$cj7@oaiB8~3Jndjb7FY0trQKps?h_(1^Yl^71lZ4E zd8B^kPexmOkS1SsCwH`MXv&6i#>3o`C@#$;FHWB)oco1T`-=#PwRXXh(pikkh%E!rqbmtVMv zY{Ff_7q3GW%iCh-m?&B8=7T+BYwgI=a=NU~6uS=P(KBuFv?;w2S=~ou^+-9j7HT5m zLgBb$fg0w%-;Pa_4x;ixS#Z;>phlZxxgM@=^wG~*n8G(1EBAh(k~@?^Kl27^oGrs0 zE+6QXUP6WH7Ng7Z3wSu-G46ahisdENRBB^AE!C68lyY4tdm>Ez`U5dtej8)tc@)Kd zxw3uds<6uQ4sj{lhial_*!z}&nhHBo77kRQ`XK&NOTrIx#x#!l`{aYR5(ZXJp+;F% zXp|pqkg|Fq&3$}P3nB_?uzvdqM(YnzRC2T!n?Lqo49`j|d48AXsYWx}*Y_ZYf4YIql^!bh z^d$DyT2kB0N_>(3l_^;HfN1>mqgKu%RJyDVb2n&Xs>f_JStnk%SmFh`c>_t}t4_gjD|Pn{qv|19HCxel6=he@tM4U>A&4K-C&S)R+` zRP34$8+l)oa+1I4b?X+<2P!M5cIGtVvFaf!(~*v?cNalVs4RA$mBM6|K`PoP48zxj zz&NoSSE_HI_QlhHD0|X`gO^F^NH1;9yi0Osj?%%`YRn}0NmOn8bGlck7*F42@xyyv zuz3BH_3{y|J5+Ou=}^*w)v0mhlSd>xl={G?@z+rA_+wPSrj;JckARDUR;0<*s5ZVkF}R`xexJ9~s?WjLcz74xu)CN{bLWAj#tc-g6T?5I$kW`Nm)P^t zhFB)%Oma5v#r;~EG(Jll&E|OG@Dc@_%;{&oSIs3ih&0NHeP<(+)oIVP>s0$>1d8og z2d;cu$+Uf3{JKGobTwXQ)Tb!Zo*$>F@%kaQt&L?L*4s0!7kxK*$FUvL{YEBO#yF?F@JB8z+q+T0#ydDRw2dqGC z>unPHb3BZmwL%s91orSM87#Z&PUe;w673pYxSQ=sbs|`rTm6`3Bu|2}YpTS`Y8Trv z&Vz<@AEr;n*3CVa^TEh79eS3X29Lv&S&_DAVlpxh=k08S!MQ`^fYJ)!REyxMAuH6* zp9K;n*&tz5Mf<&^XkN2FGE4T6;9C#9zJY5;sfJvrqPP?7ub&7 z0iZIr*G{fh0+A>eqHuKxl<&IJ?g#y(MX-x1h5>7z)J>lX?*r5JN#Kz%k20(@ITh^g8su+FjuZ!JeSo74|0YMP-${svoWAc$R>Vl*x^ zPd`sBni28kP(4*e97;DrQSl3;>W%}}zY~PZs=YK>zyvy^Kd>iO-h)oVM&kEY1je7a zOuhYdY1|1T%-C_T7)9Bc^N#e}X>0{LwM zjCj8svpMr7$-77BXtWkn#Xk!!59r{$ccmoj`%(6+sw;GF^CHb=zo>=&d2;r`D5>}; z&TyG?9|wi)ezX$NzvSCKoJ#Ux1M35|Z+ zMk39o!{8)cIy?3Wx#+G5=F5VRO|l~WYG&|iu!rb;^}s42W#WB%Arabc0w*T*vZ-qA zgf*<77gt{*TB+yA5BqOS)UgJfg;#R-P(fy||4h6Ws|Zt#$AP;F2TboZ zlE=gfU+ud}=0+=$#QHbXh(BL{weAg4ty@Es1TT=<#(Z{4x(tdg*-bVE*pR}Qcl4;0 zICM>&%%*5fFi_rkn$q`Y7%LS)=m5$Q-8<$QIcl})DjGHo0_e>Dld&US*AzUP_3Dfbvxy%B7vy^PHdyz$K1 z^LUss!Ems{9S!5_!fqZwkUU3)AqBhYr`8?V^A#&%Kj1u%>)3Up7JJRc)tS}iVd>FO zZ2TrqD;I3UqP~~3^U4K$dT~2OT+cz@SYLGYU4^!F_K?>c!%pp4f=#L`P$u>p?KPFg z`lap6Kx_>*&AE?Gv52qD$}w?D8=83TM_=s@j0+iHVvqd50QcJ{r=5b;*1}kHW)N- z95rWLK)xj*`0>6p?rJE&bBAnk#uCJNaqj3C^^4dQZpM6n5p+sAjE*b6;V{j{yuw0k zy`_n58@YHKwj-x$N^M=C1d7#{V9NO8_$GTNw&g@))yrxWm{EeL@`=_QXu^oWRcM&K z7zgJ6f!#-rQ`^2#+IpIo+04AZ^QdAaIrWk4LFWYe^}wEJvnrvXFleZ z9l^Y2c_u92D4Khfqqux69#%^pJEPA+{>p*n8;cfYNNB8AJvsEMgg8^j0oL>1Pka|9!4p7U#i$FutBvX5MRxQ@jQ z?R<6mp{01NWdc5apbvZ@-_SxZ26cDxqsO}WAThKAEl%qrJ>Z6$d-X8Z;t$O2yh7ip zsgR6ZXVl`ofdw0~QPx4Q&hE$^+PNck=S&eOmz^GNE!3RIJ?gyNZ(CYM*Sa4(QlO>3e1hiGtqZ&r`P}vh9~1rD-mR?&QWv6)tJ%2Qzxpw7GRR_Kz0@zh`>%-LfIJIdebzy>~e@*UhIZ)vnN7mzUVueFoXx>3DFY z1*JNY$otEQDEeYG_ME9jIqL(+WD1i^b|v2Dy@hilX4i3pQZZ)cZj8~;rayl@rKLw@ zVWVL;wpu$d0k5Wj@?`@w!m|_xicnPW91d!=qhFK`DjYY(Li=LeoBI+a;`&i^(>i(~ ztpJVXm9S`(4+D)dP(08KhXkdtZm%xRvkyS0W4G`O-x563QipF(x8VK0T6}ta8){cc zVMN$Rl#98BB^plj%OwuVPYc7o@ufIT;wZxD1DNG9Ks+bs(+9S*pg)(gCJ`;9_vZ}! zzQ~-8q|1W4zcg&Qbr$+X&qAZ?dobS91?sX1(D*={wHrMGo~If>HsS?oGH)TDb`FvC zq6ttv-itm?QzwA~XJPQXGJMt;M+?rMWEEE*BvO0ELz84Ksi+(gDKggz~$p*yprw8P$=4IcxCgY28GuW2Hf%ZQJXwoGf!`&;7k#B?hsXccCWPA9* zRi8Yf;(DHa?YbBWb3~x+s11$apKRE3={#BG8wW!v1!TpHQ3|9A#AgSBQB76t`6lL{tbh6#LiDIm{p4}+19CXTx|lbUuMg&0{|VszR8G>gTc zEaC*aclA6_3;x6{YFdGNW`$GNs01cpv;<5p7qE*>Z_uKW1ju)@p>hdvVBsZ*dr1h~VAmTC|dii0q3J6LDlOGC3o_ow`?)= zuWx_~mAy=^@?G*HVO;#6#F}XDm{3q9H9m40xVaQmw$vL|@MrwC-infYkToRO2wczv(2raOE-F z6C_6#qZs^lKE6Jp^&n1>!}(qRv-44bGygB0kN-5FD*PAL$G=%H7YYAnaOdbchrjlJ zYICIil7Z^q%$MOlVg6$d)ELj-K3sQyZ!_a*|ACOYmX3z5x}Khvy1K5GrncrHH4SaG zMY?KQ>S{WRv~|@rG<4L}H1rnzX89uDJ#s8WoZr>PlIJ(!e>$K4Z07y_{9ns|>x2Gx z)IS?@7yb+6(lM{@U!ne+E%@KT|7^O|_!say{|)&6a{B%|{+|uCO8;E0}&*`H5FQ9fRe+~M7Ph=ZQ z0fDj3e=D-Q3;v1v+s^*G{LgjM-;$HV`TCE>WBva2DEJ>)^mnH>|8KAJSfBBn-|he2 G_WuCs_UPvT From 2354ef837d42555c17b50f1bfaf3a74d7a620cd0 Mon Sep 17 00:00:00 2001 From: Jeremy McGibbon Date: Wed, 11 Feb 2026 15:00:41 +0000 Subject: [PATCH 14/20] use logging.info, save json as well --- fme/core/benchmark/run.py | 53 ++++++++++++++++++++++++++------------- 1 file changed, 36 insertions(+), 17 deletions(-) diff --git a/fme/core/benchmark/run.py b/fme/core/benchmark/run.py index d540a3aef..d786d917f 100644 --- a/fme/core/benchmark/run.py +++ b/fme/core/benchmark/run.py @@ -1,7 +1,11 @@ import argparse +import dataclasses +import json +import logging import os import pathlib import subprocess +import sys import torch @@ -29,18 +33,19 @@ def get_device_name() -> str: return "CPU" -def main(name: str | None, iters: int, child: str | None = None) -> None: +def main(name: str | None, iters: int, child: str | None = None) -> int: RESULTS_PATH.mkdir(exist_ok=True) device_name = get_device_name() - print(f"Running benchmarks on device: {device_name}") + logging.info(f"Running benchmarks on device: {device_name}") benchmarks = get_benchmarks() if name is not None: if name not in benchmarks: - print(f"Specified benchmark {name} not found. Available benchmarks:") - for benchmark_name in benchmarks: - print(f" - {benchmark_name}") - return + logging.error( + f"Specified benchmark {name} not found. " + f"Available benchmarks: {', '.join(benchmarks.keys())}" + ) + return 1 benchmarks_to_run = {name: benchmarks[name]} else: benchmarks_to_run = benchmarks @@ -48,21 +53,30 @@ def main(name: str | None, iters: int, child: str | None = None) -> None: def get_label(name): return f"{name} on {device_name} at commit {get_git_commit()}" - def get_filename(name) -> pathlib.Path: + def get_filename(name, extension) -> pathlib.Path: safe_name = name.replace("/", "_").replace(".", "_").lower() safe_device_name = device_name.replace(" ", "_").replace("/", "_").lower() - return RESULTS_PATH / f"{safe_name}_{safe_device_name}_{get_git_commit()}.png" + return ( + RESULTS_PATH + / f"{safe_name}_{safe_device_name}_{get_git_commit()}.{extension}" + ) for name, cls in benchmarks_to_run.items(): - print(f"Running benchmark: {name}") + logging.info(f"Running benchmark: {name}") result = cls.run_benchmark(iters=iters) - result.to_png(get_filename(name), label=get_label(name)) + result.to_png(get_filename(name, "png"), label=get_label(name)) if child is not None: child_name = f"{name}.{child}" child_label = get_label(child_name) - print(f" Generating report for child timer: {child_label}") - result.to_png(get_filename(child_name), label=child_label, child=child) - print(f" Result: {result}") + logging.info(f"Generating report for child timer: {child_label}") + result.to_png( + get_filename(child_name, "png"), label=child_label, child=child + ) + result_data = json.dumps(dataclasses.asdict(result), indent=2) + with open(get_filename(name, "json"), "w") as f: + f.write(result_data) + logging.info(f"Result: {result_data}") + return 0 def get_benchmark_label(name): @@ -71,6 +85,9 @@ def get_benchmark_label(name): if __name__ == "__main__": + logging.basicConfig( + level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s" + ) parser = argparse.ArgumentParser(description="Run registered benchmarks.") parser.add_argument( "--name", @@ -99,8 +116,10 @@ def get_benchmark_label(name): ) args = parser.parse_args() - main( - name=args.name, - iters=args.iters, - child=args.child, + sys.exit( + main( + name=args.name, + iters=args.iters, + child=args.child, + ) ) From 1c7560741467ffc0792328f8305959f1665637f0 Mon Sep 17 00:00:00 2001 From: Jeremy McGibbon Date: Wed, 11 Feb 2026 15:00:54 +0000 Subject: [PATCH 15/20] revert changes to conftest --- conftest.py | 13 ------------- 1 file changed, 13 deletions(-) diff --git a/conftest.py b/conftest.py index 930e9fc68..84c62901a 100644 --- a/conftest.py +++ b/conftest.py @@ -1,7 +1,3 @@ -import os - -os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8" # required for determinism - import gc import signal from unittest import mock @@ -9,15 +5,6 @@ import pytest import torch -from fme.core.rand import set_seed - - -@pytest.fixture(autouse=True, scope="session") -def deterministic_pytorch(): - torch.use_deterministic_algorithms(True) - torch.backends.cudnn.benchmark = False - set_seed(0) - def pytest_addoption(parser): parser.addoption( From cec49135fab1df29dc482c2d76b3a33a3c0d2ee1 Mon Sep 17 00:00:00 2001 From: Jeremy McGibbon Date: Wed, 11 Feb 2026 15:08:36 +0000 Subject: [PATCH 16/20] maintain insertion (runtime) order --- fme/core/benchmark/benchmark.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/fme/core/benchmark/benchmark.py b/fme/core/benchmark/benchmark.py index 843c229fe..47457f8d5 100644 --- a/fme/core/benchmark/benchmark.py +++ b/fme/core/benchmark/benchmark.py @@ -64,10 +64,8 @@ def label_ok(name: str, ms: float, frac_of_root: float) -> bool: return False return frac_of_root >= 0.05 - def sorted_children(t: TimerResult) -> list[tuple[str, TimerResult]]: - return sorted( - t.children.items(), key=lambda kv: avg_time(kv[1]), reverse=True - ) + def ordered_children(t: TimerResult) -> list[tuple[str, TimerResult]]: + return list(t.children.items()) # maintain dict order (insertion order) def blend_with_white( rgb: tuple[float, float, float], amount: float @@ -121,7 +119,7 @@ def blend_with_white( gray = (0.85, 0.85, 0.85, 1.0) cmap = plt.get_cmap("tab20") - lvl1 = sorted_children(root) + lvl1 = ordered_children(root) lvl1_names = [n for n, _ in lvl1] lvl1_index = {n: i for i, n in enumerate(lvl1_names)} @@ -188,7 +186,7 @@ def draw_stack( parent_rgba = cmap(lvl1_index[n1] % cmap.N) parent_rgb = (parent_rgba[0], parent_rgba[1], parent_rgba[2]) - children = sorted_children(t1) + children = ordered_children(t1) k = len(children) for i, (n2, t2) in enumerate(children): # Same “type” of color as parent: lighten progressively per child. From fb647d428df1ceb43048afc26970329ec7caf84d Mon Sep 17 00:00:00 2001 From: Jeremy McGibbon Date: Wed, 11 Feb 2026 15:10:55 +0000 Subject: [PATCH 17/20] simpler imports --- fme/core/__init__.py | 1 + fme/core/models/__init__.py | 4 +--- fme/core/models/conditional_sfno/__init__.py | 4 +--- 3 files changed, 3 insertions(+), 6 deletions(-) diff --git a/fme/core/__init__.py b/fme/core/__init__.py index a2650da09..5c2f47b01 100644 --- a/fme/core/__init__.py +++ b/fme/core/__init__.py @@ -22,6 +22,7 @@ "weighted_mean", "weighted_mean_bias", "weighted_nanmean", + "weighted_sum", "root_mean_squared_error", "get_device", "using_gpu", diff --git a/fme/core/models/__init__.py b/fme/core/models/__init__.py index 4ce828d37..ae3c6d041 100644 --- a/fme/core/models/__init__.py +++ b/fme/core/models/__init__.py @@ -1,3 +1 @@ -from . import conditional_sfno as _ # to trigger registrations - -del _ +from . import conditional_sfno diff --git a/fme/core/models/conditional_sfno/__init__.py b/fme/core/models/conditional_sfno/__init__.py index d37a22639..6e628e84f 100644 --- a/fme/core/models/conditional_sfno/__init__.py +++ b/fme/core/models/conditional_sfno/__init__.py @@ -1,3 +1 @@ -from . import benchmark as _ # to trigger registrations - -del _ +from . import benchmark From f0d6f9b94f5d7c4180639b0a1490f5e0173a2005 Mon Sep 17 00:00:00 2001 From: Jeremy McGibbon Date: Wed, 11 Feb 2026 15:14:10 +0000 Subject: [PATCH 18/20] add dirty label --- fme/core/benchmark/run.py | 26 +++++++++++++++++++++++--- 1 file changed, 23 insertions(+), 3 deletions(-) diff --git a/fme/core/benchmark/run.py b/fme/core/benchmark/run.py index d786d917f..8663aff26 100644 --- a/fme/core/benchmark/run.py +++ b/fme/core/benchmark/run.py @@ -19,10 +19,30 @@ def get_git_commit() -> str: global _GIT_COMMIT if _GIT_COMMIT is None: - args = ["git", "rev-parse", "--short", "HEAD"] - _GIT_COMMIT = ( - subprocess.check_output(args, stderr=subprocess.DEVNULL).decode().strip() + commit = ( + subprocess.check_output( + ["git", "rev-parse", "--short", "HEAD"], + stderr=subprocess.DEVNULL, + ) + .decode() + .strip() + ) + + # Non-empty output means repo is dirty + dirty = ( + subprocess.check_output( + ["git", "status", "--porcelain"], + stderr=subprocess.DEVNULL, + ) + .decode() + .strip() ) + + if dirty: + commit = f"{commit}-dirty" + + _GIT_COMMIT = commit + return _GIT_COMMIT From 4f407ba286110c1306f961566b749bd990fad988 Mon Sep 17 00:00:00 2001 From: Jeremy McGibbon Date: Thu, 12 Feb 2026 20:11:27 +0000 Subject: [PATCH 19/20] add logging, arg for out dir, basic test --- fme/core/benchmark/run.py | 43 ++++++++++++++++++++++++---------- fme/core/benchmark/test_run.py | 17 ++++++++++++++ 2 files changed, 48 insertions(+), 12 deletions(-) create mode 100644 fme/core/benchmark/test_run.py diff --git a/fme/core/benchmark/run.py b/fme/core/benchmark/run.py index 8663aff26..f77486a6e 100644 --- a/fme/core/benchmark/run.py +++ b/fme/core/benchmark/run.py @@ -53,8 +53,10 @@ def get_device_name() -> str: return "CPU" -def main(name: str | None, iters: int, child: str | None = None) -> int: - RESULTS_PATH.mkdir(exist_ok=True) +def main( + name: str | None, iters: int, output_dir: pathlib.Path, child: str | None = None +) -> int: + output_dir.mkdir(exist_ok=True) device_name = get_device_name() logging.info(f"Running benchmarks on device: {device_name}") @@ -77,25 +79,28 @@ def get_filename(name, extension) -> pathlib.Path: safe_name = name.replace("/", "_").replace(".", "_").lower() safe_device_name = device_name.replace(" ", "_").replace("/", "_").lower() return ( - RESULTS_PATH + output_dir / f"{safe_name}_{safe_device_name}_{get_git_commit()}.{extension}" ) for name, cls in benchmarks_to_run.items(): logging.info(f"Running benchmark: {name}") result = cls.run_benchmark(iters=iters) - result.to_png(get_filename(name, "png"), label=get_label(name)) - if child is not None: - child_name = f"{name}.{child}" - child_label = get_label(child_name) - logging.info(f"Generating report for child timer: {child_label}") - result.to_png( - get_filename(child_name, "png"), label=child_label, child=child - ) + png_filename = get_filename(name, "png") + logging.info(f"Saving result image to {png_filename}") + result.to_png(png_filename, label=get_label(name)) result_data = json.dumps(dataclasses.asdict(result), indent=2) + logging.info(f"Result: {result_data}") with open(get_filename(name, "json"), "w") as f: + logging.info(f"Saving result json to {f.name}") f.write(result_data) - logging.info(f"Result: {result_data}") + if child is not None: + child_name = f"{name}.{child}" + child_label = get_label(child_name) + logging.info(f"Generating benchmark result for child timer: {child_label}") + png_filename = get_filename(child_name, "png") + logging.info(f"Saving child result image to {png_filename}") + result.to_png(png_filename, label=child_label, child=child) return 0 @@ -134,12 +139,26 @@ def get_benchmark_label(name): default=10, help="Number of iterations to run each benchmark for.", ) + parser.add_argument( + "--output-dir", + type=str, + default=None, + help=( + "Directory to save benchmark results in. If not provided, " + "results will be saved in a 'results' directory next to this script." + ), + ) args = parser.parse_args() + if args.output_dir is not None: + output_dir = pathlib.Path(args.output_dir) + else: + output_dir = RESULTS_PATH sys.exit( main( name=args.name, iters=args.iters, child=args.child, + output_dir=output_dir, ) ) diff --git a/fme/core/benchmark/test_run.py b/fme/core/benchmark/test_run.py new file mode 100644 index 000000000..87d69d572 --- /dev/null +++ b/fme/core/benchmark/test_run.py @@ -0,0 +1,17 @@ +import pathlib +import tempfile + +from fme.core.benchmark.run import main + + +def test_run(): + # Just test that the main function runs without error on a simple benchmark + # We don't care about the output here, just that it completes successfully + with tempfile.TemporaryDirectory() as tmpdir: + output_dir = pathlib.Path(tmpdir) + main( + name="csfno_block", # just one for speed + iters=1, + output_dir=output_dir, + child=None, + ) From bb51f354900ca4dc6cbadfc6ae6eeb9789afe44e Mon Sep 17 00:00:00 2001 From: Jeremy McGibbon Date: Thu, 12 Feb 2026 21:11:44 +0000 Subject: [PATCH 20/20] skip test on non-gpu --- fme/core/benchmark/test_run.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/fme/core/benchmark/test_run.py b/fme/core/benchmark/test_run.py index 87d69d572..62e4907d9 100644 --- a/fme/core/benchmark/test_run.py +++ b/fme/core/benchmark/test_run.py @@ -1,9 +1,13 @@ import pathlib import tempfile +import pytest +import torch + from fme.core.benchmark.run import main +@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA is not available") def test_run(): # Just test that the main function runs without error on a simple benchmark # We don't care about the output here, just that it completes successfully