From 92c65359994095cffbdff3e6336a8634f13ac8df Mon Sep 17 00:00:00 2001
From: Roman Schaffert <rschaffert@nvidia.com>
Date: Mon, 23 Mar 2026 03:43:42 +0000
Subject: [PATCH] New functionality & documentation improvement in
 `optim_test_tools` - Added `numba_nvtx` module: Use of NVTX ranges in
 Numba-compiled code - `TensorDumper`   - Added `PICKLE` dump type   - Added
 ranges (for disambiguation of tensor names in certain scenarios) - Improved
 documentation

Signed-off-by: Roman Schaffert <rschaffert@nvidia.com>
---
 .gitmodules                                   |   4 +
 docs/Makefile                                 |  10 +-
 docs/spelling_wordlist.txt                    |   3 +
 .../batching_helpers/batched_indexing_ops.py  |   4 +-
 .../inc/GopDecoderUtils.hpp                   |  69 ++-
 .../optim_test_tools/numba_nvtx/__init__.py   |  21 +
 .../optim_test_tools/numba_nvtx/nvtx.py       | 141 ++++++
 .../accvlab/optim_test_tools/tensor_dumper.py | 478 ++++++++++++++----
 packages/optim_test_tools/docs/api.rst        |   7 +
 packages/optim_test_tools/docs/examples.rst   |   8 +-
 .../docs/examples/numba_nvtx.rst              |  42 ++
 .../examples/tensor_dumper_comparison.rst     |  16 +-
 .../docs/examples/tensor_dumper_dumping.rst   |  85 +++-
 packages/optim_test_tools/docs/intro.rst      |  23 +-
 .../examples/numba_nvtx_example.py            |  41 ++
 .../tensor_dumper_comparison_example.py       |  10 +-
 .../examples/tensor_dumper_dumping_example.py |  59 ++-
 .../optim_test_tools/ext_impl/CMakeLists.txt  |  62 +++
 .../optim_test_tools/ext_impl/external/NVTX   |   1 +
 .../ext_impl/src/nvtx_numba.cpp               |  83 +++
 packages/optim_test_tools/pyproject.toml      |  10 +-
 packages/optim_test_tools/setup.py            |  38 +-
 .../optim_test_tools/tests/test_numba_nvtx.py |  45 ++
 23 files changed, 1042 insertions(+), 218 deletions(-)
 create mode 100644 packages/optim_test_tools/accvlab/optim_test_tools/numba_nvtx/__init__.py
 create mode 100644 packages/optim_test_tools/accvlab/optim_test_tools/numba_nvtx/nvtx.py
 create mode 100644 packages/optim_test_tools/docs/examples/numba_nvtx.rst
 create mode 100644 packages/optim_test_tools/examples/numba_nvtx_example.py
 create mode 100644 packages/optim_test_tools/ext_impl/CMakeLists.txt
 create mode 160000 packages/optim_test_tools/ext_impl/external/NVTX
 create mode 100644 packages/optim_test_tools/ext_impl/src/nvtx_numba.cpp
 create mode 100644 packages/optim_test_tools/tests/test_numba_nvtx.py

diff --git a/.gitmodules b/.gitmodules
index cef3b22..4741c12 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -2,6 +2,10 @@
 	path = packages/on_demand_video_decoder/ext_impl/external/NVTX
 	url = https://github.com/NVIDIA/NVTX.git
 
+[submodule "packages/optim_test_tools/ext_impl/external/NVTX"]
+	path = packages/optim_test_tools/ext_impl/external/NVTX
+	url = https://github.com/NVIDIA/NVTX.git
+
 [submodule "packages/on_demand_video_decoder/ext_impl/external/dlpack"]
 	path = packages/on_demand_video_decoder/ext_impl/external/dlpack
 	url = https://github.com/dmlc/dlpack.git
diff --git a/docs/Makefile b/docs/Makefile
index 8fefdb0..51667be 100644
--- a/docs/Makefile
+++ b/docs/Makefile
@@ -8,6 +8,13 @@ SPHINXBUILD  ?= sphinx-build
 SOURCEDIR     = .
 BUILDDIR      = _build
 
+MAKEFILE_DIR := $(patsubst %/,%,$(abspath $(dir $(lastword $(MAKEFILE_LIST)))))
+CURRENT_DIR  := $(patsubst %/,%,$(abspath $(CURDIR)))
+
+ifneq ($(CURRENT_DIR),$(MAKEFILE_DIR))
+$(error Please run make from $(MAKEFILE_DIR). Example: 'cd $(MAKEFILE_DIR) && make clean html' or 'make -C $(MAKEFILE_DIR) clean html')
+endif
+
 # Put it first so that "make" without argument is like "make help".
 help:
 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
@@ -29,9 +36,10 @@ sync-readme:
 html: sync-readme generate
 	@$(SPHINXBUILD) -M html "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
 
-# Clean build directory and generated files
+# Clean build directory and generated files (full removal to avoid stale sidebar/toctree)
 clean:
 	@$(SPHINXBUILD) -M clean "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
+	rm -rf $(BUILDDIR)/
 	rm -rf api/generated/
 	rm -rf ../packages/*/docs/generated/
 
diff --git a/docs/spelling_wordlist.txt b/docs/spelling_wordlist.txt
index f2ccb33..f360a3c 100644
--- a/docs/spelling_wordlist.txt
+++ b/docs/spelling_wordlist.txt
@@ -185,3 +185,6 @@ literalinclude
 blockquote
 distributable
 posix
+JIT
+prepend
+prepended
diff --git a/packages/batching_helpers/accvlab/batching_helpers/batched_indexing_ops.py b/packages/batching_helpers/accvlab/batching_helpers/batched_indexing_ops.py
index 01a3571..c5ad7dd 100644
--- a/packages/batching_helpers/accvlab/batching_helpers/batched_indexing_ops.py
+++ b/packages/batching_helpers/accvlab/batching_helpers/batched_indexing_ops.py
@@ -111,7 +111,7 @@ def backward(ctx: Any, grad: Union[torch.Tensor, None]):
         if grad is None:
             return None, None, None, None, None
         else:
-            (output_indices, output_nums_indices) = ctx.saved_tensors
+            output_indices, output_nums_indices = ctx.saved_tensors
             grad = grad.contiguous()
             grad_input = batched_indexing_access_cuda.forward(grad, output_indices, output_nums_indices, 0.0)
             return grad_input, None, None, None, None
@@ -154,7 +154,7 @@ def backward(ctx: Any, grad: Union[torch.Tensor, None]):
         if grad is None:
             return None, None, None, None
         else:
-            (output_indices, output_nums_indices) = ctx.saved_tensors
+            output_indices, output_nums_indices = ctx.saved_tensors
             grad = grad.contiguous()
             grad_for_to_insert = batched_indexing_access_cuda.forward(
                 grad, output_indices, output_nums_indices, 0.0
diff --git a/packages/on_demand_video_decoder/ext_impl/src/PyNvOnDemandDecoder/inc/GopDecoderUtils.hpp b/packages/on_demand_video_decoder/ext_impl/src/PyNvOnDemandDecoder/inc/GopDecoderUtils.hpp
index b811c29..8437f2a 100644
--- a/packages/on_demand_video_decoder/ext_impl/src/PyNvOnDemandDecoder/inc/GopDecoderUtils.hpp
+++ b/packages/on_demand_video_decoder/ext_impl/src/PyNvOnDemandDecoder/inc/GopDecoderUtils.hpp
@@ -254,18 +254,18 @@ static void SavePacketBufferToFile(const uint8_t* packet_buffer, int nVideoBytes
  * Reference: ITU-T H.264 Table 7-1
  */
 enum H264NalUnitType {
-    H264_NAL_SLICE = 1,           // Coded slice of a non-IDR picture
-    H264_NAL_DPA = 2,             // Coded slice data partition A
-    H264_NAL_DPB = 3,             // Coded slice data partition B
-    H264_NAL_DPC = 4,             // Coded slice data partition C
-    H264_NAL_IDR_SLICE = 5,       // Coded slice of an IDR picture
-    H264_NAL_SEI = 6,             // Supplemental enhancement information
-    H264_NAL_SPS = 7,             // Sequence parameter set
-    H264_NAL_PPS = 8,             // Picture parameter set
-    H264_NAL_AUD = 9,             // Access unit delimiter
-    H264_NAL_END_SEQUENCE = 10,   // End of sequence
-    H264_NAL_END_STREAM = 11,     // End of stream
-    H264_NAL_FILLER_DATA = 12,    // Filler data
+    H264_NAL_SLICE = 1,          // Coded slice of a non-IDR picture
+    H264_NAL_DPA = 2,            // Coded slice data partition A
+    H264_NAL_DPB = 3,            // Coded slice data partition B
+    H264_NAL_DPC = 4,            // Coded slice data partition C
+    H264_NAL_IDR_SLICE = 5,      // Coded slice of an IDR picture
+    H264_NAL_SEI = 6,            // Supplemental enhancement information
+    H264_NAL_SPS = 7,            // Sequence parameter set
+    H264_NAL_PPS = 8,            // Picture parameter set
+    H264_NAL_AUD = 9,            // Access unit delimiter
+    H264_NAL_END_SEQUENCE = 10,  // End of sequence
+    H264_NAL_END_STREAM = 11,    // End of stream
+    H264_NAL_FILLER_DATA = 12,   // Filler data
 };
 
 /**
@@ -273,15 +273,15 @@ enum H264NalUnitType {
  * Reference: ITU-T H.265 Table 7-1
  */
 enum HevcNalUnitType {
-    HEVC_NAL_IDR_W_RADL = 19,     // IDR picture with RADL pictures
-    HEVC_NAL_IDR_N_LP = 20,       // IDR picture without leading pictures
-    HEVC_NAL_CRA_NUT = 21,        // Clean random access picture
-    HEVC_NAL_VPS = 32,            // Video parameter set
-    HEVC_NAL_SPS = 33,            // Sequence parameter set
-    HEVC_NAL_PPS = 34,            // Picture parameter set
-    HEVC_NAL_AUD = 35,            // Access unit delimiter
-    HEVC_NAL_PREFIX_SEI = 39,     // Prefix SEI message
-    HEVC_NAL_SUFFIX_SEI = 40,     // Suffix SEI message
+    HEVC_NAL_IDR_W_RADL = 19,  // IDR picture with RADL pictures
+    HEVC_NAL_IDR_N_LP = 20,    // IDR picture without leading pictures
+    HEVC_NAL_CRA_NUT = 21,     // Clean random access picture
+    HEVC_NAL_VPS = 32,         // Video parameter set
+    HEVC_NAL_SPS = 33,         // Sequence parameter set
+    HEVC_NAL_PPS = 34,         // Picture parameter set
+    HEVC_NAL_AUD = 35,         // Access unit delimiter
+    HEVC_NAL_PREFIX_SEI = 39,  // Prefix SEI message
+    HEVC_NAL_SUFFIX_SEI = 40,  // Suffix SEI message
 };
 
 /**
@@ -290,15 +290,15 @@ enum HevcNalUnitType {
  * Reference: AV1 Bitstream & Decoding Process Specification
  */
 enum AV1ObuType {
-    OBU_SEQUENCE_HEADER = 1,        // Sequence header, appears at key frames
-    OBU_TEMPORAL_DELIMITER = 2,     // Temporal delimiter
-    OBU_FRAME_HEADER = 3,           // Frame header
-    OBU_TILE_GROUP = 4,             // Tile group
-    OBU_METADATA = 5,               // Metadata
-    OBU_FRAME = 6,                  // Frame (combined frame header and tile group)
-    OBU_REDUNDANT_FRAME_HEADER = 7, // Redundant frame header
-    OBU_TILE_LIST = 8,              // Tile list
-    OBU_PADDING = 15,               // Padding
+    OBU_SEQUENCE_HEADER = 1,         // Sequence header, appears at key frames
+    OBU_TEMPORAL_DELIMITER = 2,      // Temporal delimiter
+    OBU_FRAME_HEADER = 3,            // Frame header
+    OBU_TILE_GROUP = 4,              // Tile group
+    OBU_METADATA = 5,                // Metadata
+    OBU_FRAME = 6,                   // Frame (combined frame header and tile group)
+    OBU_REDUNDANT_FRAME_HEADER = 7,  // Redundant frame header
+    OBU_TILE_LIST = 8,               // Tile list
+    OBU_PADDING = 15,                // Padding
 };
 
 /**
@@ -318,9 +318,8 @@ inline bool iskeyFrame(AVCodecID codec_id, const uint8_t* pVideo, int demux_flag
         uint8_t b = pVideo[2] == 1 ? pVideo[3] : pVideo[4];
         int nal_unit_type = b >> 1;
         // Check for VPS, SPS, PPS, or SEI NAL units which indicate key frame start
-        if (nal_unit_type == HEVC_NAL_VPS || nal_unit_type == HEVC_NAL_SPS ||
-            nal_unit_type == HEVC_NAL_PPS || nal_unit_type == HEVC_NAL_PREFIX_SEI ||
-            nal_unit_type == HEVC_NAL_SUFFIX_SEI) {
+        if (nal_unit_type == HEVC_NAL_VPS || nal_unit_type == HEVC_NAL_SPS || nal_unit_type == HEVC_NAL_PPS ||
+            nal_unit_type == HEVC_NAL_PREFIX_SEI || nal_unit_type == HEVC_NAL_SUFFIX_SEI) {
             bPS = true;
         }
     } else if (codec_id == AV_CODEC_ID_H264) {
@@ -328,8 +327,8 @@ inline bool iskeyFrame(AVCodecID codec_id, const uint8_t* pVideo, int demux_flag
         int nal_ref_idc = b >> 5;
         int nal_unit_type = b & 0x1f;
         // Check for SEI, SPS, PPS, or AUD NAL units which indicate key frame start
-        if (nal_unit_type == H264_NAL_SEI || nal_unit_type == H264_NAL_SPS ||
-            nal_unit_type == H264_NAL_PPS || nal_unit_type == H264_NAL_AUD) {
+        if (nal_unit_type == H264_NAL_SEI || nal_unit_type == H264_NAL_SPS || nal_unit_type == H264_NAL_PPS ||
+            nal_unit_type == H264_NAL_AUD) {
             bPS = true;
         }
     } else if (codec_id == AV_CODEC_ID_AV1) {
diff --git a/packages/optim_test_tools/accvlab/optim_test_tools/numba_nvtx/__init__.py b/packages/optim_test_tools/accvlab/optim_test_tools/numba_nvtx/__init__.py
new file mode 100644
index 0000000..d2680ca
--- /dev/null
+++ b/packages/optim_test_tools/accvlab/optim_test_tools/numba_nvtx/__init__.py
@@ -0,0 +1,21 @@
+# Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .nvtx import register_string, range_push, range_pop
+
+__all__ = [
+    "register_string",
+    "range_push",
+    "range_pop",
+]
diff --git a/packages/optim_test_tools/accvlab/optim_test_tools/numba_nvtx/nvtx.py b/packages/optim_test_tools/accvlab/optim_test_tools/numba_nvtx/nvtx.py
new file mode 100644
index 0000000..b14960b
--- /dev/null
+++ b/packages/optim_test_tools/accvlab/optim_test_tools/numba_nvtx/nvtx.py
@@ -0,0 +1,141 @@
+# Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import annotations
+
+import ctypes
+
+from . import _nvtx_numba_ext as _ext  # type: ignore[attr-defined]
+
+
+_SYMBOLS_READY = False
+
+
+def _try_register_numba_symbols() -> bool:
+    """
+    Register the extension's C symbols with llvmlite so that Numba ``@njit``
+    functions can call them.  Returns ``False`` if Numba/llvmlite are not
+    installed (they are optional dependencies).
+    """
+    try:
+        import llvmlite.binding as llvm
+    except ImportError:
+        return False
+
+    lib = ctypes.CDLL(_ext.__file__)
+    push = lib.accvlab_nvtx_range_push
+    pop = lib.accvlab_nvtx_range_pop
+    llvm.add_symbol("accvlab_nvtx_range_push", ctypes.cast(push, ctypes.c_void_p).value)
+    llvm.add_symbol("accvlab_nvtx_range_pop", ctypes.cast(pop, ctypes.c_void_p).value)
+    return True
+
+
+_SYMBOLS_READY = _try_register_numba_symbols()
+
+
+def register_string(name: str) -> int:
+    """
+    Register a string with NVTX once and return an integer handle.
+
+    Returns 0 if profiler is not attached (the handle is still safe to pass to
+    :func:`range_push`, which treats 0 as a no-op).
+    """
+    return int(_ext.register_string(name))
+
+
+def range_push(handle: int) -> None:
+    """
+    Push an NVTX range using a previously-registered handle.
+
+    This function can be called from within Numba ``@njit`` functions.
+    """
+    _ext.range_push(int(handle))
+
+
+def range_pop() -> None:
+    """
+    Pop an NVTX range.
+
+    This function can be called from within Numba ``@njit`` functions.
+    """
+    _ext.range_pop()
+
+
+# ---------------------- Numba lowering (CPU @njit) ----------------------
+
+try:
+    from llvmlite import ir
+    from numba.core import cgutils, types
+    from numba.core.errors import TypingError
+    from numba.extending import intrinsic, overload
+except ImportError:
+    pass
+else:
+
+    @intrinsic
+    def _range_push_intrin(typingctx, handle):
+        sig = types.void(handle)
+
+        def codegen(context, builder, signature, args):
+            i64 = ir.IntType(64)
+            fnty = ir.FunctionType(ir.VoidType(), [i64])
+            fn = cgutils.get_or_insert_function(builder.module, fnty, "accvlab_nvtx_range_push")
+            arg0 = args[0]
+            if arg0.type != i64:
+                arg0 = builder.sext(arg0, i64) if arg0.type.width < 64 else builder.trunc(arg0, i64)
+            builder.call(fn, [arg0])
+            return context.get_dummy_value()
+
+        return sig, codegen
+
+    @intrinsic
+    def _range_pop_intrin(typingctx):
+        sig = types.void()
+
+        def codegen(context, builder, signature, args):
+            fnty = ir.FunctionType(ir.VoidType(), [])
+            fn = cgutils.get_or_insert_function(builder.module, fnty, "accvlab_nvtx_range_pop")
+            builder.call(fn, [])
+            return context.get_dummy_value()
+
+        return sig, codegen
+
+    @overload(range_push, inline="always")
+    def _ov_range_push(handle):
+        if isinstance(handle, types.Integer):
+
+            if not _SYMBOLS_READY:
+                raise TypingError(
+                    "NVTX C symbols were not registered with llvmlite. "
+                    "This is unexpected — the extension is present but symbol binding failed at import time."
+                )
+
+            def impl(handle):
+                _range_push_intrin(handle)
+
+            return impl
+        return None
+
+    @overload(range_pop, inline="always")
+    def _ov_range_pop():
+        if not _SYMBOLS_READY:
+            raise TypingError(
+                "NVTX C symbols were not registered with llvmlite. "
+                "This is unexpected — the extension is present but symbol binding failed at import time."
+            )
+
+        def impl():
+            _range_pop_intrin()
+
+        return impl
diff --git a/packages/optim_test_tools/accvlab/optim_test_tools/tensor_dumper.py b/packages/optim_test_tools/accvlab/optim_test_tools/tensor_dumper.py
index 6f3ddc6..7a66489 100644
--- a/packages/optim_test_tools/accvlab/optim_test_tools/tensor_dumper.py
+++ b/packages/optim_test_tools/accvlab/optim_test_tools/tensor_dumper.py
@@ -18,8 +18,10 @@
 import json
 import numbers
 import math
+import importlib.util
 import warnings
 import copy
+import pickle
 
 import torch
 import numpy as np
@@ -114,23 +116,25 @@ class Type(Enum):
         The format type determines how tensor data is serialized when dumped.
 
         Note:
-            For binary types (``BINARY``, ``IMAGE_RGB``, ``IMAGE_BGR``, ``IMAGE_I``), entries are added to the
-            main JSON file indicating the filenames of the stored data. Also, files containing meta-data are
-            created and stored in the same directory. For ``BINARY``, the meta-data is the shape and dtype of
-            the tensor. For ``IMAGE_*``, the meta-data is the original range of the image data (min and max
-            value) and the image format (RGB, BGR, Intensity).
+            For binary types except ``PICKLE`` (i.e. ``BINARY``, ``IMAGE_RGB``, ``IMAGE_BGR``, ``IMAGE_I``),
+            entries are added to the main JSON file indicating the filenames of the stored data. Also, files
+            containing meta-data are created and stored in the same directory. For ``BINARY``, the meta-data
+            is the shape and dtype of the tensor. For ``IMAGE_*``, the meta-data is the original range of the
+            image data (min and max value) and the image format (RGB, BGR, Intensity). For ``PICKLE``, no
+            meta-data is written as the pickled object is self-contained.
 
         Note:
-            For ``BINARY`` and ``IMAGE_*`` formats, entries are added to the main JSON file indicating the
-            filenames of the stored data. The filenames for these cases are:
+            For ``BINARY``, ``IMAGE_*`` and ``PICKLE`` formats, entries are added to the main JSON file
+            indicating the filenames of the stored data. The filenames for these cases are:
 
               - blob/image data: ``[<main_json_file_name>]<path_to_data_in_dumped_structure>.<file_type>``
-              - meta-data: ``[<main_json_file_name>]<path_to_data_in_dumped_structure>.<file_type>.meta.json``
+              - meta-data (if applicable):
+                ``[<main_json_file_name>]<path_to_data_in_dumped_structure>.<file_type>.meta.json``
 
         Note:
-            For images containing multiple channels, the color channel is the last dimension. If this is not
-            the case, permutation of the axes needs to be applied to move the color channel to the last
-            dimension. The permutation can be applied using the ``permute_axes`` parameter, e.g. of
+            For images containing multiple channels, the color channel is assumed to be the last dimension. If
+            this is not the case, permutation of the axes needs to be applied to move the color channel to the
+            last dimension. The permutation can be applied using the ``permute_axes`` parameter, e.g. of
             :meth:`add_tensor_data`.
 
             If a tensor contains more than the necessary number of dimensions (3 for color images,
@@ -154,6 +158,8 @@ class Type(Enum):
         #: Tensor data converted to PNG image format (grayscale).
         #: Single channel; no explicit channel dimension.
         IMAGE_I = 4
+        #: Tensor data saved as pickle files.
+        PICKLE = 5
 
         @classmethod
         def is_image(cls, dump_type: 'TensorDumper.Type') -> bool:
@@ -165,10 +171,12 @@ def __init__(
             eps_numerical_data: float = 1e-6,
             num_errors_per_tensor_to_show: int = 1,
             allow_missing_data_in_current: bool = False,
+            allow_missing_data_in_previous: bool = False,
         ):
             self.eps_numerical_data = eps_numerical_data
             self.num_errors_per_tensor_to_show = num_errors_per_tensor_to_show
             self.allow_missing_data_in_current = allow_missing_data_in_current
+            self.allow_missing_data_in_previous = allow_missing_data_in_previous
 
     class _TensorWithFormat:
         def __init__(self, tensor: Any, dump_type: 'TensorDumper.Type', permute_axes: OptionalSequence):
@@ -200,15 +208,21 @@ def default(self, obj):
     def __init__(self, *args, **kwargs):
         '''
         Args:
-            dump_dir: The directory to dump the data to. If provided, the dumper will be enabled automatically.
-                If not provided, the dumper will be disabled and can be enabled later by calling :meth:`enable`.
+            dump_dir: The directory to dump the data to. If provided, the dumper will be enabled
+                automatically. If not provided, the dumper will be disabled and can be enabled later by
+                calling :meth:`enable`.
         '''
         if not hasattr(self, '_initialized'):
-            try:
-                import accvlab.batching_helpers
-            except ImportError:
+            # Only import accvlab.batching_helpers if it is available. If it is not, show a warning about
+            # dumping of RaggedBatch data being not supported.
+            accvlab_spec = importlib.util.find_spec("accvlab")
+            batching_helpers_spec = None
+            if accvlab_spec is not None and accvlab_spec.submodule_search_locations is not None:
+                batching_helpers_spec = importlib.util.find_spec("accvlab.batching_helpers")
+            if batching_helpers_spec is None:
                 warnings.warn(
-                    "`accvlab.batching_helpers` is not available. Dumping of `RaggedBatch` data is not supported."
+                    "`accvlab.batching_helpers` is not available. Dumping of `RaggedBatch` data is not "
+                    "supported."
                 )
             self._initialized = True
             self._enabled = False
@@ -228,6 +242,7 @@ def enable(self, dump_dir: str):
         if self._enabled:
             raise RuntimeError("`TensorDumper` is already enabled. Can only be enabled once.")
         self._dump_dir = dump_dir
+        self._range_stack = []
         self._dump_count = 0
         self._tensor_struct = {}
         self._grad_struct = {}
@@ -235,8 +250,16 @@ def enable(self, dump_dir: str):
         self._enabled = True
         self._after_dump_count_actions = {}
         self._custom_converters = {np.ndarray: lambda x: torch.from_numpy(x)}
+        # If not None, this overrides *all* dump type settings (per-call dump_type and dump_type_override).
+        self._dump_type_for_all = None
+
+        self._dump_is_compare = False
+        self._dump_is_compare_params = None
 
         # Set the methods
+        self.set_dump_is_compare = self._set_dump_is_compare_enabled
+        self.push_range = self._push_range_enabled
+        self.pop_range = self._pop_range_enabled
         self.add_tensor_data = self._add_tensor_data_enabled
         self.add_grad_data = self._add_grad_data_enabled
         self.set_dump_type_for_all = self._set_dump_type_for_all_enabled
@@ -244,11 +267,73 @@ def enable(self, dump_dir: str):
         self.compare_to_dumped_data = self._compare_to_dumped_data_enabled
         self.set_gradients = self._set_gradients_enabled
         self.reset_dump_count = self._reset_dump_count_enabled
+        self.set_dump_count = self._set_dump_count_enabled
         self.perform_after_dump_count = self._perform_after_dump_count_enabled
         self.register_custom_converter = self._register_custom_converter_enabled
         self.enable_ragged_batch_dumping = self._enable_ragged_batch_dumping_enabled
         self.run_if_enabled = self._run_if_enabled_enabled
 
+    def push_range(self, range_name: Union[str, Callable[[], str]]):
+        '''Push a range to the range stack.
+
+        Multiple ranges can be pushed and popped in a nested manner. The ranges will be prepended to the dump
+        path (see e.g. the ``path`` argument in :meth:`add_tensor_data`) in the order in which they were
+        pushed.
+
+        The ranges can be used to conveniently disambiguate the names of data entries where the same
+        name is used in multiple contexts (e.g. multiple iterations of a loop, function called from
+        multiple places, etc.).
+
+        Important:
+            To ensure that the formatting is performed only if the tensor dumper is enabled,
+            the range should not be formatted when passing the argument. Instead, the formatting happens
+            inside the method, and the additional arguments (``args``) are used to format the range name.
+
+        Args:
+            range_name: The name of the range to push.
+            *args: Additional arguments to format the range name. If not provided, the range name is used
+                as is.
+        '''
+        pass
+
+    def pop_range(self):
+        '''Pop the last range from the range stack.
+
+        Args:
+            range_name: The name of the range to pop.
+        '''
+        pass
+
+    def set_dump_is_compare(
+        self,
+        eps_numerical_data: float = 1e-6,
+        num_errors_per_tensor_to_show: int = 1,
+        allow_missing_data_in_current: bool = False,
+        allow_missing_data_in_previous: bool = False,
+        as_warning: bool = False,
+    ) -> bool:
+        '''Automatically replace calls to :meth:`dump` with calls to :meth:`compare_to_dumped_data`.
+
+        Note:
+            The parameters defined in this method will be forwarded to :meth:`compare_to_dumped_data`.
+            Note that ``compare_if_empty`` is not passed here. Instead, the value passed to :meth:`dump` is
+            used.
+
+        See Also:
+            Please see the documentation of :meth:`compare_to_dumped_data` for more details.
+
+        Args:
+            eps_numerical_data: The numerical tolerance for the comparison of numerical data.
+            num_errors_per_tensor_to_show: The number of most significant errors to show per tensor.
+            allow_missing_data_in_current: If ``True``, the comparison will not raise an error if the current data is missing
+                some keys which are present in the reference data.
+            allow_missing_data_in_previous: If ``True``, the comparison will not raise an error if the reference data is missing
+                some keys which are present in the current data.
+            as_warning: If ``True``, no error is raised in case of a mismatch and instead, a warning is printed.
+                If ``False``, an error is raised.
+        '''
+        pass
+
     @property
     def is_enabled(self) -> bool:
         '''Whether the TensorDumper is enabled'''
@@ -257,7 +342,7 @@ def is_enabled(self) -> bool:
     def add_tensor_data(
         self,
         path: str,
-        data: TensorDataStructure,
+        data: Union[TensorDataStructure, Callable[[], TensorDataStructure]],
         dump_type: 'TensorDumper.Type',
         dump_type_override: OptionalTypeDict = None,
         permute_axes: OptionalSequence = None,
@@ -299,7 +384,7 @@ def add_tensor_data(
     def add_grad_data(
         self,
         path: str,
-        data: TensorDataStructure,
+        data: Union[TensorDataStructure, Callable[[], TensorDataStructure]],
         dump_type: 'TensorDumper.Type',
         dump_type_override: OptionalTypeDict = None,
         permute_grad_axes: OptionalSequence = None,
@@ -364,8 +449,18 @@ def set_dump_type_for_all(
         # Empty method to minimize overhead if not enabled. Will be replaced when enabling.
         pass
 
-    def dump(self):
-        '''Dump the data to the dump directory.'''
+    def dump(self, dump_if_empty: bool = True):
+        '''Dump the data to the dump directory.
+
+        Note:
+            Setting ``dump_if_empty`` to ``False`` is useful to not count iterations where no data is dumped
+            as a separate iteration.
+
+        Args:
+            dump_if_empty: If ``True``, the data will be dumped even if it is empty.
+                If ``False``, the data will not be dumped if it is empty, and the dump count will not be
+                incremented.
+        '''
         # Empty method to minimize overhead if not enabled. Will be replaced when enabling.
         pass
 
@@ -374,7 +469,9 @@ def compare_to_dumped_data(
         eps_numerical_data: float = 1e-6,
         num_errors_per_tensor_to_show: int = 1,
         allow_missing_data_in_current: bool = False,
+        allow_missing_data_in_previous: bool = False,
         as_warning: bool = False,
+        compare_if_empty: bool = True,
     ):
         '''Compare the data to previously dumped data.
 
@@ -389,19 +486,30 @@ def compare_to_dumped_data(
             modifying multiple places in the code is to call :meth:`set_dump_type_for_all`
             when generating the reference data.
 
+        Important:
+            The `compare_if_empty` parameter needs to be consistent with the `dump_if_empty` parameter of
+            :meth:`dump` of the calls to :meth:`dump` which are used to dump the reference data to compare
+            to.
+
         Note:
-            The comparison can be set to allow missing data in the current data by setting ``allow_missing_data_in_current`` to ``True``.
-            This is e.g. useful if the current data is based on an implementation in progress, so that some of the data is not yet available.
-            In this case, the comparison will not raise an error if the current data is missing some data which is present in the reference data.
-            Instead, a warning will be printed.
+            The comparison can be set to allow missing keys in the current and/or reference data by setting
+            ``allow_missing_data_in_current`` and/or ``allow_missing_data_in_previous`` to ``True``.
+            This is e.g. useful if the current data is based on an implementation in progress, so that some
+            of the data is not yet available, or if the current run produces additional data which is not
+            needed for the comparison (and not present in the reference).
 
         Args:
             eps_numerical_data: The numerical tolerance for the comparison of numerical data.
             num_errors_per_tensor_to_show: The number of most significant errors to show per tensor.
-            allow_missing_data_in_current: If ``True``, the comparison will not raise an error if the current data is missing
-                some data which is present in the reference data.
-            as_warning: If ``True``, no error is raised in case of a mismatch and instead, a warning is printed.
-                If ``False``, an error is raised.
+            allow_missing_data_in_current: If ``True``, the comparison will not raise an error if the current
+                data is missing some keys which are present in the reference data.
+            allow_missing_data_in_previous: If ``True``, the comparison will not raise an error if the
+                reference data is missing some keys which are present in the current data.
+            as_warning: If ``True``, no error is raised in case of a mismatch and instead, a warning is
+                printed. If ``False``, an error is raised.
+            compare_if_empty: If ``True``, the comparison will be performed even if the current data is empty.
+                If ``False``, the comparison will not be performed if the current data is empty, and the dump
+                count will not be incremented.
         '''
         # Empty method to minimize overhead if not enabled. Will be replaced when enabling.
         pass
@@ -424,14 +532,32 @@ def set_gradients(self, function_values: Union[torch.Tensor, List[torch.Tensor]]
     def reset_dump_count(self):
         '''Reset the dump count.
 
-        Important:
-            Resetting the dump count means that:
+        This method can be used to reset the dump count to 0.
+        This is useful for debugging (e.g. when comparing to previously dumped data) to start from the first
+        dump.
 
-              - In case of dumping: the next dump will overwrite a previous dump (starting from the first dump).
-              - In case of comparing to previously dumped data: the next comparison will start from the first dump.
+        See Also:
+
+            This method is equivalent to calling :meth:`set_dump_count` with a value of 0. Please see
+            the documentation of :meth:`set_dump_count` for more details.
 
-        This method is useful for debugging e.g. to rerun the same code multiple times to check for
-        determinism, while always comparing to the same dumped data.
+        '''
+        # Empty method to minimize overhead if not enabled. Will be replaced when enabling.
+        pass
+
+    def set_dump_count(self, count: int):
+        '''Set the dump count.
+
+        This method can be used to set the dump count to a specific value.
+        This is useful for debugging (e.g. when comparing to previously dumped data) to jump to a specific
+        iteration.
+
+        Note:
+            If any actions are registered to be performed after a given number of dumps, they will be
+            triggered if the count corresponds to the number of dumps set.
+
+        Args:
+            count: The dump count to set.
         '''
         # Empty method to minimize overhead if not enabled. Will be replaced when enabling.
         pass
@@ -445,11 +571,11 @@ def perform_after_dump_count(self, count: int, action: Callable[[], None]):
         been dumped (by passing the :func:`exit`-function as the action).
 
         Important:
-            If :meth:`reset_dump_count` is called, the dump count is reset to 0,
-            and the action will be performed after the ``count``-th dump after the reset.
-
-            Note that this also means that the action can be performed multiple times if
-            the dump count is reset after the action has been performed.
+            The action is performed after the dump count reaches the given ``count`` value.
+            If :meth:`set_dump_count` is called, the dump count is adjusted to a given value,
+            and this also influences when the action is performed. For example, if :meth:`set_dump_count` is
+            called with a value of 3 and an action is registered to be performed after 5 dumps, the action
+            will be performed after another 2 dumps.
 
         Important:
             This method can be called multiple times with the same count.
@@ -547,6 +673,9 @@ def _SET_DOCSTRINGS_OF_ENABLED_METHOD_VARIANTS(cls):
         once the TensorDumper is enabled, and the original (disabled) methods are
         replaced by the enabled variants.
         '''
+        cls._set_dump_is_compare_enabled.__doc__ = cls.set_dump_is_compare.__doc__
+        cls._push_range_enabled.__doc__ = cls.push_range.__doc__
+        cls._pop_range_enabled.__doc__ = cls.pop_range.__doc__
         cls._add_tensor_data_enabled.__doc__ = cls.add_tensor_data.__doc__
         cls._add_grad_data_enabled.__doc__ = cls.add_grad_data.__doc__
         cls._set_dump_type_for_all_enabled.__doc__ = cls.set_dump_type_for_all.__doc__
@@ -554,6 +683,7 @@ def _SET_DOCSTRINGS_OF_ENABLED_METHOD_VARIANTS(cls):
         cls._compare_to_dumped_data_enabled.__doc__ = cls.compare_to_dumped_data.__doc__
         cls._set_gradients_enabled.__doc__ = cls.set_gradients.__doc__
         cls._reset_dump_count_enabled.__doc__ = cls.reset_dump_count.__doc__
+        cls._set_dump_count_enabled.__doc__ = cls.set_dump_count.__doc__
         cls._perform_after_dump_count_enabled.__doc__ = cls.perform_after_dump_count.__doc__
         cls._register_custom_converter_enabled.__doc__ = cls.register_custom_converter.__doc__
         cls._enable_ragged_batch_dumping_enabled.__doc__ = cls.enable_ragged_batch_dumping.__doc__
@@ -561,10 +691,50 @@ def _SET_DOCSTRINGS_OF_ENABLED_METHOD_VARIANTS(cls):
 
     # ===== Enabled Variants of the Methods =====
 
+    def _set_dump_is_compare_enabled(
+        self,
+        eps_numerical_data: float = 1e-6,
+        num_errors_per_tensor_to_show: int = 1,
+        allow_missing_data_in_current: bool = False,
+        allow_missing_data_in_previous: bool = False,
+        as_warning: bool = False,
+    ):
+        '''TEMPORARY DOCSTRING
+        This is the enabled variant of the corresponding method (same name without leading `_` and training `_enabled`).
+        This docstring will be replaced with the docstring of the corresponding method when an instance is requested
+        for the first time.
+        '''
+        self._dump_is_compare = True
+        self._dump_is_compare_params = {
+            "eps_numerical_data": eps_numerical_data,
+            "num_errors_per_tensor_to_show": num_errors_per_tensor_to_show,
+            "allow_missing_data_in_current": allow_missing_data_in_current,
+            "allow_missing_data_in_previous": allow_missing_data_in_previous,
+            "as_warning": as_warning,
+        }
+
+    def _push_range_enabled(self, range_name: Union[str, Callable[[], str]]):
+        '''TEMPORARY DOCSTRING
+        This is the enabled variant of the corresponding method (same name without leading `_` and training `_enabled`).
+        This docstring will be replaced with the docstring of the corresponding method when an instance is requested
+        for the first time.
+        '''
+        if callable(range_name):
+            range_name = range_name()
+        self._range_stack.append(range_name)
+
+    def _pop_range_enabled(self):
+        '''TEMPORARY DOCSTRING
+        This is the enabled variant of the corresponding method (same name without leading `_` and training `_enabled`).
+        This docstring will be replaced with the docstring of the corresponding method when an instance is requested
+        for the first time.
+        '''
+        self._range_stack.pop()
+
     def _add_tensor_data_enabled(
         self,
         path: str,
-        data: TensorDataStructure,
+        data: Union[TensorDataStructure, Callable[[], TensorDataStructure]],
         dump_type: 'TensorDumper.Type',
         dump_type_override: OptionalTypeDict = None,
         permute_axes: OptionalSequence = None,
@@ -576,10 +746,18 @@ def _add_tensor_data_enabled(
         This docstring will be replaced with the docstring of the corresponding method when an instance is requested
         for the first time.
         '''
+        if len(self._range_stack) > 0:
+            path = ".".join(self._range_stack) + "." + path
+        if callable(data):
+            data = data()
         if exclude is not None:
             data = TensorDumper._exclude_elements(data, exclude)
         if len(self._custom_converters) > 0:
             data = TensorDumper._get_with_custom_converters_applied(data, self._custom_converters)
+        # If the global dump type is set, it overrides everything else.
+        if self._dump_type_for_all is not None:
+            dump_type = self._dump_type_for_all
+            dump_type_override = None
         data_with_format = TensorDumper._format_data_elements(
             data, dump_type, dump_type_override, permute_axes, permute_axes_override
         )
@@ -588,7 +766,7 @@ def _add_tensor_data_enabled(
     def _add_grad_data_enabled(
         self,
         path: str,
-        data: TensorDataStructure,
+        data: Union[TensorDataStructure, Callable[[], TensorDataStructure]],
         dump_type: 'TensorDumper.Type',
         dump_type_override: OptionalTypeDict = None,
         permute_grad_axes: OptionalSequence = None,
@@ -600,10 +778,18 @@ def _add_grad_data_enabled(
         This docstring will be replaced with the docstring of the corresponding method when an instance is requested
         for the first time.
         '''
+        if len(self._range_stack) > 0:
+            path = ".".join(self._range_stack) + "." + path
+        if callable(data):
+            data = data()
         if exclude is not None:
             data = TensorDumper._exclude_elements(data, exclude)
         if len(self._custom_converters) > 0:
             data = TensorDumper._get_with_custom_converters_applied(data, self._custom_converters)
+        # If the global dump type is set, it overrides everything else.
+        if self._dump_type_for_all is not None:
+            dump_type = self._dump_type_for_all
+            dump_type_override = None
         for_grads_with_format = TensorDumper._format_data_elements(
             data, dump_type, dump_type_override, permute_grad_axes, permute_grad_axes_override
         )
@@ -618,7 +804,10 @@ def _set_dump_type_for_all_enabled(
         This docstring will be replaced with the docstring of the corresponding method when an instance is requested
         for the first time.
         '''
+        # Store the global dump type for the future
+        self._dump_type_for_all = dump_type
 
+        # Apply the dump type to the already set data.
         def set_dump_type(data: TensorDumper._TensorWithFormat) -> TensorDumper._TensorWithFormat:
             data.dump_type = dump_type
             return data
@@ -632,12 +821,26 @@ def set_dump_type(data: TensorDumper._TensorWithFormat) -> TensorDumper._TensorW
                 self._grad_struct, TensorDumper._TensorWithFormat, set_dump_type
             )
 
-    def _dump_enabled(self):
+    def _dump_enabled(self, dump_if_empty: bool = True):
         '''TEMPORARY DOCSTRING
         This is the enabled variant of the corresponding method (same name without leading `_` and training `_enabled`).
         This docstring will be replaced with the docstring of the corresponding method when an instance is requested
         for the first time.
         '''
+
+        if self._dump_is_compare:
+            return self.compare_to_dumped_data(
+                eps_numerical_data=self._dump_is_compare_params["eps_numerical_data"],
+                num_errors_per_tensor_to_show=self._dump_is_compare_params["num_errors_per_tensor_to_show"],
+                allow_missing_data_in_current=self._dump_is_compare_params["allow_missing_data_in_current"],
+                as_warning=self._dump_is_compare_params["as_warning"],
+                compare_if_empty=dump_if_empty,
+            )
+
+        # If dumping is disabled for empty dumps, return early (and don't increment the dump count).
+        if not dump_if_empty and len(self._tensor_struct) == 0 and len(self._grad_struct) == 0:
+            return
+
         self._dump_struct(self._tensor_struct, "tensors")
         if len(self._grad_struct) > 0:
             if not self._grad_computed:
@@ -657,53 +860,66 @@ def _compare_to_dumped_data_enabled(
         eps_numerical_data: float = 1e-6,
         num_errors_per_tensor_to_show: int = 1,
         allow_missing_data_in_current: bool = False,
+        allow_missing_data_in_previous: bool = False,
         as_warning: bool = False,
+        compare_if_empty: bool = True,
     ):
         '''TEMPORARY DOCSTRING
         This is the enabled variant of the corresponding method (same name without leading `_` and training `_enabled`).
         This docstring will be replaced with the docstring of the corresponding method when an instance is requested
         for the first time.
         '''
-        # Create config from parameters
-        config = TensorDumper._ComparisonConfig(
-            eps_numerical_data=eps_numerical_data,
-            num_errors_per_tensor_to_show=num_errors_per_tensor_to_show,
-            allow_missing_data_in_current=allow_missing_data_in_current,
-        )
+        # If comparison is disabled for empty data and the current data is empty, return early (and don't
+        # increment the dump count).
+        if not compare_if_empty and len(self._tensor_struct) == 0 and len(self._grad_struct) == 0:
+            return
+        try:
+            # Create config from parameters
+            config = TensorDumper._ComparisonConfig(
+                eps_numerical_data=eps_numerical_data,
+                num_errors_per_tensor_to_show=num_errors_per_tensor_to_show,
+                allow_missing_data_in_current=allow_missing_data_in_current,
+                allow_missing_data_in_previous=allow_missing_data_in_previous,
+            )
 
-        is_tensor_data_consistent = self._compare_to_dumped_data(
-            self._tensor_struct,
-            "tensors",
-            config,
-            as_warning,
-        )
-        has_grad_data = len(self._grad_struct) > 0
-        if has_grad_data:
-            if not self._grad_computed:
-                raise ValueError(
-                    "Gradients were not computed. Call `set_gradients` before comparing to previously dumped data."
-                )
-            is_grad_data_consistent = self._compare_to_dumped_data(
-                self._grad_struct,
-                "grads",
+            is_tensor_data_consistent = self._compare_to_dumped_data(
+                self._tensor_struct,
+                "tensors",
                 config,
                 as_warning,
             )
-        else:
-            is_grad_data_consistent = True
-        if is_tensor_data_consistent:
-            print(
-                f"`TensorDumper:` Tensor data is consistent with previously dumped data for dump {self._dump_count}."
-            )
-        if has_grad_data and is_grad_data_consistent:
-            print(
-                f"`TensorDumper:` Grad data is consistent with previously dumped data for dump {self._dump_count}."
-            )
-
-        self._tensor_struct = {}
-        self._grad_struct = {}
-        self._grad_computed = False
-        self._dump_count += 1
+            has_grad_data = len(self._grad_struct) > 0
+            if has_grad_data:
+                if not self._grad_computed:
+                    raise ValueError(
+                        "Gradients were not computed. Call `set_gradients` before comparing to previously dumped data."
+                    )
+                is_grad_data_consistent = self._compare_to_dumped_data(
+                    self._grad_struct,
+                    "grads",
+                    config,
+                    as_warning,
+                )
+            else:
+                is_grad_data_consistent = True
+            if is_tensor_data_consistent:
+                print(
+                    f"`TensorDumper:` Tensor data is consistent with previously dumped data for dump {self._dump_count}."
+                )
+            if has_grad_data and is_grad_data_consistent:
+                print(
+                    f"`TensorDumper:` Grad data is consistent with previously dumped data for dump {self._dump_count}."
+                )
+        finally:
+            # Keep compare and dump interchangeable:
+            # - always clear buffered data, even if comparison raises
+            # - always advance dump_count to stay in sync with a `dump()`-based run loop
+            self._tensor_struct = {}
+            self._grad_struct = {}
+            self._grad_computed = False
+            self._dump_count += 1
+            if self._dump_count in self._after_dump_count_actions:
+                self._after_dump_count_actions[self._dump_count]()
 
     def _set_gradients_enabled(self, function_values: Union[torch.Tensor, List[torch.Tensor]]):
         '''TEMPORARY DOCSTRING
@@ -720,7 +936,17 @@ def _reset_dump_count_enabled(self):
         This docstring will be replaced with the docstring of the corresponding method when an instance is requested
         for the first time.
         '''
-        self._dump_count = 0
+        self.set_dump_count(0)
+
+    def _set_dump_count_enabled(self, count: int):
+        '''TEMPORARY DOCSTRING
+        This is the enabled variant of the corresponding method (same name without leading `_` and training `_enabled`).
+        This docstring will be replaced with the docstring of the corresponding method when an instance is requested
+        for the first time.
+        '''
+        self._dump_count = count
+        if self._dump_count in self._after_dump_count_actions:
+            self._after_dump_count_actions[self._dump_count]()
 
     def _perform_after_dump_count_enabled(self, count: int, action: Callable):
         '''TEMPORARY DOCSTRING
@@ -908,7 +1134,8 @@ def replace_element_not_requiring_grad(
         return struct_with_tensors
 
     def _get_dump_dir(self) -> str:
-        return f"{self._dump_dir}/{self._dump_count}"
+        res = f"{self._dump_dir}/{self._dump_count}"
+        return res
 
     def _get_json_filename(self, type_of_struct: str) -> str:
         return f"{type_of_struct}.json"
@@ -934,6 +1161,8 @@ def _dump_struct(self, struct_to_dump: Union[Sequence, Dict], type_of_struct: st
                 TensorDumper._dump_image(
                     f"{dump_dir}/[{json_file_name}]{file_name}", file_data["data"], dump_type
                 )
+            elif dump_type == TensorDumper.Type.PICKLE:
+                TensorDumper._dump_pickle(f"{dump_dir}/[{json_file_name}]{file_name}", file_data["data"])
             else:
                 raise ValueError(f"Unsupported file type: {file_name}")
 
@@ -979,12 +1208,14 @@ def get_child_path(curr_path: str, key: str, is_self_tensor: bool, is_parent_ten
             is_self_tensor = non_tensor_struct is None
             for key in dumped_data.keys():
                 if not key in json_struct_to_compare:
-                    res.append(
-                        ComparisonError(
-                            f"  Missing key '{key}' at path: {get_path_to_show(curr_path)} in dumped reference",
-                            math.inf,
+                    if not config.allow_missing_data_in_current:
+                        res.append(
+                            ComparisonError(
+                                f"  Missing key '{key}' at path: {get_path_to_show(curr_path)} in current "
+                                f"data but present in reference",
+                                math.inf,
+                            )
                         )
-                    )
                     continue
                 is_child_tensor = is_self_tensor or not key in non_tensor_struct
                 non_tensor_struct_child = non_tensor_struct[key] if not is_child_tensor else None
@@ -1000,12 +1231,13 @@ def get_child_path(curr_path: str, key: str, is_self_tensor: bool, is_parent_ten
                     r = order_errors_by_weight(r)
                     r = r[: config.num_errors_per_tensor_to_show] if len(r) > 0 else []
                 res.extend(r)
-            if not config.allow_missing_data_in_current:
+            if not config.allow_missing_data_in_previous:
                 for key in json_struct_to_compare.keys():
                     if not key in dumped_data:
                         res.append(
                             ComparisonError(
-                                f"  Extra key '{key}' at path: {get_path_to_show(curr_path)} in dumped reference",
+                                f"  Extra key '{key}' at path: {get_path_to_show(curr_path)} in current "
+                                f"data but not present in reference",
                                 math.inf,
                             )
                         )
@@ -1016,7 +1248,8 @@ def get_child_path(curr_path: str, key: str, is_self_tensor: bool, is_parent_ten
             if len(dumped_data) != len(json_struct_to_compare):
                 res.append(
                     ComparisonError(
-                        f"  Length mismatch at path: {get_path_to_show(curr_path)}\n    Dumped data: {dumped_data}\n    Struct to compare: {json_struct_to_compare}",
+                        f"  Length mismatch at path: {get_path_to_show(curr_path)}\n"
+                        f"    Dumped data: {dumped_data}\n    Struct to compare: {json_struct_to_compare}",
                         math.inf,
                     )
                 )
@@ -1042,7 +1275,9 @@ def get_child_path(curr_path: str, key: str, is_self_tensor: bool, is_parent_ten
                 difference = abs(json_struct_to_compare - dumped_data)
                 return [
                     ComparisonError(
-                        f"  Numerical mismatch at path: {get_path_to_show(curr_path)}\n    Dumped data: {dumped_data}\n    Struct to compare: {json_struct_to_compare}\n    Difference (current - dumped): {json_struct_to_compare - dumped_data}",
+                        f"  Numerical mismatch at path: {get_path_to_show(curr_path)}\n"
+                        f"    Dumped data: {dumped_data}\n    Struct to compare: {json_struct_to_compare}\n"
+                        f"    Difference (current - dumped): {json_struct_to_compare - dumped_data}",
                         difference,
                     )
                 ]
@@ -1052,7 +1287,8 @@ def get_child_path(curr_path: str, key: str, is_self_tensor: bool, is_parent_ten
             if dumped_data != json_struct_to_compare:
                 return [
                     ComparisonError(
-                        f"  Mismatch at path: {get_path_to_show(curr_path)}\n    Dumped data: {dumped_data}\n    Struct to compare: {json_struct_to_compare}",
+                        f"  Mismatch at path: {get_path_to_show(curr_path)}\n    Dumped data: {dumped_data}\n"
+                        f"    Struct to compare: {json_struct_to_compare}",
                         0.0,
                     )
                 ]
@@ -1076,7 +1312,9 @@ def _compare_to_dumped_data(
             first_file_split_at_extension = first_file.rsplit(".", 1)
             first_file_no_extension, extension = first_file_split_at_extension
             raise ValueError(
-                f"Cannot compare to dumped data with binary or image format.\nFound image or binary format at: {first_file_no_extension}\nwith format: {extension}\nPlease use the JSON format when dumping the data for comparison."
+                f"Cannot compare to dumped data with binary or image format.\nFound image or binary format "
+                f"at: {first_file_no_extension}\nwith format: {extension}\nPlease use the JSON format when "
+                f"dumping the data for comparison."
             )
 
         json_file_name = self._get_json_filename(type_of_struct)
@@ -1086,7 +1324,9 @@ def _compare_to_dumped_data(
                 dumped_data = json.load(f)
         except FileNotFoundError:
             raise FileNotFoundError(
-                f"No previously dumped data found for [{type_of_struct}] data for dump {self._dump_count}\nunder file path: {json_file_path}.\nDump the data first before comparing to previously dumped data."
+                f"No previously dumped data found for [{type_of_struct}] data for dump {self._dump_count}\n"
+                f"under file path: {json_file_path}.\nDump the data first before comparing to previously "
+                f"dumped data."
             )
 
         res_errors = self._walk_and_compare(
@@ -1100,11 +1340,15 @@ def _compare_to_dumped_data(
         if len(res_errors) > 0:
             error_message = "\n".join([error.message for error in res_errors])
             error_message = (
-                f"NOTE: The following errors were found for the dumped [{type_of_struct}] data for dump {self._dump_count}.\n"
-                f"      Up to {config.num_errors_per_tensor_to_show} most significant errors are shown per tensor.\n"
-                + error_message
+                f"NOTE: The following errors were found for the dumped [{type_of_struct}] data for dump "
+                f"{self._dump_count}.\n"
+                f"      Up to {config.num_errors_per_tensor_to_show} most significant errors are shown per "
+                f"tensor.\n" + error_message
+            )
+            error_message = (
+                f"Comparison of data with previously dumped data failed for [{type_of_struct}] data for dump "
+                f"{self._dump_count}.\n{error_message}"
             )
-            error_message = f"Comparison of data with previously dumped data failed for [{type_of_struct}] data for dump {self._dump_count}.\n{error_message}"
             if as_warning:
                 warnings.warn(error_message)
             else:
@@ -1137,7 +1381,8 @@ def _dump_image(
             raise ImportError(
                 "OpenCV (cv2) is not installed, but is required for dumping images via TensorDumper.\n"
                 "Please install the ACCV-Lab packages with optional dependencies enabled. "
-                "For details, see the Installation Guide (section on installation with optional dependencies)."
+                "For details, see the Installation Guide (section on installation with optional "
+                "dependencies)."
             ) from exc
 
         def ensure_image_range_ang_get_orig_range(image: torch.Tensor) -> tuple[torch.Tensor, List[float]]:
@@ -1151,10 +1396,14 @@ def ensure_image_range_ang_get_orig_range(image: torch.Tensor) -> tuple[torch.Te
 
         assert (dump_type == TensorDumper.Type.IMAGE_I and file_data.ndim == 2) or (
             dump_type != TensorDumper.Type.IMAGE_I and file_data.ndim == 3
-        ), f"Number of image dimensions does not match the dump type for file:\n{file_name}.\nImage data has {file_data.ndim} dimensions; dump type is {dump_type}."
-        assert (
-            dump_type == TensorDumper.Type.IMAGE_I or file_data.shape[-1] == 3
-        ), f"Color image must have 3 channels, but image to be dumped to:\n{file_name}\nhas {file_data.shape[-1]} channels."
+        ), (
+            f"Number of image dimensions does not match the dump type for file:\n{file_name}.\nImage data "
+            f"has {file_data.ndim} dimensions; dump type is {dump_type}."
+        )
+        assert dump_type == TensorDumper.Type.IMAGE_I or file_data.shape[-1] == 3, (
+            f"Color image must have 3 channels, but image to be dumped to:\n{file_name}\nhas "
+            f"{file_data.shape[-1]} channels."
+        )
 
         file_data, orig_range = ensure_image_range_ang_get_orig_range(file_data)
         file_data = file_data.detach().cpu().contiguous().numpy().astype(np.uint8)
@@ -1182,6 +1431,11 @@ def ensure_image_range_ang_get_orig_range(image: torch.Tensor) -> tuple[torch.Te
                 file_meta_info, open(f"{file_name}.meta.json", "w"), cls=TensorDumper._CustomEncoder, indent=2
             )
 
+    @staticmethod
+    def _dump_pickle(file_name: str, file_data: Any):
+        with open(file_name, "wb") as f:
+            pickle.dump(file_data, f)
+
     @staticmethod
     def _ensure_dir_exists(dir_name: str):
         os.makedirs(dir_name, exist_ok=True)
@@ -1266,7 +1520,8 @@ def change_permute_axes(
                 data, torch.Tensor, lambda x: TensorDumper._TensorWithFormat(x, dump_type, permute_axes)
             )
             return data
-        # If dump type overrides are provided, we need to traverse the data and apply the overrides to the tensors
+        # If dump type overrides are provided, we need to traverse the data and apply the overrides to the
+        # tensors
         elif dump_type_override is not None:
             data = TensorDumper._traverse_remember_waypoints_and_apply(
                 data,
@@ -1330,6 +1585,9 @@ def get_item_path(item_key: Union[str, int]) -> str:
             elif TensorDumper.Type.is_image(data.dump_type):
                 res_data = f"{path}.png"
                 res_files = {res_data: {"data": tensor, "dump_type": data.dump_type}}
+            elif data.dump_type == TensorDumper.Type.PICKLE:
+                res_data = f"{path}.pkl"
+                res_files = {res_data: {"data": tensor, "dump_type": data.dump_type}}
             else:
                 raise ValueError(f"Unsupported dump type: {data.dump_type}")
             return res_data, res_files
@@ -1414,16 +1672,18 @@ def _insert_at_path(
         # If we are inserting a dictionary, we can insert into non-empty or empty dicts
         if isinstance(value, Dict):
             for key in value.keys():
-                assert (
-                    key not in curr_data
-                ), f"Path `{path}` has an existing element with key `{key}`. Cannot insert element from `value` with the same key."
+                assert key not in curr_data, (
+                    f"Path `{path}` has an existing element with key `{key}`. Cannot insert element from "
+                    f"`value` with the same key."
+                )
                 curr_data[key] = value[key]
         # If we are inserting a tensor or a sequence, we can only insert into an empty dict
         elif isinstance(value, (TensorDumper._TensorWithFormat, Sequence)):
             assert parent is not None, f"Can only insert dictionaries at the root level."
-            assert (
-                len(curr_data) == 0
-            ), f"Path part `{path}` points to an existing non-empty dictionary. Cannot insert tensors or sequences as this would overwrite the existing elements."
+            assert len(curr_data) == 0, (
+                f"Path part `{path}` points to an existing non-empty dictionary. Cannot insert tensors or "
+                "sequences as this would overwrite the existing elements."
+            )
             parent[path_parts[-1]] = value
         else:
             raise ValueError(f"Unsupported data type: {type(value)}")
diff --git a/packages/optim_test_tools/docs/api.rst b/packages/optim_test_tools/docs/api.rst
index 0cb0475..10e68b8 100644
--- a/packages/optim_test_tools/docs/api.rst
+++ b/packages/optim_test_tools/docs/api.rst
@@ -6,6 +6,13 @@ Complete API documentation for the optim_test_tools package.
 .. currentmodule:: accvlab.optim_test_tools
 
 .. automodule:: accvlab.optim_test_tools
+   :members:
+   :undoc-members:
+   :show-inheritance:
+
+.. currentmodule:: accvlab.optim_test_tools.numba_nvtx
+
+.. automodule:: accvlab.optim_test_tools.numba_nvtx
    :members:
    :undoc-members:
    :show-inheritance:
\ No newline at end of file
diff --git a/packages/optim_test_tools/docs/examples.rst b/packages/optim_test_tools/docs/examples.rst
index e1ad691..0895d70 100644
--- a/packages/optim_test_tools/docs/examples.rst
+++ b/packages/optim_test_tools/docs/examples.rst
@@ -4,9 +4,12 @@ Examples
 This section contains runnable examples for the tools in ``accvlab.optim_test_tools``. The examples are
 kept concise and highlight common usage patterns. The corresponding classes in this package are singletons, 
 allowing them to be used in different parts of the code without the need for coordination between the parts.
+
 The stopwatch and NVTX range wrapper examples show this pattern explicitly, while the tensor dumper examples
-focus mainly on the actual usage of the tools. Please see the individual examples for more details. These
-are listed below in the recommended reading order.
+focus mainly on the actual usage of the tools. 
+
+The Numba NVTX example shows how to profile Numba-compiled code
+with NVTX ranges. Please see the individual examples for more details.
 
 .. toctree::
    :maxdepth: 1
@@ -15,6 +18,7 @@ are listed below in the recommended reading order.
    examples/nvtx_range_wrapper
    examples/tensor_dumper_comparison
    examples/tensor_dumper_dumping
+   examples/numba_nvtx
 
 .. seealso::
     
diff --git a/packages/optim_test_tools/docs/examples/numba_nvtx.rst b/packages/optim_test_tools/docs/examples/numba_nvtx.rst
new file mode 100644
index 0000000..f22bcb8
--- /dev/null
+++ b/packages/optim_test_tools/docs/examples/numba_nvtx.rst
@@ -0,0 +1,42 @@
+NVTX Ranges in Numba Code Example
+=================================
+
+This example shows how to annotate Numba JIT-compiled code with NVTX ranges using the 
+``accvlab.optim_test_tools.numba_nvtx`` module.
+Numba-compiled functions run as native code; this module allows you to push and pop NVTX
+ranges inside such functions.
+
+.. seealso::
+    
+    The code of this example can be found in the repository under 
+    ``packages/optim_test_tools/examples/numba_nvtx_example.py``.
+
+.. important::
+  
+  The functionality described here is specifically aimed at adding NVTX ranges to Numba-compiled code,
+  where alternative solutions, e.g. the Python ``nvtx`` module or ``torch.cuda.nvtx``, cannot be used.
+  While ``accvlab.optim_test_tools.numba_nvtx`` also works for plain Python code, it is not intended for 
+  that use-case and does not provide benefits over using available alternatives in this case.
+  
+  For profiling Python code, please also see the :doc:`nvtx_range_wrapper` example.
+  
+
+Overview
+--------
+
+- Import ``accvlab.optim_test_tools.numba_nvtx`` (e.g. as ``nvtx``).
+- Register range names with ``register_string("...")`` to obtain integer handles; this must be done outside
+  the JIT function, before compilation.
+- Inside an ``@numba.njit`` function, call ``nvtx.range_push(handle)`` and ``nvtx.range_pop()`` around
+  the region to profile.
+
+Example
+-------
+
+Please see the notes in the code for more details.
+
+.. note-literalinclude:: ../../examples/numba_nvtx_example.py
+   :language: python
+   :caption: packages/optim_test_tools/examples/numba_nvtx_example.py
+   :linenos:
+   :lineno-match:
diff --git a/packages/optim_test_tools/docs/examples/tensor_dumper_comparison.rst b/packages/optim_test_tools/docs/examples/tensor_dumper_comparison.rst
index dc2ec28..7fa65dc 100644
--- a/packages/optim_test_tools/docs/examples/tensor_dumper_comparison.rst
+++ b/packages/optim_test_tools/docs/examples/tensor_dumper_comparison.rst
@@ -6,10 +6,6 @@ and compare a subsequent run against a stored reference. The :doc:`tensor_dumper
 more features (custom converters, per‑tensor overrides, RaggedBatch support, custom pre‑dump processing, 
 early exit).
 
-Note that the dumper is a singleton similar to the other tools (see :doc:`stopwatch` and 
-:doc:`nvtx_range_wrapper`). Therefore, it can be enabled once and used across multiple code parts.
-However, this is not the focus of this example.
-
 .. seealso::
     
     The code of this example can be found in the repository under 
@@ -21,11 +17,21 @@ Overview
 - Enable the dumper and choose dump location.
 - Dump data in one run; compare in a later run (or next loop iteration here for demonstration).
 - Override dump formats to JSON for comparison (comparison supports only JSON).
-- Gradients are auto-computed when you call ``set_gradients([...])`` with scalar losses.
+- Gradients are auto-computed when you call ``set_gradients([...])`` with scalar losses. These gradients are
+  only used for dumping/comparison and do not influence the gradients computed elsewhere (e.g. during 
+  training).
 
 Example
 -------
 
+.. important::
+
+   In this example, we do not divide the code into different parts which correspond to e.g. different source
+   files in the actual use case, to make the example more concise. However, as 
+   :class:`~accvlab.optim_test_tools.TensorDumper` is a singleton, this can be easily done in practice. 
+   Please see the :doc:`stopwatch` or the :doc:`nvtx_range_wrapper` for examples of how to do this. The same 
+   approach can be used with the :class:`~accvlab.optim_test_tools.TensorDumper`.
+
 Example Code
 ^^^^^^^^^^^^
 
diff --git a/packages/optim_test_tools/docs/examples/tensor_dumper_dumping.rst b/packages/optim_test_tools/docs/examples/tensor_dumper_dumping.rst
index d842411..9c906f3 100644
--- a/packages/optim_test_tools/docs/examples/tensor_dumper_dumping.rst
+++ b/packages/optim_test_tools/docs/examples/tensor_dumper_dumping.rst
@@ -2,8 +2,8 @@ Tensor Dumper – Extended Dumping Example
 ========================================
 
 This example demonstrates advanced features: custom converters, per‑tensor dump type/permute overrides, 
-:class:`RaggedBatch` handling, custom processing executed only when the dumper is enabled, and 
-early exit after a fixed number of dumps.
+:class:`~accvlab.batching_helpers.RaggedBatch` handling, custom processing executed only when the dumper is 
+enabled, and early exit after a fixed number of dumps.
 
 .. seealso::
 
@@ -21,19 +21,28 @@ Overview
 - Register custom converters for non‑tensor containers.
 - Use per‑tensor overrides (format, axis permutation, exclusion) within nested structures.
 - Dump gradients by registering tensors first and providing scalar losses to ``set_gradients([...])``.
-- RaggedBatch support: dump as per‑sample or as a structured :class:`RaggedBatch`.
+- RaggedBatch support: dump as per‑sample or as a structured :class:`~accvlab.batching_helpers.RaggedBatch`.
 - Run custom pre‑dump logic only when enabled via ``run_if_enabled``.
 
 Details
 -------
 
+.. important::
+
+   In this example, we do not divide the code into different parts which correspond to e.g. different source
+   files in the actual use case, to make the example more concise. However, as 
+   :class:`~accvlab.optim_test_tools.TensorDumper` is a singleton, this can be easily done in practice. 
+   Please see the :doc:`stopwatch` or the :doc:`nvtx_range_wrapper` for examples of how to do this. The same 
+   approach can be used with the :class:`~accvlab.optim_test_tools.TensorDumper`.
+
 Below, we walk through the example section by section. Notes in the code are highlighted.
 
-Create Synthetic Inputs Helpers
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+Synthetic Inputs Generation Helpers
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
-Here, we define a helper function to create some synthetic inputs to be dumped as well as a wrapper class for 
-demonstrating the custom converter functionality.
+First, helpers are defined to create dummy data which will be dumped in the example. Note that apart from 
+functions for image and bounding box data generation, a wrapper class is defined. That class is later used to
+showcase the custom converter functionality of the dumper.
 
 .. note-literalinclude:: ../../examples/tensor_dumper_dumping_example.py
    :language: python
@@ -46,7 +55,9 @@ demonstrating the custom converter functionality.
 Initialize and Configure the Dumper
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
-Here, we initialize and configure the dumper.
+Here, we initialize and configure the dumper. Note the use of 
+:meth:`~accvlab.optim_test_tools.TensorDumper.perform_after_dump_count` to exit the
+program after a fixed number of dumps. This can e.g. be useful during debugging to only dump a few iterations.
 
 .. note-literalinclude:: ../../examples/tensor_dumper_dumping_example.py
    :language: python
@@ -59,8 +70,11 @@ Here, we initialize and configure the dumper.
 Register Custom Converters
 ^^^^^^^^^^^^^^^^^^^^^^^^^^
 
-Here, we register a custom converter for the ``TensorWrapper`` class (which is a simple wrapper used for 
-demonstrating the custom converter functionality).
+Here, we register a custom converter for the ``TensorWrapper`` class.
+
+The task of this converter is to convert the ``TensorWrapper`` object to a nested structure containing only
+values supported by the dumper (tensors, NumPy arrays, types for which other custom converters are 
+registered, or simple types which can written out as-is (e.g. strings)).
 
 .. note-literalinclude:: ../../examples/tensor_dumper_dumping_example.py
    :language: python
@@ -74,6 +88,7 @@ Main Loop
 ^^^^^^^^^
 
 Here, we loop over some iterations (e.g. training iterations) and dump the data (see following sections).
+Note that all the following sections are are inside the main loop.
 
 .. note-literalinclude:: ../../examples/tensor_dumper_dumping_example.py
    :language: python
@@ -86,7 +101,7 @@ Here, we loop over some iterations (e.g. training iterations) and dump the data
 Create the Test Data
 ^^^^^^^^^^^^^^^^^^^^
 
-Here, we create some synthetic test data to be dumped.
+Generate some synthetic test data to be dumped.
 
 .. note-literalinclude:: ../../examples/tensor_dumper_dumping_example.py
    :language: python
@@ -99,7 +114,7 @@ Here, we create some synthetic test data to be dumped.
 Add Tensors
 ^^^^^^^^^^^
 
-Here, we add the tensors to be dumped.
+Add tensors to be dumped.
 
 .. note-literalinclude:: ../../examples/tensor_dumper_dumping_example.py
    :language: python
@@ -112,7 +127,8 @@ Here, we add the tensors to be dumped.
 Add Gradients
 ^^^^^^^^^^^^^
 
-Here, we add the gradients to be dumped.
+Gradients to be dumped. Note that here, tensors are added and the corresponding gradients are computed
+automatically based on output (loss) values later.
 
 .. note-literalinclude:: ../../examples/tensor_dumper_dumping_example.py
    :language: python
@@ -125,7 +141,8 @@ Here, we add the gradients to be dumped.
 Custom Processing Prior to Dumping
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
-Here, we run some custom processing prior to dumping to enable dumping of in a more accessible format.
+Run custom processing prior to dumping to enable dumping of in a more accessible format.
+Note that the processing is only executed if the dumper is enabled.
 
 .. note-literalinclude:: ../../examples/tensor_dumper_dumping_example.py
    :language: python
@@ -138,7 +155,7 @@ Here, we run some custom processing prior to dumping to enable dumping of in a m
 RaggedBatch Dumping
 ^^^^^^^^^^^^^^^^^^^
 
-Here, we dump the RaggedBatch data.
+Dump :class:`~accvlab.batching_helpers.RaggedBatch` data.
 
 .. note-literalinclude:: ../../examples/tensor_dumper_dumping_example.py
    :language: python
@@ -146,39 +163,51 @@ Here, we dump the RaggedBatch data.
    :linenos:
    :lineno-match:
    :start-at: # ---------------------------- RaggedBatch dumping ----------------------------
-   :end-before: # ------------------- Placeholder for e.g. loss computation -------------------
+   :end-before: # --------------------------------- Inner loop --------------------------------
+
 
-Placeholder for e.g. Loss Computation
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+Use of Ranges for Disambiguation
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
-Here, we place a placeholder for e.g. the loss computation to demonstrate how the gradients are computed
-automatically.
+Here, there is an inner loop where the same data is added to the dump in multiple iterations. To disambiguate
+the names of the data entries, ranges can be used to add context to the data entries path. In this example,
+a range containing the iteration index is used.
 
 .. note-literalinclude:: ../../examples/tensor_dumper_dumping_example.py
    :language: python
    :caption: packages/optim_test_tools/examples/tensor_dumper_dumping_example.py
    :linenos:
    :lineno-match:
-   :start-at: # ------------------- Placeholder for e.g. loss computation -------------------
-   :end-before: # ----------------------------- Set the gradients -----------------------------
+   :start-at: # --------------------------------- Inner loop --------------------------------
+   :end-before: # ------------------- Placeholder for e.g. loss computation -------------------
+
+
+Obtaining Gradients
+^^^^^^^^^^^^^^^^^^^
 
-Set the Gradients
-^^^^^^^^^^^^^^^^^
+Previously, we added tensors for which gradients are to be dumped (using 
+:meth:`~accvlab.optim_test_tools.TensorDumper.add_grad_data`).
+Here, we demonstrate how to obtain these gradients automatically based on output (e.g. loss) values.
 
-Here, we set the gradients to be dumped.
+We use a placeholder for the loss computation (``summed_3`` and ``summed_5``).
+To obtain the gradients, the function :meth:`~accvlab.optim_test_tools.TensorDumper.set_gradients` is used. 
+This function takes a list of scalar (loss) tensors as input, and the gradients are computed from each of 
+these values and accumulated to obtain the final gradients. 
+Note that this computation is performed independently of the gradients computed elsewhere (e.g. in the 
+training loop). After calling this function, the gradients are ready to be dumped.
 
 .. note-literalinclude:: ../../examples/tensor_dumper_dumping_example.py
    :language: python
    :caption: packages/optim_test_tools/examples/tensor_dumper_dumping_example.py
    :linenos:
    :lineno-match:
-   :start-at: # ----------------------------- Set the gradients -----------------------------
+   :start-at: # ------------------- Placeholder for e.g. loss computation -------------------
    :end-before: # ---------------------------------- Dump ----------------------------------
 
 Dump Data
 ^^^^^^^^^
 
-Finally, we dump the data. We invite the reader to run the example and inspect the dumped data.
+Finally, we dump the data.
 
 .. note-literalinclude:: ../../examples/tensor_dumper_dumping_example.py
    :language: python
@@ -187,6 +216,8 @@ Finally, we dump the data. We invite the reader to run the example and inspect t
    :lineno-match:
    :start-at: # ---------------------------------- Dump ----------------------------------
 
+We invite the reader to run the example and inspect the dumped data.
+
 Related Examples
 ----------------
 
diff --git a/packages/optim_test_tools/docs/intro.rst b/packages/optim_test_tools/docs/intro.rst
index e09fca5..a49005d 100644
--- a/packages/optim_test_tools/docs/intro.rst
+++ b/packages/optim_test_tools/docs/intro.rst
@@ -4,8 +4,10 @@ Introduction
 Design
 ------
 
-This package contains helpers which can be used for evaluating runtime optimizations as well as for debugging.
-The helpers are designed to
+This package contains helpers which can be used for evaluating runtime optimizations as well as for 
+debugging. 
+
+Most of the helper functionality are singleton classes which are designed to
 
   - have minimal overhead if no evaluation is performed (i.e. the helper is not enabled)
   - have a convenient way to be enabled & configured if used (e.g. from main training script)
@@ -19,14 +21,14 @@ The helpers are designed to
           measurement
         - Enabling/disabling the helpers from the main training script
   
-      - it allows to use e.g. start and end measurements in different parts of the code without the need to 
-        coordinate the data exchange between the parts. All measurements can be combined centrally and and a 
-        summary can be obtained without the need to coordinate the data exchange between the parts.
+      - it allows to e.g. start and end measurements in different parts of the code without the need to 
+        coordinate the data exchange (such as time stamps) between the parts. All measurements can be combined 
+        centrally and a summary can be obtained easily.
 
 Functionalities
 ---------------
 
-This package contains the following helper classes:
+This package contains the following helper classes (singletons):
   - :class:`~accvlab.optim_test_tools.Stopwatch`: Can be used to conveniently measure runtime of different 
     parts of the code (including defining a warm-up phase, keeping track of the 
     iterations, averaging measurements etc.). Additionally, it supports measuring average CPU usage.
@@ -41,6 +43,15 @@ This package contains the following helper classes:
     used to detect e.g. differences due to bugs introduced in the code while working on optimization (which 
     should not change the results).
 
+Additionally, this package provides the :mod:`~accvlab.optim_test_tools.numba_nvtx` module for adding NVTX 
+ranges to Numba JIT-compiled (``@njit``) code. Numba-compiled functions run as native code, so Python-level 
+NVTX tools (such as the ``nvtx`` package or ``torch.cuda.nvtx``) cannot be called from within them. The 
+:mod:`~accvlab.optim_test_tools.numba_nvtx` module solves this by providing 
+:func:`~accvlab.optim_test_tools.numba_nvtx.range_push` and 
+:func:`~accvlab.optim_test_tools.numba_nvtx.range_pop` functions that can be used 
+directly inside ``@njit`` functions. It works by registering C-level NVTX symbols using Numba's 
+:func:`~numba.core.extending.overload` mechanism and emitting the corresponding calls in the compiled code.
+
 .. seealso::
 
     Please see :doc:`api` for more detailed information about the classes and :doc:`examples` on how to use 
diff --git a/packages/optim_test_tools/examples/numba_nvtx_example.py b/packages/optim_test_tools/examples/numba_nvtx_example.py
new file mode 100644
index 0000000..cb790c3
--- /dev/null
+++ b/packages/optim_test_tools/examples/numba_nvtx_example.py
@@ -0,0 +1,41 @@
+# Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import accvlab.optim_test_tools.numba_nvtx as nvtx
+from numba import njit
+
+# @NOTE
+# Register range names outside the JIT function to obtain integer handles.
+# Handles must be created before compilation because Numba compiles the function
+# and the handle value is baked into the compiled code.
+h_example_range = nvtx.register_string("example_range")
+h_example_range_inner = nvtx.register_string("example_range_inner")
+
+
+@njit
+def compute(x):
+    # @NOTE: Push the range using the handle created above.
+    nvtx.range_push(h_example_range)
+    y = x - 1
+    # @NOTE: Push the inner range.
+    nvtx.range_push(h_example_range_inner)
+    y = x + 2
+    # @NOTE: Pop both ranges.
+    nvtx.range_pop()
+    nvtx.range_pop()
+    return y
+
+
+if __name__ == "__main__":
+    result = compute(41)
diff --git a/packages/optim_test_tools/examples/tensor_dumper_comparison_example.py b/packages/optim_test_tools/examples/tensor_dumper_comparison_example.py
index 97d8b6e..2b3179a 100644
--- a/packages/optim_test_tools/examples/tensor_dumper_comparison_example.py
+++ b/packages/optim_test_tools/examples/tensor_dumper_comparison_example.py
@@ -40,8 +40,9 @@
 
 # @NOTE
 # Here, we dump tensor in iteration 0 and compare to the dumped data in iteration 1.
-# In a typical use-case, one would dump the data in one run, store it, and then compare in future runs,
-# while e.g. working on optimizations, to ensure that the optimizations do not introduce errors in the data.
+# In a typical use-case, one would dump the data in one run, and then compare to the dumped data in subsequent
+# runs, while e.g. working on optimizations, to ensure that the optimizations do not introduce errors (which
+# would lead to a mismatch in the dumped data).
 for i in range(2):
     # @NOTE: Generate data to dump/compare to
     if not make_tensor_data_inconsistent:
@@ -82,8 +83,9 @@
     # reference data. However, as we set some of the data to be in BINARY format, we need to override this to
     # use the comparison functionality.
     #
-    # In practice, this may be useful to switch between dumping the data for manual inspection (original
-    # formats) or comparison (all JSON).
+    # In practice, this may be useful to switch between dumping the data for manual inspection (originally
+    # selected formats) or the automatic comparison functionality used in this example (all data in JSON
+    # format).
     dumper.set_dump_type_for_all(TensorDumper.Type.JSON)
 
     # @NOTE
diff --git a/packages/optim_test_tools/examples/tensor_dumper_dumping_example.py b/packages/optim_test_tools/examples/tensor_dumper_dumping_example.py
index e202443..4d9c7e1 100644
--- a/packages/optim_test_tools/examples/tensor_dumper_dumping_example.py
+++ b/packages/optim_test_tools/examples/tensor_dumper_dumping_example.py
@@ -29,7 +29,12 @@
 
 
 # ------------------------- Helper: Create synthetic inputs -------------------------
-# @NOTE: Create a test tensor representing an image with smooth gradients
+
+# @NOTE
+# Here, helper functions are defined to create dummy data which will be dumped. Additionally, a wrapper class
+# for demonstrating the custom converter functionality is defined.
+
+
 def create_simple_gradient_image(
     height: int = 256, width: int = 256, blue_channel_value: float = 0.5
 ) -> torch.Tensor:
@@ -50,6 +55,7 @@ def create_simple_gradient_image(
 
 
 def create_bboxes(num_bboxes: int, image_shape: tuple[int, int]) -> torch.Tensor:
+    """Create a tensor representing bounding boxes."""
     bboxes = []
     for _ in range(num_bboxes):
         x1 = torch.randint(0, image_shape[1], (1,))
@@ -69,17 +75,20 @@ def __init__(self, tensor: torch.Tensor):
 
 
 # ------------------- Initialize and configure the dumper -------------------
-# @NOTE: Get instance and enable the dumper. Configure early‑exit after a fixed number of dumps.
+# @NOTE: Get instance and enable the dumper.
 _current_dir = os.path.dirname(os.path.abspath(__file__))
 dumper = TensorDumper()
 dumper.enable(os.path.join(_current_dir, "test_dump"))
-# @NOTE: Exit the program after 3 dumps. Useful to capture only a few iterations without changing outer loops.
+# @NOTE: Configure early‑exit after 3 dumps. This can be useful to only dump a few iterations and exit,
+# without any code-changes for the exit functionality. Other functions can also be set up to be called after a
+# fixed number of dumps.
 dumper.perform_after_dump_count(3, exit)
 
 # ------------------------- Register custom converters -------------------------
 # @NOTE
 # Register a custom converter for `TensorWrapper`. Any tensors returned by the converter are treated
-# the same as tensors added directly via `add_tensor_data`.
+# the same as tensors added directly via `add_tensor_data`. Converters are applied recursively, i.e. if a
+# converter returns types for which other custom converters are registered, these are called in turn.
 dumper.register_custom_converter(
     TensorWrapper, lambda x: {"tensor": x.tensor, "some_addition_text": x.some_addition_text}
 )
@@ -141,9 +150,9 @@ def __init__(self, tensor: torch.Tensor):
     #    The custom handling is done by adding a custom extension to the dumper, which is then used to dump
     #    the object (the custom converter is registered above).
     # 3. `unneeded_data` is excluded from the dump.
-    #    This is useful to e.g. exclude data which is part of the structure, but either not needed in the dump,
-    #    or which will be added to the dump later via custom processing logic (see below for bounding box
-    #    images).
+    #    This is useful to e.g. exclude data which is part of the structure, but either not needed in the 
+    #    dump, or which will be added to the dump later via custom processing logic (see below for bounding 
+    #    box images).
     dumper.add_tensor_data(
         "images.other_images",
         {
@@ -232,21 +241,51 @@ def draw_bboxes(bboxes: torch.Tensor, image_shape: tuple[int, int]) -> torch.Ten
     # @NOTE: Dump RaggedBatch structures both as per‑sample and as full RaggedBatch objects.
     ragged_batch_1 = RaggedBatch(torch.randn(3, 5), sample_sizes=torch.tensor([3, 5, 1]))
     ragged_batch_2 = RaggedBatch(torch.randn(3, 5), sample_sizes=torch.tensor([3, 5, 1]))
-    # @NOTE: Demonstrate toggling `as_per_sample` and prefer JSON for structured RaggedBatch content.
+    # @NOTE
+    # Toggling `as_per_sample` and dumping the data in the desired format (per-sample or as a RaggedBatch
+    # structure).
     dumper.enable_ragged_batch_dumping(as_per_sample=True)
     dumper.add_tensor_data("ragged_batches.batch_1", ragged_batch_1, TensorDumper.Type.JSON)
     dumper.enable_ragged_batch_dumping(as_per_sample=False)
     dumper.add_tensor_data("ragged_batches.batch_2", ragged_batch_2, TensorDumper.Type.JSON)
 
+    # --------------------------------- Inner loop --------------------------------
+    # @NOTE
+    # Here, the same data is added to the dump in multiple iterations. Care needs to be taked to disambiguate
+    # the names of the individual data entries. This can be conveniently done using ranges. Ranges will become
+    # part of the dump path, and the names of the data entries will be appended to the range name.
+    for inner_iteration in range(10):
+        # @NOTE
+        # The string formatting is performed inside a lambda function instead of passing the formatted string
+        # directly. `push_range` supports both strings and callables. If formatting is needed, a callable
+        # should be passed as it is called only if the dumper is enabled, avoiding unnecessary overhead for
+        # formatting if the dumper is not enabled.
+        dumper.push_range(lambda: f"inner_loop_{inner_iteration}")
+        dummy_tensor = torch.tensor(inner_iteration)
+        # @NOTE: While the path "dummy.tensor" remains the same, the range provides context and ensures
+        # that naming collisions are avoided. The range (or ranges) is added at the beginning of the path.
+        # For example, the json structure for `inner_iteration == 0` would be:
+        #   "inner_loop_0": {
+        #     "dummy": {
+        #       "tensor": 0
+        #   }
+        dumper.add_tensor_data("dummy.tensor", dummy_tensor, TensorDumper.Type.JSON)
+        dumper.pop_range()
+
     # ------------------- Placeholder for e.g. loss computation -------------------
-    # @NOTE: Dummy loss computation to demonstrate auto-computing & dumping of gradients.
+    # @NOTE
+    # Dummy loss computation to demonstrate auto-computing & dumping of gradients.
+    # In a real use case, a final loss value would be used instead of `summed_3` and `summer_5`.
+    # Although the final loss is one value, we use `summed_3` and `summed_5` to demonstrate that the
+    # gradients can also be computed for more than one output tensor.
     image_sin_3 = torch.sin(test_image_3 * 2.0 * np.pi * 3.0)
     image_sin_5 = torch.sin(test_image_5 * 2.0 * np.pi * 3.0)
     summed_3 = torch.sum(image_sin_3)
     summed_5 = torch.sum(image_sin_5)
 
     # ----------------------------- Set the gradients -----------------------------
-    # @NOTE: Provide scalar values from which gradients are computed for all tensors that require them.
+    # @NOTE
+    # Provide (scalar) loss tensors from which gradients are computed for all tensors that require them.
     dumper.set_gradients([summed_3, summed_5])
 
     # ---------------------------------- Dump ----------------------------------
diff --git a/packages/optim_test_tools/ext_impl/CMakeLists.txt b/packages/optim_test_tools/ext_impl/CMakeLists.txt
new file mode 100644
index 0000000..e99bc7b
--- /dev/null
+++ b/packages/optim_test_tools/ext_impl/CMakeLists.txt
@@ -0,0 +1,62 @@
+# Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+cmake_minimum_required(VERSION 3.18)
+project(accvlab_optim_test_tools_numba_ext LANGUAGES CXX)
+
+if(NOT DEFINED CMAKE_CXX_STANDARD)
+  set(CMAKE_CXX_STANDARD 17)
+endif()
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+
+find_package(Python COMPONENTS Interpreter Development REQUIRED)
+
+# Get pybind11 CMake directory
+execute_process(
+    COMMAND "${Python_EXECUTABLE}" -m pybind11 --cmakedir
+    OUTPUT_VARIABLE pybind11_DIR
+    OUTPUT_STRIP_TRAILING_WHITESPACE
+)
+
+find_package(pybind11 REQUIRED)
+
+# NVTX headers are provided via a git submodule (mirrors on_demand_video_decoder, but separate instance).
+set(ACCVLAB_NVTX_INCLUDE_DIR
+    "${CMAKE_CURRENT_LIST_DIR}/external/NVTX/c/include"
+)
+
+if(NOT EXISTS "${ACCVLAB_NVTX_INCLUDE_DIR}/nvtx3/nvToolsExt.h")
+  message(FATAL_ERROR "NVTX submodule not found. Run: git submodule update --init --recursive")
+endif()
+
+pybind11_add_module(accvlab_optim_test_tools_numba_ext MODULE
+    src/nvtx_numba.cpp
+)
+
+set_target_properties(accvlab_optim_test_tools_numba_ext PROPERTIES
+    CXX_STANDARD 17
+    OUTPUT_NAME "_nvtx_numba_ext"
+    PREFIX ""
+)
+
+target_include_directories(accvlab_optim_test_tools_numba_ext PRIVATE
+    "${ACCVLAB_NVTX_INCLUDE_DIR}"
+)
+
+install(TARGETS accvlab_optim_test_tools_numba_ext
+    LIBRARY DESTINATION .
+    RUNTIME DESTINATION .
+)
+
+
diff --git a/packages/optim_test_tools/ext_impl/external/NVTX b/packages/optim_test_tools/ext_impl/external/NVTX
new file mode 160000
index 0000000..e170594
--- /dev/null
+++ b/packages/optim_test_tools/ext_impl/external/NVTX
@@ -0,0 +1 @@
+Subproject commit e170594ac7cf1dac584da473d4ca9301087090c1
diff --git a/packages/optim_test_tools/ext_impl/src/nvtx_numba.cpp b/packages/optim_test_tools/ext_impl/src/nvtx_numba.cpp
new file mode 100644
index 0000000..27b3462
--- /dev/null
+++ b/packages/optim_test_tools/ext_impl/src/nvtx_numba.cpp
@@ -0,0 +1,83 @@
+/*
+ * Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * 
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ * 
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cstdint>
+#include <string>
+
+#include <pybind11/pybind11.h>
+
+// NVTX v3 header-only implementation (includes dynamic injection loader).
+// We rely on the repo-vendored NVTX headers (see ext_impl/CMakeLists.txt include path).
+#include <nvtx3/nvToolsExt.h>
+
+namespace py = pybind11;
+
+namespace {
+
+static inline uintptr_t nvtx_register_string_a(const char* s) noexcept {
+    // If domain is NULL, the global domain is used.
+    nvtxStringHandle_t h = nvtxDomainRegisterStringA(nullptr, s);
+    return reinterpret_cast<uintptr_t>(h);
+}
+
+static inline void nvtx_range_push_registered(uintptr_t handle) noexcept {
+    if (handle == 0) {
+        return;
+    }
+
+    nvtxEventAttributes_t a{};
+    a.version = NVTX_VERSION;
+    a.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE;
+    a.messageType = NVTX_MESSAGE_TYPE_REGISTERED;
+    a.message.registered = reinterpret_cast<nvtxStringHandle_t>(handle);
+    (void)nvtxRangePushEx(&a);
+}
+
+static inline void nvtx_range_pop() noexcept { (void)nvtxRangePop(); }
+
+}  // namespace
+
+#if defined(_WIN32)
+#define ACCVLAB_NVTX_NUMBA_EXPORT extern "C" __declspec(dllexport)
+#else
+#define ACCVLAB_NVTX_NUMBA_EXPORT extern "C" __attribute__((visibility("default")))
+#endif
+
+// These exported symbols are what Numba will call from within @njit (via LLVM symbol binding).
+ACCVLAB_NVTX_NUMBA_EXPORT void accvlab_nvtx_range_push(std::uint64_t handle) noexcept {
+    nvtx_range_push_registered(static_cast<uintptr_t>(handle));
+}
+
+ACCVLAB_NVTX_NUMBA_EXPORT void accvlab_nvtx_range_pop() noexcept { nvtx_range_pop(); }
+
+static std::uint64_t py_register_string(const std::string& s) {
+    return static_cast<std::uint64_t>(nvtx_register_string_a(s.c_str()));
+}
+
+static void py_range_push(std::uint64_t handle) noexcept {
+    nvtx_range_push_registered(static_cast<uintptr_t>(handle));
+}
+
+static void py_range_pop() noexcept { nvtx_range_pop(); }
+
+PYBIND11_MODULE(_nvtx_numba_ext, m) {
+    m.doc() = "NVTX registered-string range push/pop for Numba @njit (CPU) code.";
+    m.def("register_string", &py_register_string, py::arg("name"),
+          "Register a string with NVTX and return an integer handle (0 if NVTX is inactive).");
+    m.def("range_push", &py_range_push, py::arg("handle"),
+          "Push an NVTX range using a previously-registered string handle.");
+    m.def("range_pop", &py_range_pop, "Pop an NVTX range.");
+}
diff --git a/packages/optim_test_tools/pyproject.toml b/packages/optim_test_tools/pyproject.toml
index 670b48c..6802125 100644
--- a/packages/optim_test_tools/pyproject.toml
+++ b/packages/optim_test_tools/pyproject.toml
@@ -1,5 +1,11 @@
 [build-system]
-requires = ["setuptools>=64", "wheel", "torch>=2.0.0"]
+requires = [
+    "setuptools>=64",
+    "wheel",
+    "torch>=2.0.0",
+    "scikit-build>=0.17.0",
+    "pybind11>=2.10.0",
+]
 build-backend = "setuptools.build_meta"
 
 [project]
@@ -18,6 +24,8 @@ dependencies = [
 optional = [
     "opencv-python-headless",
     "pytest",
+    "numba",
+    "llvmlite",
 ]
 
 [tool.setuptools.packages.find]
diff --git a/packages/optim_test_tools/setup.py b/packages/optim_test_tools/setup.py
index 2d29f52..432aead 100644
--- a/packages/optim_test_tools/setup.py
+++ b/packages/optim_test_tools/setup.py
@@ -1,26 +1,32 @@
-from setuptools import setup
-from torch.utils.cpp_extension import BuildExtension
-import sys
-from pathlib import Path
+# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 
-from accvlab_build_config import run_external_build
+from skbuild import setup
+from setuptools import find_namespace_packages
 
+from accvlab_build_config import build_cmake_args_from_env
 
-def get_extensions():
-    """Return all extensions"""
-    extensions = []
-    return extensions
-
-
-# Run external build before setup
-run_external_build(Path(__file__).parent)
+_cmake_args = build_cmake_args_from_env()
 
 setup(
     name="accvlab.optim_test_tools",
     version="0.1.0",
     description="Optimization Testing Tools Package (part of the ACCV-Lab package).",
-    ext_modules=get_extensions(),
-    cmdclass={"build_ext": BuildExtension},
-    python_requires=">=3.8",
+    packages=find_namespace_packages(include=["accvlab.optim_test_tools*"]),
+    include_package_data=True,
     zip_safe=False,
+    cmake_source_dir="ext_impl",
+    cmake_install_dir="accvlab/optim_test_tools/numba_nvtx",
+    cmake_args=_cmake_args,
 )
diff --git a/packages/optim_test_tools/tests/test_numba_nvtx.py b/packages/optim_test_tools/tests/test_numba_nvtx.py
new file mode 100644
index 0000000..2f9f2d3
--- /dev/null
+++ b/packages/optim_test_tools/tests/test_numba_nvtx.py
@@ -0,0 +1,45 @@
+# Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import pytest
+
+
+def test_import_numba_nvtx_module():
+    import accvlab.optim_test_tools.numba_nvtx as nvtx
+
+    assert hasattr(nvtx, "register_string")
+    assert hasattr(nvtx, "range_push")
+    assert hasattr(nvtx, "range_pop")
+
+
+def test_numba_njit_calls_do_not_crash():
+    numba = pytest.importorskip("numba")
+
+    import accvlab.optim_test_tools.numba_nvtx as nvtx
+
+    h = nvtx.register_string("test_range")
+    assert isinstance(h, int)
+
+    @numba.njit
+    def f(x):
+        nvtx.range_push(h)
+        y = x + 1
+        nvtx.range_pop()
+        return y
+
+    assert f(41) == 42
+
+
+if __name__ == "__main__":
+    pytest.main([__file__])