diff --git a/clams/app/__init__.py b/clams/app/__init__.py
index d6c6dcf..6550327 100644
--- a/clams/app/__init__.py
+++ b/clams/app/__init__.py
@@ -148,7 +148,7 @@ def annotate(self, mmif: Union[str, dict, Mmif], **runtime_params: List[str]) ->
         pretty = refined.get('pretty', False)
         t = datetime.now()
         with warnings.catch_warnings(record=True) as ws:
-            annotated = self._annotate(mmif, **refined)
+            annotated, cuda_profiler = self._profile_cuda_memory(self._annotate)(mmif, **refined)
             if ws:
                 issued_warnings.extend(ws)
         if issued_warnings:
@@ -164,11 +164,21 @@ def annotate(self, mmif: Union[str, dict, Mmif], **runtime_params: List[str]) ->
             runtime_recs['architecture'] = platform.machine()
             # runtime_recs['processor'] = platform.processor()  # this only works on Windows
             runtime_recs['cuda'] = []
-            if shutil.which('nvidia-smi'):
+            # Use cuda_profiler data if available, otherwise fallback to nvidia-smi
+            if cuda_profiler:
+                for gpu_info, peak_memory_bytes in cuda_profiler.items():
+                    # Convert peak memory to human-readable format
+                    peak_memory_mb = peak_memory_bytes / (1000 * 1000)
+                    if peak_memory_mb >= 1000:
+                        peak_memory_str = f"{peak_memory_mb / 1000:.2f} GiB"
+                    else:
+                        peak_memory_str = f"{peak_memory_mb:.1f} MiB"
+                    runtime_recs['cuda'].append(f"{gpu_info}, Used {self._cuda_memory_to_str(peak_memory_bytes)}")
+            elif shutil.which('nvidia-smi'):
                 for gpu in subprocess.run(['nvidia-smi', '--query-gpu=name,memory.total', '--format=csv,noheader'], 
                                           stdout=subprocess.PIPE).stdout.decode('utf-8').strip().split('\n'):
                     name, mem = gpu.split(', ')
-                    runtime_recs['cuda'].append(f'{name} ({mem})')
+                    runtime_recs['cuda'].append(self._cuda_device_name_concat(name, mem))
         for annotated_view in annotated.views:
             if annotated_view.metadata.app == self.metadata.identifier:
                 if runningTime:
@@ -321,6 +331,66 @@ def validate_document_locations(mmif: Union[str, Mmif]) -> None:
                 #  (https://github.com/clamsproject/mmif/issues/150) , here is a good place for additional check for
                 #  file integrity
 
+    @staticmethod
+    def _cuda_memory_to_str(mem) -> str:
+        mib = mem / (1024 * 1024)
+        if mib >= 1024:
+            return f"{mib / 1024:.2f} GiB"
+        else:
+            return f"{mib:.1f} MiB"
+
+    @staticmethod
+    def _cuda_device_name_concat(name, mem):
+        if type(mem) in (bytes, int):
+            mem = ClamsApp._cuda_memory_to_str(mem)
+        return f"{name}, With {mem}"
+
+    @staticmethod
+    def _profile_cuda_memory(func):
+        """
+        Decorator for profiling CUDA memory usage during _annotate execution.
+        
+        :param func: The function to wrap (typically _annotate)
+        :return: Decorated function that returns (result, cuda_profiler)
+                 where cuda_profiler is dict with "<GPU_NAME>, <GPU_TOTAL_MEMORY>" keys
+                 and peak memory usage values
+        """
+        def wrapper(*args, **kwargs):
+            cuda_profiler = {}
+            torch_available = False
+            cuda_available = False
+            device_count = 0
+            
+            try:
+                import torch  # pytype: disable=import-error
+                torch_available = True
+                cuda_available = torch.cuda.is_available()
+                device_count = torch.cuda.device_count()
+                if cuda_available:
+                    # Reset peak memory stats for all devices
+                    torch.cuda.reset_peak_memory_stats('cuda')
+            except ImportError:
+                pass
+            
+            try:
+                result = func(*args, **kwargs)
+                
+                if torch_available and cuda_available and device_count > 0:
+                    for device_id in range(device_count):
+                        device_id = f'cuda:{device_id}'
+                        peak_memory = torch.cuda.max_memory_allocated(device_id)
+                        gpu_name = torch.cuda.get_device_name(device_id)
+                        gpu_total_memory = torch.cuda.get_device_properties(device_id).total_memory
+                        key = ClamsApp._cuda_device_name_concat(gpu_name, gpu_total_memory)
+                        cuda_profiler[key] = peak_memory
+                
+                return result, cuda_profiler
+            finally:
+                if torch_available and cuda_available:
+                    torch.cuda.empty_cache()
+        
+        return wrapper
+
     @staticmethod
     @contextmanager
     def open_document_location(document: Union[str, Document], opener: Any = open, **openerargs):
diff --git a/docs/.doctrees/environment.pickle b/docs/.doctrees/environment.pickle
index 30a34df..e11b07c 100644
Binary files a/docs/.doctrees/environment.pickle and b/docs/.doctrees/environment.pickle differ