diff --git a/CMakeLists.txt b/CMakeLists.txt
index 63538e78..3b871c41 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -52,6 +52,9 @@ option(SGL_BUILD_STANDALONE "Build without external dependencies" ON)
 # This allows to build CUDA support without having CUDA SDK installed.
 option(SGL_USE_DYNAMIC_CUDA "Load CUDA driver API dynamically" ON)
 
+# Enable NVTX profiling annotations
+option(SGL_ENABLE_NVTX "Enable NVTX profiling annotations" ON)
+
 # Disable asserts.
 # By default, asserts are enabled in both debug and release builds.
 # This option can be used to override the default behavior.
@@ -119,6 +122,20 @@ if(NOT SGL_USE_DYNAMIC_CUDA)
     find_package(CUDAToolkit REQUIRED)
 endif()
 
+# NVTX
+# -----------------------------------------------------------------------------
+
+if(SGL_ENABLE_NVTX)
+    find_package(CUDAToolkit REQUIRED)
+    if(CUDAToolkit_FOUND)
+        add_compile_definitions(SGL_ENABLE_NVTX=1)
+        message(STATUS "NVTX support enabled")
+    else()
+        message(WARNING "NVTX requested but CUDA Toolkit not found")
+        set(SGL_ENABLE_NVTX OFF)
+    endif()
+endif()
+
 # -----------------------------------------------------------------------------
 # Global setup
 # -----------------------------------------------------------------------------
diff --git a/src/slangpy_ext/utils/slangpy.cpp b/src/slangpy_ext/utils/slangpy.cpp
index 4e046b74..ff9df1c5 100644
--- a/src/slangpy_ext/utils/slangpy.cpp
+++ b/src/slangpy_ext/utils/slangpy.cpp
@@ -21,6 +21,10 @@
 
 #include <fmt/format.h>
 
+#ifdef SGL_ENABLE_NVTX
+#include <nvtx3/nvToolsExt.h>
+#endif
+
 namespace sgl {
 extern void write_shader_cursor(ShaderCursor& cursor, nb::object value);
 extern nb::ndarray<nb::numpy> buffer_to_numpy(Buffer* self);
@@ -429,12 +433,22 @@ nb::object NativeCallData::exec(
     nb::kwargs kwargs
 )
 {
+#ifdef SGL_ENABLE_NVTX
+    nvtxRangePush("slangpy_exec_unpack_args");
+#endif
     // Unpack args and kwargs.
     nb::list unpacked_args = unpack_args(args);
     nb::dict unpacked_kwargs = unpack_kwargs(kwargs);
+#ifdef SGL_ENABLE_NVTX
+    nvtxRangePop();
+    nvtxRangePush("slangpy_exec_calc_shape");
+#endif
 
     // Calculate call shape.
     Shape call_shape = m_runtime->calculate_call_shape(m_call_dimensionality, unpacked_args, unpacked_kwargs, this);
+#ifdef SGL_ENABLE_NVTX
+    nvtxRangePop();
+#endif
     m_last_call_shape = call_shape;
 
     // Setup context.
@@ -656,23 +670,51 @@ nb::object NativeCallData::exec(
         }
     };
 
+#ifdef SGL_ENABLE_NVTX
+    nvtxRangePush("slangpy_exec_create_encoder");
+#endif
     // Create temporary command encoder if none is provided.
     ref<CommandEncoder> temp_command_encoder;
     if (command_encoder == nullptr) {
         temp_command_encoder = m_device->create_command_encoder();
         command_encoder = temp_command_encoder.get();
     }
+#ifdef SGL_ENABLE_NVTX
+    nvtxRangePop();
+#endif
 
     bool is_ray_tracing = opts->is_ray_tracing();
 
     if (!is_ray_tracing) {
+#ifdef SGL_ENABLE_NVTX
+        nvtxRangePush("slangpy_exec_begin_compute_pass");
+#endif
         ref<ComputePassEncoder> pass_encoder = command_encoder->begin_compute_pass();
+#ifdef SGL_ENABLE_NVTX
+        nvtxRangePop();
+        nvtxRangePush("slangpy_exec_bind_pipeline");
+#endif
         ComputePipeline* pipeline = dynamic_cast<ComputePipeline*>(m_pipeline.get());
         SGL_ASSERT(pipeline != nullptr);
         ShaderCursor cursor(pass_encoder->bind_pipeline(pipeline));
+#ifdef SGL_ENABLE_NVTX
+        nvtxRangePop();
+        nvtxRangePush("slangpy_exec_bind_call_data");
+#endif
         bind_call_data(cursor);
+#ifdef SGL_ENABLE_NVTX
+        nvtxRangePop();
+        nvtxRangePush("slangpy_exec_dispatch");
+#endif
         pass_encoder->dispatch(uint3(total_threads, 1, 1));
+#ifdef SGL_ENABLE_NVTX
+        nvtxRangePop();
+        nvtxRangePush("slangpy_exec_end_pass");
+#endif
         pass_encoder->end();
+#ifdef SGL_ENABLE_NVTX
+        nvtxRangePop();
+#endif
     } else {
         ref<RayTracingPassEncoder> pass_encoder = command_encoder->begin_ray_tracing_pass();
         RayTracingPipeline* pipeline = dynamic_cast<RayTracingPipeline*>(m_pipeline.get());
@@ -683,17 +725,26 @@ nb::object NativeCallData::exec(
         pass_encoder->end();
     }
 
+#ifdef SGL_ENABLE_NVTX
+    nvtxRangePush("slangpy_exec_submit_cmdbuf");
+#endif
     // If we created a temporary command encoder, we need to submit it.
     if (temp_command_encoder) {
         m_device->submit_command_buffer(temp_command_encoder->finish(), CommandQueueType::graphics, cuda_stream);
         command_encoder = nullptr;
     }
+#ifdef SGL_ENABLE_NVTX
+    nvtxRangePop();
+#endif
 
     // If command_encoder is not null, return early.
     if (command_encoder != nullptr) {
         return nanobind::none();
     }
 
+#ifdef SGL_ENABLE_NVTX
+    nvtxRangePush("slangpy_exec_readback");
+#endif
     // Read call data post dispatch.
     // m_runtime->read_call_data_post_dispatch(context, call_data, unpacked_args, unpacked_kwargs);
     for (auto val : read_back) {
@@ -703,6 +754,10 @@ nb::object NativeCallData::exec(
         auto rb_data = t[2];
         bvr->python_type()->read_calldata(context, bvr.get(), rb_val, rb_data);
     }
+#ifdef SGL_ENABLE_NVTX
+    nvtxRangePop();
+    nvtxRangePush("slangpy_exec_pack_results");
+#endif
 
     // Pack updated 'this' values back.
     for (size_t i = 0; i < args.size(); ++i) {
@@ -711,6 +766,9 @@ nb::object NativeCallData::exec(
     for (auto [k, v] : kwargs) {
         pack_arg(nb::cast<nb::object>(v), unpacked_kwargs[k]);
     }
+#ifdef SGL_ENABLE_NVTX
+    nvtxRangePop();
+#endif
 
     // Handle return value based on call mode.
     if (m_call_mode == CallMode::prim) {
diff --git a/src/slangpy_ext/utils/slangpyfunction.cpp b/src/slangpy_ext/utils/slangpyfunction.cpp
index 3332a035..a2d3d6a4 100644
--- a/src/slangpy_ext/utils/slangpyfunction.cpp
+++ b/src/slangpy_ext/utils/slangpyfunction.cpp
@@ -5,6 +5,10 @@
 #include "utils/slangpyfunction.h"
 #include <fmt/format.h>
 
+#ifdef SGL_ENABLE_NVTX
+#include <nvtx3/nvToolsExt.h>
+#endif
+
 namespace sgl {
 
 template<>
@@ -47,6 +51,10 @@ ref<NativeCallData> NativeFunctionNode::build_call_data(NativeCallDataCache* cac
 
 nb::object NativeFunctionNode::call(NativeCallDataCache* cache, nb::args args, nb::kwargs kwargs)
 {
+#ifdef SGL_ENABLE_NVTX
+    nvtxRangePush("slangpy_native_call");
+#endif
+
     auto options = make_ref<NativeCallRuntimeOptions>();
     gather_runtime_options(options);
 
@@ -55,6 +63,10 @@ nb::object NativeFunctionNode::call(NativeCallDataCache* cache, nb::args args, n
         args = nb::cast<nb::args>(nb::make_tuple(options->get_this()) + args);
     }
 
+#ifdef SGL_ENABLE_NVTX
+    nvtxRangePush("slangpy_signature_build");
+#endif
+
     auto builder = make_ref<SignatureBuilder>();
     read_signature(builder);
     cache->get_args_signature(builder, args, kwargs);
@@ -62,19 +74,46 @@ nb::object NativeFunctionNode::call(NativeCallDataCache* cache, nb::args args, n
     std::string sig = builder->str();
     ref<NativeCallData> call_data = cache->find_call_data(sig);
 
+#ifdef SGL_ENABLE_NVTX
+    nvtxRangePop();
+#endif
+
+    nb::object result;
     if (call_data) {
+#ifdef SGL_ENABLE_NVTX
+        nvtxRangePush("slangpy_call_cached");
+#endif
         if (call_data->is_torch_integration())
-            return call_data->_py_torch_call(this, options, args, kwargs);
+            result = call_data->_py_torch_call(this, options, args, kwargs);
         else
-            return call_data->call(options, args, kwargs);
+            result = call_data->call(options, args, kwargs);
+#ifdef SGL_ENABLE_NVTX
+        nvtxRangePop();
+#endif
     } else {
+#ifdef SGL_ENABLE_NVTX
+        nvtxRangePush("slangpy_call_generate");
+#endif
         ref<NativeCallData> new_call_data = generate_call_data(args, kwargs);
         cache->add_call_data(sig, new_call_data);
+#ifdef SGL_ENABLE_NVTX
+        nvtxRangePop();
+        nvtxRangePush("slangpy_call_new");
+#endif
         if (new_call_data->is_torch_integration())
-            return new_call_data->_py_torch_call(this, options, args, kwargs);
+            result = new_call_data->_py_torch_call(this, options, args, kwargs);
         else
-            return new_call_data->call(options, args, kwargs);
+            result = new_call_data->call(options, args, kwargs);
+#ifdef SGL_ENABLE_NVTX
+        nvtxRangePop();
+#endif
     }
+
+#ifdef SGL_ENABLE_NVTX
+    nvtxRangePop();
+#endif
+
+    return result;
 }
 
 void NativeFunctionNode::append_to(