diff --git a/CMakeLists.txt b/CMakeLists.txt index 63538e78..3b871c41 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -52,6 +52,9 @@ option(SGL_BUILD_STANDALONE "Build without external dependencies" ON) # This allows to build CUDA support without having CUDA SDK installed. option(SGL_USE_DYNAMIC_CUDA "Load CUDA driver API dynamically" ON) +# Enable NVTX profiling annotations +option(SGL_ENABLE_NVTX "Enable NVTX profiling annotations" ON) + # Disable asserts. # By default, asserts are enabled in both debug and release builds. # This option can be used to override the default behavior. @@ -119,6 +122,20 @@ if(NOT SGL_USE_DYNAMIC_CUDA) find_package(CUDAToolkit REQUIRED) endif() +# NVTX +# ----------------------------------------------------------------------------- + +if(SGL_ENABLE_NVTX) + find_package(CUDAToolkit REQUIRED) + if(CUDAToolkit_FOUND) + add_compile_definitions(SGL_ENABLE_NVTX=1) + message(STATUS "NVTX support enabled") + else() + message(WARNING "NVTX requested but CUDA Toolkit not found") + set(SGL_ENABLE_NVTX OFF) + endif() +endif() + # ----------------------------------------------------------------------------- # Global setup # ----------------------------------------------------------------------------- diff --git a/src/slangpy_ext/utils/slangpy.cpp b/src/slangpy_ext/utils/slangpy.cpp index 4e046b74..ff9df1c5 100644 --- a/src/slangpy_ext/utils/slangpy.cpp +++ b/src/slangpy_ext/utils/slangpy.cpp @@ -21,6 +21,10 @@ #include +#ifdef SGL_ENABLE_NVTX +#include +#endif + namespace sgl { extern void write_shader_cursor(ShaderCursor& cursor, nb::object value); extern nb::ndarray buffer_to_numpy(Buffer* self); @@ -429,12 +433,22 @@ nb::object NativeCallData::exec( nb::kwargs kwargs ) { +#ifdef SGL_ENABLE_NVTX + nvtxRangePush("slangpy_exec_unpack_args"); +#endif // Unpack args and kwargs. nb::list unpacked_args = unpack_args(args); nb::dict unpacked_kwargs = unpack_kwargs(kwargs); +#ifdef SGL_ENABLE_NVTX + nvtxRangePop(); + nvtxRangePush("slangpy_exec_calc_shape"); +#endif // Calculate call shape. Shape call_shape = m_runtime->calculate_call_shape(m_call_dimensionality, unpacked_args, unpacked_kwargs, this); +#ifdef SGL_ENABLE_NVTX + nvtxRangePop(); +#endif m_last_call_shape = call_shape; // Setup context. @@ -656,23 +670,51 @@ nb::object NativeCallData::exec( } }; +#ifdef SGL_ENABLE_NVTX + nvtxRangePush("slangpy_exec_create_encoder"); +#endif // Create temporary command encoder if none is provided. ref temp_command_encoder; if (command_encoder == nullptr) { temp_command_encoder = m_device->create_command_encoder(); command_encoder = temp_command_encoder.get(); } +#ifdef SGL_ENABLE_NVTX + nvtxRangePop(); +#endif bool is_ray_tracing = opts->is_ray_tracing(); if (!is_ray_tracing) { +#ifdef SGL_ENABLE_NVTX + nvtxRangePush("slangpy_exec_begin_compute_pass"); +#endif ref pass_encoder = command_encoder->begin_compute_pass(); +#ifdef SGL_ENABLE_NVTX + nvtxRangePop(); + nvtxRangePush("slangpy_exec_bind_pipeline"); +#endif ComputePipeline* pipeline = dynamic_cast(m_pipeline.get()); SGL_ASSERT(pipeline != nullptr); ShaderCursor cursor(pass_encoder->bind_pipeline(pipeline)); +#ifdef SGL_ENABLE_NVTX + nvtxRangePop(); + nvtxRangePush("slangpy_exec_bind_call_data"); +#endif bind_call_data(cursor); +#ifdef SGL_ENABLE_NVTX + nvtxRangePop(); + nvtxRangePush("slangpy_exec_dispatch"); +#endif pass_encoder->dispatch(uint3(total_threads, 1, 1)); +#ifdef SGL_ENABLE_NVTX + nvtxRangePop(); + nvtxRangePush("slangpy_exec_end_pass"); +#endif pass_encoder->end(); +#ifdef SGL_ENABLE_NVTX + nvtxRangePop(); +#endif } else { ref pass_encoder = command_encoder->begin_ray_tracing_pass(); RayTracingPipeline* pipeline = dynamic_cast(m_pipeline.get()); @@ -683,17 +725,26 @@ nb::object NativeCallData::exec( pass_encoder->end(); } +#ifdef SGL_ENABLE_NVTX + nvtxRangePush("slangpy_exec_submit_cmdbuf"); +#endif // If we created a temporary command encoder, we need to submit it. if (temp_command_encoder) { m_device->submit_command_buffer(temp_command_encoder->finish(), CommandQueueType::graphics, cuda_stream); command_encoder = nullptr; } +#ifdef SGL_ENABLE_NVTX + nvtxRangePop(); +#endif // If command_encoder is not null, return early. if (command_encoder != nullptr) { return nanobind::none(); } +#ifdef SGL_ENABLE_NVTX + nvtxRangePush("slangpy_exec_readback"); +#endif // Read call data post dispatch. // m_runtime->read_call_data_post_dispatch(context, call_data, unpacked_args, unpacked_kwargs); for (auto val : read_back) { @@ -703,6 +754,10 @@ nb::object NativeCallData::exec( auto rb_data = t[2]; bvr->python_type()->read_calldata(context, bvr.get(), rb_val, rb_data); } +#ifdef SGL_ENABLE_NVTX + nvtxRangePop(); + nvtxRangePush("slangpy_exec_pack_results"); +#endif // Pack updated 'this' values back. for (size_t i = 0; i < args.size(); ++i) { @@ -711,6 +766,9 @@ nb::object NativeCallData::exec( for (auto [k, v] : kwargs) { pack_arg(nb::cast(v), unpacked_kwargs[k]); } +#ifdef SGL_ENABLE_NVTX + nvtxRangePop(); +#endif // Handle return value based on call mode. if (m_call_mode == CallMode::prim) { diff --git a/src/slangpy_ext/utils/slangpyfunction.cpp b/src/slangpy_ext/utils/slangpyfunction.cpp index 3332a035..a2d3d6a4 100644 --- a/src/slangpy_ext/utils/slangpyfunction.cpp +++ b/src/slangpy_ext/utils/slangpyfunction.cpp @@ -5,6 +5,10 @@ #include "utils/slangpyfunction.h" #include +#ifdef SGL_ENABLE_NVTX +#include +#endif + namespace sgl { template<> @@ -47,6 +51,10 @@ ref NativeFunctionNode::build_call_data(NativeCallDataCache* cac nb::object NativeFunctionNode::call(NativeCallDataCache* cache, nb::args args, nb::kwargs kwargs) { +#ifdef SGL_ENABLE_NVTX + nvtxRangePush("slangpy_native_call"); +#endif + auto options = make_ref(); gather_runtime_options(options); @@ -55,6 +63,10 @@ nb::object NativeFunctionNode::call(NativeCallDataCache* cache, nb::args args, n args = nb::cast(nb::make_tuple(options->get_this()) + args); } +#ifdef SGL_ENABLE_NVTX + nvtxRangePush("slangpy_signature_build"); +#endif + auto builder = make_ref(); read_signature(builder); cache->get_args_signature(builder, args, kwargs); @@ -62,19 +74,46 @@ nb::object NativeFunctionNode::call(NativeCallDataCache* cache, nb::args args, n std::string sig = builder->str(); ref call_data = cache->find_call_data(sig); +#ifdef SGL_ENABLE_NVTX + nvtxRangePop(); +#endif + + nb::object result; if (call_data) { +#ifdef SGL_ENABLE_NVTX + nvtxRangePush("slangpy_call_cached"); +#endif if (call_data->is_torch_integration()) - return call_data->_py_torch_call(this, options, args, kwargs); + result = call_data->_py_torch_call(this, options, args, kwargs); else - return call_data->call(options, args, kwargs); + result = call_data->call(options, args, kwargs); +#ifdef SGL_ENABLE_NVTX + nvtxRangePop(); +#endif } else { +#ifdef SGL_ENABLE_NVTX + nvtxRangePush("slangpy_call_generate"); +#endif ref new_call_data = generate_call_data(args, kwargs); cache->add_call_data(sig, new_call_data); +#ifdef SGL_ENABLE_NVTX + nvtxRangePop(); + nvtxRangePush("slangpy_call_new"); +#endif if (new_call_data->is_torch_integration()) - return new_call_data->_py_torch_call(this, options, args, kwargs); + result = new_call_data->_py_torch_call(this, options, args, kwargs); else - return new_call_data->call(options, args, kwargs); + result = new_call_data->call(options, args, kwargs); +#ifdef SGL_ENABLE_NVTX + nvtxRangePop(); +#endif } + +#ifdef SGL_ENABLE_NVTX + nvtxRangePop(); +#endif + + return result; } void NativeFunctionNode::append_to(