diff --git a/Libraries/rocProfiler-SDK/api_buffered_tracing/main.cpp b/Libraries/rocProfiler-SDK/api_buffered_tracing/main.cpp index 28b740301..8c74455a1 100644 --- a/Libraries/rocProfiler-SDK/api_buffered_tracing/main.cpp +++ b/Libraries/rocProfiler-SDK/api_buffered_tracing/main.cpp @@ -331,7 +331,11 @@ void run_migrate(int rank, int tid, hipStream_t stream, int, char** argv) itr = init_v; } - test_page_migrate<<<1, 1024, 0, stream>>>(page_data.data(), incr_v); + auto page_data_dev_ptr = static_cast(nullptr); + HIP_CHECK( + hipHostGetDevicePointer(reinterpret_cast(&page_data_dev_ptr), page_data.data(), 0)); + + test_page_migrate<<<1, 1024, 0, stream>>>(page_data_dev_ptr, incr_v); HIP_CHECK(hipStreamSynchronize(stream)); diff --git a/Libraries/rocProfiler-SDK/external_correlation_id_request/client.cpp b/Libraries/rocProfiler-SDK/external_correlation_id_request/client.cpp index 818d7bba7..0853279b9 100644 --- a/Libraries/rocProfiler-SDK/external_correlation_id_request/client.cpp +++ b/Libraries/rocProfiler-SDK/external_correlation_id_request/client.cpp @@ -22,7 +22,7 @@ // // undefine NDEBUG so asserts are implemented #ifdef NDEBUG - #undef NDEBUG +# undef NDEBUG #endif /** @@ -39,7 +39,26 @@ #include #include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include #include +#include +#include +#include +#include +#include +#include +#include namespace client { @@ -49,7 +68,6 @@ struct external_corr_id_data; using common::buffer_name_info; using common::call_stack_t; -using common::print_call_stack; using common::source_location; using kernel_symbol_data_t = rocprofiler_callback_tracing_code_object_kernel_symbol_register_data_t; @@ -57,22 +75,37 @@ using kernel_symbol_map_t = std::unordered_map; using retired_corr_id_set_t = std::unordered_set; -rocprofiler_client_id_t* client_id = nullptr; -rocprofiler_client_finalize_t client_fini_func = nullptr; -rocprofiler_context_id_t client_ctx = {0}; -rocprofiler_buffer_id_t client_buffer = {}; -buffer_name_info* client_name_info = new buffer_name_info{}; -kernel_symbol_map_t* client_kernels = new kernel_symbol_map_t{}; -auto client_mutex = std::shared_mutex{}; -auto client_external_corr_ids = external_corr_id_set_t{}; -auto client_retired_corr_ids = retired_corr_id_set_t{}; - -void tool_code_object_callback(rocprofiler_callback_tracing_record_t record, - rocprofiler_user_data_t* user_data, - void* callback_data) +// Maps correlation ID to maximum end timestamp observed for records with that ID +using corr_id_max_end_ts_map_t = std::unordered_map; + +// Maps correlation ID to retirement timestamp +using corr_id_retirement_ts_map_t = std::unordered_map; + +rocprofiler_client_id_t* client_id = nullptr; +rocprofiler_client_finalize_t client_fini_func = nullptr; +rocprofiler_context_id_t client_ctx = {0}; +rocprofiler_buffer_id_t client_buffer = {}; +buffer_name_info* client_name_info = new buffer_name_info{}; +kernel_symbol_map_t* client_kernels = new kernel_symbol_map_t{}; +auto client_mutex = std::shared_mutex{}; +auto client_external_corr_ids = external_corr_id_set_t{}; +auto client_retired_corr_ids = retired_corr_id_set_t{}; +auto client_corr_id_max_end_ts = corr_id_max_end_ts_map_t{}; +auto client_corr_id_retirement_ts = corr_id_retirement_ts_map_t{}; + +void +print_call_stack(const call_stack_t& _call_stack) { - if(record.kind == ROCPROFILER_CALLBACK_TRACING_CODE_OBJECT - && record.operation == ROCPROFILER_CODE_OBJECT_LOAD) + common::print_call_stack("external_correlation_id_request.log", _call_stack); +} + +void +tool_code_object_callback(rocprofiler_callback_tracing_record_t record, + rocprofiler_user_data_t* user_data, + void* callback_data) +{ + if(record.kind == ROCPROFILER_CALLBACK_TRACING_CODE_OBJECT && + record.operation == ROCPROFILER_CODE_OBJECT_LOAD) { if(record.phase == ROCPROFILER_CALLBACK_PHASE_UNLOAD) { @@ -83,8 +116,8 @@ void tool_code_object_callback(rocprofiler_callback_tracing_record_t record, ROCPROFILER_CHECK(flush_status); } } - else if(record.kind == ROCPROFILER_CALLBACK_TRACING_CODE_OBJECT - && record.operation == ROCPROFILER_CODE_OBJECT_DEVICE_KERNEL_SYMBOL_REGISTER) + else if(record.kind == ROCPROFILER_CALLBACK_TRACING_CODE_OBJECT && + record.operation == ROCPROFILER_CODE_OBJECT_DEVICE_KERNEL_SYMBOL_REGISTER) { auto* data = static_cast(record.payload); if(record.phase == ROCPROFILER_CALLBACK_PHASE_LOAD) @@ -101,8 +134,8 @@ void tool_code_object_callback(rocprofiler_callback_tracing_record_t record, } } - (void)user_data; - (void)callback_data; + (void) user_data; + (void) callback_data; } struct external_corr_id_data @@ -122,8 +155,7 @@ struct external_corr_id_data friend std::ostream& operator<<(std::ostream& os, external_corr_id_data data) { - if(!data.valid()) - return os; + if(!data.valid()) return os; auto ss = std::stringstream{}; ss << "seen=" << data.seen_count << ", thr_id=" << data.thread_id << ", context_id=" << data.context_id.handle << ", kind=" << data.kind @@ -133,43 +165,46 @@ struct external_corr_id_data } }; -bool operator==(external_corr_id_data lhs, external_corr_id_data rhs) +bool +operator==(external_corr_id_data lhs, external_corr_id_data rhs) { return std::tie(lhs.thread_id, lhs.context_id.handle, lhs.kind, lhs.operation, lhs.internal_corr_id, - lhs.user_data) - == std::tie(rhs.thread_id, - rhs.context_id.handle, - rhs.kind, - rhs.operation, - rhs.internal_corr_id, - rhs.user_data); + lhs.user_data) == std::tie(rhs.thread_id, + rhs.context_id.handle, + rhs.kind, + rhs.operation, + rhs.internal_corr_id, + rhs.user_data); } -bool operator!=(external_corr_id_data lhs, external_corr_id_data rhs) +bool +operator!=(external_corr_id_data lhs, external_corr_id_data rhs) { return !(lhs == rhs); } -bool external_corr_id_data::valid() const +bool +external_corr_id_data::valid() const { static constexpr auto invalid_v = external_corr_id_data{}; return (*this != invalid_v); } -int set_external_correlation_id(rocprofiler_thread_id_t thr_id, - rocprofiler_context_id_t ctx_id, - rocprofiler_external_correlation_id_request_kind_t kind, - rocprofiler_tracing_operation_t op, - uint64_t internal_corr_id, - rocprofiler_user_data_t* external_corr_id, - void* user_data) +int +set_external_correlation_id(rocprofiler_thread_id_t thr_id, + rocprofiler_context_id_t ctx_id, + rocprofiler_external_correlation_id_request_kind_t kind, + rocprofiler_tracing_operation_t op, + uint64_t internal_corr_id, + rocprofiler_user_data_t* external_corr_id, + void* user_data) { - auto* _data - = new external_corr_id_data{thr_id, ctx_id, kind, op, internal_corr_id, user_data, 0}; + auto* _data = + new external_corr_id_data{thr_id, ctx_id, kind, op, internal_corr_id, user_data, 0}; { static auto _mtx = std::mutex{}; @@ -182,26 +217,30 @@ int set_external_correlation_id(rocprofiler_thread_id_t return 0; } -void tool_tracing_callback(rocprofiler_context_id_t context, - rocprofiler_buffer_id_t buffer_id, - rocprofiler_record_header_t** headers, - size_t num_headers, - void* user_data, - uint64_t /*drop_count*/) +// Helper to update the max end timestamp for a correlation ID +static void +track_record_end_timestamp(uint64_t corr_id, uint64_t end_timestamp) { - static const auto ensure_internal_correlation_id_retirement_ordering = [](uint64_t _corr_id) + auto _lk = std::unique_lock{client_mutex}; + auto it = client_corr_id_max_end_ts.find(corr_id); + if(it == client_corr_id_max_end_ts.end()) { - auto _lk = std::shared_lock{client_mutex}; - // this correlation ID should not have reported as retired yet so - // we are demoing the expectation here - if(client_retired_corr_ids.count(_corr_id) > 0) - { - auto msg = std::stringstream{}; - msg << "internal correlation id " << _corr_id << " was retired prematurely"; - throw std::runtime_error{msg.str()}; - } - }; + client_corr_id_max_end_ts[corr_id] = end_timestamp; + } + else + { + it->second = std::max(it->second, end_timestamp); + } +} +void +tool_tracing_callback(rocprofiler_context_id_t context, + rocprofiler_buffer_id_t buffer_id, + rocprofiler_record_header_t** headers, + size_t num_headers, + void* user_data, + uint64_t /*drop_count*/) +{ for(size_t i = 0; i < num_headers; ++i) { auto* header = headers[i]; @@ -223,17 +262,17 @@ void tool_tracing_callback(rocprofiler_context_id_t context, } } - if(header->category == ROCPROFILER_BUFFER_CATEGORY_TRACING - && header->kind == ROCPROFILER_BUFFER_TRACING_HIP_RUNTIME_API) + if(header->category == ROCPROFILER_BUFFER_CATEGORY_TRACING && + header->kind == ROCPROFILER_BUFFER_TRACING_HIP_RUNTIME_API) { - auto* record - = static_cast(header->payload); + auto* record = + static_cast(header->payload); // this should always be empty auto _extern_corr_id = external_corr_id_data{}; - // demonstrate reliability of correlation ID retirement ordering - ensure_internal_correlation_id_retirement_ordering(record->correlation_id.internal); + // Track the end timestamp for temporal ordering validation + track_record_end_timestamp(record->correlation_id.internal, record->end_timestamp); auto info = std::stringstream{}; info << "tid=" << record->thread_id << ", context=" << context.handle @@ -246,24 +285,24 @@ void tool_tracing_callback(rocprofiler_context_id_t context, static_cast(user_data)->emplace_back( source_location{__FUNCTION__, __FILE__, __LINE__, kind_name + info.str()}); } - else if(header->category == ROCPROFILER_BUFFER_CATEGORY_TRACING - && header->kind == ROCPROFILER_BUFFER_TRACING_KERNEL_DISPATCH) + else if(header->category == ROCPROFILER_BUFFER_CATEGORY_TRACING && + header->kind == ROCPROFILER_BUFFER_TRACING_KERNEL_DISPATCH) { - auto* record = static_cast( - header->payload); + auto* record = + static_cast(header->payload); - // demonstrate reliability of correlation ID retirement ordering - ensure_internal_correlation_id_retirement_ordering(record->correlation_id.internal); + // Track the end timestamp for temporal ordering validation + track_record_end_timestamp(record->correlation_id.internal, record->end_timestamp); auto _extern_corr_id = external_corr_id_data{}; if(record->correlation_id.external.ptr) { - auto* _extcid - = static_cast(record->correlation_id.external.ptr); + auto* _extcid = + static_cast(record->correlation_id.external.ptr); _extcid->seen_count++; _extern_corr_id = *_extcid; - // demonstrate reliability of correlation ID retirement ordering - ensure_internal_correlation_id_retirement_ordering(_extcid->internal_corr_id); + // Track the end timestamp for the external correlation ID's internal correlation ID + track_record_end_timestamp(_extcid->internal_corr_id, record->end_timestamp); } auto info = std::stringstream{}; @@ -282,24 +321,24 @@ void tool_tracing_callback(rocprofiler_context_id_t context, static_cast(user_data)->emplace_back( source_location{__FUNCTION__, __FILE__, __LINE__, kind_name + info.str()}); } - else if(header->category == ROCPROFILER_BUFFER_CATEGORY_TRACING - && header->kind == ROCPROFILER_BUFFER_TRACING_MEMORY_COPY) + else if(header->category == ROCPROFILER_BUFFER_CATEGORY_TRACING && + header->kind == ROCPROFILER_BUFFER_TRACING_MEMORY_COPY) { - auto* record - = static_cast(header->payload); + auto* record = + static_cast(header->payload); - // demonstrate reliability of correlation ID retirement ordering - ensure_internal_correlation_id_retirement_ordering(record->correlation_id.internal); + // Track the end timestamp for temporal ordering validation + track_record_end_timestamp(record->correlation_id.internal, record->end_timestamp); auto _extern_corr_id = external_corr_id_data{}; if(record->correlation_id.external.ptr) { - auto* _extcid - = static_cast(record->correlation_id.external.ptr); + auto* _extcid = + static_cast(record->correlation_id.external.ptr); _extcid->seen_count++; _extern_corr_id = *_extcid; - // demonstrate reliability of correlation ID retirement ordering - ensure_internal_correlation_id_retirement_ordering(_extcid->internal_corr_id); + // Track the end timestamp for the external correlation ID's internal correlation ID + track_record_end_timestamp(_extcid->internal_corr_id, record->end_timestamp); } auto info = std::stringstream{}; @@ -318,16 +357,18 @@ void tool_tracing_callback(rocprofiler_context_id_t context, static_cast(user_data)->emplace_back( source_location{__FUNCTION__, __FILE__, __LINE__, kind_name + info.str()}); } - else if(header->category == ROCPROFILER_BUFFER_CATEGORY_TRACING - && header->kind == ROCPROFILER_BUFFER_TRACING_CORRELATION_ID_RETIREMENT) + else if(header->category == ROCPROFILER_BUFFER_CATEGORY_TRACING && + header->kind == ROCPROFILER_BUFFER_TRACING_CORRELATION_ID_RETIREMENT) { - auto* record - = static_cast( + auto* record = + static_cast( header->payload); { auto _lk = std::unique_lock{client_mutex}; client_retired_corr_ids.emplace(record->internal_correlation_id); + // Store the retirement timestamp for validation in tool_fini + client_corr_id_retirement_ts[record->internal_correlation_id] = record->timestamp; } auto _extern_corr_id = external_corr_id_data{}; @@ -352,14 +393,16 @@ void tool_tracing_callback(rocprofiler_context_id_t context, } } -template -auto make_array(Arg arg, Args&&... args) +template +auto +make_array(Arg arg, Args&&... args) { constexpr auto N = 1 + sizeof...(Args); return std::array{std::forward(arg), std::forward(args)...}; } -int tool_init(rocprofiler_client_finalize_t fini_func, void* tool_data) +int +tool_init(rocprofiler_client_finalize_t fini_func, void* tool_data) { assert(tool_data != nullptr); @@ -394,9 +437,9 @@ int tool_init(rocprofiler_client_finalize_t fini_func, void* tool_data) tool_data, &client_buffer)); - auto external_corr_id_request_kinds - = make_array(ROCPROFILER_EXTERNAL_CORRELATION_REQUEST_KERNEL_DISPATCH, - ROCPROFILER_EXTERNAL_CORRELATION_REQUEST_MEMORY_COPY); + auto external_corr_id_request_kinds = + make_array(ROCPROFILER_EXTERNAL_CORRELATION_REQUEST_KERNEL_DISPATCH, + ROCPROFILER_EXTERNAL_CORRELATION_REQUEST_MEMORY_COPY); ROCPROFILER_CHECK(rocprofiler_configure_external_correlation_id_request_service( client_ctx, @@ -406,14 +449,11 @@ int tool_init(rocprofiler_client_finalize_t fini_func, void* tool_data) nullptr)); auto hip_runtime_ops = std::vector{}; - const auto desired_hip_runtime_ops = std::unordered_set{"hipLaunchKernel", - "hipMemcpyAsync", - "hipMemsetAsync", - "hipMalloc"}; + const auto desired_hip_runtime_ops = std::unordered_set{ + "hipLaunchKernel", "hipMemcpyAsync", "hipMemsetAsync", "hipMalloc"}; for(auto [idx, itr] : (*client_name_info)[ROCPROFILER_BUFFER_TRACING_HIP_RUNTIME_API].items()) { - if(desired_hip_runtime_ops.count(*itr) > 0) - hip_runtime_ops.emplace_back(idx); + if(desired_hip_runtime_ops.count(*itr) > 0) hip_runtime_ops.emplace_back(idx); } if(desired_hip_runtime_ops.size() != hip_runtime_ops.size()) @@ -426,19 +466,11 @@ int tool_init(rocprofiler_client_finalize_t fini_func, void* tool_data) hip_runtime_ops.size(), client_buffer)); - ROCPROFILER_CHECK( - rocprofiler_configure_buffer_tracing_service(client_ctx, - ROCPROFILER_BUFFER_TRACING_KERNEL_DISPATCH, - nullptr, - 0, - client_buffer)); + ROCPROFILER_CHECK(rocprofiler_configure_buffer_tracing_service( + client_ctx, ROCPROFILER_BUFFER_TRACING_KERNEL_DISPATCH, nullptr, 0, client_buffer)); - ROCPROFILER_CHECK( - rocprofiler_configure_buffer_tracing_service(client_ctx, - ROCPROFILER_BUFFER_TRACING_MEMORY_COPY, - nullptr, - 0, - client_buffer)); + ROCPROFILER_CHECK(rocprofiler_configure_buffer_tracing_service( + client_ctx, ROCPROFILER_BUFFER_TRACING_MEMORY_COPY, nullptr, 0, client_buffer)); ROCPROFILER_CHECK(rocprofiler_configure_buffer_tracing_service( client_ctx, @@ -463,7 +495,8 @@ int tool_init(rocprofiler_client_finalize_t fini_func, void* tool_data) return 0; } -void tool_fini(void* tool_data) +void +tool_fini(void* tool_data) { assert(tool_data != nullptr); client_fini_func = nullptr; @@ -471,12 +504,46 @@ void tool_fini(void* tool_data) std::cout << "finalizing...\n" << std::flush; rocprofiler_stop_context(client_ctx); - ROCPROFILER_CHECK(rocprofiler_flush_buffer(client_buffer)); + // Buffer flush may return ERROR_FINALIZED if rocprofiler has already finalized + // and flushed buffers - this is not an error + auto flush_status = rocprofiler_flush_buffer(client_buffer); + if(flush_status != ROCPROFILER_STATUS_SUCCESS && + flush_status != ROCPROFILER_STATUS_ERROR_FINALIZED) + { + ROCPROFILER_CHECK(flush_status); + } auto* _call_stack = static_cast(tool_data); _call_stack->emplace_back(source_location{__FUNCTION__, __FILE__, __LINE__, ""}); - print_call_stack("external_correlation_id_request.log", *_call_stack); + print_call_stack(*_call_stack); + + // Validate temporal ordering: retirement timestamps should be >= max(end_timestamps) + // for records with the same correlation ID. Use a small tolerance for clock domain + // differences between GPU and CPU timestamps. + constexpr uint64_t timestamp_tolerance_ns = 1000; // 1 microsecond tolerance + size_t temporal_ordering_violations = 0; + for(const auto& [corr_id, max_end_ts] : client_corr_id_max_end_ts) + { + auto retirement_it = client_corr_id_retirement_ts.find(corr_id); + if(retirement_it != client_corr_id_retirement_ts.end()) + { + uint64_t retirement_ts = retirement_it->second; + // Check if retirement timestamp is before (max_end_ts - tolerance) + // This means retirement happened too early + if(retirement_ts + timestamp_tolerance_ns < max_end_ts) + { + std::cerr << "temporal ordering violation: correlation id " << corr_id + << " retired at timestamp " << retirement_ts + << " but has record with end_timestamp " << max_end_ts + << " (difference: " << (max_end_ts - retirement_ts) << " ns)\n" + << std::flush; + ++temporal_ordering_violations; + } + } + } + std::cerr << "temporal ordering violations : " << temporal_ordering_violations << "\n" + << std::flush; size_t unretired = 0; size_t unseen = 0; @@ -504,16 +571,17 @@ void tool_fini(void* tool_data) std::cerr << "external correlation IDs not seen : " << unseen << "\n" << std::flush; std::cerr << "internal correlation IDs not retired: " << unretired << "\n" << std::flush; - if(unseen > 0) - throw std::runtime_error{"unseen external correlation id data"}; - if(unretired > 0) - throw std::runtime_error{"unretired internal correlation id values"}; + if(unseen > 0) throw std::runtime_error{"unseen external correlation id data"}; + if(unretired > 0) throw std::runtime_error{"unretired internal correlation id values"}; + if(temporal_ordering_violations > 0) + throw std::runtime_error{"temporal ordering violation in correlation id retirement"}; delete _call_stack; } -} // namespace +} // namespace -void setup() +void +setup() { if(int status = 0; rocprofiler_is_initialized(&status) == ROCPROFILER_STATUS_SUCCESS && status == 0) @@ -522,21 +590,31 @@ void setup() } } -void shutdown() +void +shutdown() { if(client_id) { - ROCPROFILER_CHECK(rocprofiler_flush_buffer(client_buffer)); + // Buffer flush may return ERROR_FINALIZED if rocprofiler has already finalized + // and flushed buffers - this is not an error + auto flush_status = rocprofiler_flush_buffer(client_buffer); + if(flush_status != ROCPROFILER_STATUS_SUCCESS && + flush_status != ROCPROFILER_STATUS_ERROR_FINALIZED) + { + ROCPROFILER_CHECK(flush_status); + } client_fini_func(*client_id); } } -void start() +void +start() { ROCPROFILER_CHECK(rocprofiler_start_context(client_ctx)); } -void identify(uint64_t val) +void +identify(uint64_t val) { auto _tid = rocprofiler_thread_id_t{}; rocprofiler_get_thread_id(&_tid); @@ -545,16 +623,18 @@ void identify(uint64_t val) rocprofiler_push_external_correlation_id(client_ctx, _tid, user_data); } -void stop() +void +stop() { ROCPROFILER_CHECK(rocprofiler_stop_context(client_ctx)); } -} // namespace client +} // namespace client -extern "C" rocprofiler_tool_configure_result_t* rocprofiler_configure(uint32_t version, - const char* runtime_version, - uint32_t priority, - rocprofiler_client_id_t* id) +extern "C" rocprofiler_tool_configure_result_t* +rocprofiler_configure(uint32_t version, + const char* runtime_version, + uint32_t priority, + rocprofiler_client_id_t* id) { // set the client name id->name = "ExampleTool"; @@ -579,20 +659,18 @@ extern "C" rocprofiler_tool_configure_result_t* rocprofiler_configure(uint32_t client_tool_data->emplace_back( client::source_location{__FUNCTION__, __FILE__, __LINE__, info.str()}); - std::atexit( - []() - { - std::cout << "atexit handler...\n" << std::flush; - if(client::client_fini_func && client::client_id) - client::client_fini_func(*client::client_id); - }); + std::atexit([]() { + std::cout << "atexit handler...\n" << std::flush; + if(client::client_fini_func && client::client_id) + client::client_fini_func(*client::client_id); + }); // create configure data - static auto cfg - = rocprofiler_tool_configure_result_t{sizeof(rocprofiler_tool_configure_result_t), - &client::tool_init, - &client::tool_fini, - static_cast(client_tool_data)}; + static auto cfg = + rocprofiler_tool_configure_result_t{sizeof(rocprofiler_tool_configure_result_t), + &client::tool_init, + &client::tool_fini, + static_cast(client_tool_data)}; // return pointer to configure data return &cfg; diff --git a/Libraries/rocProfiler-SDK/external_correlation_id_request/main.cpp b/Libraries/rocProfiler-SDK/external_correlation_id_request/main.cpp index 7b53e2452..433642caf 100644 --- a/Libraries/rocProfiler-SDK/external_correlation_id_request/main.cpp +++ b/Libraries/rocProfiler-SDK/external_correlation_id_request/main.cpp @@ -322,7 +322,11 @@ void run_migrate(int rank, int tid, hipStream_t stream, int, char** argv) itr = init_v; } - test_page_migrate<<<1, 1024, 0, stream>>>(page_data.data(), incr_v); + auto page_data_dev_ptr = static_cast(nullptr); + HIP_CHECK( + hipHostGetDevicePointer(reinterpret_cast(&page_data_dev_ptr), page_data.data(), 0)); + + test_page_migrate<<<1, 1024, 0, stream>>>(page_data_dev_ptr, incr_v); HIP_CHECK(hipStreamSynchronize(stream));