diff --git a/third_party/xla/xla/backends/profiler/gpu/BUILD b/third_party/xla/xla/backends/profiler/gpu/BUILD index 4d0e82bf7c4822..cc1273597ef2a2 100644 --- a/third_party/xla/xla/backends/profiler/gpu/BUILD +++ b/third_party/xla/xla/backends/profiler/gpu/BUILD @@ -62,8 +62,8 @@ tsl_gpu_library( copts = tf_profiler_copts() + tsl_copts(), visibility = ["//visibility:public"], deps = [ - "@local_tsl//tsl/platform:macros", - "@local_tsl//tsl/platform:types", + "@tsl//tsl/platform:macros", + "@tsl//tsl/platform:types", ] + if_cuda(["//xla/tsl/cuda:cupti"]), ) @@ -76,7 +76,7 @@ tsl_gpu_library( ":cupti_interface", ], deps = [ - "@local_tsl//tsl/platform:test", + "@tsl//tsl/platform:test", ], ) @@ -92,9 +92,9 @@ tsl_gpu_library( visibility = ["//visibility:public"], deps = [ "@com_google_absl//absl/debugging:leak_check", - "@local_tsl//tsl/platform:logging", - "@local_tsl//tsl/platform:mutex", - "@local_tsl//tsl/platform:thread_annotations", + "@tsl//tsl/platform:logging", + "@tsl//tsl/platform:mutex", + "@tsl//tsl/platform:thread_annotations", ], ) @@ -106,12 +106,12 @@ xla_test( copts = tf_profiler_copts() + tsl_copts(), tags = [ "no_mac", - "cuda-only", #TODO(rocm): weekly-sync 24-12-10 + "cuda-only" ], deps = [ ":cupti_interface", "@com_google_googletest//:gtest_main", - "@local_tsl//tsl/platform:test", + "@tsl//tsl/platform:test", ] + if_cuda_is_configured([ ":cuda_test", ":cupti_error_manager", @@ -137,7 +137,7 @@ cuda_library( deps = [ "@local_config_cuda//cuda:cuda_headers", "@local_config_cuda//cuda:cuda_runtime", - "@local_tsl//tsl/platform:test", + "@tsl//tsl/platform:test", ], ) @@ -182,14 +182,13 @@ tsl_gpu_library( "@com_google_absl//absl/log", "@com_google_absl//absl/status", "@com_google_absl//absl/strings:string_view", - "@com_google_absl//absl/types:optional", "@com_google_absl//absl/types:span", - "@local_tsl//tsl/platform:env", - "@local_tsl//tsl/platform:errors", - "@local_tsl//tsl/platform:logging", - "@local_tsl//tsl/platform:macros", - "@local_tsl//tsl/platform:platform_port", - "@local_tsl//tsl/platform:types", + "@tsl//tsl/platform:env", + "@tsl//tsl/platform:errors", + "@tsl//tsl/platform:logging", + "@tsl//tsl/platform:macros", + "@tsl//tsl/platform:platform_port", + "@tsl//tsl/platform:types", ], ) @@ -208,13 +207,13 @@ tsl_gpu_library( "@com_google_absl//absl/container:node_hash_map", "@com_google_absl//absl/container:node_hash_set", "@com_google_absl//absl/status", - "@local_tsl//tsl/platform:env", - "@local_tsl//tsl/platform:errors", - "@local_tsl//tsl/platform:logging", - "@local_tsl//tsl/platform:macros", - "@local_tsl//tsl/platform:platform_port", - "@local_tsl//tsl/platform:types", - "@local_tsl//tsl/profiler/lib:scoped_annotation", + "@tsl//tsl/platform:env", + "@tsl//tsl/platform:errors", + "@tsl//tsl/platform:logging", + "@tsl//tsl/platform:macros", + "@tsl//tsl/platform:platform_port", + "@tsl//tsl/platform:types", + "@tsl//tsl/profiler/lib:scoped_annotation", ], ) @@ -246,7 +245,6 @@ tsl_gpu_library( "@com_google_absl//absl/container:node_hash_set", "@com_google_absl//absl/strings", "@com_google_absl//absl/strings:str_format", - "@com_google_absl//absl/types:optional", "@local_tsl//tsl/platform:abi", "@local_tsl//tsl/platform:env_time", "@local_tsl//tsl/platform:errors", @@ -265,6 +263,12 @@ tsl_gpu_library( srcs = if_rocm(["rocm_tracer.cc"]), hdrs = if_rocm(["rocm_tracer.h"]), copts = tf_profiler_copts() + tsl_copts(), + linkopts = select({ + "//conditions:default": [ + "-L/opt/rocm/lib", # search path for all ROCm shared objects + "-lrocprofiler-sdk", # the library that owns the missing symbols + ], + }), tags = [ "gpu", "rocm-only", @@ -284,7 +288,6 @@ tsl_gpu_library( "@com_google_absl//absl/container:node_hash_map", "@com_google_absl//absl/container:node_hash_set", "@com_google_absl//absl/status", - "@com_google_absl//absl/types:optional", "@local_tsl//tsl/platform:env", "@local_tsl//tsl/platform:errors", "@local_tsl//tsl/platform:logging", @@ -295,6 +298,58 @@ tsl_gpu_library( ], ) +xla_test( + name = "rocm_tracer_test", + size = "small", + srcs = ["rocm_tracer_test.cc"], + copts = tf_profiler_copts() + tsl_copts(), + tags = [ + "gpu", + "rocm", + "rocm-only", + ] + if_google([ + # Optional: only run internally if ROCm config is enabled + "manual", + ]), + deps = if_rocm([ + ":rocm_tracer", + ":rocm_collector", + ]) + [ + "//xla/tsl/profiler/utils:xplane_builder", + "@com_google_absl//absl/container:flat_hash_map", + "@com_google_googletest//:gtest_main", + "@local_tsl//tsl/platform:status_matchers", + "@local_tsl//tsl/platform:test", + "@local_tsl//tsl/profiler/protobuf:xplane_proto_cc", + ], +) + +xla_test( + name = "rocm_collector_test", + size = "small", + srcs = ["rocm_collector_test.cc"], + copts = tf_profiler_copts() + tsl_copts(), + tags = [ + "gpu", + "rocm", + "rocm-only", + ] + if_google([ + "manual", + ]), + deps = if_rocm([ + ":rocm_tracer", + ":rocm_collector", + ]) + [ + "//xla/tsl/profiler/utils:xplane_builder", + "@com_google_absl//absl/container:flat_hash_map", + "@com_google_googletest//:gtest_main", + "@local_tsl//tsl/platform:status_matchers", + "@local_tsl//tsl/platform:test", + "@local_tsl//tsl/profiler/protobuf:xplane_proto_cc", + ], + args = ["--gtest_shuffle"], +) + tsl_gpu_library( name = "nvtx_utils", srcs = if_cuda(["nvtx_utils.cc"]), @@ -302,8 +357,8 @@ tsl_gpu_library( copts = tf_profiler_copts() + tsl_copts(), deps = [ "@com_google_absl//absl/strings", - "@local_tsl//tsl/platform", - "@local_tsl//tsl/platform:macros", + "@tsl//tsl/platform", + "@tsl//tsl/platform:macros", ], ) @@ -331,12 +386,12 @@ tsl_gpu_library( "@com_google_absl//absl/log", "@com_google_absl//absl/log:check", "@com_google_absl//absl/strings", - "@local_tsl//tsl/platform:abi", - "@local_tsl//tsl/platform:mutex", - "@local_tsl//tsl/platform:platform_port", - "@local_tsl//tsl/platform:thread_annotations", - "@local_tsl//tsl/platform:types", - "@local_tsl//tsl/profiler/protobuf:xplane_proto_cc", + "@tsl//tsl/platform:abi", + "@tsl//tsl/platform:mutex", + "@tsl//tsl/platform:platform_port", + "@tsl//tsl/platform:thread_annotations", + "@tsl//tsl/platform:types", + "@tsl//tsl/profiler/protobuf:xplane_proto_cc", ] + if_cuda([ "//xla/tsl/cuda:cupti", "//xla/tsl/cuda", @@ -360,10 +415,10 @@ tsl_gpu_library( "@com_google_absl//absl/status", "@com_google_absl//absl/strings", "@com_google_absl//absl/strings:string_view", - "@local_tsl//tsl/platform:errors", - "@local_tsl//tsl/platform:mutex", - "@local_tsl//tsl/platform:platform_port", - "@local_tsl//tsl/platform:thread_annotations", + "@tsl//tsl/platform:errors", + "@tsl//tsl/platform:mutex", + "@tsl//tsl/platform:platform_port", + "@tsl//tsl/platform:thread_annotations", ] + if_cuda(["//xla/tsl/cuda:cupti"]), ) @@ -377,8 +432,8 @@ tsl_gpu_library( ":cupti_wrapper", "@com_google_absl//absl/base", "@com_google_absl//absl/memory", - "@local_tsl//tsl/platform:logging", - "@local_tsl//tsl/platform:stringpiece", + "@tsl//tsl/platform:logging", + "@tsl//tsl/platform:stringpiece", "//xla/tsl/util:env_var", ], visibility = ["//visibility:public"], @@ -396,11 +451,9 @@ xla_test( ], deps = [ ":cupti_buffer_events", - ":cupti_collector", - ":cupti_tracer", ":cupti_utils", "@com_google_googletest//:gtest_main", - "@local_tsl//tsl/platform:test", + "@tsl//tsl/platform:test", ], ) @@ -421,9 +474,9 @@ xla_test( "//xla/tsl/profiler/utils:xplane_builder", "@com_google_absl//absl/container:flat_hash_map", "@com_google_googletest//:gtest_main", - "@local_tsl//tsl/platform:status_matchers", - "@local_tsl//tsl/platform:test", - "@local_tsl//tsl/profiler/protobuf:xplane_proto_cc", + "@tsl//tsl/platform:status_matchers", + "@tsl//tsl/platform:test", + "@tsl//tsl/profiler/protobuf:xplane_proto_cc", ], ) @@ -439,6 +492,7 @@ cuda_library( local_defines = if_oss(["NVTX_VERSION_3_1=1"]), tags = ["cuda-only"], visibility = ["//visibility:public"], + deps = ["@local_config_cuda//cuda:cuda_headers"], ) xla_test( diff --git a/third_party/xla/xla/backends/profiler/gpu/device_tracer_rocm.cc b/third_party/xla/xla/backends/profiler/gpu/device_tracer_rocm.cc index 09b457ee3d38c8..abb2a55a3b307c 100644 --- a/third_party/xla/xla/backends/profiler/gpu/device_tracer_rocm.cc +++ b/third_party/xla/xla/backends/profiler/gpu/device_tracer_rocm.cc @@ -45,23 +45,8 @@ namespace xla { namespace profiler { using tensorflow::ProfileOptions; -using tsl::mutex; -using tsl::mutex_lock; -using tsl::profiler::Annotation; using tsl::profiler::AnnotationStack; -using tsl::profiler::FindOrAddMutablePlaneWithName; -using tsl::profiler::GetStatTypeStr; -using tsl::profiler::GpuPlaneName; -using tsl::profiler::kDeviceVendorAMD; -using tsl::profiler::kThreadIdOverhead; -using tsl::profiler::ParseAnnotationStack; using tsl::profiler::ProfilerInterface; -using tsl::profiler::RegisterProfilerFactory; -using tsl::profiler::StatType; -using tsl::profiler::XEventBuilder; -using tsl::profiler::XEventMetadata; -using tsl::profiler::XLineBuilder; -using tsl::profiler::XPlaneBuilder; using tsl::profiler::XSpace; // GpuTracer for ROCm GPU. @@ -82,7 +67,6 @@ class GpuTracer : public profiler::ProfilerInterface { absl::Status DoStop(); RocmTracerOptions GetRocmTracerOptions(); - RocmTraceCollectorOptions GetRocmTraceCollectorOptions(uint32_t num_gpus); enum State { @@ -99,10 +83,9 @@ class GpuTracer : public profiler::ProfilerInterface { }; RocmTracerOptions GpuTracer::GetRocmTracerOptions() { - // TODO(rocm-profiler): We need support for context similar to CUDA RocmTracerOptions options; +#if TF_ROCM_VERSION < 60300 std::vector empty_vec; - // clang formatting does not preserve one entry per line // clang-format off std::vector hip_api_domain_ops{ @@ -172,7 +155,9 @@ RocmTracerOptions GpuTracer::GetRocmTracerOptions() { options.api_callbacks.emplace(ACTIVITY_DOMAIN_HIP_API, empty_vec); options.activity_tracing.emplace(ACTIVITY_DOMAIN_HIP_OPS, empty_vec); - +#else + options.max_annotation_strings = 1024 * 1024; +#endif return options; } @@ -187,20 +172,16 @@ RocmTraceCollectorOptions GpuTracer::GetRocmTraceCollectorOptions( } absl::Status GpuTracer::DoStart() { - if (!rocm_tracer_->IsAvailable()) { - return tsl::errors::Unavailable("Another profile session running."); - } - AnnotationStack::Enable(true); + uint64_t start_gputime_ns = RocmTracer::GetTimestamp(); + uint64_t start_walltime_ns = tsl::EnvTime::NowNanos(); + RocmTracerOptions tracer_options = GetRocmTracerOptions(); RocmTraceCollectorOptions trace_collector_options = GetRocmTraceCollectorOptions(rocm_tracer_->NumGpus()); - uint64_t start_gputime_ns = RocmTracer::GetTimestamp(); - uint64_t start_walltime_ns = tsl::EnvTime::NowNanos(); rocm_trace_collector_ = CreateRocmCollector( trace_collector_options, start_walltime_ns, start_gputime_ns); - RocmTracerOptions tracer_options = GetRocmTracerOptions(); rocm_tracer_->Enable(tracer_options, rocm_trace_collector_.get()); return absl::OkStatus(); @@ -259,12 +240,16 @@ std::unique_ptr CreateGpuTracer( if (options.device_type() != ProfileOptions::GPU && options.device_type() != ProfileOptions::UNSPECIFIED) return nullptr; - +#if TF_ROCM_VERSION < 60300 profiler::RocmTracer* rocm_tracer = profiler::RocmTracer::GetRocmTracerSingleton(); if (!rocm_tracer->IsAvailable()) return nullptr; - return std::make_unique(rocm_tracer); +#else + auto& rocm_tracer = profiler::RocmTracer::i(); + if (!rocm_tracer.IsAvailable()) return nullptr; + return std::make_unique(&rocm_tracer); +#endif } auto register_rocm_gpu_tracer_factory = [] { diff --git a/third_party/xla/xla/backends/profiler/gpu/rocm_collector.cc b/third_party/xla/xla/backends/profiler/gpu/rocm_collector.cc index dac0a853c76365..12053c4a5b4c2e 100644 --- a/third_party/xla/xla/backends/profiler/gpu/rocm_collector.cc +++ b/third_party/xla/xla/backends/profiler/gpu/rocm_collector.cc @@ -1,4 +1,3 @@ - /* Copyright 2024 The OpenXLA Authors. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); @@ -18,17 +17,12 @@ limitations under the License. #include "absl/container/fixed_array.h" #include "absl/container/flat_hash_set.h" -#include "absl/container/node_hash_map.h" + #include "absl/strings/str_cat.h" #include "absl/strings/str_format.h" #include "absl/strings/str_join.h" -#include "absl/types/optional.h" -#include "xla/stream_executor/rocm/roctracer_wrapper.h" + #include "xla/tsl/profiler/backends/cpu/annotation_stack.h" -#include "xla/tsl/profiler/utils/parse_annotation.h" -#include "xla/tsl/profiler/utils/xplane_builder.h" -#include "xla/tsl/profiler/utils/xplane_schema.h" -#include "xla/tsl/profiler/utils/xplane_utils.h" #include "xla/tsl/util/env_var.h" #include "tsl/platform/abi.h" #include "tsl/platform/env_time.h" @@ -44,10 +38,8 @@ limitations under the License. namespace xla { namespace profiler { -namespace se = ::stream_executor; using tensorflow::ProfileOptions; -using tsl::mutex; -using tsl::mutex_lock; +using tsl::Status; using tsl::profiler::Annotation; using tsl::profiler::AnnotationStack; using tsl::profiler::FindOrAddMutablePlaneWithName; @@ -57,8 +49,8 @@ using tsl::profiler::kDeviceVendorAMD; using tsl::profiler::kThreadIdOverhead; using tsl::profiler::ParseAnnotationStack; using tsl::profiler::ProfilerInterface; -// using tsl::profiler::RegisterProfilerFactory; using tsl::profiler::StatType; +using tsl::profiler::XEvent; using tsl::profiler::XEventBuilder; using tsl::profiler::XEventMetadata; using tsl::profiler::XLineBuilder; @@ -110,18 +102,17 @@ std::string GetDeviceXLineName( return absl::StrCat(line_name, "(", absl::StrJoin(type_names, ","), ")"); } -} // namespace - -static void DumpRocmTracerEvent(const RocmTracerEvent& event, - uint64_t start_walltime_ns, - uint64_t start_gputime_ns, - const std::string& message) { +void PrintRocmTracerEvent(const RocmTracerEvent& event, + const std::string& message = {}, + uint64_t start_walltime_ns = 0, + uint64_t start_gputime_ns = 0) { std::ostringstream oss; oss << "correlation_id=" << event.correlation_id; oss << ",type=" << GetRocmTracerEventTypeName(event.type); oss << ",source=" << GetRocmTracerEventSourceName(event.source); oss << ",domain=" << GetRocmTracerEventDomainName(event.domain); oss << ",name=" << event.name; + oss << ",corr_id=" << event.correlation_id; oss << ",annotation=" << event.annotation; oss << ",start_time_us=" << (start_walltime_ns + (start_gputime_ns - event.start_time_ns)) / 1000; @@ -136,7 +127,6 @@ static void DumpRocmTracerEvent(const RocmTracerEvent& event, case RocmTracerEventType::MemcpyD2H: case RocmTracerEventType::MemcpyH2D: case RocmTracerEventType::MemcpyD2D: - case RocmTracerEventType::MemcpyP2P: oss << ",num_bytes=" << event.memcpy_info.num_bytes; oss << ",destination=" << event.memcpy_info.destination; oss << ",async=" << event.memcpy_info.async; @@ -154,10 +144,12 @@ static void DumpRocmTracerEvent(const RocmTracerEvent& event, DCHECK(false); break; } - oss << message; - VLOG(3) << oss.str(); + VLOG(3) << oss.str() << ' ' << message; } +#if TF_ROCM_VERSION < 60300 + +namespace se = ::stream_executor; static uint64_t get_timestamp() { uint64_t ts; if (se::wrap::roctracer_get_timestamp(&ts) != ROCTRACER_STATUS_SUCCESS) { @@ -169,526 +161,411 @@ static uint64_t get_timestamp() { } return ts; } - -struct RocmDeviceOccupancyParams { - hipFuncAttributes attributes = {}; - int block_size = 0; - size_t dynamic_smem_size = 0; - void* func_ptr; - - friend bool operator==(const RocmDeviceOccupancyParams& lhs, - const RocmDeviceOccupancyParams& rhs) { - return 0 == memcmp(&lhs, &rhs, sizeof(lhs)); +#else +uint64_t get_timestamp() { + uint64_t ts; + rocprofiler_status_t CHECKSTATUS = rocprofiler_get_timestamp(&ts); + if (CHECKSTATUS != ROCPROFILER_STATUS_SUCCESS) { + const char* errstr = rocprofiler_get_status_string(CHECKSTATUS); + LOG(ERROR) << "function rocprofiler_get_timestamp failed with error " + << errstr; + return 0; } + return ts; +} +#endif - template - friend H AbslHashValue(H hash_state, - const RocmDeviceOccupancyParams& params) { - return H::combine( - std::move(hash_state), params.attributes.maxThreadsPerBlock, - params.attributes.numRegs, params.attributes.sharedSizeBytes, - params.attributes.maxDynamicSharedSizeBytes, params.block_size, - params.dynamic_smem_size, params.func_ptr); - } -}; - -struct OccupancyStats { - double occupancy_pct = 0.0; - int min_grid_size = 0; - int suggested_block_size = 0; -}; - -struct CorrelationInfo { - CorrelationInfo(uint32_t t, uint32_t e) : thread_id(t), enqueue_time_ns(e) {} - uint32_t thread_id; - uint64_t enqueue_time_ns; -}; - -class PerDeviceCollector { - private: - OccupancyStats GetOccupancy(const RocmDeviceOccupancyParams& params) const { - // TODO(rocm-profiler): hipOccupancyMaxActiveBlocksPerMultiprocessor only - // return hipSuccess for HIP_API_ID_hipLaunchKernel - - OccupancyStats stats; - int number_of_active_blocks; - hipError_t err = hipOccupancyMaxActiveBlocksPerMultiprocessor( - &number_of_active_blocks, params.func_ptr, params.block_size, - params.dynamic_smem_size); - - if (err != hipError_t::hipSuccess) { - return {}; - } +} // namespace - stats.occupancy_pct = number_of_active_blocks * params.block_size * 100; - stats.occupancy_pct /= device_properties_.maxThreadsPerMultiProcessor; +OccupancyStats PerDeviceCollector::GetOccupancy( + const RocmDeviceOccupancyParams& params) const { + // TODO(rocm-profiler): hipOccupancyMaxActiveBlocksPerMultiprocessor only + // return hipSuccess for HIP_API_ID_hipLaunchKernel + OccupancyStats stats; + int number_of_active_blocks; + hipError_t err = hipOccupancyMaxActiveBlocksPerMultiprocessor( + &number_of_active_blocks, params.func_ptr, params.block_size, + params.dynamic_smem_size); + + if (err != hipError_t::hipSuccess) { + return {}; + } - err = hipOccupancyMaxPotentialBlockSize( - &stats.min_grid_size, &stats.suggested_block_size, - static_cast(params.func_ptr), params.dynamic_smem_size, 0); + stats.occupancy_pct = number_of_active_blocks * params.block_size * 100; + stats.occupancy_pct /= device_properties_.maxThreadsPerMultiProcessor; - if (err != hipError_t::hipSuccess) { - return {}; - } + err = hipOccupancyMaxPotentialBlockSize( + &stats.min_grid_size, &stats.suggested_block_size, + static_cast(params.func_ptr), params.dynamic_smem_size, 0); - return stats; + if (err != hipError_t::hipSuccess) { + return {}; } - void CreateXEvent(const RocmTracerEvent& event, XPlaneBuilder* plane, - uint64_t start_gpu_ns, uint64_t end_gpu_ns, - XLineBuilder* line) { - if (event.start_time_ns < start_gpu_ns || event.end_time_ns > end_gpu_ns || - event.start_time_ns > event.end_time_ns) { - VLOG(2) << "events have abnormal timestamps:" << event.name - << " start time(ns): " << event.start_time_ns - << " end time(ns): " << event.end_time_ns - << " start gpu(ns):" << start_gpu_ns - << " end gpu(ns):" << end_gpu_ns - << " corr. id:" << event.correlation_id; - return; - } - std::string kernel_name = tsl::port::MaybeAbiDemangle(event.name.c_str()); - if (kernel_name.empty()) { - kernel_name = GetRocmTracerEventTypeName(event.type); - } - XEventMetadata* event_metadata = - plane->GetOrCreateEventMetadata(std::move(kernel_name)); - XEventBuilder xevent = line->AddEvent(*event_metadata); - VLOG(7) << "Adding event to line=" << line->Id(); - xevent.SetTimestampNs(event.start_time_ns); - xevent.SetEndTimestampNs(event.end_time_ns); - if (event.source == RocmTracerEventSource::ApiCallback) { - xevent.AddStatValue( - *plane->GetOrCreateStatMetadata(GetStatTypeStr(StatType::kDeviceId)), - event.device_id); - } - if (event.correlation_id != RocmTracerEvent::kInvalidCorrelationId) { - xevent.AddStatValue(*plane->GetOrCreateStatMetadata( - GetStatTypeStr(StatType::kCorrelationId)), - event.correlation_id); - } - if (!event.roctx_range.empty()) { - xevent.AddStatValue( - *plane->GetOrCreateStatMetadata(GetStatTypeStr(StatType::kNVTXRange)), - *plane->GetOrCreateStatMetadata(event.roctx_range)); - } + return stats; +} - if (event.type == RocmTracerEventType::Kernel && - event.source == RocmTracerEventSource::Activity) { - RocmDeviceOccupancyParams params{}; - params.attributes.maxThreadsPerBlock = INT_MAX; - params.attributes.numRegs = - static_cast(event.kernel_info.registers_per_thread); - params.attributes.sharedSizeBytes = - event.kernel_info.static_shared_memory_usage; - // params.attributes.partitionedGCConfig = PARTITIONED_GC_OFF; - // params.attributes.shmemLimitConfig = FUNC_SHMEM_LIMIT_DEFAULT; - params.attributes.maxDynamicSharedSizeBytes = 0; - params.block_size = static_cast(event.kernel_info.block_x * - event.kernel_info.block_y * - event.kernel_info.block_z); - - params.dynamic_smem_size = event.kernel_info.dynamic_shared_memory_usage; - params.func_ptr = event.kernel_info.func_ptr; - - OccupancyStats& occ_stats = occupancy_cache_[params]; - if (occ_stats.occupancy_pct == 0.0) { - occ_stats = GetOccupancy(params); - } - xevent.AddStatValue(*plane->GetOrCreateStatMetadata(GetStatTypeStr( - StatType::kTheoreticalOccupancyPct)), - occ_stats.occupancy_pct); - xevent.AddStatValue(*plane->GetOrCreateStatMetadata( - GetStatTypeStr(StatType::kOccupancyMinGridSize)), - static_cast(occ_stats.min_grid_size)); - xevent.AddStatValue( - *plane->GetOrCreateStatMetadata( - GetStatTypeStr(StatType::kOccupancySuggestedBlockSize)), - static_cast(occ_stats.suggested_block_size)); - xevent.AddStatValue(*plane->GetOrCreateStatMetadata( - GetStatTypeStr(StatType::kKernelDetails)), - *plane->GetOrCreateStatMetadata(ToXStat( - event.kernel_info, occ_stats.occupancy_pct))); - } else if (event.type == RocmTracerEventType::MemcpyH2D || - event.type == RocmTracerEventType::MemcpyD2H || - event.type == RocmTracerEventType::MemcpyD2D || - event.type == RocmTracerEventType::MemcpyP2P || - event.type == RocmTracerEventType::MemcpyOther) { - VLOG(7) << "Add Memcpy stat"; - const auto& memcpy_info = event.memcpy_info; - std::string memcpy_details = absl::StrCat( - // TODO(rocm-profiler): we need to discover the memory kind similar - // to CUDA - "kind:", "Unknown", " size:", memcpy_info.num_bytes, - " dest:", memcpy_info.destination, " async:", memcpy_info.async); - xevent.AddStatValue( - *plane->GetOrCreateStatMetadata( - GetStatTypeStr(StatType::kMemcpyDetails)), - *plane->GetOrCreateStatMetadata(std::move(memcpy_details))); - } else if (event.type == RocmTracerEventType::MemoryAlloc) { - VLOG(7) << "Add MemAlloc stat"; - std::string value = - // TODO(rocm-profiler): we need to discover the memory kind similar - // to CUDA - absl::StrCat("kind:", "Unknown", - " num_bytes:", event.memalloc_info.num_bytes); - xevent.AddStatValue(*plane->GetOrCreateStatMetadata( - GetStatTypeStr(StatType::kMemallocDetails)), - *plane->GetOrCreateStatMetadata(std::move(value))); - } else if (event.type == RocmTracerEventType::MemoryFree) { - VLOG(7) << "Add MemFree stat"; - std::string value = - // TODO(rocm-profiler): we need to discover the memory kind similar - // to CUDA - absl::StrCat("kind:", "Unknown", - " num_bytes:", event.memalloc_info.num_bytes); - xevent.AddStatValue(*plane->GetOrCreateStatMetadata( - GetStatTypeStr(StatType::kMemFreeDetails)), - *plane->GetOrCreateStatMetadata(std::move(value))); - } else if (event.type == RocmTracerEventType::Memset) { - VLOG(7) << "Add Memset stat"; - auto value = - // TODO(rocm-profiler): we need to discover the memory kind similar - // to CUDA - absl::StrCat("kind:", "Unknown", - " num_bytes:", event.memset_info.num_bytes, - " async:", event.memset_info.async); - xevent.AddStatValue(*plane->GetOrCreateStatMetadata( - GetStatTypeStr(StatType::kMemsetDetails)), - *plane->GetOrCreateStatMetadata(std::move(value))); - } - // TODO(rocm-profiler): we need to support the following event type - /* else if (event.type == CuptiTracerEventType::MemoryResidency) { - VLOG(7) << "Add MemoryResidency stat"; - std::string value = absl::StrCat( - "kind:", GetMemoryKindName(event.memory_residency_info.kind), - " num_bytes:", event.memory_residency_info.num_bytes, - " addr:", event.memory_residency_info.address); - xevent.AddStatValue(*plane->GetOrCreateStatMetadata(GetStatTypeStr( - StatType::kMemoryResidencyDetails)), - *plane->GetOrCreateStatMetadata(std::move(value))); - } */ - - std::vector annotation_stack = - ParseAnnotationStack(event.annotation); - if (!annotation_stack.empty()) { - xevent.AddStatValue( - *plane->GetOrCreateStatMetadata(GetStatTypeStr(StatType::kTfOp)), - *plane->GetOrCreateStatMetadata(annotation_stack.begin()->name)); - } - // If multiple metadata have the same key name, show the values from the - // top of the stack (innermost annotation). Concatenate the values from - // "hlo_op". - absl::flat_hash_set key_set; - - for (auto annotation = annotation_stack.rbegin(); - annotation != annotation_stack.rend(); ++annotation) { - for (const Annotation::Metadata& metadata : annotation->metadata) { - if (key_set.insert(metadata.key).second) { - xevent.ParseAndAddStatValue( - *plane->GetOrCreateStatMetadata(metadata.key), metadata.value); - } - } - } +void PerDeviceCollector::CreateXEvent(const RocmTracerEvent& event, + XPlaneBuilder* plane, + uint64_t start_gpu_ns, + uint64_t end_gpu_ns, XLineBuilder* line) { + if (event.start_time_ns < start_gpu_ns || event.end_time_ns > end_gpu_ns || + event.start_time_ns > event.end_time_ns) { + VLOG(2) << "events have abnormal timestamps:" << event.name + << " start time(ns): " << event.start_time_ns + << " end time(ns): " << event.end_time_ns + << " start gpu(ns):" << start_gpu_ns + << " end gpu(ns):" << end_gpu_ns + << " corr. id:" << event.correlation_id; + return; } - - void SortByStartTime() { - mutex_lock lock(events_mutex); - std::sort(events.begin(), events.end(), - [](const RocmTracerEvent& event1, const RocmTracerEvent& event2) { - return event1.start_time_ns < event2.start_time_ns; - }); + std::string kernel_name = tsl::port::MaybeAbiDemangle(event.name.c_str()); + if (kernel_name.empty()) { + kernel_name = GetRocmTracerEventTypeName(event.type); + } + XEventMetadata* event_metadata = + plane->GetOrCreateEventMetadata(std::move(kernel_name)); + XEventBuilder xevent = line->AddEvent(*event_metadata); + VLOG(7) << "Adding event to line=" << line->Id(); + xevent.SetTimestampNs(event.start_time_ns); + xevent.SetEndTimestampNs(event.end_time_ns); + if (event.source == RocmTracerEventSource::ApiCallback) { + xevent.AddStatValue( + *plane->GetOrCreateStatMetadata(GetStatTypeStr(StatType::kDeviceId)), + event.device_id); + } + if (event.correlation_id != RocmTracerEvent::kInvalidCorrelationId) { + xevent.AddStatValue(*plane->GetOrCreateStatMetadata( + GetStatTypeStr(StatType::kCorrelationId)), + event.correlation_id); + } + if (!event.roctx_range.empty()) { + xevent.AddStatValue( + *plane->GetOrCreateStatMetadata(GetStatTypeStr(StatType::kNVTXRange)), + *plane->GetOrCreateStatMetadata(event.roctx_range)); } - bool IsHostEvent(const RocmTracerEvent& event, tsl::int64* line_id) { - // DriverCallback(i.e. kernel launching) events are host events. - if (event.source == RocmTracerEventSource::ApiCallback) { - *line_id = event.thread_id; - return true; - } else { // activities - *line_id = event.stream_id; - return false; - } - - // TODO(rocm-profiler): do we have such a report in rocm? - // Non-overhead activity events are device events. - /* if (event.type != CuptiTracerEventType::Overhead) { - *line_id = event.stream_id; - return false; - } */ - // Overhead events can be associated with a thread or a stream, etc. - // If a valid thread id is specified, we consider it as a host event. - // - - if (event.stream_id != RocmTracerEvent::kInvalidStreamId) { - *line_id = event.stream_id; - return false; - } else if (event.thread_id != RocmTracerEvent::kInvalidThreadId && - event.thread_id != 0) { - *line_id = event.thread_id; - return true; - } else { - *line_id = tsl::profiler::kThreadIdOverhead; - return false; + if (event.type == RocmTracerEventType::Kernel && + event.source == RocmTracerEventSource::Activity) { + RocmDeviceOccupancyParams params{}; + params.attributes.maxThreadsPerBlock = INT_MAX; + params.attributes.numRegs = + static_cast(event.kernel_info.registers_per_thread); + params.attributes.sharedSizeBytes = + event.kernel_info.static_shared_memory_usage; + // params.attributes.partitionedGCConfig = PARTITIONED_GC_OFF; + // params.attributes.shmemLimitConfig = FUNC_SHMEM_LIMIT_DEFAULT; + params.attributes.maxDynamicSharedSizeBytes = 0; + params.block_size = + static_cast(event.kernel_info.block_x * event.kernel_info.block_y * + event.kernel_info.block_z); + + params.dynamic_smem_size = event.kernel_info.dynamic_shared_memory_usage; + params.func_ptr = event.kernel_info.func_ptr; + } else if (event.type == RocmTracerEventType::MemcpyH2D || + event.type == RocmTracerEventType::MemcpyD2H || + event.type == RocmTracerEventType::MemcpyD2D || + event.type == RocmTracerEventType::MemcpyOther) { + VLOG(7) << "Add Memcpy stat"; + const auto& memcpy_info = event.memcpy_info; + std::string memcpy_details = absl::StrCat( + // TODO(rocm-profiler): we need to discover the memory kind similar + // to CUDA + "kind:", "Unknown", " size:", memcpy_info.num_bytes, + " dest:", memcpy_info.destination, " async:", memcpy_info.async); + xevent.AddStatValue( + *plane->GetOrCreateStatMetadata( + GetStatTypeStr(StatType::kMemcpyDetails)), + *plane->GetOrCreateStatMetadata(std::move(memcpy_details))); + } else if (event.type == RocmTracerEventType::MemoryAlloc) { + VLOG(7) << "Add MemAlloc stat"; + std::string value = + // TODO(rocm-profiler): we need to discover the memory kind similar + // to CUDA + absl::StrCat("kind:", "Unknown", + " num_bytes:", event.memalloc_info.num_bytes); + xevent.AddStatValue(*plane->GetOrCreateStatMetadata( + GetStatTypeStr(StatType::kMemallocDetails)), + *plane->GetOrCreateStatMetadata(std::move(value))); + } else if (event.type == RocmTracerEventType::MemoryFree) { + VLOG(7) << "Add MemFree stat"; + std::string value = + // TODO(rocm-profiler): we need to discover the memory kind similar + // to CUDA + absl::StrCat("kind:", "Unknown", + " num_bytes:", event.memalloc_info.num_bytes); + xevent.AddStatValue(*plane->GetOrCreateStatMetadata( + GetStatTypeStr(StatType::kMemFreeDetails)), + *plane->GetOrCreateStatMetadata(std::move(value))); + } else if (event.type == RocmTracerEventType::Memset) { + VLOG(7) << "Add Memset stat"; + auto value = + // TODO(rocm-profiler): we need to discover the memory kind similar + // to CUDA + absl::StrCat("kind:", "Unknown", + " num_bytes:", event.memset_info.num_bytes, + " async:", event.memset_info.async); + xevent.AddStatValue(*plane->GetOrCreateStatMetadata( + GetStatTypeStr(StatType::kMemsetDetails)), + *plane->GetOrCreateStatMetadata(std::move(value))); + } + // TODO(rocm-profiler): we need to support the following event type + /* else if (event.type == CuptiTracerEventType::MemoryResidency) { + VLOG(7) << "Add MemoryResidency stat"; + std::string value = absl::StrCat( + "kind:", GetMemoryKindName(event.memory_residency_info.kind), + " num_bytes:", event.memory_residency_info.num_bytes, + " addr:", event.memory_residency_info.address); + xevent.AddStatValue(*plane->GetOrCreateStatMetadata(GetStatTypeStr( + StatType::kMemoryResidencyDetails)), + *plane->GetOrCreateStatMetadata(std::move(value))); + } */ + + std::vector annotation_stack = + ParseAnnotationStack(event.annotation); + if (!annotation_stack.empty()) { + xevent.AddStatValue( + *plane->GetOrCreateStatMetadata(GetStatTypeStr(StatType::kTfOp)), + *plane->GetOrCreateStatMetadata(annotation_stack.begin()->name)); + } + // If multiple metadata have the same key name, show the values from the + // top of the stack (innermost annotation). Concatenate the values from + // "hlo_op". + absl::flat_hash_set key_set; + + for (auto annotation = annotation_stack.rbegin(); + annotation != annotation_stack.rend(); ++annotation) { + for (const Annotation::Metadata& metadata : annotation->metadata) { + if (key_set.insert(metadata.key).second) { + xevent.ParseAndAddStatValue( + *plane->GetOrCreateStatMetadata(metadata.key), metadata.value); + } } } +} - public: - void Export(uint64_t start_walltime_ns, uint64_t start_gputime_ns, - uint64_t end_gputime_ns, XPlaneBuilder* device_plane, - XPlaneBuilder* host_plane) { - int host_ev_cnt = 0, dev_ev_cnt = 0; - mutex_lock l(events_mutex); - // Tracking event types per line. - absl::flat_hash_map> - events_types_per_line; - for (const RocmTracerEvent& event : events) { - int64_t line_id = RocmTracerEvent::kInvalidThreadId; - bool is_host_event = IsHostEvent(event, &line_id); - - if (is_host_event) { - host_ev_cnt++; - } else { - dev_ev_cnt++; - } +void PerDeviceCollector::SortByStartTime() { + mutex_lock lock(events_mutex_); + std::sort(events_.begin(), events_.end(), + [](const RocmTracerEvent& event1, const RocmTracerEvent& event2) { + return event1.start_time_ns < event2.start_time_ns; + }); +} - if (line_id == RocmTracerEvent::kInvalidThreadId || - line_id == RocmTracerEvent::kInvalidStreamId) { - VLOG(3) << "Ignoring event, type=" << static_cast(event.type); - continue; - } - auto* plane = is_host_event ? host_plane : device_plane; - VLOG(9) << "Event" << " type=" << static_cast(event.type) - << " line_id=" << line_id - << (is_host_event ? " host plane=" : " device plane=") - << plane->Name(); - XLineBuilder line = plane->GetOrCreateLine(line_id); - line.SetTimestampNs(start_gputime_ns); - CreateXEvent(event, plane, start_gputime_ns, end_gputime_ns, &line); - events_types_per_line[line_id].emplace(event.type); - } - device_plane->ForEachLine([&](XLineBuilder line) { - line.SetName( - GetDeviceXLineName(line.Id(), events_types_per_line[line.Id()])); - }); - host_plane->ForEachLine([&](XLineBuilder line) { - line.SetName(absl::StrCat("Host Threads/", line.Id())); - }); - events.clear(); +bool PerDeviceCollector::IsHostEvent(const RocmTracerEvent& event, + tsl::int64* line_id) { + // DriverCallback(i.e. kernel launching) events are host events. + if (event.source == RocmTracerEventSource::ApiCallback) { + *line_id = event.thread_id; + return true; + } else { // activities + *line_id = event.stream_id; + return false; } - PerDeviceCollector() = default; + // TODO(rocm-profiler): do we have such a report in rocm? + // Non-overhead activity events are device events. + /* if (event.type != CuptiTracerEventType::Overhead) { + *line_id = event.stream_id; + return false; + } */ + // Overhead events can be associated with a thread or a stream, etc. + // If a valid thread id is specified, we consider it as a host event. + // + + if (event.stream_id != RocmTracerEvent::kInvalidStreamId) { + *line_id = event.stream_id; + return false; + } else if (event.thread_id != RocmTracerEvent::kInvalidThreadId && + event.thread_id != 0) { + *line_id = event.thread_id; + return true; + } else { + *line_id = tsl::profiler::kThreadIdOverhead; + return false; + } +} - void AddEvent(const RocmTracerEvent& event) { - mutex_lock l(events_mutex); - if (event.source == RocmTracerEventSource::ApiCallback) { - // Cupti api callback events were used to populate launch times etc. - if (event.correlation_id != RocmTracerEvent::kInvalidCorrelationId) { - correlation_info_.insert( - {event.correlation_id, - CorrelationInfo(event.thread_id, event.start_time_ns)}); - } - events.emplace_back(std::move(event)); - } else { - // Cupti activity events measure device times etc. - events.emplace_back(std::move(event)); +void PerDeviceCollector::Export(uint64_t start_walltime_ns, + uint64_t start_gputime_ns, + uint64_t end_gputime_ns, + XPlaneBuilder* device_plane, + XPlaneBuilder* host_plane) { + int host_ev_cnt = 0, dev_ev_cnt = 0; + mutex_lock l(events_mutex_); + // Tracking event types per line. + absl::flat_hash_map> + events_types_per_line; + + for (const RocmTracerEvent& event : events_) { + int64_t line_id = RocmTracerEvent::kInvalidThreadId; + bool is_host_event = IsHostEvent(event, &line_id); + + if (is_host_event) + host_ev_cnt++; + else + dev_ev_cnt++; + + if (line_id == RocmTracerEvent::kInvalidThreadId || + line_id == RocmTracerEvent::kInvalidStreamId) { + VLOG(3) << "Ignoring event, type=" << static_cast(event.type); + continue; } + auto* plane = is_host_event ? host_plane : device_plane; + VLOG(9) << "Event" << " type=" << static_cast(event.type) + << " line_id=" << line_id + << (is_host_event ? " host plane=" : " device plane=") + << plane->Name(); + + XLineBuilder line = plane->GetOrCreateLine(line_id); + line.SetTimestampNs(start_gputime_ns); + CreateXEvent(event, plane, start_gputime_ns, end_gputime_ns, &line); } - void GetDeviceCapabilities(int32_t device_ordinal, - XPlaneBuilder* device_plane) { - device_plane->AddStatValue(*device_plane->GetOrCreateStatMetadata( - GetStatTypeStr(StatType::kDevVendor)), - kDeviceVendorAMD); - - if (hipGetDeviceProperties(&device_properties_, device_ordinal) != - hipSuccess) - return; - - auto clock_rate_in_khz = - device_properties_.clockRate; // this is also in Khz - if (clock_rate_in_khz) { - device_plane->AddStatValue( - *device_plane->GetOrCreateStatMetadata( - GetStatTypeStr(StatType::kDevCapClockRateKHz)), - clock_rate_in_khz); - } + device_plane->ForEachLine([&](XLineBuilder line) { + line.SetName( + GetDeviceXLineName(line.Id(), events_types_per_line[line.Id()])); + }); + host_plane->ForEachLine([&](XLineBuilder line) { + line.SetName(absl::StrCat("Host Threads/", line.Id())); + }); + events_.clear(); +} - auto core_count = device_properties_.multiProcessorCount; - if (core_count) { - device_plane->AddStatValue( - *device_plane->GetOrCreateStatMetadata( - GetStatTypeStr(StatType::kDevCapCoreCount)), - core_count); - } +void PerDeviceCollector::AddEvent(RocmTracerEvent&& event) { + mutex_lock l(events_mutex_); + events_.emplace_back(std::move(event)); +} - auto mem_clock_khz = device_properties_.memoryClockRate; - auto mem_bus_width_bits = device_properties_.memoryBusWidth; - - if (mem_clock_khz && mem_bus_width_bits) { - // Times 2 because HBM is DDR memory; it gets two data bits per each - // data lane. - auto memory_bandwidth = - uint64_t{2} * (mem_clock_khz) * 1000 * (mem_bus_width_bits) / 8; - device_plane->AddStatValue( - *device_plane->GetOrCreateStatMetadata( - GetStatTypeStr(StatType::kDevCapMemoryBandwidth)), - memory_bandwidth); - } +void PerDeviceCollector::GetDeviceCapabilities(int32_t device_ordinal, + XPlaneBuilder* device_plane) { + device_plane->AddStatValue(*device_plane->GetOrCreateStatMetadata( + GetStatTypeStr(StatType::kDevVendor)), + kDeviceVendorAMD); + + if (hipGetDeviceProperties(&device_properties_, device_ordinal) != hipSuccess) + return; + + auto clock_rate_in_khz = device_properties_.clockRate; // this is also in Khz + if (clock_rate_in_khz) { + device_plane->AddStatValue( + *device_plane->GetOrCreateStatMetadata( + GetStatTypeStr(StatType::kDevCapClockRateKHz)), + clock_rate_in_khz); + } - size_t total_memory = device_properties_.totalGlobalMem; - if (total_memory) { - device_plane->AddStatValue( - *device_plane->GetOrCreateStatMetadata( - GetStatTypeStr(StatType::kDevCapMemorySize)), - static_cast(total_memory)); - } + auto core_count = device_properties_.multiProcessorCount; + if (core_count) { + device_plane->AddStatValue(*device_plane->GetOrCreateStatMetadata( + GetStatTypeStr(StatType::kDevCapCoreCount)), + core_count); + } - auto compute_capability_major = device_properties_.major; - if (compute_capability_major) { - device_plane->AddStatValue( - *device_plane->GetOrCreateStatMetadata( - GetStatTypeStr(StatType::kDevCapComputeCapMajor)), - compute_capability_major); - } - auto compute_capability_minor = device_properties_.minor; - if (compute_capability_minor) { - device_plane->AddStatValue( - *device_plane->GetOrCreateStatMetadata( - GetStatTypeStr(StatType::kDevCapComputeCapMinor)), - compute_capability_minor); - } + auto mem_clock_khz = device_properties_.memoryClockRate; + auto mem_bus_width_bits = device_properties_.memoryBusWidth; + + if (mem_clock_khz && mem_bus_width_bits) { + // Times 2 because HBM is DDR memory; it gets two data bits per each + // data lane. + auto memory_bandwidth = + uint64_t{2} * (mem_clock_khz) * 1000 * (mem_bus_width_bits) / 8; + device_plane->AddStatValue( + *device_plane->GetOrCreateStatMetadata( + GetStatTypeStr(StatType::kDevCapMemoryBandwidth)), + memory_bandwidth); } - private: - mutex events_mutex; - std::vector events TF_GUARDED_BY(events_mutex); - absl::flat_hash_map correlation_info_ - TF_GUARDED_BY(events_mutex); - absl::flat_hash_map - occupancy_cache_; - hipDeviceProp_t device_properties_; -}; - -class RocmTraceCollectorImpl : public profiler::RocmTraceCollector { - public: - RocmTraceCollectorImpl(const RocmTraceCollectorOptions& options, - uint64_t start_walltime_ns, uint64_t start_gputime_ns) - : RocmTraceCollector(options), - num_callback_events_(0), - num_activity_events_(0), - start_walltime_ns_(start_walltime_ns), - start_gputime_ns_(start_gputime_ns), - num_gpus_(options.num_gpus) {} - - void AddEvent(RocmTracerEvent&& event, bool is_auxiliary) override; - void Flush() override; - void Export(XSpace* space) override; - - void OnEventsDropped(const std::string& reason, - uint32_t correlation_id) override { - LOG(INFO) << "RocmTracerEvent dropped (correlation_id=" << correlation_id - << ",) : " << reason << "."; + size_t total_memory = device_properties_.totalGlobalMem; + if (total_memory) { + device_plane->AddStatValue(*device_plane->GetOrCreateStatMetadata( + GetStatTypeStr(StatType::kDevCapMemorySize)), + static_cast(total_memory)); } - private: - std::atomic num_callback_events_; - std::atomic num_activity_events_; - uint64_t start_walltime_ns_; - uint64_t start_gputime_ns_; - int num_gpus_; - - mutex event_maps_mutex_; - absl::flat_hash_map api_events_map_ - TF_GUARDED_BY(event_maps_mutex_); - - /* Some apis such as MEMSETD32 (based on an observation with ResNet50), - trigger multiple HIP ops domain activities. We keep them in a vector and - merge them with api activities at flush time. - */ - absl::flat_hash_map> - activity_ops_events_map_ TF_GUARDED_BY(event_maps_mutex_); - // This is for the APIs that we track because we need some information from - // them to populate the corresponding activity that we actually track. - absl::flat_hash_map auxiliary_api_events_map_ - TF_GUARDED_BY(event_maps_mutex_); - - const std::vector ApiActivityInfoExchange() - TF_EXCLUSIVE_LOCKS_REQUIRED(event_maps_mutex_); - - absl::node_hash_map per_device_collector_; -}; -//========== + auto compute_capability_major = device_properties_.major; + if (compute_capability_major) { + device_plane->AddStatValue( + *device_plane->GetOrCreateStatMetadata( + GetStatTypeStr(StatType::kDevCapComputeCapMajor)), + compute_capability_major); + } + auto compute_capability_minor = device_properties_.minor; + if (compute_capability_minor) { + device_plane->AddStatValue( + *device_plane->GetOrCreateStatMetadata( + GetStatTypeStr(StatType::kDevCapComputeCapMinor)), + compute_capability_minor); + } +} void RocmTraceCollectorImpl::AddEvent(RocmTracerEvent&& event, bool is_auxiliary) { mutex_lock lock(event_maps_mutex_); - if (event.source == RocmTracerEventSource::ApiCallback && !is_auxiliary) { - if (num_callback_events_ > options_.max_callback_api_events) { - OnEventsDropped("max callback event capacity reached", - event.correlation_id); - DumpRocmTracerEvent(event, 0, 0, ". Dropped!"); - return; + if (event.source == RocmTracerEventSource::ApiCallback) { + if (!is_auxiliary) { + if (num_callback_events_ >= options_.max_callback_api_events) { + OnEventsDropped("max callback event capacity reached", + event.correlation_id); + PrintRocmTracerEvent(event, ". Dropped!"); + return; + } + num_callback_events_++; } - num_callback_events_++; - } else if (event.source == RocmTracerEventSource::Activity && - event.domain == RocmTracerEventDomain::HIP_API) { - // we do not count HIP_OPS activities. - if (num_activity_events_ > options_.max_activity_api_events) { - OnEventsDropped("max activity event capacity reached", + auto& map = is_auxiliary ? auxiliary_api_events_map_ : api_events_map_; + auto [it, added] = map.emplace(event.correlation_id, std::move(event)); + + if (!added) { + OnEventsDropped("event with duplicate correlation_id was received.", event.correlation_id); - DumpRocmTracerEvent(event, 0, 0, ". Dropped!"); - return; + PrintRocmTracerEvent(event, ". Dropped!"); } - num_activity_events_++; - } - - bool emplace_result = false; - if (event.source == RocmTracerEventSource::ApiCallback) { - auto& target_api_event_map = - (is_auxiliary) ? auxiliary_api_events_map_ : api_events_map_; - std::tie(std::ignore, emplace_result) = - target_api_event_map.emplace(event.correlation_id, std::move(event)); } else if (event.source == RocmTracerEventSource::Activity) { - auto result = activity_ops_events_map_.emplace( + if (event.domain == RocmTracerEventDomain::HIP_API) { + // we do not count HIP_OPS activities. + if (num_activity_events_ >= options_.max_activity_api_events) { + OnEventsDropped("max activity event capacity reached", + event.correlation_id); + PrintRocmTracerEvent(event, ". Dropped!"); + return; + } + num_activity_events_++; + } + + auto [it, _] = activity_ops_events_map_.emplace( event.correlation_id, std::vector{}); - result.first->second.push_back(std::move(event)); - emplace_result = true; // we always accept Hip-Ops events - } - if (!emplace_result) { - OnEventsDropped("event with duplicate correlation_id was received.", - event.correlation_id); - DumpRocmTracerEvent(event, 0, 0, ". Dropped!"); + it->second.push_back(std::move(event)); + } else { + VLOG(3) << "Dropping unknown event: " << (int)event.source + << " domain: " << (int)event.domain; } } void RocmTraceCollectorImpl::Flush() { mutex_lock lock(event_maps_mutex_); - auto& aggregated_events_ = ApiActivityInfoExchange(); + auto aggregated_events = ApiActivityInfoExchange(); VLOG(3) << "RocmTraceCollector collected " << num_callback_events_ - << " callback events, " << num_activity_events_ - << " activity events, and aggregated them into " - << aggregated_events_.size() << " events."; + << " callback events, " << num_activity_events_ + << " activity events, and aggregated them into " + << aggregated_events.size() << " events."; // device ids for GPUs filled in by roctracer are not zero indexed. // They are offset by number of CPUs on the machine tsl::uint32 min_device_id = INT32_MAX; - ; - for (auto& event : aggregated_events_) { + + for (const auto& event : aggregated_events) { if (event.device_id < min_device_id) { min_device_id = event.device_id; } } - for (auto event : aggregated_events_) { - event.device_id = event.device_id - min_device_id; - if (event.device_id < num_gpus_) { - per_device_collector_[event.device_id].AddEvent(event); + for (auto& event : aggregated_events) { + auto id = event.device_id - min_device_id; + if (id < num_gpus_) { + per_device_collector_[id].AddEvent(std::move(event)); } else { - OnEventsDropped("Invalid device id for an event.", event.correlation_id); - DumpRocmTracerEvent(event, 0, 0, ". Dropped!"); + PrintRocmTracerEvent(event, ". Dropped due to invalid device ID!"); } } @@ -702,24 +579,72 @@ void RocmTraceCollectorImpl::Export(XSpace* space) { XPlaneBuilder host_plane(FindOrAddMutablePlaneWithName( space, tsl::profiler::kRoctracerApiPlaneName)); - for (int device_ordinal = 0; device_ordinal < num_gpus_; ++device_ordinal) { - std::string name = GpuPlaneName(device_ordinal); + VLOG(3) << "Calling RocmTraceCollectorImpl::Export num_gpus " << num_gpus_; + + for (int id = 0; id < num_gpus_; id++) { + std::string name = GpuPlaneName(id); XPlaneBuilder device_plane(FindOrAddMutablePlaneWithName(space, name)); - device_plane.SetId(device_ordinal); + device_plane.SetId(id); // Calculate device capabilities before flushing, so that device // properties are available to the occupancy calculator in export(). - per_device_collector_[device_ordinal].GetDeviceCapabilities(device_ordinal, - &device_plane); - per_device_collector_[device_ordinal].Export( - start_walltime_ns_, start_gputime_ns_, end_gputime_ns, &device_plane, - &host_plane); + per_device_collector_[id].GetDeviceCapabilities(id, &device_plane); + per_device_collector_[id].Export(start_walltime_ns_, start_gputime_ns_, + end_gputime_ns, &device_plane, + &host_plane); NormalizeTimeStamps(&device_plane, start_walltime_ns_); } NormalizeTimeStamps(&host_plane, start_walltime_ns_); } -const std::vector -RocmTraceCollectorImpl::ApiActivityInfoExchange() { +#if TF_ROCM_VERSION < 60300 + +static void DumpRocmTracerEvent(const RocmTracerEvent& event, + uint64_t start_walltime_ns, + uint64_t start_gputime_ns, + const std::string& message) { + std::ostringstream oss; + oss << "correlation_id=" << event.correlation_id; + oss << ",type=" << GetRocmTracerEventTypeName(event.type); + oss << ",source=" << GetRocmTracerEventSourceName(event.source); + oss << ",domain=" << GetRocmTracerEventDomainName(event.domain); + oss << ",name=" << event.name; + oss << ",annotation=" << event.annotation; + oss << ",start_time_us=" + << (start_walltime_ns + (start_gputime_ns - event.start_time_ns)) / 1000; + oss << ",duration=" << (event.end_time_ns - event.start_time_ns) / 1000; + oss << ",device_id=" << event.device_id; + oss << ",thread_id=" << event.thread_id; + oss << ",stream_id=" << event.stream_id; + + switch (event.type) { + case RocmTracerEventType::Kernel: + break; + case RocmTracerEventType::MemcpyD2H: + case RocmTracerEventType::MemcpyH2D: + case RocmTracerEventType::MemcpyD2D: + case RocmTracerEventType::MemcpyP2P: + oss << ",num_bytes=" << event.memcpy_info.num_bytes; + oss << ",destination=" << event.memcpy_info.destination; + oss << ",async=" << event.memcpy_info.async; + break; + case RocmTracerEventType::MemoryAlloc: + oss << ",num_bytes=" << event.memalloc_info.num_bytes; + break; + case RocmTracerEventType::MemcpyOther: + case RocmTracerEventType::MemoryFree: + case RocmTracerEventType::Memset: + case RocmTracerEventType::Synchronization: + case RocmTracerEventType::Generic: + break; + default: + DCHECK(false); + break; + } + oss << message; + VLOG(3) << oss.str(); +} + +std::vector RocmTraceCollectorImpl::ApiActivityInfoExchange() { /* Different from CUDA, roctracer activity records are not enough to fill a TF event. For most of the activities, we need to enable the corresponding API callsbacks (we call them auxiliary API callbacks) to capture the @@ -843,6 +768,125 @@ RocmTraceCollectorImpl::ApiActivityInfoExchange() { return aggregated_events; } +#else + +std::vector RocmTraceCollectorImpl::ApiActivityInfoExchange() { + /* Different from CUDA, roctracer activity records are not enough to fill a + TF event. For most of the activities, we need to enable the corresponding + API callsbacks (we call them auxiliary API callbacks) to capture the + necessary fields from them using the correlation id. The purpose of this + function is to let APIs and activities exchange information to reach a + state very similar to TF CUDA and getting ready to dump the event. + */ + + std::vector aggregated_events; + aggregated_events.reserve(api_events_map_.size()); + + // Copy info from activity events to API callback events + for (auto& [key, api_event] : api_events_map_) { + auto iact = activity_ops_events_map_.find(api_event.correlation_id); + + if (iact == activity_ops_events_map_.end()) { + PrintRocmTracerEvent(api_event, ". Dropped!"); + VLOG(1) << api_event.name << " could not find activity counterpart!"; + continue; + } + const auto& item = iact->second.front(); + api_event.device_id = item.device_id; + api_event.stream_id = item.stream_id; + switch (api_event.type) { + case RocmTracerEventType::Kernel: + case RocmTracerEventType::Memset: + case RocmTracerEventType::MemoryAlloc: + case RocmTracerEventType::MemoryFree: + case RocmTracerEventType::Synchronization: { + aggregated_events.push_back(api_event); + break; + } + case RocmTracerEventType::MemcpyD2H: + case RocmTracerEventType::MemcpyH2D: + case RocmTracerEventType::MemcpyD2D: + case RocmTracerEventType::MemcpyOther: { + // api_event.memcpy_info.destination = item.device_id; + api_event.memcpy_info = item.memcpy_info; + aggregated_events.push_back(api_event); + break; + } + default: + OnEventsDropped("Missing API-Activity information exchange. Dropped!", + api_event.correlation_id); + PrintRocmTracerEvent(api_event, ". Dropped!"); + LOG(WARNING) << "A ROCm API event type with unimplemented activity " + "merge dropped! " + "Type=" + << GetRocmTracerEventTypeName(api_event.type); + } // switch + } // for + + // Make sure for all activity events we have API callback events + for (auto& activity_iter : activity_ops_events_map_) { + RocmTracerEvent& activity_event = activity_iter.second.front(); + + auto api_event = api_events_map_.find(activity_event.correlation_id); + + if (api_event == api_events_map_.end()) { + api_event = auxiliary_api_events_map_.find(activity_event.correlation_id); + } + + if (api_event == auxiliary_api_events_map_.end()) { + OnEventsDropped( + "An event from activity was discarded." + "Could not find the counterpart HIP API.", + activity_event.correlation_id); + PrintRocmTracerEvent(activity_event, ". Dropped!"); + continue; + } + + switch (activity_event.type) { + case RocmTracerEventType::Kernel: + activity_event.kernel_info = api_event->second.kernel_info; + PrintRocmTracerEvent(activity_event, + ". activity event from api_event."); + aggregated_events.push_back(activity_event); + break; + + case RocmTracerEventType::MemcpyD2H: + case RocmTracerEventType::MemcpyH2D: + case RocmTracerEventType::MemcpyD2D: + case RocmTracerEventType::MemcpyOther: + // activity_event.memcpy_info = api_event->second.memcpy_info; + aggregated_events.push_back(activity_event); + break; + case RocmTracerEventType::Memset: + activity_event.memset_info = api_event->second.memset_info; + aggregated_events.push_back(activity_event); + break; + + case RocmTracerEventType::MemoryAlloc: + case RocmTracerEventType::MemoryFree: + activity_event.device_id = api_event->second.device_id; + aggregated_events.push_back(activity_event); + break; + + case RocmTracerEventType::Synchronization: + activity_event.device_id = api_event->second.device_id; + aggregated_events.push_back(activity_event); + break; + default: + OnEventsDropped("Missing API-Activity information exchange. Dropped!", + activity_event.correlation_id); + PrintRocmTracerEvent(activity_event, ". Dropped!"); + LOG(WARNING) << "A ROCm activity event with unimplemented API " + "callback merge dropped! " + "Type=" + << GetRocmTracerEventTypeName(activity_event.type); + } // switch + } // for + + return aggregated_events; +} +#endif + std::unique_ptr CreateRocmCollector( const RocmTraceCollectorOptions& options, const uint64_t start_walltime_ns, const uint64_t start_gputime_ns) { @@ -851,4 +895,4 @@ std::unique_ptr CreateRocmCollector( } } // namespace profiler -} // namespace xla +} // namespace xla \ No newline at end of file diff --git a/third_party/xla/xla/backends/profiler/gpu/rocm_collector.h b/third_party/xla/xla/backends/profiler/gpu/rocm_collector.h index 46e8e71eee77f0..369046fa0f9074 100644 --- a/third_party/xla/xla/backends/profiler/gpu/rocm_collector.h +++ b/third_party/xla/xla/backends/profiler/gpu/rocm_collector.h @@ -20,12 +20,25 @@ limitations under the License. #include #include "absl/container/flat_hash_map.h" +#include "absl/container/node_hash_map.h" #include "absl/container/node_hash_set.h" +#include "xla/stream_executor/rocm/roctracer_wrapper.h" +#include "tsl/profiler/protobuf/xplane.pb.h" +#include "tsl/profiler/lib/profiler_factory.h" +#include "tsl/profiler/lib/profiler_interface.h" +#include "xla/tsl/profiler/utils/parse_annotation.h" #include "xla/tsl/profiler/utils/xplane_builder.h" +#include "xla/tsl/profiler/utils/xplane_schema.h" +#include "xla/tsl/profiler/utils/xplane_utils.h" namespace xla { namespace profiler { +using tsl::mutex; +using tsl::mutex_lock; +using tsl::profiler::XEvent; +using tsl::profiler::XLineBuilder; +using tsl::profiler::XPlaneBuilder; using tsl::profiler::XSpace; struct MemcpyDetails { @@ -116,6 +129,7 @@ enum class RocmTracerEventDomain { HIP_API, HIP_OPS, }; + const char* GetRocmTracerEventDomainName(const RocmTracerEventDomain& domain); // RocmTracerSyncTypes forward declaration enum class RocmTracerSyncTypes; @@ -146,7 +160,8 @@ struct RocmTracerEvent { uint32_t device_id = kInvalidDeviceId; uint32_t correlation_id = kInvalidCorrelationId; uint64_t thread_id = kInvalidThreadId; - int64_t stream_id = kInvalidStreamId; + uint64_t stream_id = kInvalidStreamId; + union { MemcpyDetails memcpy_info; // If type == Memcpy* MemsetDetails memset_info; // If type == Memset* @@ -194,6 +209,9 @@ class AnnotationMap { AnnotationMap& operator=(const AnnotationMap&) = delete; }; +// for roctracer (v1) +#if TF_ROCM_VERSION < 60300 + class RocmTraceCollector { public: explicit RocmTraceCollector(const RocmTraceCollectorOptions& options) @@ -220,6 +238,137 @@ class RocmTraceCollector { RocmTraceCollector& operator=(const RocmTraceCollector&) = delete; }; +#else +// for rocprofiler-sdk (v3) + +class RocmTraceCollector { + public: + explicit RocmTraceCollector(const RocmTraceCollectorOptions& options) + : options_(options) {} + virtual ~RocmTraceCollector() {} + + virtual void AddEvent(RocmTracerEvent&& event, bool is_auxiliary) = 0; + virtual void OnEventsDropped(const std::string& reason, + uint32_t num_events) = 0; + virtual void Flush() = 0; + virtual void Export(XSpace* space) = 0; + + protected: + RocmTraceCollectorOptions options_; + + public: + // Disable copy and move. + RocmTraceCollector(const RocmTraceCollector&) = delete; + RocmTraceCollector& operator=(const RocmTraceCollector&) = delete; +}; +#endif + +struct RocmDeviceOccupancyParams { + hipFuncAttributes attributes = {}; + int block_size = 0; + size_t dynamic_smem_size = 0; + void* func_ptr; + + friend bool operator==(const RocmDeviceOccupancyParams& lhs, + const RocmDeviceOccupancyParams& rhs) { + return 0 == memcmp(&lhs, &rhs, sizeof(lhs)); + } + + template + friend H AbslHashValue(H hash_state, + const RocmDeviceOccupancyParams& params) { + return H::combine( + std::move(hash_state), params.attributes.maxThreadsPerBlock, + params.attributes.numRegs, params.attributes.sharedSizeBytes, + params.attributes.maxDynamicSharedSizeBytes, params.block_size, + params.dynamic_smem_size, params.func_ptr); + } +}; + +// FIXME: rocprofiler-sdk does not have this one yet +struct OccupancyStats { + double occupancy_pct = 0.0; + int min_grid_size = 0; + int suggested_block_size = 0; +}; + +class PerDeviceCollector { + public: + void Export(uint64_t start_walltime_ns, uint64_t start_gputime_ns, + uint64_t end_gputime_ns, XPlaneBuilder* device_plane, + XPlaneBuilder* host_plane); + + PerDeviceCollector() = default; + + void AddEvent(RocmTracerEvent&& event); + void GetDeviceCapabilities(int32_t device_ordinal, + XPlaneBuilder* device_plane); + + private: + OccupancyStats GetOccupancy(const RocmDeviceOccupancyParams& params) const; + void CreateXEvent(const RocmTracerEvent& event, XPlaneBuilder* plane, + uint64_t start_gpu_ns, uint64_t end_gpu_ns, + XLineBuilder* line); + void SortByStartTime(); + bool IsHostEvent(const RocmTracerEvent& event, tsl::int64* line_id); + + private: + mutex events_mutex_; + std::vector events_ TF_GUARDED_BY(events_mutex_); + absl::flat_hash_map + occupancy_cache_; + hipDeviceProp_t device_properties_; +}; // PerDeviceCollector + +class RocmTraceCollectorImpl : public RocmTraceCollector { + public: + RocmTraceCollectorImpl(const RocmTraceCollectorOptions& options, + uint64_t start_walltime_ns, uint64_t start_gputime_ns) + : RocmTraceCollector(options), + num_callback_events_(0), + num_activity_events_(0), + start_walltime_ns_(start_walltime_ns), + start_gputime_ns_(start_gputime_ns), + num_gpus_(options.num_gpus) {} + + void AddEvent(RocmTracerEvent&& event, bool is_auxiliary) override; + void Flush() override; + void Export(XSpace* space) override; + + void OnEventsDropped(const std::string& reason, + uint32_t correlation_id) override { + LOG(INFO) << "RocmTracerEvent dropped (correlation_id=" << correlation_id + << ",) : " << reason << "."; + } + + private: + std::atomic num_callback_events_; + std::atomic num_activity_events_; + uint64_t start_walltime_ns_; + uint64_t start_gputime_ns_; + int num_gpus_; + + mutex event_maps_mutex_; + absl::flat_hash_map api_events_map_ + TF_GUARDED_BY(event_maps_mutex_); + + /* Some apis such as MEMSETD32 (based on an observation with ResNet50), + trigger multiple HIP ops domain activities. We keep them in a vector and + merge them with api activities at flush time. + */ + absl::flat_hash_map> + activity_ops_events_map_ TF_GUARDED_BY(event_maps_mutex_); + // This is for the APIs that we track because we need some information from + // them to populate the corresponding activity that we actually track. + absl::flat_hash_map auxiliary_api_events_map_ + TF_GUARDED_BY(event_maps_mutex_); + + std::vector ApiActivityInfoExchange() + TF_EXCLUSIVE_LOCKS_REQUIRED(event_maps_mutex_); + + absl::node_hash_map per_device_collector_; +}; // RocmTraceCollectorImpl + std::unique_ptr CreateRocmCollector( const RocmTraceCollectorOptions& options, const uint64_t start_walltime_ns, const uint64_t start_gputime_ns); @@ -227,4 +376,4 @@ std::unique_ptr CreateRocmCollector( } // namespace profiler } // namespace xla -#endif // XLA_BACKENDS_PROFILER_GPU_ROCM_COLLECTOR_H_ +#endif // XLA_BACKENDS_PROFILER_GPU_ROCM_COLLECTOR_H_ \ No newline at end of file diff --git a/third_party/xla/xla/backends/profiler/gpu/rocm_collector_test.cc b/third_party/xla/xla/backends/profiler/gpu/rocm_collector_test.cc new file mode 100644 index 00000000000000..480fe4f8e05eea --- /dev/null +++ b/third_party/xla/xla/backends/profiler/gpu/rocm_collector_test.cc @@ -0,0 +1,121 @@ +/* Copyright 2025 The OpenXLA Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "tsl/platform/test.h" +#include + +#include "xla/backends/profiler/gpu/rocm_collector.h" + +#if TF_ROCM_VERSION >= 60300 + +namespace xla { +namespace profiler { +namespace test { + +using tsl::profiler::FindOrAddMutablePlaneWithName; +using tsl::profiler::GetStatTypeStr; +using tsl::profiler::GpuPlaneName; +using tsl::profiler::kDeviceVendorAMD; +using tsl::profiler::kThreadIdOverhead; +using tsl::profiler::ParseAnnotationStack; +using tsl::profiler::ProfilerInterface; +using tsl::profiler::StatType; +using tsl::profiler::XEvent; +using tsl::profiler::XEventBuilder; +using tsl::profiler::XEventMetadata; +using tsl::profiler::XLineBuilder; +using tsl::profiler::XPlaneBuilder; +using tsl::profiler::XSpace; + +TEST(RocmCollectorTest, TestAddKernelEventAndExport) { + RocmTraceCollectorOptions options; + options.max_callback_api_events = 100; + options.max_activity_api_events = 100; + options.max_annotation_strings = 100; + options.num_gpus = 1; + + const uint64_t start_walltime_ns = 1000; + const uint64_t start_gputime_ns = 2000; + + RocmTraceCollectorImpl collector(options, start_walltime_ns, + start_gputime_ns); + + const uint32_t correlation_id = 42; + const uint64_t start_time_ns = 3000; + const uint64_t end_time_ns = 4000; + + // === 1. Add API Callback Event === + RocmTracerEvent api_event; + api_event.type = RocmTracerEventType::Kernel; + api_event.source = RocmTracerEventSource::ApiCallback; + api_event.domain = RocmTracerEventDomain::HIP_API; + api_event.name = "test_rocm_kernel"; + api_event.correlation_id = correlation_id; + api_event.thread_id = 999; + api_event.kernel_info = {.registers_per_thread = 32, + .static_shared_memory_usage = 1024, + .dynamic_shared_memory_usage = 0, + .block_x = 256, + .block_y = 1, + .block_z = 1, + .grid_x = 100, + .grid_y = 1, + .grid_z = 1, + .func_ptr = reinterpret_cast(0xdeadbeef)}; + + collector.AddEvent(std::move(api_event), /*is_auxiliary=*/false); + + // === 2. Add Activity Event === + RocmTracerEvent activity_event; + activity_event.type = RocmTracerEventType::Kernel; + activity_event.source = RocmTracerEventSource::Activity; + activity_event.domain = RocmTracerEventDomain::HIP_OPS; + activity_event.name = "test_rocm_kernel"; // will be filled from api_event + activity_event.correlation_id = correlation_id; + activity_event.start_time_ns = start_time_ns; + activity_event.end_time_ns = end_time_ns; + activity_event.device_id = 100; // Will be adjusted in Flush() + activity_event.stream_id = 123; + + collector.AddEvent(std::move(activity_event), /*is_auxiliary=*/false); + + // === 3. Finalize and Export === + collector.Flush(); + + tensorflow::profiler::XSpace space; + collector.Export(&space); + + // === 4. Check results === + ASSERT_GE(space.planes_size(), 1); + const auto* gpu_plane = + FindOrAddMutablePlaneWithName(&space, "/device:GPU:0"); + ASSERT_NE(gpu_plane, nullptr); + + ASSERT_GT(gpu_plane->lines_size(), 0); + const auto& line = gpu_plane->lines(0); + ASSERT_GT(line.events_size(), 0); + + const auto& event = line.events(0); + EXPECT_EQ(event.offset_ps(), (start_time_ns - start_gputime_ns) * 1000); + EXPECT_EQ(event.duration_ps(), (end_time_ns - start_time_ns) * 1000); + EXPECT_EQ(gpu_plane->event_metadata().at(event.metadata_id()).name(), + "test_rocm_kernel"); +} + +} // namespace test +} // namespace profiler +} // namespace xla + +#endif // TF_ROCM_VERSION \ No newline at end of file diff --git a/third_party/xla/xla/backends/profiler/gpu/rocm_tracer.cc b/third_party/xla/xla/backends/profiler/gpu/rocm_tracer.cc index fcf51dbac9a667..edf801feb07a4b 100644 --- a/third_party/xla/xla/backends/profiler/gpu/rocm_tracer.cc +++ b/third_party/xla/xla/backends/profiler/gpu/rocm_tracer.cc @@ -13,13 +13,29 @@ See the License for the specific language governing permissions and limitations under the License. ==============================================================================*/ +// This translation unit is **self‑contained**: it provides minimal stub +// implementations for the rocprofiler callbacks that XLA needs to register +// (toolInit / toolFinialize / code_object_callback). They do nothing except +// keep the compiler and linker happy. Once real logging is implemented, you +// can replace the stubs with the actual logic. + #include "xla/backends/profiler/gpu/rocm_tracer.h" #include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include // For standard sysconf #include "absl/container/flat_hash_map.h" #include "absl/container/node_hash_map.h" -#include "absl/status/status.h" #include "rocm/rocm_config.h" #include "xla/tsl/profiler/backends/cpu/annotation_stack.h" #include "xla/tsl/profiler/utils/time_utils.h" @@ -29,6 +45,8 @@ limitations under the License. #include "tsl/platform/macros.h" #include "tsl/platform/mem.h" +// for roctracer (v1) +#if TF_ROCM_VERSION < 60300 namespace xla { namespace profiler { @@ -36,7 +54,6 @@ namespace se = ::stream_executor; using tsl::mutex; using tsl::mutex_lock; using tsl::profiler::AnnotationStack; - constexpr uint32_t RocmTracerEvent::kInvalidDeviceId; #define RETURN_IF_ROCTRACER_ERROR(expr) \ @@ -297,7 +314,7 @@ absl::Status RocmApiCallbackImpl::operator()(uint32_t domain, uint32_t cbid, // DumpApiCallbackData(domain, cbid, cbdata); - if (domain != ACTIVITY_DOMAIN_HIP_API) return absl::OkStatus(); + if (domain != ACTIVITY_DOMAIN_HIP_API) return tsl::OkStatus(); const hip_api_data_t* data = reinterpret_cast(cbdata); @@ -325,7 +342,7 @@ absl::Status RocmApiCallbackImpl::operator()(uint32_t domain, uint32_t cbid, } else { LOG(WARNING) << "An API exit callback received without API enter " "with same correlation id. Event droped!"; - return absl::OkStatus(); // This API does not belong to us. + return tsl::OkStatus(); // This API does not belong to us. } exit_time = RocmTracer::GetTimestamp(); } @@ -407,7 +424,7 @@ absl::Status RocmApiCallbackImpl::operator()(uint32_t domain, uint32_t cbid, break; } } - return absl::OkStatus(); + return tsl::OkStatus(); } void RocmApiCallbackImpl::AddKernelEventUponApiExit(uint32_t cbid, @@ -1012,7 +1029,7 @@ absl::Status RocmActivityCallbackImpl::operator()(const char* begin, )); } - return absl::OkStatus(); + return tsl::OkStatus(); } void RocmActivityCallbackImpl::AddHipKernelActivityEvent( @@ -1389,11 +1406,11 @@ absl::Status RocmTracer::ApiCallbackHandler(uint32_t domain, uint32_t cbid, const void* cbdata) { if (api_tracing_enabled_) TF_RETURN_IF_ERROR((*api_cb_impl_)(domain, cbid, cbdata)); - return absl::OkStatus(); + return tsl::OkStatus(); } absl::Status RocmTracer::EnableApiTracing() { - if (api_tracing_enabled_) return absl::OkStatus(); + if (api_tracing_enabled_) return tsl::OkStatus(); api_tracing_enabled_ = true; for (auto& iter : options_->api_callbacks) { @@ -1415,11 +1432,11 @@ absl::Status RocmTracer::EnableApiTracing() { } } } - return absl::OkStatus(); + return tsl::OkStatus(); } absl::Status RocmTracer::DisableApiTracing() { - if (!api_tracing_enabled_) return absl::OkStatus(); + if (!api_tracing_enabled_) return tsl::OkStatus(); api_tracing_enabled_ = false; for (auto& iter : options_->api_callbacks) { @@ -1441,7 +1458,7 @@ absl::Status RocmTracer::DisableApiTracing() { } } } - return absl::OkStatus(); + return tsl::OkStatus(); } void ActivityCallback(const char* begin, const char* end, void* user_data) { @@ -1475,11 +1492,11 @@ absl::Status RocmTracer::ActivityCallbackHandler(const char* begin, } VLOG(3) << "Dropped Activity Records End"; } - return absl::OkStatus(); + return tsl::OkStatus(); } absl::Status RocmTracer::EnableActivityTracing() { - if (activity_tracing_enabled_) return absl::OkStatus(); + if (activity_tracing_enabled_) return tsl::OkStatus(); activity_tracing_enabled_ = true; if (!options_->activity_tracing.empty()) { @@ -1517,11 +1534,11 @@ absl::Status RocmTracer::EnableActivityTracing() { } } - return absl::OkStatus(); + return tsl::OkStatus(); } absl::Status RocmTracer::DisableActivityTracing() { - if (!activity_tracing_enabled_) return absl::OkStatus(); + if (!activity_tracing_enabled_) return tsl::OkStatus(); for (auto& iter : options_->activity_tracing) { activity_domain_t domain = iter.first; @@ -1572,7 +1589,7 @@ absl::Status RocmTracer::DisableActivityTracing() { activity_tracing_enabled_ = false; - return absl::OkStatus(); + return tsl::OkStatus(); } /*static*/ uint64_t RocmTracer::GetTimestamp() { @@ -1589,3 +1606,599 @@ absl::Status RocmTracer::DisableActivityTracing() { } // namespace profiler } // namespace xla + +#else +// for rocprofiler-sdk +namespace xla { +namespace profiler { + +using tsl::profiler::AnnotationStack; + +// represents the maximum number of chars +static constexpr int kMaxSymbolSize = 1024; +// represents an invalid or uninitialized device ID used in RocmTracer events. +constexpr uint32_t RocmTracerEvent::kInvalidDeviceId; + +std::string demangle(const char* name) { +#ifndef _MSC_VER + if (!name) { + return ""; + } + + if (strlen(name) > kMaxSymbolSize) { + return name; + } + + int status; + size_t len = 0; + char* demangled = abi::__cxa_demangle(name, nullptr, &len, &status); + if (status != 0) { + return name; + } + std::string res(demangled); + // The returned buffer must be freed! + free(demangled); + return res; +#else + // TODO: demangling on Windows + if (!name) { + return ""; + } else { + return name; + } +#endif +} + +std::string demangle(const std::string& name) { return demangle(name.c_str()); } + +inline auto GetCallbackTracingNames() { + return rocprofiler::sdk::get_callback_tracing_names(); +} + +std::vector GetGpuDeviceAgents(); + +//----------------------------------------------------------------------------- +const char* GetRocmTracerEventSourceName(const RocmTracerEventSource& source) { + switch (source) { + case RocmTracerEventSource::ApiCallback: + return "ApiCallback"; + break; + case RocmTracerEventSource::Activity: + return "Activity"; + break; + case RocmTracerEventSource::Invalid: + return "Invalid"; + break; + default: + DCHECK(false); + return ""; + } + return ""; +} + +// FIXME(rocm-profiler): These domain names are not consistent with the +// GetActivityDomainName function +const char* GetRocmTracerEventDomainName(const RocmTracerEventDomain& domain) { + switch (domain) { + case RocmTracerEventDomain::HIP_API: + return "HIP_API"; + break; + case RocmTracerEventDomain::HIP_OPS: + return "HIP_OPS"; + break; + default: + LOG(WARNING) << "RocmTracerEventDomain::InvalidDomain"; + DCHECK(false); + return ""; + } + return ""; +} + +const char* GetRocmTracerEventTypeName(const RocmTracerEventType& type) { +#define OO(x) \ + case RocmTracerEventType::x: \ + return #x; + switch (type) { + OO(Kernel) + OO(MemcpyH2D) + OO(MemcpyD2H) + OO(MemcpyD2D) + OO(MemcpyOther) + OO(MemoryAlloc) + OO(MemoryFree) + OO(Memset) + OO(Synchronization) + OO(Generic) + default:; + } +#undef OO + DCHECK(false); + return ""; +} + +//----------------------------------------------------------------------------- +// copy api calls +bool isCopyApi(uint32_t id) { + switch (id) { + case ROCPROFILER_HIP_RUNTIME_API_ID_hipMemcpy: + case ROCPROFILER_HIP_RUNTIME_API_ID_hipMemcpy2D: + case ROCPROFILER_HIP_RUNTIME_API_ID_hipMemcpy2DAsync: + case ROCPROFILER_HIP_RUNTIME_API_ID_hipMemcpy2DFromArray: + case ROCPROFILER_HIP_RUNTIME_API_ID_hipMemcpy2DFromArrayAsync: + case ROCPROFILER_HIP_RUNTIME_API_ID_hipMemcpy2DToArray: + case ROCPROFILER_HIP_RUNTIME_API_ID_hipMemcpy2DToArrayAsync: + case ROCPROFILER_HIP_RUNTIME_API_ID_hipMemcpy3D: + case ROCPROFILER_HIP_RUNTIME_API_ID_hipMemcpy3DAsync: + case ROCPROFILER_HIP_RUNTIME_API_ID_hipMemcpyAsync: + case ROCPROFILER_HIP_RUNTIME_API_ID_hipMemcpyAtoH: + case ROCPROFILER_HIP_RUNTIME_API_ID_hipMemcpyDtoD: + case ROCPROFILER_HIP_RUNTIME_API_ID_hipMemcpyDtoDAsync: + case ROCPROFILER_HIP_RUNTIME_API_ID_hipMemcpyDtoH: + case ROCPROFILER_HIP_RUNTIME_API_ID_hipMemcpyDtoHAsync: + case ROCPROFILER_HIP_RUNTIME_API_ID_hipMemcpyFromArray: + case ROCPROFILER_HIP_RUNTIME_API_ID_hipMemcpyFromSymbol: + case ROCPROFILER_HIP_RUNTIME_API_ID_hipMemcpyFromSymbolAsync: + case ROCPROFILER_HIP_RUNTIME_API_ID_hipMemcpyHtoA: + case ROCPROFILER_HIP_RUNTIME_API_ID_hipMemcpyHtoD: + case ROCPROFILER_HIP_RUNTIME_API_ID_hipMemcpyHtoDAsync: + case ROCPROFILER_HIP_RUNTIME_API_ID_hipMemcpyParam2D: + case ROCPROFILER_HIP_RUNTIME_API_ID_hipMemcpyParam2DAsync: + case ROCPROFILER_HIP_RUNTIME_API_ID_hipMemcpyPeer: + case ROCPROFILER_HIP_RUNTIME_API_ID_hipMemcpyPeerAsync: + case ROCPROFILER_HIP_RUNTIME_API_ID_hipMemcpyToArray: + case ROCPROFILER_HIP_RUNTIME_API_ID_hipMemcpyToSymbol: + case ROCPROFILER_HIP_RUNTIME_API_ID_hipMemcpyToSymbolAsync: + case ROCPROFILER_HIP_RUNTIME_API_ID_hipMemcpyWithStream: + return true; + break; + default:; + } + return false; +} + +// ---------------------------------------------------------------------------- +// Stub implementations for RocmTracer static functions expected by +// rocprofiler-sdk. +// ---------------------------------------------------------------------------- +RocmTracer& RocmTracer::i() { + static RocmTracer obj; + return obj; +} + +bool RocmTracer::IsAvailable() const { + return !activity_tracing_enabled_ && !api_tracing_enabled_; // &&NumGpus() +} + +/*static*/ uint64_t RocmTracer::GetTimestamp() { + uint64_t ts; + if (rocprofiler_get_timestamp(&ts) != ROCPROFILER_STATUS_SUCCESS) { + LOG(ERROR) << "function rocprofiler_get_timestamp failed with error "; + return 0; + } + return ts; +} + +void RocmTracer::Enable(const RocmTracerOptions& options, + RocmTraceCollector* collector) { + absl::MutexLock lock(&collector_mutex_); + if (collector_ != nullptr) { + LOG(WARNING) << "ROCM tracer is already running!"; + return; + } + options_ = options; + collector_ = collector; + AnnotationMap(options_->max_annotation_strings); + api_tracing_enabled_ = true; + activity_tracing_enabled_ = true; + rocprofiler_start_context(context_); + LOG(INFO) << "GpuTracer started with number of GPUs" << NumGpus(); +} + +void RocmTracer::HipApiEvent(const rocprofiler_record_header_t* hdr, + RocmTracerEvent* traced_event) { + const auto& rec = + *static_cast( + hdr->payload); + + traced_event->type = RocmTracerEventType::Kernel; + traced_event->source = RocmTracerEventSource::ApiCallback; + traced_event->domain = RocmTracerEventDomain::HIP_API; + traced_event->name = "??"; + traced_event->start_time_ns = rec.start_timestamp; + traced_event->end_time_ns = rec.end_timestamp; + traced_event->device_id = RocmTracerEvent::kInvalidDeviceId; + traced_event->correlation_id = rec.correlation_id.internal; + traced_event->annotation = + annotation_map()->LookUp(traced_event->correlation_id); + traced_event->thread_id = rec.thread_id; + traced_event->stream_id = RocmTracerEvent::kInvalidStreamId; + traced_event->kernel_info = KernelDetails{}; + + absl::MutexLock lock(&kernel_lock_); + if (static_cast(rec.kind) < name_info_.size()) { + auto& vec = name_info_[rec.kind]; + traced_event->name = vec[rec.operation]; + } + if (isCopyApi(rec.operation)) { + // actually one needs to set the real type + traced_event->type = RocmTracerEventType::MemcpyOther; + } +} + +void RocmTracer::MemcpyEvent(const rocprofiler_record_header_t* hdr, + RocmTracerEvent* traced_event) { + const auto& rec = + *static_cast( + hdr->payload); + +#define OO(src, target) \ + case ROCPROFILER_MEMORY_COPY_##src: \ + traced_event->type = RocmTracerEventType::target; \ + traced_event->name = #target; \ + break; + + switch (rec.operation) { + OO(NONE, MemcpyOther) + OO(HOST_TO_HOST, MemcpyOther) + OO(HOST_TO_DEVICE, MemcpyH2D) + OO(DEVICE_TO_HOST, MemcpyD2H) + OO(DEVICE_TO_DEVICE, MemcpyD2D) + default: + LOG(WARNING) << "Unexpected memcopy operation " << rec.operation; + traced_event->type = RocmTracerEventType::MemcpyOther; + } +#undef OO + const auto &src_gpu = agents_[static_cast(rec.src_agent_id.handle)], + &dst_gpu = agents_[static_cast(rec.dst_agent_id.handle)]; + + // Assign device_id based on copy direction + if (traced_event->type == RocmTracerEventType::MemcpyH2D && + dst_gpu.type == ROCPROFILER_AGENT_TYPE_GPU) { + traced_event->device_id = dst_gpu.id.handle; // Destination is GPU + } else if (traced_event->type == RocmTracerEventType::MemcpyD2H && + src_gpu.type == ROCPROFILER_AGENT_TYPE_GPU) { + traced_event->device_id = src_gpu.id.handle; // Source is GPU + } else if (traced_event->type == RocmTracerEventType::MemcpyD2D) { + // Prefer destination GPU for D2D + traced_event->device_id = dst_gpu.id.handle; + } else { + // Fallback for MemcpyOther or HOST_TO_HOST + if (dst_gpu.type == ROCPROFILER_AGENT_TYPE_GPU) { + traced_event->device_id = dst_gpu.id.handle; + } else if (src_gpu.type == ROCPROFILER_AGENT_TYPE_GPU) { + traced_event->device_id = src_gpu.id.handle; + } else { + LOG(WARNING) << "No GPU ID available for memory copy operation: " + << traced_event->name << ", src_agent_type=" << src_gpu.type + << ", dst_agent_type=" << dst_gpu.type; + traced_event->device_id = 0; // Invalid ID or default + } + } + + traced_event->source = RocmTracerEventSource::Activity; + traced_event->domain = RocmTracerEventDomain::HIP_OPS; + traced_event->start_time_ns = rec.start_timestamp; + traced_event->end_time_ns = rec.end_timestamp; + traced_event->correlation_id = rec.correlation_id.internal; + traced_event->annotation = + annotation_map()->LookUp(traced_event->correlation_id); + traced_event->thread_id = rec.thread_id; + // we do not know valid stream ID for memcpy + // rec.stream_id.handle; + traced_event->stream_id = RocmTracerEvent::kInvalidStreamId; + traced_event->memcpy_info = MemcpyDetails{ + .num_bytes = rec.bytes, + .destination = static_cast(dst_gpu.id.handle), + .async = false, + }; + + LOG(INFO) << "copy bytes: " << traced_event->memcpy_info.num_bytes + << " stream: " << traced_event->stream_id << " src_id " + << traced_event->device_id << " dst_id " + << traced_event->memcpy_info.destination; +} + +void RocmTracer::KernelEvent(const rocprofiler_record_header_t* hdr, + RocmTracerEvent* traced_event) { + const auto& rec = + *static_cast( + hdr->payload); + + const auto& kinfo = rec.dispatch_info; + traced_event->type = RocmTracerEventType::Kernel; + traced_event->source = RocmTracerEventSource::Activity; + traced_event->domain = RocmTracerEventDomain::HIP_OPS; + traced_event->name = "??"; + traced_event->start_time_ns = rec.start_timestamp; + traced_event->end_time_ns = rec.end_timestamp; + traced_event->device_id = agents_[kinfo.agent_id.handle].id.handle; + traced_event->correlation_id = rec.correlation_id.internal; + traced_event->annotation = + annotation_map()->LookUp(traced_event->correlation_id); + traced_event->thread_id = rec.thread_id; + traced_event->stream_id = kinfo.queue_id.handle; + traced_event->kernel_info = KernelDetails{ + .registers_per_thread = 0, + .static_shared_memory_usage = 0, + .dynamic_shared_memory_usage = 0, + .block_x = kinfo.workgroup_size.x, + .block_y = kinfo.workgroup_size.y, + .block_z = kinfo.workgroup_size.z, + .grid_x = kinfo.grid_size.x, + .grid_y = kinfo.grid_size.y, + .grid_z = kinfo.grid_size.z, + .func_ptr = nullptr, + }; + + auto it = kernel_info_.find(kinfo.kernel_id); + if (it != kernel_info_.end()) traced_event->name = it->second.name; +} + +void RocmTracer::TracingCallback(rocprofiler_context_id_t context, + rocprofiler_buffer_id_t buffer_id, + rocprofiler_record_header_t** headers, + size_t num_headers, uint64_t drop_count) { + if (collector() == nullptr) return; + if (num_headers == 0) return; + assert(drop_count == 0 && "drop count should be zero for lossless policy"); + + if (headers == nullptr) { + LOG(ERROR) + << "rocprofiler invoked a buffer callback with a null pointer to the " + "array of headers. this should never happen"; + return; + } + + for (size_t i = 0; i < num_headers; i++) { + RocmTracerEvent event; + auto header = headers[i]; + + if (header->category != ROCPROFILER_BUFFER_CATEGORY_TRACING) continue; + + switch (header->kind) { + case ROCPROFILER_BUFFER_TRACING_HIP_RUNTIME_API: + HipApiEvent(header, &event); + break; + + case ROCPROFILER_BUFFER_TRACING_KERNEL_DISPATCH: + KernelEvent(header, &event); + break; + + case ROCPROFILER_BUFFER_TRACING_MEMORY_COPY: + MemcpyEvent(header, &event); + break; + + default: + continue; + } // switch + + absl::MutexLock lock(&collector_mutex_); + if (collector()) { + collector()->AddEvent(std::move(event), false); + } + } // for +} + +void RocmTracer::CodeObjectCallback( + rocprofiler_callback_tracing_record_t record, void* callback_data) { + if (record.kind == ROCPROFILER_CALLBACK_TRACING_CODE_OBJECT && + record.operation == ROCPROFILER_CODE_OBJECT_LOAD) { + if (record.phase == ROCPROFILER_CALLBACK_PHASE_UNLOAD) { + // mainly for debugging + LOG(WARNING) + << "Callback phase unload without registering kernel names ..."; + } + } else if (record.kind == ROCPROFILER_CALLBACK_TRACING_CODE_OBJECT && + record.operation == + ROCPROFILER_CODE_OBJECT_DEVICE_KERNEL_SYMBOL_REGISTER) { + auto* data = static_cast(record.payload); + if (record.phase == ROCPROFILER_CALLBACK_PHASE_LOAD) { + absl::MutexLock lock(&kernel_lock_); + kernel_info_.emplace( + data->kernel_id, + ProfilerKernelInfo{demangle(data->kernel_name), *data}); + } else if (record.phase == ROCPROFILER_CALLBACK_PHASE_UNLOAD) { + // FIXME: clear these? At minimum need kernel names at shutdown, async + // completion We don't erase it just in case a buffer callback still needs + // this kernel_info_.erase(data->kernel_id); + } + } +} + +static void code_object_callback(rocprofiler_callback_tracing_record_t record, + rocprofiler_user_data_t* user_data, + void* callback_data) { + RocmTracer::i().CodeObjectCallback(record, callback_data); +} + +static void tool_tracing_callback(rocprofiler_context_id_t context, + rocprofiler_buffer_id_t buffer_id, + rocprofiler_record_header_t** headers, + size_t num_headers, void* user_data, + uint64_t drop_count) { + RocmTracer::i().TracingCallback(context, buffer_id, headers, num_headers, + drop_count); +} + +int RocmTracer::toolInit(rocprofiler_client_finalize_t fini_func, + void* tool_data) { + // Gather API names + name_info_ = GetCallbackTracingNames(); + + // Gather agent info + num_gpus_ = 0; + for (const auto& agent : GetGpuDeviceAgents()) { + LOG(INFO) << "agent id = " << agent.id.handle + << ", dev = " << agent.device_id + << ", name = " << (agent.name ? agent.name : "null"); + agents_[agent.id.handle] = agent; + if (agent.type == ROCPROFILER_AGENT_TYPE_GPU) { + num_gpus_++; + } + } + + // Utility context to gather code‑object info + rocprofiler_create_context(&utility_context_); + + // buffered tracing + auto code_object_ops = std::vector{ + ROCPROFILER_CODE_OBJECT_DEVICE_KERNEL_SYMBOL_REGISTER}; + + rocprofiler_configure_callback_tracing_service( + utility_context_, ROCPROFILER_CALLBACK_TRACING_CODE_OBJECT, + code_object_ops.data(), code_object_ops.size(), code_object_callback, + nullptr); + + rocprofiler_start_context(utility_context_); + LOG(INFO) << "rocprofiler start utilityContext"; + + // a multiple of the page size, and the gap allows the buffer to absorb bursts + // of GPU events + constexpr auto buffer_size_bytes = 20 * 4096; + constexpr auto buffer_watermark_bytes = 2 * 4096; + + // Utility context to gather code‑object info + rocprofiler_create_context(&context_); + + rocprofiler_create_buffer(context_, buffer_size_bytes, buffer_watermark_bytes, + ROCPROFILER_BUFFER_POLICY_LOSSLESS, + tool_tracing_callback, tool_data, &buffer_); + + rocprofiler_configure_buffer_tracing_service( + context_, ROCPROFILER_BUFFER_TRACING_HIP_RUNTIME_API, nullptr, 0, + buffer_); + + rocprofiler_configure_buffer_tracing_service( + context_, ROCPROFILER_BUFFER_TRACING_KERNEL_DISPATCH, nullptr, 0, + buffer_); + + rocprofiler_configure_buffer_tracing_service( + context_, ROCPROFILER_BUFFER_TRACING_MEMORY_COPY, nullptr, 0, buffer_); + + { + // for annotations + const rocprofiler_tracing_operation_t* hip_ops = nullptr; + size_t hip_ops_count = 0; + + rocprofiler_configure_callback_tracing_service( + context_, ROCPROFILER_CALLBACK_TRACING_HIP_RUNTIME_API, hip_ops, + hip_ops_count, + [](rocprofiler_callback_tracing_record_t record, + rocprofiler_user_data_t*, void*) { + if (record.phase == ROCPROFILER_CALLBACK_PHASE_ENTER) { + const std::string& annotation = + tsl::profiler::AnnotationStack::Get(); + if (!annotation.empty()) { + RocmTracer::i().annotation_map()->Add( + record.correlation_id.internal, annotation); + } + } + }, + nullptr); + } + + auto client_thread = rocprofiler_callback_thread_t{}; + rocprofiler_create_callback_thread(&client_thread); + rocprofiler_assign_callback_thread(buffer_, client_thread); + + int isValid = 0; + rocprofiler_context_is_valid(context_, &isValid); + if (isValid == 0) { + context_.handle = 0; // Leak on failure. + return -1; + } + + return 0; +} + +void RocmTracer::toolFinalize(void* tool_data) { + auto& obj = RocmTracer::i(); + LOG(INFO) << "Calling toolFinalize!"; + rocprofiler_stop_context(obj.utility_context_); + obj.utility_context_.handle = 0; + rocprofiler_stop_context(obj.context_); + // flush buffer here or in disable? + obj.context_.handle = 0; +} + +void RocmTracer::Disable() { + absl::MutexLock lock(&collector_mutex_); + collector_->Flush(); + collector_ = nullptr; + api_tracing_enabled_ = false; + activity_tracing_enabled_ = false; + LOG(INFO) << "GpuTracer stopped"; +} + +// ---------------------------------------------------------------------------- +// Helper that returns all device agents (GPU + CPU for now). +// ---------------------------------------------------------------------------- +std::vector GetGpuDeviceAgents() { + std::vector agents; + + rocprofiler_query_available_agents_cb_t iterate_cb = + [](rocprofiler_agent_version_t agents_ver, const void** agents_arr, + size_t num_agents, void* udata) { + if (agents_ver != ROCPROFILER_AGENT_INFO_VERSION_0) { + LOG(ERROR) << "unexpected rocprofiler agent version: " << agents_ver; + return ROCPROFILER_STATUS_ERROR; + } + auto* agents_vec = + static_cast*>(udata); + for (size_t i = 0; i < num_agents; ++i) { + const auto* agent = + static_cast(agents_arr[i]); + agents_vec->push_back(*agent); + } + return ROCPROFILER_STATUS_SUCCESS; + }; + + rocprofiler_query_available_agents(ROCPROFILER_AGENT_INFO_VERSION_0, + iterate_cb, sizeof(rocprofiler_agent_t), + static_cast(&agents)); + return agents; +} + +static int toolInitStatic(rocprofiler_client_finalize_t finalize_func, + void* tool_data) { + return RocmTracer::i().toolInit(finalize_func, tool_data); +} + +// ---------------------------------------------------------------------------- +// C‑linkage entry‑point expected by rocprofiler-sdk. +// ---------------------------------------------------------------------------- +extern "C" rocprofiler_tool_configure_result_t* rocprofiler_configure( + uint32_t version, const char* runtime_version, uint32_t priority, + rocprofiler_client_id_t* id) { + auto& obj = RocmTracer::i(); // Ensure constructed, critical for tracing. + + id->name = "XLA-with-rocprofiler-sdk"; + obj.client_id_ = id; + + LOG(INFO) << "Configure rocprofiler-sdk..."; + + const uint32_t major = version / 10000; + const uint32_t minor = (version % 10000) / 100; + const uint32_t patch = version % 100; + + std::stringstream info; + info << id->name << " Configure XLA with rocprofv3... (priority=" << priority + << ") is using rocprofiler-sdk v" << major << '.' << minor << '.' + << patch << " (" << runtime_version << ')'; + LOG(INFO) << info.str(); + + static rocprofiler_tool_configure_result_t cfg{ + sizeof(rocprofiler_tool_configure_result_t), &toolInitStatic, + &RocmTracer::toolFinalize, nullptr}; + + return &cfg; +} + +} // namespace profiler +} // namespace xla + +void __attribute__((constructor)) init_rocm_lib() { + rocprofiler_force_configure(xla::profiler::rocprofiler_configure); +} + +#endif // \ No newline at end of file diff --git a/third_party/xla/xla/backends/profiler/gpu/rocm_tracer.h b/third_party/xla/xla/backends/profiler/gpu/rocm_tracer.h index b82a1e66d0092a..8bb5467a0f30d0 100644 --- a/third_party/xla/xla/backends/profiler/gpu/rocm_tracer.h +++ b/third_party/xla/xla/backends/profiler/gpu/rocm_tracer.h @@ -28,6 +28,9 @@ limitations under the License. #include "tsl/platform/status.h" #include "tsl/platform/types.h" +// for roctracer (v1) +#if TF_ROCM_VERSION < 60300 + namespace xla { namespace profiler { @@ -210,4 +213,115 @@ class RocmTracer { } // namespace profiler } // namespace xla -#endif // XLA_BACKENDS_PROFILER_GPU_ROCM_TRACER_H_ + +#else +// for rocprofiler-sdk (v3) + +namespace xla { +namespace profiler { + +std::string demangle(const char* name); +std::string demangle(const std::string& name); + +struct RocmTracerOptions { + // maximum number of annotation strings that AnnotationMap in RocmTracer can + // store. e.g. 1M + uint64_t max_annotation_strings; +}; + +// The class use to enable rocprofiler-sdk buffered callback/activity tracing +// and forward the collected trace events to RocmTraceCollector. There should be +// only one RocmTracer per process. +class RocmTracer { + public: + // Returns a reference to the singleton instance of RocmTracer. + // This ensures that only one global instance exists throughout the process + // lifetime. The first call to this function lazily constructs the instance in + // a thread-safe manner. Subsequent calls return the same instance, enabling + // centralized tracer state management. + static RocmTracer& i(); + + // Only one profile session can be live in the same time. + bool IsAvailable() const; + + void Enable(const RocmTracerOptions& options, RocmTraceCollector* collector_); + void Disable(); + + static uint64_t GetTimestamp(); + uint32_t NumGpus() const { return num_gpus_; }; + RocmTraceCollector* collector() { return collector_; } + + int toolInit(rocprofiler_client_finalize_t finalize_func, void* tool_data); + static void toolFinalize(void* tool_data); + + void TracingCallback(rocprofiler_context_id_t context, + rocprofiler_buffer_id_t buffer_id, + rocprofiler_record_header_t** headers, + size_t num_headers, uint64_t drop_count); + + void CodeObjectCallback(rocprofiler_callback_tracing_record_t record, + void* callback_data); + + AnnotationMap* annotation_map() { return &annotation_map_; } + + protected: + // protected constructor for injecting mock cupti interface for testing. + RocmTracer() = default; + + void HipApiEvent(const rocprofiler_record_header_t* hdr, RocmTracerEvent* ev); + void KernelEvent(const rocprofiler_record_header_t* hdr, RocmTracerEvent* ev); + void MemcpyEvent(const rocprofiler_record_header_t* hdr, RocmTracerEvent* ev); + + private: + uint32_t num_gpus_{0}; + std::optional options_; + RocmTraceCollector* collector_{nullptr}; + absl::Mutex collector_mutex_; + + bool api_tracing_enabled_{false}; + bool activity_tracing_enabled_{false}; + + AnnotationMap annotation_map_{/* default size, e.g. */ 1024 * 1024}; + + public: + using kernel_symbol_data_t = + rocprofiler_callback_tracing_code_object_kernel_symbol_register_data_t; + + struct ProfilerKernelInfo { + std::string name; + kernel_symbol_data_t data; + }; + + using kernel_info_map_t = + std::unordered_map; + + using agent_info_map_t = std::unordered_map; + + using callback_name_info = rocprofiler::sdk::callback_name_info; + + rocprofiler_client_id_t* client_id_{nullptr}; + // Contexts ---------------------------------------------------------- + // for registering kernel names + rocprofiler_context_id_t utility_context_{}; + // for buffered callback services + rocprofiler_context_id_t context_{}; + rocprofiler_buffer_id_t buffer_{}; + + // Maps & misc ------------------------------------------------------- + kernel_info_map_t kernel_info_{}; + absl::Mutex kernel_lock_; + + callback_name_info name_info_; + agent_info_map_t agents_; + + public: + // Disable copy and move. + RocmTracer(const RocmTracer&) = delete; + RocmTracer& operator=(const RocmTracer&) = delete; +}; + +} // namespace profiler +} // namespace xla +#endif // TF_ROCM_VERSION + +#endif // XLA_BACKENDS_PROFILER_GPU_ROCM_TRACER_H_ \ No newline at end of file diff --git a/third_party/xla/xla/backends/profiler/gpu/rocm_tracer_test.cc b/third_party/xla/xla/backends/profiler/gpu/rocm_tracer_test.cc new file mode 100644 index 00000000000000..a9712c15383e0a --- /dev/null +++ b/third_party/xla/xla/backends/profiler/gpu/rocm_tracer_test.cc @@ -0,0 +1,124 @@ +/* Copyright 2025 The OpenXLA Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "tsl/platform/test.h" +#include + +#include "xla/backends/profiler/gpu/rocm_tracer.h" +#include "xla/backends/profiler/gpu/rocm_collector.h" + +#if TF_ROCM_VERSION >= 60300 + +namespace xla { +namespace profiler { +namespace test { + +// Minimal mock collector implementation based on RocmTraceCollectorImpl. +class TestRocmTraceCollector : public RocmTraceCollectorImpl { + public: + TestRocmTraceCollector(const RocmTraceCollectorOptions& options, + uint64_t start_walltime_ns, uint64_t start_gputime_ns) + : RocmTraceCollectorImpl(options, start_walltime_ns, start_gputime_ns) {} + + void Export(XSpace* space) override { + exported_ = true; + exported_space_ = space; + } + + void OnEventsDropped(const std::string& reason, + uint32_t correlation_id) override { + dropped_reason_ = reason; + dropped_id_ = correlation_id; + } + + bool exported() const { return exported_; } + const std::string& dropped_reason() const { return dropped_reason_; } + uint32_t dropped_id() const { return dropped_id_; } + XSpace* exported_space() const { return exported_space_; } + + private: + bool exported_ = false; + std::string dropped_reason_; + uint32_t dropped_id_ = 0; + XSpace* exported_space_ = nullptr; +}; + +// Utility to create valid options for the test collector. +std::unique_ptr CreateTestCollector() { + RocmTraceCollectorOptions options; + options.max_callback_api_events = 2 * 1024 * 1024; + options.max_activity_api_events = 2 * 1024 * 1024; + options.max_annotation_strings = 1024 * 1024; + options.num_gpus = 1; + + uint64_t walltime_ns = RocmTracer::GetTimestamp(); + uint64_t gputime_ns = RocmTracer::GetTimestamp(); + + return std::make_unique(options, walltime_ns, + gputime_ns); +} + +TEST(RocmTracerTest, SingletonInstance) { + LOG(INFO) << "Before RocmTracer::i()"; + RocmTracer& tracer1 = RocmTracer::i(); + RocmTracer& tracer2 = RocmTracer::i(); + LOG(INFO) << "Before RocmTracer::i()"; + EXPECT_EQ(&tracer1, &tracer2) << "RocmTracer must be a singleton"; +} + +TEST(RocmTracerTest, InitialStateIsAvailable) { + RocmTracer& tracer = RocmTracer::i(); + EXPECT_TRUE(tracer.IsAvailable()) + << "Tracer should be available before Enable()"; +} + +TEST(RocmTracerTest, EnableAndDisableLifecycle) { + RocmTracer& tracer = RocmTracer::i(); + auto collector = CreateTestCollector(); + + RocmTracerOptions tracer_options{/*max_annotation_strings=*/128}; + tracer.Enable(tracer_options, collector.get()); + + EXPECT_FALSE(tracer.IsAvailable()) + << "Tracer should not be available after Enable()"; + EXPECT_EQ(tracer.collector(), collector.get()) + << "Collector should be set after Enable()"; + ASSERT_NE(tracer.annotation_map(), nullptr) + << "Annotation map should be initialized"; + + tracer.Disable(); + + EXPECT_TRUE(tracer.IsAvailable()) + << "Tracer should be available after Disable()"; +} + +TEST(RocmTracerTest, AnnotationMapWorks) { + RocmTracer& tracer = RocmTracer::i(); + auto* map = tracer.annotation_map(); + ASSERT_NE(map, nullptr); + + uint64_t id = 42; + std::string annotation = "matmul_fused_op"; + map->Add(id, annotation); + + absl::string_view result = map->LookUp(id); + EXPECT_EQ(result, annotation); +} + +} // namespace test +} // namespace profiler +} // namespace xla + +#endif // TF_ROCM_VERSION \ No newline at end of file diff --git a/third_party/xla/xla/stream_executor/rocm/roctracer_wrapper.h b/third_party/xla/xla/stream_executor/rocm/roctracer_wrapper.h index 4ba18752824f0e..81f764027c6d8c 100644 --- a/third_party/xla/xla/stream_executor/rocm/roctracer_wrapper.h +++ b/third_party/xla/xla/stream_executor/rocm/roctracer_wrapper.h @@ -1,3 +1,4 @@ + /* Copyright 2021 The OpenXLA Authors. Licensed under the Apache License, Version 2.0 (the "License"); @@ -20,16 +21,26 @@ limitations under the License. #ifndef XLA_STREAM_EXECUTOR_ROCM_ROCTRACER_WRAPPER_H_ #define XLA_STREAM_EXECUTOR_ROCM_ROCTRACER_WRAPPER_H_ -#include "rocm/include/roctracer/roctracer.h" -#include "rocm/include/roctracer/roctracer_hip.h" #include "rocm/rocm_config.h" -#if TF_ROCM_VERSION >= 50300 + +#if TF_ROCM_VERSION >= 60300 +#include +#include +#include +#include +#include +#include +#include +#include +#include +#elif TF_ROCM_VERSION >= 50300 #include "rocm/include/roctracer/roctracer_roctx.h" -#else -#include "rocm/include/roctracer/roctracer_hcc.h" +#include "rocm/include/roctracer/roctracer.h" +#include "rocm/include/roctracer/roctracer_hip.h" #endif -#include "xla/tsl/platform/env.h" + #include "tsl/platform/dso_loader.h" +#include "tsl/platform/env.h" #include "tsl/platform/platform.h" namespace stream_executor { @@ -64,7 +75,28 @@ namespace wrap { #endif // PLATFORM_GOOGLE -#if TF_ROCM_VERSION >= 50300 +#if TF_ROCM_VERSION >= 60300 +#define FOREACH_ROCTRACER_API(DO_FUNC) \ + DO_FUNC(rocprofiler_configure) \ + DO_FUNC(rocprofiler_at_internal_thread_create) \ + DO_FUNC(rocprofiler_create_buffer) \ + DO_FUNC(rocprofiler_create_context) \ + DO_FUNC(rocprofiler_flush_buffer) \ + DO_FUNC(rocprofiler_get_status_string) \ + DO_FUNC(rocprofiler_context_is_valid) \ + DO_FUNC(rocprofiler_start_context) \ + DO_FUNC(rocprofiler_stop_context) \ + DO_FUNC(rocprofiler_configure_callback_tracing_service) \ + DO_FUNC(rocprofiler_configure_buffer_tracing_service) \ + DO_FUNC(rocprofiler_get_timestamp) \ + DO_FUNC(rocprofiler_query_available_agents) \ + DO_FUNC(rocprofiler_iterate_callback_tracing_kinds) \ + DO_FUNC(rocprofiler_assign_callback_thread) \ + DO_FUNC(rocprofiler_create_callback_thread) \ + DO_FUNC(rocprofiler_query_callback_tracing_kind_name) \ + DO_FUNC(rocprofiler_iterate_callback_tracing_kind_operations) \ + DO_FUNC(rocprofiler_query_callback_tracing_kind_operation_name) +#elif TF_ROCM_VERSION >= 50300 && TF_ROCM_VERSION < 60300 #define FOREACH_ROCTRACER_API(DO_FUNC) \ DO_FUNC(roctracer_default_pool_expl) \ DO_FUNC(roctracer_disable_domain_activity) \ @@ -108,4 +140,4 @@ FOREACH_ROCTRACER_API(ROCTRACER_API_WRAPPER) } // namespace wrap } // namespace stream_executor -#endif // XLA_STREAM_EXECUTOR_ROCM_ROCTRACER_WRAPPER_H_ +#endif // XLA_STREAM_EXECUTOR_ROCM_ROCTRACER_WRAPPER_H_ \ No newline at end of file