Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
144 changes: 99 additions & 45 deletions third_party/xla/xla/backends/profiler/gpu/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -62,8 +62,8 @@ tsl_gpu_library(
copts = tf_profiler_copts() + tsl_copts(),
visibility = ["//visibility:public"],
deps = [
"@local_tsl//tsl/platform:macros",
"@local_tsl//tsl/platform:types",
"@tsl//tsl/platform:macros",
"@tsl//tsl/platform:types",
] + if_cuda(["//xla/tsl/cuda:cupti"]),
)

Expand All @@ -76,7 +76,7 @@ tsl_gpu_library(
":cupti_interface",
],
deps = [
"@local_tsl//tsl/platform:test",
"@tsl//tsl/platform:test",
],
)

Expand All @@ -92,9 +92,9 @@ tsl_gpu_library(
visibility = ["//visibility:public"],
deps = [
"@com_google_absl//absl/debugging:leak_check",
"@local_tsl//tsl/platform:logging",
"@local_tsl//tsl/platform:mutex",
"@local_tsl//tsl/platform:thread_annotations",
"@tsl//tsl/platform:logging",
"@tsl//tsl/platform:mutex",
"@tsl//tsl/platform:thread_annotations",
],
)

Expand All @@ -106,12 +106,12 @@ xla_test(
copts = tf_profiler_copts() + tsl_copts(),
tags = [
"no_mac",
"cuda-only", #TODO(rocm): weekly-sync 24-12-10
"cuda-only"
],
deps = [
":cupti_interface",
"@com_google_googletest//:gtest_main",
"@local_tsl//tsl/platform:test",
"@tsl//tsl/platform:test",
] + if_cuda_is_configured([
":cuda_test",
":cupti_error_manager",
Expand All @@ -137,7 +137,7 @@ cuda_library(
deps = [
"@local_config_cuda//cuda:cuda_headers",
"@local_config_cuda//cuda:cuda_runtime",
"@local_tsl//tsl/platform:test",
"@tsl//tsl/platform:test",
],
)

Expand Down Expand Up @@ -182,14 +182,13 @@ tsl_gpu_library(
"@com_google_absl//absl/log",
"@com_google_absl//absl/status",
"@com_google_absl//absl/strings:string_view",
"@com_google_absl//absl/types:optional",
"@com_google_absl//absl/types:span",
"@local_tsl//tsl/platform:env",
"@local_tsl//tsl/platform:errors",
"@local_tsl//tsl/platform:logging",
"@local_tsl//tsl/platform:macros",
"@local_tsl//tsl/platform:platform_port",
"@local_tsl//tsl/platform:types",
"@tsl//tsl/platform:env",
"@tsl//tsl/platform:errors",
"@tsl//tsl/platform:logging",
"@tsl//tsl/platform:macros",
"@tsl//tsl/platform:platform_port",
"@tsl//tsl/platform:types",
],
)

Expand All @@ -208,13 +207,13 @@ tsl_gpu_library(
"@com_google_absl//absl/container:node_hash_map",
"@com_google_absl//absl/container:node_hash_set",
"@com_google_absl//absl/status",
"@local_tsl//tsl/platform:env",
"@local_tsl//tsl/platform:errors",
"@local_tsl//tsl/platform:logging",
"@local_tsl//tsl/platform:macros",
"@local_tsl//tsl/platform:platform_port",
"@local_tsl//tsl/platform:types",
"@local_tsl//tsl/profiler/lib:scoped_annotation",
"@tsl//tsl/platform:env",
"@tsl//tsl/platform:errors",
"@tsl//tsl/platform:logging",
"@tsl//tsl/platform:macros",
"@tsl//tsl/platform:platform_port",
"@tsl//tsl/platform:types",
"@tsl//tsl/profiler/lib:scoped_annotation",
],
)

Expand Down Expand Up @@ -246,7 +245,6 @@ tsl_gpu_library(
"@com_google_absl//absl/container:node_hash_set",
"@com_google_absl//absl/strings",
"@com_google_absl//absl/strings:str_format",
"@com_google_absl//absl/types:optional",
"@local_tsl//tsl/platform:abi",
"@local_tsl//tsl/platform:env_time",
"@local_tsl//tsl/platform:errors",
Expand All @@ -265,6 +263,12 @@ tsl_gpu_library(
srcs = if_rocm(["rocm_tracer.cc"]),
hdrs = if_rocm(["rocm_tracer.h"]),
copts = tf_profiler_copts() + tsl_copts(),
linkopts = select({
"//conditions:default": [
"-L/opt/rocm/lib", # search path for all ROCm shared objects
"-lrocprofiler-sdk", # the library that owns the missing symbols
],
}),
tags = [
"gpu",
"rocm-only",
Expand All @@ -284,7 +288,6 @@ tsl_gpu_library(
"@com_google_absl//absl/container:node_hash_map",
"@com_google_absl//absl/container:node_hash_set",
"@com_google_absl//absl/status",
"@com_google_absl//absl/types:optional",
"@local_tsl//tsl/platform:env",
"@local_tsl//tsl/platform:errors",
"@local_tsl//tsl/platform:logging",
Expand All @@ -295,15 +298,67 @@ tsl_gpu_library(
],
)

xla_test(
name = "rocm_tracer_test",
size = "small",
srcs = ["rocm_tracer_test.cc"],
copts = tf_profiler_copts() + tsl_copts(),
tags = [
"gpu",
"rocm",
"rocm-only",
] + if_google([
# Optional: only run internally if ROCm config is enabled
"manual",
]),
deps = if_rocm([
":rocm_tracer",
":rocm_collector",
]) + [
"//xla/tsl/profiler/utils:xplane_builder",
"@com_google_absl//absl/container:flat_hash_map",
"@com_google_googletest//:gtest_main",
"@local_tsl//tsl/platform:status_matchers",
"@local_tsl//tsl/platform:test",
"@local_tsl//tsl/profiler/protobuf:xplane_proto_cc",
],
)

xla_test(
name = "rocm_collector_test",
size = "small",
srcs = ["rocm_collector_test.cc"],
copts = tf_profiler_copts() + tsl_copts(),
tags = [
"gpu",
"rocm",
"rocm-only",
] + if_google([
"manual",
]),
deps = if_rocm([
":rocm_tracer",
":rocm_collector",
]) + [
"//xla/tsl/profiler/utils:xplane_builder",
"@com_google_absl//absl/container:flat_hash_map",
"@com_google_googletest//:gtest_main",
"@local_tsl//tsl/platform:status_matchers",
"@local_tsl//tsl/platform:test",
"@local_tsl//tsl/profiler/protobuf:xplane_proto_cc",
],
args = ["--gtest_shuffle"],
)

tsl_gpu_library(
name = "nvtx_utils",
srcs = if_cuda(["nvtx_utils.cc"]),
hdrs = if_cuda(["nvtx_utils.h"]),
copts = tf_profiler_copts() + tsl_copts(),
deps = [
"@com_google_absl//absl/strings",
"@local_tsl//tsl/platform",
"@local_tsl//tsl/platform:macros",
"@tsl//tsl/platform",
"@tsl//tsl/platform:macros",
],
)

Expand Down Expand Up @@ -331,12 +386,12 @@ tsl_gpu_library(
"@com_google_absl//absl/log",
"@com_google_absl//absl/log:check",
"@com_google_absl//absl/strings",
"@local_tsl//tsl/platform:abi",
"@local_tsl//tsl/platform:mutex",
"@local_tsl//tsl/platform:platform_port",
"@local_tsl//tsl/platform:thread_annotations",
"@local_tsl//tsl/platform:types",
"@local_tsl//tsl/profiler/protobuf:xplane_proto_cc",
"@tsl//tsl/platform:abi",
"@tsl//tsl/platform:mutex",
"@tsl//tsl/platform:platform_port",
"@tsl//tsl/platform:thread_annotations",
"@tsl//tsl/platform:types",
"@tsl//tsl/profiler/protobuf:xplane_proto_cc",
] + if_cuda([
"//xla/tsl/cuda:cupti",
"//xla/tsl/cuda",
Expand All @@ -360,10 +415,10 @@ tsl_gpu_library(
"@com_google_absl//absl/status",
"@com_google_absl//absl/strings",
"@com_google_absl//absl/strings:string_view",
"@local_tsl//tsl/platform:errors",
"@local_tsl//tsl/platform:mutex",
"@local_tsl//tsl/platform:platform_port",
"@local_tsl//tsl/platform:thread_annotations",
"@tsl//tsl/platform:errors",
"@tsl//tsl/platform:mutex",
"@tsl//tsl/platform:platform_port",
"@tsl//tsl/platform:thread_annotations",
] + if_cuda(["//xla/tsl/cuda:cupti"]),
)

Expand All @@ -377,8 +432,8 @@ tsl_gpu_library(
":cupti_wrapper",
"@com_google_absl//absl/base",
"@com_google_absl//absl/memory",
"@local_tsl//tsl/platform:logging",
"@local_tsl//tsl/platform:stringpiece",
"@tsl//tsl/platform:logging",
"@tsl//tsl/platform:stringpiece",
"//xla/tsl/util:env_var",
],
visibility = ["//visibility:public"],
Expand All @@ -396,11 +451,9 @@ xla_test(
],
deps = [
":cupti_buffer_events",
":cupti_collector",
":cupti_tracer",
":cupti_utils",
"@com_google_googletest//:gtest_main",
"@local_tsl//tsl/platform:test",
"@tsl//tsl/platform:test",
],
)

Expand All @@ -421,9 +474,9 @@ xla_test(
"//xla/tsl/profiler/utils:xplane_builder",
"@com_google_absl//absl/container:flat_hash_map",
"@com_google_googletest//:gtest_main",
"@local_tsl//tsl/platform:status_matchers",
"@local_tsl//tsl/platform:test",
"@local_tsl//tsl/profiler/protobuf:xplane_proto_cc",
"@tsl//tsl/platform:status_matchers",
"@tsl//tsl/platform:test",
"@tsl//tsl/profiler/protobuf:xplane_proto_cc",
],
)

Expand All @@ -439,6 +492,7 @@ cuda_library(
local_defines = if_oss(["NVTX_VERSION_3_1=1"]),
tags = ["cuda-only"],
visibility = ["//visibility:public"],
deps = ["@local_config_cuda//cuda:cuda_headers"],
)

xla_test(
Expand Down
41 changes: 13 additions & 28 deletions third_party/xla/xla/backends/profiler/gpu/device_tracer_rocm.cc
Original file line number Diff line number Diff line change
Expand Up @@ -45,23 +45,8 @@ namespace xla {
namespace profiler {

using tensorflow::ProfileOptions;
using tsl::mutex;
using tsl::mutex_lock;
using tsl::profiler::Annotation;
using tsl::profiler::AnnotationStack;
using tsl::profiler::FindOrAddMutablePlaneWithName;
using tsl::profiler::GetStatTypeStr;
using tsl::profiler::GpuPlaneName;
using tsl::profiler::kDeviceVendorAMD;
using tsl::profiler::kThreadIdOverhead;
using tsl::profiler::ParseAnnotationStack;
using tsl::profiler::ProfilerInterface;
using tsl::profiler::RegisterProfilerFactory;
using tsl::profiler::StatType;
using tsl::profiler::XEventBuilder;
using tsl::profiler::XEventMetadata;
using tsl::profiler::XLineBuilder;
using tsl::profiler::XPlaneBuilder;
using tsl::profiler::XSpace;

// GpuTracer for ROCm GPU.
Expand All @@ -82,7 +67,6 @@ class GpuTracer : public profiler::ProfilerInterface {
absl::Status DoStop();

RocmTracerOptions GetRocmTracerOptions();

RocmTraceCollectorOptions GetRocmTraceCollectorOptions(uint32_t num_gpus);

enum State {
Expand All @@ -99,10 +83,9 @@ class GpuTracer : public profiler::ProfilerInterface {
};

RocmTracerOptions GpuTracer::GetRocmTracerOptions() {
// TODO(rocm-profiler): We need support for context similar to CUDA
RocmTracerOptions options;
#if TF_ROCM_VERSION < 60300
std::vector<uint32_t> empty_vec;

// clang formatting does not preserve one entry per line
// clang-format off
std::vector<uint32_t> hip_api_domain_ops{
Expand Down Expand Up @@ -172,7 +155,9 @@ RocmTracerOptions GpuTracer::GetRocmTracerOptions() {
options.api_callbacks.emplace(ACTIVITY_DOMAIN_HIP_API, empty_vec);

options.activity_tracing.emplace(ACTIVITY_DOMAIN_HIP_OPS, empty_vec);

#else
options.max_annotation_strings = 1024 * 1024;
#endif
return options;
}

Expand All @@ -187,20 +172,16 @@ RocmTraceCollectorOptions GpuTracer::GetRocmTraceCollectorOptions(
}

absl::Status GpuTracer::DoStart() {
if (!rocm_tracer_->IsAvailable()) {
return tsl::errors::Unavailable("Another profile session running.");
}

AnnotationStack::Enable(true);
uint64_t start_gputime_ns = RocmTracer::GetTimestamp();
uint64_t start_walltime_ns = tsl::EnvTime::NowNanos();

RocmTracerOptions tracer_options = GetRocmTracerOptions();
RocmTraceCollectorOptions trace_collector_options =
GetRocmTraceCollectorOptions(rocm_tracer_->NumGpus());
uint64_t start_gputime_ns = RocmTracer::GetTimestamp();
uint64_t start_walltime_ns = tsl::EnvTime::NowNanos();
rocm_trace_collector_ = CreateRocmCollector(
trace_collector_options, start_walltime_ns, start_gputime_ns);

RocmTracerOptions tracer_options = GetRocmTracerOptions();
rocm_tracer_->Enable(tracer_options, rocm_trace_collector_.get());

return absl::OkStatus();
Expand Down Expand Up @@ -259,12 +240,16 @@ std::unique_ptr<profiler::ProfilerInterface> CreateGpuTracer(
if (options.device_type() != ProfileOptions::GPU &&
options.device_type() != ProfileOptions::UNSPECIFIED)
return nullptr;

#if TF_ROCM_VERSION < 60300
profiler::RocmTracer* rocm_tracer =
profiler::RocmTracer::GetRocmTracerSingleton();
if (!rocm_tracer->IsAvailable()) return nullptr;

return std::make_unique<profiler::GpuTracer>(rocm_tracer);
#else
auto& rocm_tracer = profiler::RocmTracer::i();
if (!rocm_tracer.IsAvailable()) return nullptr;
return std::make_unique<profiler::GpuTracer>(&rocm_tracer);
#endif
}

auto register_rocm_gpu_tracer_factory = [] {
Expand Down
Loading