From 21af86b61154e809cc4c18a446d72098c0e8c53c Mon Sep 17 00:00:00 2001 From: poursoul Date: Fri, 13 Feb 2026 15:14:30 +0800 Subject: [PATCH] =?UTF-8?q?device=5Frunner.cpp=20=E2=80=94=20poll=5Fand=5F?= =?UTF-8?q?collect=20=E6=94=B9=E4=B8=BA=E5=9C=A8=E7=8B=AC=E7=AB=8B?= =?UTF-8?q?=E7=BA=BF=E7=A8=8B=E4=B8=AD=E4=B8=8E=20rtStreamSynchronize=20?= =?UTF-8?q?=E5=B9=B6=E8=A1=8C=E8=BF=90=E8=A1=8C=20performance=5Fcollector.?= =?UTF-8?q?cpp=20=E2=80=94=20=E6=B6=88=E9=99=A4=E5=85=88=E7=AD=89=20total?= =?UTF-8?q?=5Ftasks=20=E5=86=8D=E6=B6=88=E8=B4=B9=20buffer=20=E7=9A=84?= =?UTF-8?q?=E6=AD=BB=E9=94=81=EF=BC=8C=E6=94=B9=E4=B8=BA=E7=AB=8B=E5=8D=B3?= =?UTF-8?q?=E6=B6=88=E8=B4=B9=20buffer=20=E5=B9=B6=E5=8A=A8=E6=80=81?= =?UTF-8?q?=E5=8F=91=E7=8E=B0=20total=5Ftasks=20platform=5Fconfig.h=20?= =?UTF-8?q?=E2=80=94=20=E8=B6=85=E6=97=B6=E4=BB=8E=202s=20=E6=94=B9?= =?UTF-8?q?=E4=B8=BA=2060s?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/platform/a2a3/host/device_runner.cpp | 20 ++++++- src/platform/include/common/platform_config.h | 2 +- src/platform/src/performance_collector.cpp | 58 +++++++++---------- 3 files changed, 46 insertions(+), 34 deletions(-) diff --git a/src/platform/a2a3/host/device_runner.cpp b/src/platform/a2a3/host/device_runner.cpp index a5ece59e..e35a3fee 100644 --- a/src/platform/a2a3/host/device_runner.cpp +++ b/src/platform/a2a3/host/device_runner.cpp @@ -8,6 +8,7 @@ #include "device_runner.h" #include +#include // Include HAL constants from CANN (header only, library loaded dynamically) #include "ascend_hal.h" @@ -397,9 +398,15 @@ int DeviceRunner::run(Runtime& runtime, return rc; } - // Poll and collect performance data (must be before stream sync) + // Poll and collect performance data in a separate thread so it runs + // concurrently with stream synchronization. This prevents a deadlock + // where AICPU spins waiting for the host to consume a full perf buffer + // while the host has already stopped polling. + std::thread collector_thread; if (runtime.enable_profiling) { - poll_and_collect_performance_data(runtime.worker_count, runtime.get_task_count()); + collector_thread = std::thread([this, &runtime]() { + poll_and_collect_performance_data(runtime.worker_count, runtime.get_task_count()); + }); } std::cout << "\n=== rtStreamSynchronize stream_aicpu_===" << '\n'; @@ -407,6 +414,7 @@ int DeviceRunner::run(Runtime& runtime, rc = rtStreamSynchronize(stream_aicpu_); if (rc != 0) { LOG_ERROR("rtStreamSynchronize (AICPU) failed: %d", rc); + if (collector_thread.joinable()) collector_thread.join(); kernel_args_.finalize_runtime_args(); return rc; } @@ -415,11 +423,17 @@ int DeviceRunner::run(Runtime& runtime, rc = rtStreamSynchronize(stream_aicore_); if (rc != 0) { LOG_ERROR("rtStreamSynchronize (AICore) failed: %d", rc); + if (collector_thread.joinable()) collector_thread.join(); kernel_args_.finalize_runtime_args(); return rc; } - // Print collected performance data (after stream sync) + // Wait for collector thread to finish + if (collector_thread.joinable()) { + collector_thread.join(); + } + + // Export collected performance data (after stream sync + collection done) if (runtime.enable_profiling) { export_swimlane_json(); } diff --git a/src/platform/include/common/platform_config.h b/src/platform/include/common/platform_config.h index 5992f563..bbdc57d4 100644 --- a/src/platform/include/common/platform_config.h +++ b/src/platform/include/common/platform_config.h @@ -94,7 +94,7 @@ constexpr uint64_t PLATFORM_PROF_SYS_CNT_FREQ = 50000000; // 50 MHz /** * Timeout duration for performance data collection (seconds) */ -constexpr int PLATFORM_PROF_TIMEOUT_SECONDS = 2; +constexpr int PLATFORM_PROF_TIMEOUT_SECONDS = 60; /** * Number of empty polling iterations before checking timeout diff --git a/src/platform/src/performance_collector.cpp b/src/platform/src/performance_collector.cpp index 53a9145e..f3675abc 100644 --- a/src/platform/src/performance_collector.cpp +++ b/src/platform/src/performance_collector.cpp @@ -129,47 +129,44 @@ void PerformanceCollector::poll_and_collect(int num_aicore, int expected_tasks) const auto timeout_duration = std::chrono::seconds(PLATFORM_PROF_TIMEOUT_SECONDS); std::optional idle_start; - // Poll for total_tasks if not provided - if (expected_tasks <= 0) { - LOG_INFO("Waiting for AICPU to write total_tasks to PerfDataHeader..."); - idle_start = std::chrono::steady_clock::now(); - - while (true) { - rmb(); - expected_tasks = static_cast(header->total_tasks); - - if (expected_tasks > 0) { - LOG_INFO("Task count read from PerfDataHeader: %d", expected_tasks); - break; - } - - auto elapsed = std::chrono::steady_clock::now() - idle_start.value(); - if (elapsed >= timeout_duration) { - LOG_ERROR("Timeout waiting for total_tasks from AICPU after %ld seconds", - std::chrono::duration_cast(elapsed).count()); - LOG_ERROR("AICPU may not have initialized performance profiling"); - return; - } - } - } - - LOG_DEBUG("Expected tasks: %d", expected_tasks); - uint32_t capacity = PLATFORM_PROF_READYQUEUE_SIZE; int total_records_collected = 0; int buffers_processed = 0; collected_perf_records_.clear(); - idle_start.reset(); int empty_poll_count = 0; - // Poll the ready queue - while (total_records_collected < expected_tasks) { + // When expected_tasks is unknown (<=0), we start consuming buffers immediately + // and dynamically check header->total_tasks to learn the final count. + bool tasks_known = (expected_tasks > 0); + if (!tasks_known) { + LOG_INFO("Task count unknown, will poll total_tasks from PerfDataHeader while consuming buffers"); + } + + // Unified polling loop: consume ready buffers and check for total_tasks + while (true) { + // Dynamically discover total_tasks if not yet known + if (!tasks_known) { + rmb(); + int discovered = static_cast(header->total_tasks); + if (discovered > 0) { + expected_tasks = discovered; + tasks_known = true; + LOG_INFO("Task count read from PerfDataHeader: %d", expected_tasks); + } + } + + // Termination: collected enough records + if (tasks_known && total_records_collected >= expected_tasks) { + break; + } + rmb(); uint32_t head = header->queue_head; uint32_t tail = header->queue_tail; if (head == tail) { + // Queue empty — track idle time for timeout if (!idle_start.has_value()) { idle_start = std::chrono::steady_clock::now(); } @@ -189,6 +186,7 @@ void PerformanceCollector::poll_and_collect(int num_aicore, int expected_tasks) continue; } + // Got data — reset idle tracking idle_start.reset(); empty_poll_count = 0; @@ -230,7 +228,7 @@ void PerformanceCollector::poll_and_collect(int num_aicore, int expected_tasks) LOG_INFO("Total buffers processed: %d", buffers_processed); LOG_INFO("Total records collected: %d", total_records_collected); - if (total_records_collected < expected_tasks) { + if (tasks_known && total_records_collected < expected_tasks) { LOG_WARN("Incomplete collection (%d / %d records)", total_records_collected, expected_tasks); }