diff --git a/ggml/src/ggml-cpu/CMakeLists.txt b/ggml/src/ggml-cpu/CMakeLists.txt index 7e53a57b7b0..cfd0d5ad12c 100644 --- a/ggml/src/ggml-cpu/CMakeLists.txt +++ b/ggml/src/ggml-cpu/CMakeLists.txt @@ -98,6 +98,11 @@ function(ggml_add_cpu_backend_variant_impl tag_name) target_link_libraries(${GGML_CPU_NAME} PUBLIC memkind) endif() + if (GGML_CPU_OP_PROFILING) + message(STATUS "Enabling operator-level CPU profiling for ggml-cpu target") + target_compile_definitions(${GGML_CPU_NAME} PRIVATE GGML_CPU_OP_PROFILING) + endif() + if (GGML_SYSTEM_ARCH STREQUAL "ARM") message(STATUS "ARM detected") list(APPEND GGML_CPU_SOURCES diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c index 3247af8bb03..5c94f803cf7 100644 --- a/ggml/src/ggml-cpu/ggml-cpu.c +++ b/ggml/src/ggml-cpu/ggml-cpu.c @@ -701,6 +701,62 @@ static void ggml_init_arm_arch_features(void) { #endif // __ARM_ARCH + +// +// Profiling +// +#ifdef GGML_CPU_OP_PROFILING +static atomic_uint_fast64_t start_clock_us; +static FILE * g_performance_log_csv = NULL; + +static void write_time(const char * op_name) { + const uint64_t end_time_us = ggml_time_us(); + const uint64_t start_time_us = atomic_load(&start_clock_us); + const uint64_t duration_us = end_time_us - start_time_us; + + if (g_performance_log_csv) { + fprintf(g_performance_log_csv, "%s,%.4f\n", op_name, duration_us / 1000.0); + fflush(g_performance_log_csv); + } + atomic_store(&start_clock_us, ggml_time_us()); +} + +static void write_barrier(struct ggml_threadpool * tp) { + int n_threads = atomic_load_explicit(&tp->n_threads_cur, memory_order_relaxed); + if (n_threads == 1) { + return; + } + +#ifdef GGML_USE_OPENMP + #pragma omp barrier +#else + int n_passed = atomic_load_explicit(&tp->n_barrier_passed, memory_order_relaxed); + + // enter barrier (full seq-cst fence) + int n_barrier = atomic_fetch_add_explicit(&tp->n_barrier, 1, memory_order_seq_cst); + + if (n_barrier == (n_threads - 1)) { + atomic_store_explicit(&tp->n_barrier, 0, memory_order_relaxed); + atomic_fetch_add_explicit(&tp->n_barrier_passed, 1, memory_order_seq_cst); + return; + } + + // wait for other threads + while (atomic_load_explicit(&tp->n_barrier_passed, memory_order_relaxed) == n_passed) { + ggml_thread_cpu_relax(); + } + + // exit barrier (full seq-cst fence) + // TSAN doesn't support standalone fence yet, we use a dummy read-modify-write instead + #ifdef GGML_TSAN_ENABLED + atomic_fetch_add_explicit(&tp->n_barrier_passed, 0, memory_order_seq_cst); + #else + atomic_thread_fence(memory_order_seq_cst); + #endif +#endif +} +#endif + struct ggml_tensor * ggml_new_i32(struct ggml_context * ctx, int32_t value) { GGML_ASSERT(!ggml_get_no_alloc(ctx)); @@ -2912,6 +2968,12 @@ static thread_ret_t ggml_graph_compute_thread(void * data) { /*.threadpool=*/ tp, }; +#ifdef GGML_CPU_OP_PROFILING + if (state->ith == 0) { + atomic_store(&start_clock_us, ggml_time_us()); + } +#endif + for (int node_n = 0; node_n < cgraph->n_nodes && atomic_load_explicit(&tp->abort, memory_order_relaxed) != node_n; node_n++) { struct ggml_tensor * node = cgraph->nodes[node_n]; @@ -2931,6 +2993,12 @@ static thread_ret_t ggml_graph_compute_thread(void * data) { if (node_n + 1 < cgraph->n_nodes) { ggml_barrier(state->threadpool); } +#ifdef GGML_CPU_OP_PROFILING + if (state->ith == 0) { + write_time(ggml_op_name(node->op)); + } + write_barrier(state->threadpool); +#endif } ggml_barrier(state->threadpool); @@ -3169,6 +3237,11 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl GGML_ASSERT(cplan); GGML_ASSERT(cplan->n_threads > 0); GGML_ASSERT(cplan->work_size == 0 || cplan->work_data != NULL); +#ifdef GGML_CPU_OP_PROFILING + if (g_performance_log_csv == NULL) { + g_performance_log_csv = fopen("op_profiling.csv", "a"); + } +#endif int n_threads = cplan->n_threads; struct ggml_threadpool * threadpool = cplan->threadpool; @@ -3237,6 +3310,13 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl ggml_threadpool_free(threadpool); } +#ifdef GGML_CPU_OP_PROFILING + if (g_performance_log_csv != NULL) { + fclose(g_performance_log_csv); + g_performance_log_csv = NULL; + } +#endif + return ret; }