From 715d1ecfb429eb21a9259d3b605f6bf021d39f1c Mon Sep 17 00:00:00 2001 From: Vidas Date: Mon, 20 Oct 2025 19:35:29 +0300 Subject: [PATCH 1/2] ggml-et: Retrieve kernel errors --- ggml/src/ggml-et/ggml-et-kernels.cpp | 38 ++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) diff --git a/ggml/src/ggml-et/ggml-et-kernels.cpp b/ggml/src/ggml-et/ggml-et-kernels.cpp index f09f269516d..3981fbbcf3d 100644 --- a/ggml/src/ggml-et/ggml-et-kernels.cpp +++ b/ggml/src/ggml-et/ggml-et-kernels.cpp @@ -142,6 +142,44 @@ bool ggml_et_launch_kernel(ggml_backend_et_device_context* dev_ctx, const std::s // Wait for completion (synchronous execution) runtime->waitForStream(dev_ctx->default_stream); + // Check for kernel execution errors + auto errors = runtime->retrieveStreamErrors(dev_ctx->default_stream); + for (const auto& error : errors) { + // Check if there are error contexts (indicates kernel-level errors) + if (error.errorContext_.has_value() && !error.errorContext_->empty()) { + bool found_kernel_error = false; + + for (const auto& ctx : *error.errorContext_) { + // Type 4 = CM_CONTEXT_TYPE_USER_KERNEL_ERROR (kernel returned non-zero) + // Skip uninitialized contexts (debug fill pattern 0xcdcdcdcdcdcdcdcd) + if (ctx.type_ == 4 && ctx.hartId_ != 0xcdcdcdcdcdcdcdcdULL) { + int64_t kernel_return_code = ctx.userDefinedError_; + GGML_LOG_ERROR("ET: Kernel '%s' returned error code %lld on hart %lld (shire %lld)\n", + kernel_name.c_str(), + (long long)kernel_return_code, + (long long)ctx.hartId_, + (long long)(ctx.hartId_ / 64)); + found_kernel_error = true; + + // Only log first failing hart to avoid spam + break; + } + } + + if (found_kernel_error) { + return false; + } + } + + // Handle errors without contexts (other device-level errors) + if (error.errorCode_ != rt::DeviceErrorCode::Unknown) { + GGML_LOG_ERROR("ET: Kernel '%s' failed with device error code %d\n", + kernel_name.c_str(), + (int)error.errorCode_); + return false; + } + } + GGML_LOG_DEBUG("ET: Kernel %s completed successfully\n", kernel_name.c_str()); return true; From d88b94c4ce115b2a8a01ad7f301ddf66e88f888d Mon Sep 17 00:00:00 2001 From: Vidas Date: Mon, 10 Nov 2025 13:39:29 +0200 Subject: [PATCH 2/2] ggml-et: Print device id --- ggml/src/ggml-et/ggml-et-kernels.cpp | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/ggml/src/ggml-et/ggml-et-kernels.cpp b/ggml/src/ggml-et/ggml-et-kernels.cpp index 3981fbbcf3d..54b7e741005 100644 --- a/ggml/src/ggml-et/ggml-et-kernels.cpp +++ b/ggml/src/ggml-et/ggml-et-kernels.cpp @@ -154,9 +154,10 @@ bool ggml_et_launch_kernel(ggml_backend_et_device_context* dev_ctx, const std::s // Skip uninitialized contexts (debug fill pattern 0xcdcdcdcdcdcdcdcd) if (ctx.type_ == 4 && ctx.hartId_ != 0xcdcdcdcdcdcdcdcdULL) { int64_t kernel_return_code = ctx.userDefinedError_; - GGML_LOG_ERROR("ET: Kernel '%s' returned error code %lld on hart %lld (shire %lld)\n", + GGML_LOG_ERROR("ET: Kernel '%s' returned error code %lld on device %d, hart %lld (shire %lld)\n", kernel_name.c_str(), (long long)kernel_return_code, + dev_ctx->devidx, (long long)ctx.hartId_, (long long)(ctx.hartId_ / 64)); found_kernel_error = true; @@ -173,9 +174,10 @@ bool ggml_et_launch_kernel(ggml_backend_et_device_context* dev_ctx, const std::s // Handle errors without contexts (other device-level errors) if (error.errorCode_ != rt::DeviceErrorCode::Unknown) { - GGML_LOG_ERROR("ET: Kernel '%s' failed with device error code %d\n", + GGML_LOG_ERROR("ET: Kernel '%s' failed with device error code %d on device %d\n", kernel_name.c_str(), - (int)error.errorCode_); + (int)error.errorCode_, + dev_ctx->devidx); return false; } }