From 525d7b0e5e1d7a62ef3a933e7cf492ddd6056899 Mon Sep 17 00:00:00 2001 From: Marco Edoardo Santimaria Date: Mon, 23 Jun 2025 10:30:42 +0200 Subject: [PATCH 1/8] Added try catch block on exception handling --- src/posix/libcapio_posix.cpp | 25 +++++++++++++++++++++++-- 1 file changed, 23 insertions(+), 2 deletions(-) diff --git a/src/posix/libcapio_posix.cpp b/src/posix/libcapio_posix.cpp index 93dc776eb..d91739c78 100644 --- a/src/posix/libcapio_posix.cpp +++ b/src/posix/libcapio_posix.cpp @@ -409,7 +409,28 @@ static int hook(long syscall_number, long arg0, long arg1, long arg2, long arg3, } LOG("Handling syscall NO %ld (max num is %ld)", syscall_number, CAPIO_NR_SYSCALLS); - return syscallTable[syscall_number](arg0, arg1, arg2, arg3, arg4, arg5, result); + try { + return syscallTable[syscall_number](arg0, arg1, arg2, arg3, arg4, arg5, result); + } catch (const std::exception &exception) { + syscall_no_intercept_flag = true; + + std::cout + << std::endl + << "~~~~~~~~~~~~~~[\033[31mlibcapio_posix.so: FATAL EXCEPTION\033[0m]~~~~~~~~~~~~~~" + << std::endl + << "| Exception thrown while handling syscall " << syscall_number << std::endl + << "| TID of offending thread: " << syscall_no_intercept(SYS_gettid) << std::endl + << "| PID of offending thread: " << syscall_no_intercept(SYS_getpid) << std::endl + << "| PPID of offending thread: " << syscall_no_intercept(SYS_getppid) << std::endl + << "| " << std::endl + << "| `" << typeid(exception).name() << ": " << exception.what() << std::endl + << "|" << std::endl + << "~~~~~~~~~~~~~~[\033[31mlibcapio_posix.so: FATAL EXCEPTION\033[0m]~~~~~~~~~~~~~~" + << std::endl + << std::endl; + + exit(EXIT_FAILURE); + } } static __attribute__((constructor)) void init() { @@ -434,4 +455,4 @@ static __attribute__((constructor)) void init() { intercept_hook_point_clone_parent = hook_clone_parent; intercept_hook_point = hook; START_SYSCALL_LOGGING(); -} +} \ No newline at end of file From 58804258391ac861faa9ea12dbac16d3d6878b01 Mon Sep 17 00:00:00 2001 From: Marco Edoardo Santimaria Date: Mon, 23 Jun 2025 11:53:21 +0200 Subject: [PATCH 2/8] Fixed startup issue when child_Stack on SYS_clone == NULL --- README.md | 2 +- src/posix/syscall_intercept/CMakeLists.txt | 5 +++-- src/posix/utils/clone.hpp | 19 +++++++++++++++++-- 3 files changed, 21 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index ed472d009..4bd88bf4b 100644 --- a/README.md +++ b/README.md @@ -58,7 +58,7 @@ the [CAPIO-CL Docs](https://capio.hpc4ai.it/docs/coord-language/) for details. To launch your workflow with capio you can follow two routes: -#### A) Use `capiorun` for simplfied operations +#### A) Use `capiorun` for simplified operations You can simplify the execution of workflow steps with CAPIO using the `capiorun` utility. See the [`capiorun` documentation](capiorun/readme.md) for usage and examples. `capiorun` provides an easier way to manage diff --git a/src/posix/syscall_intercept/CMakeLists.txt b/src/posix/syscall_intercept/CMakeLists.txt index 517f76217..825aada2d 100644 --- a/src/posix/syscall_intercept/CMakeLists.txt +++ b/src/posix/syscall_intercept/CMakeLists.txt @@ -14,11 +14,12 @@ include(ExternalProject) # Import external project from git ##################################### ExternalProject_Add(syscall_intercept - GIT_REPOSITORY https://github.com/pmem/syscall_intercept.git - GIT_TAG ca4b13531f883597c2f04a40e095f76f6c3a6d22 + GIT_REPOSITORY https://github.com/alpha-unito/syscall_intercept + GIT_TAG 3e8695cdf62d17af041eaee987bb26fcee2691ff PREFIX ${CMAKE_CURRENT_BINARY_DIR} CMAKE_ARGS -DBUILD_TESTS=OFF + -DSTATIC_CAPSTONE=ON -DBUILD_EXAMPLES=OFF -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE} -DCMAKE_INSTALL_PREFIX:PATH= diff --git a/src/posix/utils/clone.hpp b/src/posix/utils/clone.hpp index 0795c8c0d..f24ff92d2 100644 --- a/src/posix/utils/clone.hpp +++ b/src/posix/utils/clone.hpp @@ -21,6 +21,7 @@ inline void register_capio_tid(const pid_t tid) { START_LOG(syscall_no_intercept(SYS_gettid), "call(tid=%ld)", tid); const std::lock_guard lg(clone_mutex); tids->insert(tid); + LOG("Pid inserted ? %s", tids->find(tid) == tids->end() ? "NO" : "YES"); } inline void remove_capio_tid(const pid_t tid) { @@ -79,7 +80,7 @@ inline void hook_clone_child() { #endif std::unique_lock lock(clone_mutex); - clone_cv.wait(lock, [&tid] { return tids->find(tid) != tids->end(); }); + clone_cv.wait(lock, [] { return true; }); /** * Freeing memory here through `tids.erase()` can cause a SIGSEGV error @@ -88,6 +89,20 @@ inline void hook_clone_child() { * is removed from the `tids` set only when the thread terminates. */ lock.unlock(); + +#ifdef CAPIO_LOG + /* + * Needed to enable logging when SYS_clone is called with child_stack==NULL. + * In this case, the thread_local variables are initialized and not set to a nullptr. + * For this reason we reset them here + */ + if (logfileOpen) { + logfileOpen = false; + logfileFD = -1; + bzero(logfile_path, PATH_MAX); + } +#endif + START_SYSCALL_LOGGING(); START_LOG(tid, "call()"); LOG("Initializing child thread %d", tid); @@ -114,4 +129,4 @@ inline void hook_clone_parent(long child_tid) { clone_cv.notify_all(); } -#endif // CAPIO_POSIX_UTILS_CLONE_HPP +#endif // CAPIO_POSIX_UTILS_CLONE_HPP \ No newline at end of file From 0c4b7347f9ad307788600297f41476887dda65f0 Mon Sep 17 00:00:00 2001 From: Marco Edoardo Santimaria Date: Mon, 23 Jun 2025 15:56:27 +0200 Subject: [PATCH 3/8] Investigating strange deadlock] --- src/common/capio/queue.hpp | 1 + src/posix/handlers/exit.hpp | 1 + src/posix/libcapio_posix.cpp | 10 ++++++ src/posix/utils/cache.hpp | 3 ++ src/posix/utils/clone.hpp | 67 ++++++++++++++++++++++-------------- src/posix/utils/requests.hpp | 4 +++ 6 files changed, 60 insertions(+), 26 deletions(-) diff --git a/src/common/capio/queue.hpp b/src/common/capio/queue.hpp index fcdb3ed15..83b953e27 100644 --- a/src/common/capio/queue.hpp +++ b/src/common/capio/queue.hpp @@ -71,6 +71,7 @@ template class Queue { _last_elem = (long int *) create_shm(_last_elem_name, sizeof(long int)); _shm = get_shm_if_exist(_shm_name); if (_shm == nullptr) { + LOG("Creating shared memory"); *_first_elem = 0; *_last_elem = 0; _shm = create_shm(_shm_name, _buff_size); diff --git a/src/posix/handlers/exit.hpp b/src/posix/handlers/exit.hpp index c966035fc..8ba1c96d0 100644 --- a/src/posix/handlers/exit.hpp +++ b/src/posix/handlers/exit.hpp @@ -28,6 +28,7 @@ int exit_handler(long arg0, long arg1, long arg2, long arg3, long arg4, long arg } delete_caches(); + delete_queues(); LOG("Removed caches"); if (const auto itm = bufs_response->find(tid); itm != bufs_response->end()) { diff --git a/src/posix/libcapio_posix.cpp b/src/posix/libcapio_posix.cpp index d91739c78..11efdb3e9 100644 --- a/src/posix/libcapio_posix.cpp +++ b/src/posix/libcapio_posix.cpp @@ -408,6 +408,16 @@ static int hook(long syscall_number, long arg0, long arg1, long arg2, long arg3, return 1; } + if (syscall_number == SYS_clone +#ifdef SYS_clone3 + || syscall_number == SYS_clone3 +#endif + ) { + clone_after_null_child_stack = arg1 == 0; + LOG("Clone will occur with child_stack == NULL ? %s ", + clone_after_null_child_stack ? "true" : "false"); + } + LOG("Handling syscall NO %ld (max num is %ld)", syscall_number, CAPIO_NR_SYSCALLS); try { return syscallTable[syscall_number](arg0, arg1, arg2, arg3, arg4, arg5, result); diff --git a/src/posix/utils/cache.hpp b/src/posix/utils/cache.hpp index 67e07977f..393e2b43e 100644 --- a/src/posix/utils/cache.hpp +++ b/src/posix/utils/cache.hpp @@ -31,7 +31,10 @@ inline void delete_caches() { delete consent_request_cache_fs; delete write_request_cache_mem; delete read_request_cache_mem; +} +inline void delete_queues() { + START_LOG(capio_syscall(SYS_gettid), "call()"); delete cts_queue; LOG("Removed cts_queue"); delete stc_queue; diff --git a/src/posix/utils/clone.hpp b/src/posix/utils/clone.hpp index f24ff92d2..fc7f9cb68 100644 --- a/src/posix/utils/clone.hpp +++ b/src/posix/utils/clone.hpp @@ -12,6 +12,8 @@ inline std::mutex clone_mutex; inline std::condition_variable clone_cv; inline std::unordered_set *tids; +inline bool clone_after_null_child_stack = false; + inline bool is_capio_tid(const pid_t tid) { const std::lock_guard lg(clone_mutex); return tids->find(tid) != tids->end(); @@ -40,13 +42,14 @@ inline void init_process(pid_t tid) { syscall_no_intercept_flag = true; auto *p_buf_response = new ResponseQueue(SHM_COMM_CHAN_NAME_RESP + std::to_string(tid)); - bufs_response->insert(std::make_pair(tid, p_buf_response)); - + LOG("Created buf response"); + bufs_response->insert({tid, p_buf_response}); LOG("Created request response buffer with name: %s", (SHM_COMM_CHAN_NAME_RESP + std::to_string(tid)).c_str()); const char *capio_app_name = get_capio_app_name(); auto pid = static_cast(syscall_no_intercept(SYS_gettid)); + LOG("sending handshake with tid=%ld, pid=%ld", tid, pid); /** * The previous if, for an anonymous handshake was present, however the get_capio_app_name() @@ -61,15 +64,13 @@ inline void init_process(pid_t tid) { inline void hook_clone_child() { auto tid = static_cast(syscall_no_intercept(SYS_gettid)); -#ifdef __CAPIO_POSIX - syscall_no_intercept_flag = true; - /* * This piece of code is aimed at addressing issues with applications that spawn several * thousand threads that only do computations. When this occurs, under some circumstances CAPIO * might fail to allocate shared memory objects. As such, if child threads ONLY do computation, * we can safely ignore them with CAPIO. */ + syscall_no_intercept_flag = true; thread_local char *skip_child = std::getenv("CAPIO_IGNORE_CHILD_THREADS"); if (skip_child != nullptr) { auto skip_child_str = std::string(skip_child); @@ -77,31 +78,46 @@ inline void hook_clone_child() { return; } } + syscall_no_intercept_flag = false; -#endif - std::unique_lock lock(clone_mutex); - clone_cv.wait(lock, [] { return true; }); - - /** - * Freeing memory here through `tids.erase()` can cause a SIGSEGV error - * in the libc, which tries to load the `__ctype_b_loc` table but fails - * because it is not initialized yet. For this reason, a thread's `tid` - * is removed from the `tids` set only when the thread terminates. - */ - lock.unlock(); + if (!clone_after_null_child_stack) { + syscall_no_intercept_flag = true; + std::unique_lock lock(clone_mutex); + clone_cv.wait(lock, [&tid] { return tids->find(tid) != tids->end(); }); + /** + * Freeing memory here through `tids.erase()` can cause a SIGSEGV error + * in the libc, which tries to load the `__ctype_b_loc` table but fails + * because it is not initialized yet. For this reason, a thread's `tid` + * is removed from the `tids` set only when the thread terminates. + */ + lock.unlock(); + syscall_no_intercept_flag = false; + } else { + /* + * Needed to enable logging when SYS_clone is called with child_stack==NULL. + * In this case, the thread_local variables are initialized and not set to a nullptr. + * For this reason, we reset them here + */ #ifdef CAPIO_LOG - /* - * Needed to enable logging when SYS_clone is called with child_stack==NULL. - * In this case, the thread_local variables are initialized and not set to a nullptr. - * For this reason we reset them here - */ - if (logfileOpen) { logfileOpen = false; logfileFD = -1; bzero(logfile_path, PATH_MAX); - } #endif + // We cannot perform delete, as it will destroy also shm objects. put ptr to nullptr + // and accept a small memory leak + bufs_response = nullptr; + buf_requests = nullptr; + stc_queue = nullptr; + cts_queue = nullptr; + write_request_cache_fs = nullptr; + read_request_cache_fs = nullptr; + consent_request_cache_fs = nullptr; + write_request_cache_mem = nullptr; + read_request_cache_mem = nullptr; + init_client(); + clone_after_null_child_stack = false; + } START_SYSCALL_LOGGING(); START_LOG(tid, "call()"); @@ -110,9 +126,6 @@ inline void hook_clone_child() { LOG("Child thread %d initialized", tid); LOG("Starting child thread %d", tid); init_caches(); -#ifdef __CAPIO_POSIX - syscall_no_intercept_flag = false; -#endif } inline void hook_clone_parent(long child_tid) { @@ -125,6 +138,8 @@ inline void hook_clone_parent(long child_tid) { return; } + clone_after_null_child_stack = false; + register_capio_tid(child_tid); clone_cv.notify_all(); } diff --git a/src/posix/utils/requests.hpp b/src/posix/utils/requests.hpp index 661642cad..c7828726d 100644 --- a/src/posix/utils/requests.hpp +++ b/src/posix/utils/requests.hpp @@ -26,9 +26,13 @@ inline thread_local SPSCQueue *stc_queue; * @return */ inline void init_client() { + START_LOG(capio_syscall(SYS_gettid), "call()"); + buf_requests = new CircularBuffer(SHM_COMM_CHAN_NAME, CAPIO_REQ_BUFF_CNT, CAPIO_REQ_MAX_SIZE); + LOG("Initialized buf_requests"); bufs_response = new std::unordered_map(); + LOG("Initialized bufs_response"); } /** From 078ccf419365f1e957ec43424ea488b002008831 Mon Sep 17 00:00:00 2001 From: Marco Edoardo Santimaria <39337626+marcoSanti@users.noreply.github.com> Date: Tue, 24 Jun 2025 12:18:30 +0200 Subject: [PATCH 4/8] Bump version of taywee/args (#142) Updated dependency version of taiwee/args to address deprecation of cmake <= 3.10 Co-authored-by: Marco Edoardo Santimaria --- src/server/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/server/CMakeLists.txt b/src/server/CMakeLists.txt index 612791f88..31d0fbf9b 100644 --- a/src/server/CMakeLists.txt +++ b/src/server/CMakeLists.txt @@ -13,7 +13,7 @@ set(TARGET_SOURCES FetchContent_Declare( args GIT_REPOSITORY https://github.com/Taywee/args.git - GIT_TAG 6.4.6 + GIT_TAG 6.4.7 ) FetchContent_Declare( simdjson From ce37d68625944aef7e267105f57dd018d328bb7c Mon Sep 17 00:00:00 2001 From: Marco Edoardo Santimaria Date: Tue, 24 Jun 2025 12:36:46 +0000 Subject: [PATCH 5/8] bump syscall intercept commit --- src/posix/syscall_intercept/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/posix/syscall_intercept/CMakeLists.txt b/src/posix/syscall_intercept/CMakeLists.txt index 825aada2d..99ee799a9 100644 --- a/src/posix/syscall_intercept/CMakeLists.txt +++ b/src/posix/syscall_intercept/CMakeLists.txt @@ -15,7 +15,7 @@ include(ExternalProject) ##################################### ExternalProject_Add(syscall_intercept GIT_REPOSITORY https://github.com/alpha-unito/syscall_intercept - GIT_TAG 3e8695cdf62d17af041eaee987bb26fcee2691ff + GIT_TAG 623aa8415b2ff0b9b81bd08f791850723edea18c PREFIX ${CMAKE_CURRENT_BINARY_DIR} CMAKE_ARGS -DBUILD_TESTS=OFF From 68185ed139afbb8ff589d6ee37968bfcab65f8bd Mon Sep 17 00:00:00 2001 From: Marco Edoardo Santimaria Date: Tue, 24 Jun 2025 13:01:37 +0000 Subject: [PATCH 6/8] fix? --- src/posix/utils/clone.hpp | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/posix/utils/clone.hpp b/src/posix/utils/clone.hpp index fc7f9cb68..1aae88957 100644 --- a/src/posix/utils/clone.hpp +++ b/src/posix/utils/clone.hpp @@ -106,8 +106,6 @@ inline void hook_clone_child() { #endif // We cannot perform delete, as it will destroy also shm objects. put ptr to nullptr // and accept a small memory leak - bufs_response = nullptr; - buf_requests = nullptr; stc_queue = nullptr; cts_queue = nullptr; write_request_cache_fs = nullptr; @@ -115,8 +113,8 @@ inline void hook_clone_child() { consent_request_cache_fs = nullptr; write_request_cache_mem = nullptr; read_request_cache_mem = nullptr; - init_client(); clone_after_null_child_stack = false; + } START_SYSCALL_LOGGING(); From 6fadd09f9d2920ce2c8ed7b7e8d168ed4e62f9da Mon Sep 17 00:00:00 2001 From: Marco Edoardo Santimaria Date: Wed, 25 Jun 2025 09:33:27 +0200 Subject: [PATCH 7/8] wip --- src/posix/libcapio_posix.cpp | 2 +- src/posix/utils/clone.hpp | 29 +++++++++++++++++++---------- 2 files changed, 20 insertions(+), 11 deletions(-) diff --git a/src/posix/libcapio_posix.cpp b/src/posix/libcapio_posix.cpp index 11efdb3e9..8f110833e 100644 --- a/src/posix/libcapio_posix.cpp +++ b/src/posix/libcapio_posix.cpp @@ -410,7 +410,7 @@ static int hook(long syscall_number, long arg0, long arg1, long arg2, long arg3, if (syscall_number == SYS_clone #ifdef SYS_clone3 - || syscall_number == SYS_clone3 + || syscall_number == SYS_clone3 #endif ) { clone_after_null_child_stack = arg1 == 0; diff --git a/src/posix/utils/clone.hpp b/src/posix/utils/clone.hpp index 1aae88957..ba7c556d8 100644 --- a/src/posix/utils/clone.hpp +++ b/src/posix/utils/clone.hpp @@ -42,8 +42,18 @@ inline void init_process(pid_t tid) { syscall_no_intercept_flag = true; auto *p_buf_response = new ResponseQueue(SHM_COMM_CHAN_NAME_RESP + std::to_string(tid)); - LOG("Created buf response"); - bufs_response->insert({tid, p_buf_response}); + + DBG(tid, [](auto bufs_response, auto tid) { + START_LOG(tid, "call(DBG)"); + LOG("Created buf response. buf_response map initialized ? %s", + bufs_response != nullptr ? "YES" : "NO"); + LOG("buf_Response size for tid %d: %ld", tid, bufs_response->size()); + for (auto &[fst, snd] : *bufs_response) { + LOG("Found entry for tid %ld", fst); + } + }(bufs_response, tid)); + + bufs_response->insert(std::make_pair(tid, p_buf_response)); LOG("Created request response buffer with name: %s", (SHM_COMM_CHAN_NAME_RESP + std::to_string(tid)).c_str()); @@ -106,15 +116,14 @@ inline void hook_clone_child() { #endif // We cannot perform delete, as it will destroy also shm objects. put ptr to nullptr // and accept a small memory leak - stc_queue = nullptr; - cts_queue = nullptr; - write_request_cache_fs = nullptr; - read_request_cache_fs = nullptr; - consent_request_cache_fs = nullptr; - write_request_cache_mem = nullptr; - read_request_cache_mem = nullptr; + stc_queue = nullptr; + cts_queue = nullptr; + write_request_cache_fs = nullptr; + read_request_cache_fs = nullptr; + consent_request_cache_fs = nullptr; + write_request_cache_mem = nullptr; + read_request_cache_mem = nullptr; clone_after_null_child_stack = false; - } START_SYSCALL_LOGGING(); From deb9f13e99f5ded5749c2f15194a0c38e277b049 Mon Sep 17 00:00:00 2001 From: Marco Edoardo Santimaria Date: Wed, 25 Jun 2025 14:21:53 +0200 Subject: [PATCH 8/8] fix? --- src/posix/libcapio_posix.cpp | 16 ++++++++++++++-- src/posix/utils/clone.hpp | 17 +++++++++-------- 2 files changed, 23 insertions(+), 10 deletions(-) diff --git a/src/posix/libcapio_posix.cpp b/src/posix/libcapio_posix.cpp index 8f110833e..2ea764587 100644 --- a/src/posix/libcapio_posix.cpp +++ b/src/posix/libcapio_posix.cpp @@ -392,8 +392,8 @@ static int hook(long syscall_number, long arg0, long arg1, long arg2, long arg3, #ifdef CAPIO_LOG CAPIO_LOG_LEVEL = get_capio_log_level(); #endif - - START_LOG(syscall_no_intercept(SYS_gettid), "call(syscall_number=%ld)", syscall_number); + long tid = syscall_no_intercept(SYS_gettid); + START_LOG(tid, "call(syscall_number=%ld)", syscall_number); // If the syscall_number is higher than the maximum // syscall captured by CAPIO, simply return @@ -408,6 +408,18 @@ static int hook(long syscall_number, long arg0, long arg1, long arg2, long arg3, return 1; } + if (clone_after_null_child_stack) { + + LOG("Initializing bufs_response to new empty object."); + bufs_response = new std::unordered_map(); + LOG("Inizializing process"); + init_process(tid); + LOG("Child thread %d initialized", tid); + LOG("Starting child thread %d", tid); + init_caches(); + clone_after_null_child_stack = false; + } + if (syscall_number == SYS_clone #ifdef SYS_clone3 || syscall_number == SYS_clone3 diff --git a/src/posix/utils/clone.hpp b/src/posix/utils/clone.hpp index ba7c556d8..18241d8dc 100644 --- a/src/posix/utils/clone.hpp +++ b/src/posix/utils/clone.hpp @@ -116,14 +116,15 @@ inline void hook_clone_child() { #endif // We cannot perform delete, as it will destroy also shm objects. put ptr to nullptr // and accept a small memory leak - stc_queue = nullptr; - cts_queue = nullptr; - write_request_cache_fs = nullptr; - read_request_cache_fs = nullptr; - consent_request_cache_fs = nullptr; - write_request_cache_mem = nullptr; - read_request_cache_mem = nullptr; - clone_after_null_child_stack = false; + stc_queue = nullptr; + cts_queue = nullptr; + write_request_cache_fs = nullptr; + read_request_cache_fs = nullptr; + consent_request_cache_fs = nullptr; + write_request_cache_mem = nullptr; + read_request_cache_mem = nullptr; + START_SYSCALL_LOGGING(); + return; } START_SYSCALL_LOGGING();