From 9bc7a73a31c48fd4fbdaf840aa972db33ab886f3 Mon Sep 17 00:00:00 2001 From: mike dupont Date: Sat, 4 Nov 2023 00:20:47 -0400 Subject: [PATCH 01/12] save these for later review --- CMakeLists.txt | 6 ++ cmake/FindCuda.cmake | 20 ++--- common/DcgmLogging.h | 2 +- dcgmlib/format.hpp | 147 ++++++++++++++++++++++++++++++++++++ nvvs/include/JsonResult.hpp | 5 ++ 5 files changed, 169 insertions(+), 11 deletions(-) create mode 100644 dcgmlib/format.hpp diff --git a/CMakeLists.txt b/CMakeLists.txt index 4ffbaa30..c92d1e7e 100755 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,4 +1,8 @@ # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. +set(CUDA12_INCLUDE_DIR "/usr/local/cuda-12.3/include") +set(CUDA12_INCLUDE_DIR "/usr/local/cuda-12.3/include") +include_directories("/usr/local/cuda-12.3/targets/x86_64-linux/include") +include_directories(${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}) # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -312,6 +316,8 @@ add_custom_command( ) add_custom_target(dcgm_decode_db ALL DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/dcgm_decode_db.txt) + + set(DCGM_PRIVATE_DIR "${CMAKE_SOURCE_DIR}/dcgm_private") if (EXISTS ${DCGM_PRIVATE_DIR}) add_subdirectory(${DCGM_PRIVATE_DIR}) diff --git a/cmake/FindCuda.cmake b/cmake/FindCuda.cmake index 88dc0284..90dde51a 100644 --- a/cmake/FindCuda.cmake +++ b/cmake/FindCuda.cmake @@ -16,8 +16,8 @@ include(utils) -set(Cuda10_prefix usr/local/cuda-10.0) -set(Cuda11_prefix usr/local/cuda-11.8) +#set(Cuda10_prefix usr/local/cuda-10.0) +#set(Cuda11_prefix usr/local/cuda-11.8) set(Cuda12_prefix usr/local/cuda-12.0) macro (load_cuda cuda_version) @@ -95,18 +95,18 @@ macro (load_cuda cuda_version) endmacro() -if (NOT DEFINED CUDA10_INCLUDE_DIR AND NOT ${CMAKE_SYSTEM_PROCESSOR} STREQUAL "aarch64") - load_cuda(10) -endif() +#if (NOT DEFINED CUDA10_INCLUDE_DIR AND NOT ${CMAKE_SYSTEM_PROCESSOR} STREQUAL "aarch64") +# load_cuda(10) +#endif() -if (NOT DEFINED CUDA11_INCLUDE_DIR) - load_cuda(11) -endif() +#if (NOT DEFINED CUDA11_INCLUDE_DIR) +# load_cuda(11) +#endif() if (NOT DEFINED CUDA12_INCLUDE_DIR) load_cuda(12) endif() -unset(Cuda10_prefix) -unset(Cuda11_prefix) +#unset(Cuda10_prefix) +#unset(Cuda11_prefix) unset(Cuda12_prefix) diff --git a/common/DcgmLogging.h b/common/DcgmLogging.h index 013d16f8..533002ce 100755 --- a/common/DcgmLogging.h +++ b/common/DcgmLogging.h @@ -31,7 +31,7 @@ #include #include #include - +#include "format.hpp" // now define formatting for classes in this module #define DCGM_LOGGING_SEVERITY_OPTIONS "NONE, FATAL, ERROR, WARN, INFO, DEBUG, VERB" #define DCGM_LOGGING_SEVERITY_STRING_VERBOSE "VERB" diff --git a/dcgmlib/format.hpp b/dcgmlib/format.hpp new file mode 100644 index 00000000..36752f1d --- /dev/null +++ b/dcgmlib/format.hpp @@ -0,0 +1,147 @@ +#ifndef DCGM_FORMAT_H +#define DCGM_FORMAT_H +//#define FMT_HEADER_ONLY +//#include +//#include // std::FILE + +// template +// FMT_CONSTEXPR auto format(const T& val, FormatContext& ctx) const +// -> decltype(ctx.out()); + +// template<> +// struct fmt::formatter +// : formatter { +// constexpr auto format(dcgmReturn_enum value, auto& format_context) +// { +// return formatter::format((int)value, format_context); +// } +// }; +// template<> +// struct fmt::formatter +// : formatter { +// constexpr auto format(dcgm_field_entity_group_t value, auto& format_context) +// { +// return formatter::format((int)value, format_context); +// } +// }; + +inline int format_as(dcgmReturn_enum type) { + return static_cast(type); +} +inline int format_as(dcgm_field_entity_group_t type) { + return static_cast(type); +} + +inline int format_as(dcgmGroupType_enum type) { + return static_cast(type); +} + +inline int format_as(dcgmModuleId_t type) { + return static_cast(type); +} + +inline int format_as(dcgmDiagnosticLevel_t type) { + return static_cast(type); +} + +inline int format_as(dcgmPolicyValidation_enum type) { + return static_cast(type); +} + +inline int format_as(dcgmHealthSystems_enum type) { + return static_cast(type); +} + +inline int format_as(dcgmPolicyCondition_enum type) { + return static_cast(type); +} + +inline int format_as(dcgmConfigType_enum type) { + return static_cast(type); +} + +inline int format_as(dcgmOrder_enum type) { + return static_cast(type); +} + + + + + + + + + + + +// //dcgmModuleId_t +// // A rudimentary dcgmReturn_enum formatter. +// template struct fmt::formatter { +// public: +// FMT_CONSTEXPR auto parse(basic_format_parse_context& ctx) +// -> decltype(ctx.begin()) { +// auto begin = ctx.begin(), end = ctx.end(); +// return begin; +// } + +// template +// auto format(dcgmReturn_enum wd, FormatContext& ctx) const -> decltype(ctx.out()) { +// int w = static_cast(wd); +// return "TODO";// What goes here? +// } + +// }; + +// //std::ostream &operator<<(std::ostream& os, ) { +// /// return os << static_cast(c); +// //} +// template +// struct fmt::formatter : fmt::formatter +// { +// constexpr auto format(T value, auto& format_context) +// { +// return formatter::format(magic_enum::enum_name(value), format_context); +// } +// }; + +// namespace fmt +// { + +// // Specialize fmt::formatter for dcgmReturn_enum +// template <> +// class formatter { +// public: +// // Parse the format specification +// //auto parse(format_parse_context ctx) -> decltype(ctx.begin()) { +// // // Use the base class method to parse the format specification +// // return 0;//formatter::parse(ctx); +// //} + +// // Format the value of dcgmReturn_enum +// template +// auto format(dcgmReturn_enum value, FormatContext& ctx) -> decltype(ctx.out()) { +// // Use the base class method to format the value as an int +// return formatter::format(static_cast(value), ctx); +// } +// }; + +// template <> +// class formatter { +// public: +// // Parse the format specification +// auto parse(format_parse_context& ctx) -> decltype(ctx.begin()) { +// // Use the base class method to parse the format specification +// return 0;//formatter::parse(ctx); +// } + +// // Format the value of dcgmReturn_enum +// template +// auto format(dcgmReturn_enum &value, FormatContext& ctx) -> decltype(ctx.out()) { +// // Use the base class method to format the value as an int +// return formatter::format(static_cast(value), ctx); +// } +// }; + +// }; + +#endif diff --git a/nvvs/include/JsonResult.hpp b/nvvs/include/JsonResult.hpp index 7f44ac0e..fd35f836 100644 --- a/nvvs/include/JsonResult.hpp +++ b/nvvs/include/JsonResult.hpp @@ -28,6 +28,10 @@ #include #include +inline int format_as(nvvsReturn_t type) { + return static_cast(type); +} + /* * This file contains the JSON serialization and deserialization logic and structures for the NVVS JSON result format. * Here is an example of the JSON format: @@ -69,6 +73,7 @@ namespace DcgmNs::Nvvs::Json { + struct Info { std::vector messages; From 8182a651bb3ff4464af80bdccdd215a49d4f7a82 Mon Sep 17 00:00:00 2001 From: mike dupont Date: Sat, 4 Nov 2023 00:52:55 -0400 Subject: [PATCH 02/12] starting to compile all the way still have to remove duplicate decls --- common/DcgmMutex.cpp | 2 + common/DcgmWatchTable.cpp | 2 +- dcgmlib/format.hpp | 167 +++------------------ dcgmlib/src/DcgmCacheManager.cpp | 6 +- dcgmlib/src/DcgmGroupManager.cpp | 2 +- dcgmlib/src/DcgmHostEngineHandler.cpp | 2 +- dcgmlib/src/DcgmVgpu.cpp | 2 +- modules/nvswitch/DcgmNvSwitchManager.cpp | 5 +- nvml-injection/include/InjectionArgument.h | 6 + 9 files changed, 43 insertions(+), 151 deletions(-) diff --git a/common/DcgmMutex.cpp b/common/DcgmMutex.cpp index 1fa19488..bf727efc 100644 --- a/common/DcgmMutex.cpp +++ b/common/DcgmMutex.cpp @@ -21,6 +21,8 @@ #include #include +inline int format_as(dcgmMutexSt type) { return static_cast(type);} + /*****************************************************************************/ DcgmMutex::DcgmMutex(int timeoutMs) // Cast to long long to avoid overflowing before widening to a long long diff --git a/common/DcgmWatchTable.cpp b/common/DcgmWatchTable.cpp index 26b1ef87..077ac5b7 100755 --- a/common/DcgmWatchTable.cpp +++ b/common/DcgmWatchTable.cpp @@ -18,7 +18,7 @@ #include "DcgmLogging.h" #include "DcgmUtilities.h" #include "DcgmWatchTable.h" - +inline int format_as(DcgmWatcherType_t type) { return static_cast(type);} /*****************************************************************************/ DcgmWatchTable::DcgmWatchTable() : m_entityWatchHashTable() diff --git a/dcgmlib/format.hpp b/dcgmlib/format.hpp index 36752f1d..2d85a999 100644 --- a/dcgmlib/format.hpp +++ b/dcgmlib/format.hpp @@ -1,147 +1,30 @@ #ifndef DCGM_FORMAT_H #define DCGM_FORMAT_H -//#define FMT_HEADER_ONLY -//#include -//#include // std::FILE - -// template -// FMT_CONSTEXPR auto format(const T& val, FormatContext& ctx) const -// -> decltype(ctx.out()); - -// template<> -// struct fmt::formatter -// : formatter { -// constexpr auto format(dcgmReturn_enum value, auto& format_context) -// { -// return formatter::format((int)value, format_context); -// } -// }; -// template<> -// struct fmt::formatter -// : formatter { -// constexpr auto format(dcgm_field_entity_group_t value, auto& format_context) -// { -// return formatter::format((int)value, format_context); -// } -// }; - -inline int format_as(dcgmReturn_enum type) { - return static_cast(type); -} -inline int format_as(dcgm_field_entity_group_t type) { - return static_cast(type); -} - -inline int format_as(dcgmGroupType_enum type) { - return static_cast(type); -} - -inline int format_as(dcgmModuleId_t type) { - return static_cast(type); -} - -inline int format_as(dcgmDiagnosticLevel_t type) { - return static_cast(type); -} - -inline int format_as(dcgmPolicyValidation_enum type) { - return static_cast(type); -} - -inline int format_as(dcgmHealthSystems_enum type) { - return static_cast(type); -} - -inline int format_as(dcgmPolicyCondition_enum type) { - return static_cast(type); -} - -inline int format_as(dcgmConfigType_enum type) { - return static_cast(type); -} - -inline int format_as(dcgmOrder_enum type) { - return static_cast(type); -} - - - - - - - - - - - -// //dcgmModuleId_t -// // A rudimentary dcgmReturn_enum formatter. -// template struct fmt::formatter { -// public: -// FMT_CONSTEXPR auto parse(basic_format_parse_context& ctx) -// -> decltype(ctx.begin()) { -// auto begin = ctx.begin(), end = ctx.end(); -// return begin; -// } - -// template -// auto format(dcgmReturn_enum wd, FormatContext& ctx) const -> decltype(ctx.out()) { -// int w = static_cast(wd); -// return "TODO";// What goes here? -// } - -// }; - -// //std::ostream &operator<<(std::ostream& os, ) { -// /// return os << static_cast(c); -// //} -// template -// struct fmt::formatter : fmt::formatter -// { -// constexpr auto format(T value, auto& format_context) -// { -// return formatter::format(magic_enum::enum_name(value), format_context); -// } -// }; - -// namespace fmt -// { - -// // Specialize fmt::formatter for dcgmReturn_enum -// template <> -// class formatter { -// public: -// // Parse the format specification -// //auto parse(format_parse_context ctx) -> decltype(ctx.begin()) { -// // // Use the base class method to parse the format specification -// // return 0;//formatter::parse(ctx); -// //} - -// // Format the value of dcgmReturn_enum -// template -// auto format(dcgmReturn_enum value, FormatContext& ctx) -> decltype(ctx.out()) { -// // Use the base class method to format the value as an int -// return formatter::format(static_cast(value), ctx); -// } -// }; - -// template <> -// class formatter { -// public: -// // Parse the format specification -// auto parse(format_parse_context& ctx) -> decltype(ctx.begin()) { -// // Use the base class method to parse the format specification -// return 0;//formatter::parse(ctx); -// } - -// // Format the value of dcgmReturn_enum -// template -// auto format(dcgmReturn_enum &value, FormatContext& ctx) -> decltype(ctx.out()) { -// // Use the base class method to format the value as an int -// return formatter::format(static_cast(value), ctx); -// } -// }; +#ifdef DcgmWatcherType_t +inline int format_as(DcgmWatcherType_t type) { return static_cast(type);} +#endif +inline int format_as(dcgmChipArchitecture_enum type) { return static_cast(type);} +inline int format_as(dcgmConfigType_enum type) { return static_cast(type);} +inline int format_as(dcgmDiagnosticLevel_t type) { return static_cast(type);} +//inline int format_as(dcgmEntityStatusType_enum type) { return static_cast(type);} +inline int format_as(dcgmGroupType_enum type) { return static_cast(type);} +inline int format_as(dcgmHealthSystems_enum type) { return static_cast(type);} +inline int format_as(dcgmModuleId_t type) { return static_cast(type);} +inline int format_as(dcgmModuleStatus_t type) { return static_cast(type);} +//inline int format_as(dcgmMutexSt type) { return static_cast(type);} + +#ifdef dcgmNvLinkLinkState +inline int format_as(dcgmNvLinkLinkState type) { return static_cast(type);} +#endif +inline int format_as(dcgmOrder_enum type) { return static_cast(type);} +inline int format_as(dcgmPolicyCondition_enum type) { return static_cast(type);} +inline int format_as(dcgmPolicyValidation_enum type) { return static_cast(type);} +inline int format_as(dcgmPolicyAction_enum type) { return static_cast(type);} -// }; +inline int format_as(dcgmReturn_enum type) { return static_cast(type);} +inline int format_as(dcgm_field_entity_group_t type) { return static_cast(type);} +#ifdef nvmlReturn_enum +inline int format_as(nvmlReturn_enum type) { return static_cast(type);} +#endif #endif diff --git a/dcgmlib/src/DcgmCacheManager.cpp b/dcgmlib/src/DcgmCacheManager.cpp index e62d81ac..ad81c1ed 100644 --- a/dcgmlib/src/DcgmCacheManager.cpp +++ b/dcgmlib/src/DcgmCacheManager.cpp @@ -27,10 +27,8 @@ #include #include #include - #include #include - #include #include #include @@ -43,6 +41,10 @@ #include #include +inline int format_as(nvmlReturn_enum type) { return static_cast(type);} +inline int format_as(dcgmNvLinkLinkState_t type) { return static_cast(type);} +inline int format_as(DcgmWatcherType_t type) { return static_cast(type);} +inline int format_as(dcgmEntityStatusType_enum type) { return static_cast(type);} #define DRIVER_VERSION_510 510 diff --git a/dcgmlib/src/DcgmGroupManager.cpp b/dcgmlib/src/DcgmGroupManager.cpp index f49d9b18..180fad8b 100755 --- a/dcgmlib/src/DcgmGroupManager.cpp +++ b/dcgmlib/src/DcgmGroupManager.cpp @@ -25,7 +25,7 @@ #include "DcgmSettings.h" #include #include - +inline int format_as(dcgmEntityStatusType_enum type) { return static_cast(type);} /***************************************************************************** * Implementation for Group Manager Class *****************************************************************************/ diff --git a/dcgmlib/src/DcgmHostEngineHandler.cpp b/dcgmlib/src/DcgmHostEngineHandler.cpp index 1a9b2a0e..01c9aab5 100644 --- a/dcgmlib/src/DcgmHostEngineHandler.cpp +++ b/dcgmlib/src/DcgmHostEngineHandler.cpp @@ -45,7 +45,7 @@ #include #include #endif - +inline int format_as(DcgmWatcherType_t type) { return static_cast(type);} DcgmHostEngineHandler *DcgmHostEngineHandler::mpHostEngineHandlerInstance = nullptr; DcgmModuleCore DcgmHostEngineHandler::mModuleCoreObj; diff --git a/dcgmlib/src/DcgmVgpu.cpp b/dcgmlib/src/DcgmVgpu.cpp index 2b260f0c..5f49f747 100644 --- a/dcgmlib/src/DcgmVgpu.cpp +++ b/dcgmlib/src/DcgmVgpu.cpp @@ -21,7 +21,7 @@ #include "dcgm_structs.h" #include "dcgm_structs_internal.h" #include - +inline int format_as(nvmlReturn_enum type) { return static_cast(type);} /*****************************************************************************/ static std::string_view ConvertNvmlGridLicenseStateToString(unsigned int licenseState) { diff --git a/modules/nvswitch/DcgmNvSwitchManager.cpp b/modules/nvswitch/DcgmNvSwitchManager.cpp index aaf189d6..41b09155 100755 --- a/modules/nvswitch/DcgmNvSwitchManager.cpp +++ b/modules/nvswitch/DcgmNvSwitchManager.cpp @@ -17,16 +17,15 @@ #include #include #include - #include #include - #include "FieldIds.h" #include "NvSwitchData.h" #include "UpdateFunctions.h" - #include "DcgmNvSwitchManager.h" +inline int format_as(dcgmNvLinkLinkState_t type) { return static_cast(type);} + namespace DcgmNs { using phys_id_t = uint32_t; diff --git a/nvml-injection/include/InjectionArgument.h b/nvml-injection/include/InjectionArgument.h index 4338aab0..975121af 100644 --- a/nvml-injection/include/InjectionArgument.h +++ b/nvml-injection/include/InjectionArgument.h @@ -2039,12 +2039,18 @@ class InjectionArgument return m_value.processInfo_v1Ptr; } + /* +nvml-injection/include/InjectionArgument.h:2042:5: error: ‘InjectionArgument::InjectionArgument(nvmlProcessInfo_v2_t*)’ cannot be overloaded with ‘InjectionArgument::InjectionArgument(nvmlProcessInfo_t*)’ + 2042 | InjectionArgument(nvmlProcessInfo_v2_t *processInfo_v2Ptr) + | ^~~~~~~~~~~~~~~~~ + InjectionArgument(nvmlProcessInfo_v2_t *processInfo_v2Ptr) : m_type(INJECTION_PROCESSINFO_V2_PTR) { memset(&m_value, 0, sizeof(m_value)); m_value.processInfo_v2Ptr = processInfo_v2Ptr; } + */ nvmlProcessInfo_v2_t *AsProcessInfo_v2Ptr() const { return m_value.processInfo_v2Ptr; From 45ae4e56e12f783c6ec8ae3d6eacaaeb9c3d41c6 Mon Sep 17 00:00:00 2001 From: mike dupont Date: Sat, 4 Nov 2023 13:51:28 -0400 Subject: [PATCH 03/12] starting to link --- CMakeLists.txt | 7 +++ cmake/FindJsoncpp.cmake | 4 +- cmake/FindLibevent.cmake | 69 +++++--------------------- common/CMakeLists.txt | 6 +++ common/CudaWorker/CMakeLists.txt | 18 +++---- common/CudaWorker/CudaWorkerThread.hpp | 10 ++++ common/tests/TaskRunnerTests.cpp | 4 +- cublas_proxy/CMakeLists.txt | 12 +++-- cublas_proxy/Cuda10/CMakeLists.txt | 4 +- cublas_proxy/Cuda11/CMakeLists.txt | 2 +- 10 files changed, 61 insertions(+), 75 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index c92d1e7e..65cf05c8 100755 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,4 +1,10 @@ # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. +set (CMAKE_CUDA_ARCHITECTURES "native") +set (CUDA_TOOLKIT_ROOT_DIR "/usr/local/cuda-12.3") +set (CMAKE_CUDA_COMPILER "/usr/local/cuda-12.3/bin/nvcc") +list(APPEND LIBRARIES ${CUDA_CUBLAS_LIBRARIES}) +enable_language(CUDA) + set(CUDA12_INCLUDE_DIR "/usr/local/cuda-12.3/include") set(CUDA12_INCLUDE_DIR "/usr/local/cuda-12.3/include") include_directories("/usr/local/cuda-12.3/targets/x86_64-linux/include") @@ -453,3 +459,4 @@ configure_file( @ONLY) include(CPack) +cmake_minimum_required(VERSION 3.18) diff --git a/cmake/FindJsoncpp.cmake b/cmake/FindJsoncpp.cmake index 3168f188..2162c834 100644 --- a/cmake/FindJsoncpp.cmake +++ b/cmake/FindJsoncpp.cmake @@ -16,8 +16,8 @@ if (NOT TARGET JsonCpp::JsonCpp) find_package(jsoncpp REQUIRED CONFIG) - set(JSONCPP_STATIC_LIBS jsoncpp_static) - set(JSONCPP_INCLUDE_PATH $) + set(JSONCPP_STATIC_LIBS jsoncpp) + set(JSONCPP_INCLUDE_PATH $) endif() # set(Jsoncpp_PATH_PREFIXES /usr/local "${Jsoncpp_ROOT}" "$ENV{HOME}") # foreach(prefix ${Jsoncpp_PATH_PREFIXES}) diff --git a/cmake/FindLibevent.cmake b/cmake/FindLibevent.cmake index 7bdcdb26..f33fd0c7 100644 --- a/cmake/FindLibevent.cmake +++ b/cmake/FindLibevent.cmake @@ -1,61 +1,18 @@ -# -# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -# - Find Libevent (a cross event library) -# This module defines -# LIBEVENT_INCLUDE_DIR, where to find Libevent headers -# LIBEVENT_STATIC_LIBS, Libevent static libraries -# Libevent_FOUND, If false, do not try to use libevent +include(FindPackageHandleStandardArgs) -set(Libevent_EXTRA_PREFIXES / /lib /lib64 /usr/local /opt/local "$ENV{HOME}" "${Libevent_ROOT}") -foreach(prefix ${Libevent_EXTRA_PREFIXES}) - list(APPEND Libevent_INCLUDE_PATHS "${prefix}/include") - list(APPEND Libevent_LIB_PATHS "${prefix}/lib" "${prefix}/lib64") -endforeach() +find_library(LIBEVENT_LIBRARY event + PATHS ${LIBEVENT_LIBRARYDIR}) -find_path(LIBEVENT_INCLUDE_DIR evhttp.h event.h PATHS ${Libevent_INCLUDE_PATHS}) -find_library(LIBEVENT_STATIC_LIB NAMES libevent.a libevent_core.a libevent_extra.a PATHS ${Libevent_LIB_PATHS}) -find_library(LIBEVENT_PTHREAD_STATIC_LIB NAMES libevent_pthreads.a PATHS ${Libevent_LIB_PATHS}) +find_path(LIBEVENT_INCLUDE_DIR event.h + PATHS ${LIBEVENT_INCLUDEDIR}) -if (LIBEVENT_INCLUDE_DIR AND LIBEVENT_STATIC_LIB AND LIBEVENT_PTHREAD_STATIC_LIB) - set(Libevent_FOUND TRUE) - add_library(libevent_event_static STATIC IMPORTED) - set_target_properties(libevent_event_static PROPERTIES IMPORTED_LOCATION ${LIBEVENT_STATIC_LIB}) - add_library(libevent_event_pthread STATIC IMPORTED) - set_target_properties(libevent_event_pthread PROPERTIES IMPORTED_LOCATION ${LIBEVENT_PTHREAD_STATIC_LIB}) - set(LIBEVENT_STATIC_LIBS libevent_event_static libevent_event_pthread) -else () - set(Libevent_FOUND FALSE) -endif () - -if (Libevent_FOUND) - if (NOT Libevent_FIND_QUIETLY) - message(STATUS "Found libevent: ${LIBEVENT_LIB}") - endif () -else () - if (Libevent_FIND_REQUIRED) - message(FATAL_ERROR "Could NOT find libevent and libevent_pthread.") - endif () - message(STATUS "libevent and libevent_pthread NOT found.") -endif () - -unset(Libevent_EXTRA_PREFIXES) -unset(LIBEVENT_PTHREAD_STATIC_LIB) -unset(LIBEVENT_STATIC_LIB) +find_package_handle_standard_args(libevent DEFAULT_MSG + LIBEVENT_LIBRARY + LIBEVENT_INCLUDE_DIR) mark_as_advanced( - LIBEVENT_STATIC_LIBS - LIBEVENT_INCLUDE_DIR - ) + LIBEVENT_LIBRARY + LIBEVENT_INCLUDE_DIR) + +set(LIBEVENT_LIBRARIES ${LIBEVENT_LIBRARY}) +set(LIBEVENT_INCLUDE_DIRS ${LIBEVENT_INCLUDE_DIR}) diff --git a/common/CMakeLists.txt b/common/CMakeLists.txt index eab6fdcb..57475cfc 100755 --- a/common/CMakeLists.txt +++ b/common/CMakeLists.txt @@ -86,7 +86,11 @@ target_link_libraries(dcgm_common PUBLIC sdk_nvml_essentials_objects fmt::fmt dl + ) +target_link_libraries(dcgm_common PRIVATE "/usr/local/lib/libevent.so") +target_link_libraries(dcgm_common PRIVATE "/usr/local/lib/libevent_pthreads.so") + add_library(common_watch_objects STATIC) target_sources(common_watch_objects PRIVATE @@ -97,3 +101,5 @@ target_sources(common_watch_objects PRIVATE DcgmWatchTable.h ) target_link_libraries(common_watch_objects PUBLIC common_interface) + + diff --git a/common/CudaWorker/CMakeLists.txt b/common/CudaWorker/CMakeLists.txt index f2b0f136..d49ba3e8 100644 --- a/common/CudaWorker/CMakeLists.txt +++ b/common/CudaWorker/CMakeLists.txt @@ -31,21 +31,21 @@ macro(define_dcgm_cuda_worker cuda_version) CudaWorkerThread.hpp ) - if (${cuda_version} GREATER 10) + # if (${cuda_version} GREATER 10) target_sources(${dcgm_cuda_worker_lib} PRIVATE DcgmDgemm.cpp) - else() - target_compile_options(${dcgm_cuda_worker_lib} PRIVATE -Wno-volatile) - endif() + #else() + # target_compile_options(${dcgm_cuda_worker_lib} PRIVATE -Wno-volatile) + #endif() set(LOCAL_DCGM_CUDA_WORKER "${LOCAL_DCGM_CUDA_WORKER};${dcgm_cuda_worker_lib}" PARENT_SCOPE) endmacro() set(LOCAL_DCGM_CUDA_WORKER "") -if (NOT ${CMAKE_SYSTEM_PROCESSOR} STREQUAL "aarch64") - define_dcgm_cuda_worker(10) -endif() -define_dcgm_cuda_worker(11) +#if (NOT ${CMAKE_SYSTEM_PROCESSOR} STREQUAL "aarch64") +# define_dcgm_cuda_worker(10) +#endif() +#define_dcgm_cuda_worker(11) define_dcgm_cuda_worker(12) -set(DCGM_CUDA_WORKER ${LOCAL_DCGM_CUDA_WORKER} PARENT_SCOPE) \ No newline at end of file +set(DCGM_CUDA_WORKER ${LOCAL_DCGM_CUDA_WORKER} PARENT_SCOPE) diff --git a/common/CudaWorker/CudaWorkerThread.hpp b/common/CudaWorker/CudaWorkerThread.hpp index e912c0c6..f37ac1f3 100644 --- a/common/CudaWorker/CudaWorkerThread.hpp +++ b/common/CudaWorker/CudaWorkerThread.hpp @@ -18,6 +18,16 @@ #include + +inline int format_as(cudaError_enum type) { + return static_cast(type); +} + +inline int format_as(cublasStatus_t type) { + return static_cast(type); +} + + #include "FieldWorkers.hpp" #include diff --git a/common/tests/TaskRunnerTests.cpp b/common/tests/TaskRunnerTests.cpp index 45c6c117..7dd8ed5a 100755 --- a/common/tests/TaskRunnerTests.cpp +++ b/common/tests/TaskRunnerTests.cpp @@ -288,7 +288,7 @@ TEST_CASE("TaskRunner: Limited Queue") tr.Stop(); fmt::print("Wait iterations elapsed: {}\n", cWaitIterations - waitIterations); - fmt::print("Iterations: {}\nExecutions: {}\nFailed to add: {}\n", iterations, executed, failedToAdd); + fmt::print("Iterations: {}\nExecutions: {}\nFailed to add: {}\n", (int)iterations, (int)executed, (int)failedToAdd); REQUIRE(executed >= cTaskRunnerCapacity); REQUIRE((failedToAdd + executed) == iterations); } @@ -310,4 +310,4 @@ TEST_CASE("TaskRunner: Task with attempts") REQUIRE(fut.has_value()); REQUIRE_THROWS_AS((*fut).get(), std::future_error); tr.Stop(); -} \ No newline at end of file +} diff --git a/cublas_proxy/CMakeLists.txt b/cublas_proxy/CMakeLists.txt index 7a84ddf2..b476e105 100644 --- a/cublas_proxy/CMakeLists.txt +++ b/cublas_proxy/CMakeLists.txt @@ -28,15 +28,21 @@ macro(define_proxy_lib CUDA_VER) set(cublas_proxy_include_dir "${CMAKE_CURRENT_SOURCE_DIR}/..") get_absolute_path(${cublas_proxy_include_dir} cublas_proxy_include_dir) target_include_directories(${lib_name} PUBLIC $) + target_include_directories(${lib_name} PUBLIC $) target_link_libraries(${lib_name} PRIVATE dl rt) target_link_libraries(${lib_name} PRIVATE dcgm dcgm_common dcgm_logging dcgm_mutex) - target_link_libraries(${lib_name} PRIVATE ${CUDA${CUDA_VER}_STATIC_CUBLAS_LIBS}) - target_link_libraries(${lib_name} PRIVATE ${CUDA${CUDA_VER}_STATIC_LIBS}) - target_link_libraries(${lib_name} PRIVATE ${CUDA${CUDA_VER}_LIBS}) +# target_link_libraries(${lib_name} PRIVATE ${CUDA${CUDA_VER}_STATIC_CUBLAS_LIBS}) +# target_link_libraries(${lib_name} PRIVATE ${CUDA${CUDA_VER}_STATIC_LIBS}) + # target_link_libraries(${lib_name} PRIVATE ${CUDA_CUBLAS_LIBRARIES})) + #set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} + target_link_libraries(${lib_name} PRIVATE "/usr/local/cuda-12.3/lib64/libcublas.so" ) + target_link_libraries(${lib_name} PRIVATE "/usr/local/cuda-12.3/targets/x86_64-linux/lib/libcublasLt.so" ) + set(LOCAL_CUBLAS_PROXY "${LOCAL_CUBLAS_PROXY};${lib_name}" PARENT_SCOPE) + endmacro() set(LOCAL_CUBLAS_PROXY "") diff --git a/cublas_proxy/Cuda10/CMakeLists.txt b/cublas_proxy/Cuda10/CMakeLists.txt index 01ac0041..83a59ac3 100644 --- a/cublas_proxy/Cuda10/CMakeLists.txt +++ b/cublas_proxy/Cuda10/CMakeLists.txt @@ -11,5 +11,5 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -define_proxy_lib(10) -target_compile_options(${lib_name} PRIVATE -Wno-volatile) +#define_proxy_lib(10) +#target_compile_options(${lib_name} PRIVATE -Wno-volatile) diff --git a/cublas_proxy/Cuda11/CMakeLists.txt b/cublas_proxy/Cuda11/CMakeLists.txt index cdcc329f..af4d3b98 100644 --- a/cublas_proxy/Cuda11/CMakeLists.txt +++ b/cublas_proxy/Cuda11/CMakeLists.txt @@ -11,4 +11,4 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -define_proxy_lib(11) +#define_proxy_lib(11) From ad576bbdd52fe9268e35fdd539be5a32529038b3 Mon Sep 17 00:00:00 2001 From: mike dupont Date: Sat, 4 Nov 2023 13:58:27 -0400 Subject: [PATCH 04/12] adding format changes --- modules/policy/DcgmPolicyManager.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/modules/policy/DcgmPolicyManager.h b/modules/policy/DcgmPolicyManager.h index 13a45726..c877e5d1 100644 --- a/modules/policy/DcgmPolicyManager.h +++ b/modules/policy/DcgmPolicyManager.h @@ -167,4 +167,6 @@ class DcgmPolicyManager dcgmReturn_t WatchFields(dcgm_connection_id_t connectionId); }; +inline int format_as(DcgmViolationPolicyAlert_enum type) { return static_cast(type);} + #endif // DCGMPOLICYMANAGER_H From 80c034ba922f2dc4902c84109aa65cd04ad17cd2 Mon Sep 17 00:00:00 2001 From: mike dupont Date: Sat, 4 Nov 2023 16:04:38 -0400 Subject: [PATCH 05/12] work in progress --- common/CudaWorker/CudaWorkerThread.hpp | 3 --- common/CudaWorker/FieldWorkers.hpp | 7 +++++- dcgmlib/format.hpp | 2 +- dcgmproftester/CMakeLists.txt | 31 ++++++++++++++++++++------ dcgmproftester/DcgmProfTester.cpp | 2 +- dcgmproftester/PhysicalGpu.cpp | 3 ++- 6 files changed, 34 insertions(+), 14 deletions(-) diff --git a/common/CudaWorker/CudaWorkerThread.hpp b/common/CudaWorker/CudaWorkerThread.hpp index f37ac1f3..245247c2 100644 --- a/common/CudaWorker/CudaWorkerThread.hpp +++ b/common/CudaWorker/CudaWorkerThread.hpp @@ -23,9 +23,6 @@ inline int format_as(cudaError_enum type) { return static_cast(type); } -inline int format_as(cublasStatus_t type) { - return static_cast(type); -} #include "FieldWorkers.hpp" diff --git a/common/CudaWorker/FieldWorkers.hpp b/common/CudaWorker/FieldWorkers.hpp index 354b0a05..6ce8888a 100644 --- a/common/CudaWorker/FieldWorkers.hpp +++ b/common/CudaWorker/FieldWorkers.hpp @@ -15,7 +15,8 @@ */ #pragma once -#include +//#include +#include "../cublas_proxy/cublas_proxy.hpp" #include #if (CUDA_VERSION_USED >= 11) @@ -23,8 +24,12 @@ #endif #include + #include #include +inline int format_as(cublasStatus_t type) { + return static_cast(type); + } using namespace Dcgm; diff --git a/dcgmlib/format.hpp b/dcgmlib/format.hpp index 2d85a999..bd6b35e4 100644 --- a/dcgmlib/format.hpp +++ b/dcgmlib/format.hpp @@ -6,7 +6,7 @@ inline int format_as(DcgmWatcherType_t type) { return static_cast(type);} inline int format_as(dcgmChipArchitecture_enum type) { return static_cast(type);} inline int format_as(dcgmConfigType_enum type) { return static_cast(type);} inline int format_as(dcgmDiagnosticLevel_t type) { return static_cast(type);} -//inline int format_as(dcgmEntityStatusType_enum type) { return static_cast(type);} + inline int format_as(dcgmGroupType_enum type) { return static_cast(type);} inline int format_as(dcgmHealthSystems_enum type) { return static_cast(type);} inline int format_as(dcgmModuleId_t type) { return static_cast(type);} diff --git a/dcgmproftester/CMakeLists.txt b/dcgmproftester/CMakeLists.txt index 2b5d0b39..f9a3a414 100755 --- a/dcgmproftester/CMakeLists.txt +++ b/dcgmproftester/CMakeLists.txt @@ -20,6 +20,17 @@ macro(define_dcgmproftester cuda_version) add_executable(${dcgmproftester_exec}) list(APPEND DCGMPROFTESTER_TARGETS ${dcgmproftester_exec}) target_compile_definitions(${dcgmproftester_exec} PRIVATE CUDA_VERSION_USED=${cuda_version}) + + target_link_libraries(${dcgmproftester_exec} + PRIVATE + "/usr/local/cuda-12.3/lib64/libcublas.so" ) + target_link_libraries(${dcgmproftester_exec} + PRIVATE + "/usr/local/cuda-12.3/targets/x86_64-linux/lib/libcublasLt.so" ) + target_link_libraries(${dcgmproftester_exec} + PRIVATE + "/usr/lib/x86_64-linux-gnu/libcuda.so" ) + target_link_libraries( ${dcgmproftester_exec} PRIVATE @@ -33,6 +44,12 @@ macro(define_dcgmproftester cuda_version) fmt::fmt ) target_include_directories(${dcgmproftester_exec} PRIVATE ${CUDA${cuda_version}_INCLUDE_DIR}) + + set(cublas_proxy_include_dir "${CMAKE_CURRENT_SOURCE_DIR}/..") + get_absolute_path(${cublas_proxy_include_dir} cublas_proxy_include_dir) + target_include_directories(${dcgmproftester_exec} PUBLIC $) + + target_link_libraries(${dcgmproftester_exec} PRIVATE dl rt) target_link_libraries(${dcgmproftester_exec} PRIVATE dcgm_common) target_link_libraries(${dcgmproftester_exec} PRIVATE dcgm_cublas_proxy${cuda_version}) @@ -57,19 +74,19 @@ macro(define_dcgmproftester cuda_version) ${COMMON_SRCS} ) - if (${cuda_version} EQUAL 10) - target_compile_options(${dcgmproftester_exec} PRIVATE -Wno-volatile) - endif() +# if (${cuda_version} EQUAL 10) +# target_compile_options(${dcgmproftester_exec} PRIVATE -Wno-volatile) +# endif() find_package(Threads REQUIRED) target_link_libraries(${dcgmproftester_exec} PRIVATE ${CMAKE_THREAD_LIBS_INIT}) target_link_options(${dcgmproftester_exec} PRIVATE -Wl,--version-script,${CMAKE_CURRENT_SOURCE_DIR}/proftester.linux_def) endmacro() -if (NOT ${CMAKE_SYSTEM_PROCESSOR} STREQUAL "aarch64") - define_dcgmproftester(10) -endif() -define_dcgmproftester(11) +#if (NOT ${CMAKE_SYSTEM_PROCESSOR} STREQUAL "aarch64") +# define_dcgmproftester(10) +#endif() +#define_dcgmproftester(11) define_dcgmproftester(12) set(DCGMPROFTESTER ${DCGMPROFTESTER_TARGETS} PARENT_SCOPE) diff --git a/dcgmproftester/DcgmProfTester.cpp b/dcgmproftester/DcgmProfTester.cpp index ce571056..1be7646f 100755 --- a/dcgmproftester/DcgmProfTester.cpp +++ b/dcgmproftester/DcgmProfTester.cpp @@ -26,7 +26,7 @@ #include "dcgm_structs.h" #include "timelib.h" #include "vector_types.h" -#include +#include "../cublas_proxy/cublas_proxy.hpp" #include #include diff --git a/dcgmproftester/PhysicalGpu.cpp b/dcgmproftester/PhysicalGpu.cpp index 834c0ea9..da66a35c 100644 --- a/dcgmproftester/PhysicalGpu.cpp +++ b/dcgmproftester/PhysicalGpu.cpp @@ -25,7 +25,8 @@ #include "dcgm_fields_internal.hpp" #include "timelib.h" #include "vector_types.h" -#include +//#include +#include "../cublas_proxy/cublas_proxy.hpp" #include #include From 0a7066da2e4e6b489db2adfed590e3dee6cf7f76 Mon Sep 17 00:00:00 2001 From: mike dupont Date: Sat, 4 Nov 2023 17:26:32 -0400 Subject: [PATCH 06/12] fixing more bugs --- nvvs/plugin_src/contextcreate/Cuda10/CMakeLists.txt | 2 +- nvvs/plugin_src/contextcreate/Cuda11/CMakeLists.txt | 2 +- nvvs/plugin_src/diagnostic/Cuda10/CMakeLists.txt | 2 +- nvvs/plugin_src/diagnostic/Cuda11/CMakeLists.txt | 2 +- nvvs/plugin_src/memory/Cuda10/CMakeLists.txt | 2 +- nvvs/plugin_src/memory/Cuda11/CMakeLists.txt | 2 +- nvvs/plugin_src/memtest/Cuda10/CMakeLists.txt | 2 +- nvvs/plugin_src/memtest/Cuda11/CMakeLists.txt | 2 +- nvvs/plugin_src/memtest/Memtest.cpp | 1 + nvvs/plugin_src/pcie/Cuda10/CMakeLists.txt | 2 +- nvvs/plugin_src/pcie/Cuda11/CMakeLists.txt | 2 +- nvvs/plugin_src/pcie/Pcie.h | 3 ++- nvvs/plugin_src/software/Cuda10/CMakeLists.txt | 2 +- nvvs/plugin_src/software/Cuda11/CMakeLists.txt | 2 +- nvvs/plugin_src/targetedpower/Cuda10/CMakeLists.txt | 2 +- nvvs/plugin_src/targetedpower/Cuda11/CMakeLists.txt | 2 +- nvvs/plugin_src/targetedpower/TargetedPower_wrapper.h | 3 ++- nvvs/plugin_src/targetedstress/Cuda10/CMakeLists.txt | 2 +- nvvs/plugin_src/targetedstress/Cuda11/CMakeLists.txt | 2 +- nvvs/src/NvidiaValidationSuite.cpp | 3 +++ nvvs/src/Plugin.cpp | 5 +++++ 21 files changed, 29 insertions(+), 18 deletions(-) diff --git a/nvvs/plugin_src/contextcreate/Cuda10/CMakeLists.txt b/nvvs/plugin_src/contextcreate/Cuda10/CMakeLists.txt index 20d2e646..bf2c42ad 100644 --- a/nvvs/plugin_src/contextcreate/Cuda10/CMakeLists.txt +++ b/nvvs/plugin_src/contextcreate/Cuda10/CMakeLists.txt @@ -11,4 +11,4 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -define_plugin(ContextCreate 10) +#define_plugin(ContextCreate 10) diff --git a/nvvs/plugin_src/contextcreate/Cuda11/CMakeLists.txt b/nvvs/plugin_src/contextcreate/Cuda11/CMakeLists.txt index 9e901702..2187f330 100644 --- a/nvvs/plugin_src/contextcreate/Cuda11/CMakeLists.txt +++ b/nvvs/plugin_src/contextcreate/Cuda11/CMakeLists.txt @@ -11,4 +11,4 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -define_plugin(ContextCreate 11) +#define_plugin(ContextCreate 11) diff --git a/nvvs/plugin_src/diagnostic/Cuda10/CMakeLists.txt b/nvvs/plugin_src/diagnostic/Cuda10/CMakeLists.txt index 1889b800..79c919b8 100644 --- a/nvvs/plugin_src/diagnostic/Cuda10/CMakeLists.txt +++ b/nvvs/plugin_src/diagnostic/Cuda10/CMakeLists.txt @@ -11,4 +11,4 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -define_plugin(Diagnostic 10) +#define_plugin(Diagnostic 10) diff --git a/nvvs/plugin_src/diagnostic/Cuda11/CMakeLists.txt b/nvvs/plugin_src/diagnostic/Cuda11/CMakeLists.txt index 6bb948ef..f15ad9cb 100644 --- a/nvvs/plugin_src/diagnostic/Cuda11/CMakeLists.txt +++ b/nvvs/plugin_src/diagnostic/Cuda11/CMakeLists.txt @@ -11,4 +11,4 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -define_plugin(Diagnostic 11) +#define_plugin(Diagnostic 11) diff --git a/nvvs/plugin_src/memory/Cuda10/CMakeLists.txt b/nvvs/plugin_src/memory/Cuda10/CMakeLists.txt index 3d6048d0..102b1d94 100644 --- a/nvvs/plugin_src/memory/Cuda10/CMakeLists.txt +++ b/nvvs/plugin_src/memory/Cuda10/CMakeLists.txt @@ -11,4 +11,4 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -define_plugin(Memory 10) +#define_plugin(Memory 10) diff --git a/nvvs/plugin_src/memory/Cuda11/CMakeLists.txt b/nvvs/plugin_src/memory/Cuda11/CMakeLists.txt index 06b6577d..799e0f5d 100644 --- a/nvvs/plugin_src/memory/Cuda11/CMakeLists.txt +++ b/nvvs/plugin_src/memory/Cuda11/CMakeLists.txt @@ -11,4 +11,4 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -define_plugin(Memory 11) +#define_plugin(Memory 11) diff --git a/nvvs/plugin_src/memtest/Cuda10/CMakeLists.txt b/nvvs/plugin_src/memtest/Cuda10/CMakeLists.txt index 06b130a6..fcbab9d8 100644 --- a/nvvs/plugin_src/memtest/Cuda10/CMakeLists.txt +++ b/nvvs/plugin_src/memtest/Cuda10/CMakeLists.txt @@ -11,4 +11,4 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -define_plugin(Memtest 10) +#define_plugin(Memtest 10) diff --git a/nvvs/plugin_src/memtest/Cuda11/CMakeLists.txt b/nvvs/plugin_src/memtest/Cuda11/CMakeLists.txt index 740b1de7..2540e51c 100644 --- a/nvvs/plugin_src/memtest/Cuda11/CMakeLists.txt +++ b/nvvs/plugin_src/memtest/Cuda11/CMakeLists.txt @@ -11,4 +11,4 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -define_plugin(Memtest 11) +#define_plugin(Memtest 11) diff --git a/nvvs/plugin_src/memtest/Memtest.cpp b/nvvs/plugin_src/memtest/Memtest.cpp index 90d42c59..09e392bd 100644 --- a/nvvs/plugin_src/memtest/Memtest.cpp +++ b/nvvs/plugin_src/memtest/Memtest.cpp @@ -49,6 +49,7 @@ #include #include +inline int format_as(dcgmDiagnosticLevel_t type) { return static_cast(type);} const unsigned int NUM_ITERATIONS = 1000; static __thread unsigned long *err_addr; diff --git a/nvvs/plugin_src/pcie/Cuda10/CMakeLists.txt b/nvvs/plugin_src/pcie/Cuda10/CMakeLists.txt index 5b4fc3e4..2f79ba62 100644 --- a/nvvs/plugin_src/pcie/Cuda10/CMakeLists.txt +++ b/nvvs/plugin_src/pcie/Cuda10/CMakeLists.txt @@ -11,6 +11,6 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -define_plugin(Pcie 10) +#define_plugin(Pcie 10) define_bw_checker(BwChecker 10) diff --git a/nvvs/plugin_src/pcie/Cuda11/CMakeLists.txt b/nvvs/plugin_src/pcie/Cuda11/CMakeLists.txt index 5de17923..3253cae6 100644 --- a/nvvs/plugin_src/pcie/Cuda11/CMakeLists.txt +++ b/nvvs/plugin_src/pcie/Cuda11/CMakeLists.txt @@ -11,6 +11,6 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -define_plugin(Pcie 11) +#define_plugin(Pcie 11) define_bw_checker(BwChecker 11) diff --git a/nvvs/plugin_src/pcie/Pcie.h b/nvvs/plugin_src/pcie/Pcie.h index 1d004a83..b62cb4aa 100644 --- a/nvvs/plugin_src/pcie/Pcie.h +++ b/nvvs/plugin_src/pcie/Pcie.h @@ -21,7 +21,8 @@ #include "PluginDevice.h" -#include +//#include +#include "../cublas_proxy/cublas_proxy.hpp" #include #include #include diff --git a/nvvs/plugin_src/software/Cuda10/CMakeLists.txt b/nvvs/plugin_src/software/Cuda10/CMakeLists.txt index 8e9aa99a..dc4374a7 100644 --- a/nvvs/plugin_src/software/Cuda10/CMakeLists.txt +++ b/nvvs/plugin_src/software/Cuda10/CMakeLists.txt @@ -11,4 +11,4 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -define_plugin(Software 10) +#define_plugin(Software 10) diff --git a/nvvs/plugin_src/software/Cuda11/CMakeLists.txt b/nvvs/plugin_src/software/Cuda11/CMakeLists.txt index 58f47594..30556949 100644 --- a/nvvs/plugin_src/software/Cuda11/CMakeLists.txt +++ b/nvvs/plugin_src/software/Cuda11/CMakeLists.txt @@ -11,4 +11,4 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -define_plugin(Software 11) +#define_plugin(Software 11) diff --git a/nvvs/plugin_src/targetedpower/Cuda10/CMakeLists.txt b/nvvs/plugin_src/targetedpower/Cuda10/CMakeLists.txt index a414ee99..5d04e81f 100644 --- a/nvvs/plugin_src/targetedpower/Cuda10/CMakeLists.txt +++ b/nvvs/plugin_src/targetedpower/Cuda10/CMakeLists.txt @@ -11,4 +11,4 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -define_plugin(TargetedPower 10) +#define_plugin(TargetedPower 10) diff --git a/nvvs/plugin_src/targetedpower/Cuda11/CMakeLists.txt b/nvvs/plugin_src/targetedpower/Cuda11/CMakeLists.txt index 1cc90105..f1cf9cb7 100644 --- a/nvvs/plugin_src/targetedpower/Cuda11/CMakeLists.txt +++ b/nvvs/plugin_src/targetedpower/Cuda11/CMakeLists.txt @@ -11,4 +11,4 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -define_plugin(TargetedPower 11) +#define_plugin(TargetedPower 11) diff --git a/nvvs/plugin_src/targetedpower/TargetedPower_wrapper.h b/nvvs/plugin_src/targetedpower/TargetedPower_wrapper.h index 29ef0e07..6749a22a 100644 --- a/nvvs/plugin_src/targetedpower/TargetedPower_wrapper.h +++ b/nvvs/plugin_src/targetedpower/TargetedPower_wrapper.h @@ -29,7 +29,8 @@ #include #include -#include +//#include +#include "../cublas_proxy/cublas_proxy.hpp" #include #define TP_MAX_DIMENSION 8192 /* Maximum single dimension */ diff --git a/nvvs/plugin_src/targetedstress/Cuda10/CMakeLists.txt b/nvvs/plugin_src/targetedstress/Cuda10/CMakeLists.txt index d871f832..d060321b 100644 --- a/nvvs/plugin_src/targetedstress/Cuda10/CMakeLists.txt +++ b/nvvs/plugin_src/targetedstress/Cuda10/CMakeLists.txt @@ -11,4 +11,4 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -define_plugin(TargetedStress 10) +#define_plugin(TargetedStress 10) diff --git a/nvvs/plugin_src/targetedstress/Cuda11/CMakeLists.txt b/nvvs/plugin_src/targetedstress/Cuda11/CMakeLists.txt index c9d2290d..0d00fa7e 100644 --- a/nvvs/plugin_src/targetedstress/Cuda11/CMakeLists.txt +++ b/nvvs/plugin_src/targetedstress/Cuda11/CMakeLists.txt @@ -11,4 +11,4 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -define_plugin(TargetedStress 11) +#define_plugin(TargetedStress 11) diff --git a/nvvs/src/NvidiaValidationSuite.cpp b/nvvs/src/NvidiaValidationSuite.cpp index f2e086ba..f8706cc6 100644 --- a/nvvs/src/NvidiaValidationSuite.cpp +++ b/nvvs/src/NvidiaValidationSuite.cpp @@ -38,8 +38,11 @@ #include #include + using namespace DcgmNs::Nvvs; +inline int format_as(dcgmDiagnosticLevel_t type) { return static_cast(type);} +inline int format_as(cudaError_enum type) { return static_cast(type);} DcgmHandle dcgmHandle; DcgmSystem dcgmSystem; NvvsCommon nvvsCommon __attribute__((visibility("default"))); diff --git a/nvvs/src/Plugin.cpp b/nvvs/src/Plugin.cpp index 3f34725f..1cf26175 100644 --- a/nvvs/src/Plugin.cpp +++ b/nvvs/src/Plugin.cpp @@ -16,6 +16,11 @@ #include "Plugin.h" #include "PluginStrings.h" + +inline int format_as(nvvsPluginResult_enum type) { + return static_cast(type); +} + const double DUMMY_TEMPERATURE_VALUE = 30.0; /*************************************************************************/ From 40eedcdc23f65f6f5f789462906e1f6f9976730c Mon Sep 17 00:00:00 2001 From: mike dupont Date: Sat, 4 Nov 2023 18:31:51 -0400 Subject: [PATCH 07/12] working better some more linker errors and inlcude errors --- nvvs/plugin_src/memtest/Memtest.cpp | 6 +++++- nvvs/src/NvidiaValidationSuite.cpp | 7 ++++--- nvvs/src/PluginLib.cpp | 4 ++++ testing/TestDiagResponseWrapper.cpp | 5 +++++ 4 files changed, 18 insertions(+), 4 deletions(-) diff --git a/nvvs/plugin_src/memtest/Memtest.cpp b/nvvs/plugin_src/memtest/Memtest.cpp index 09e392bd..a47cf0e9 100644 --- a/nvvs/plugin_src/memtest/Memtest.cpp +++ b/nvvs/plugin_src/memtest/Memtest.cpp @@ -49,7 +49,11 @@ #include #include -inline int format_as(dcgmDiagnosticLevel_t type) { return static_cast(type);} +//inline int format_as(dcgmDiagnosticLevel_t type) { return static_cast(type);} +inline int format_as(cudaError_enum type) { return static_cast(type);} +inline int format_as(cudaError type) { return static_cast(type);} + + const unsigned int NUM_ITERATIONS = 1000; static __thread unsigned long *err_addr; diff --git a/nvvs/src/NvidiaValidationSuite.cpp b/nvvs/src/NvidiaValidationSuite.cpp index f8706cc6..54f73847 100644 --- a/nvvs/src/NvidiaValidationSuite.cpp +++ b/nvvs/src/NvidiaValidationSuite.cpp @@ -39,10 +39,11 @@ #include -using namespace DcgmNs::Nvvs; -inline int format_as(dcgmDiagnosticLevel_t type) { return static_cast(type);} -inline int format_as(cudaError_enum type) { return static_cast(type);} +using namespace DcgmNs::Nvvs; +inline int format_as(dcgmEntityStatusType_enum type) { return static_cast(type);} +//inline int format_as(dcgmDiagnosticLevel_t type) { return static_cast(type);} +//inline int format_as(cudaError_enum type) { return static_cast(type);} DcgmHandle dcgmHandle; DcgmSystem dcgmSystem; NvvsCommon nvvsCommon __attribute__((visibility("default"))); diff --git a/nvvs/src/PluginLib.cpp b/nvvs/src/PluginLib.cpp index e878c8c7..0fb4d6ec 100644 --- a/nvvs/src/PluginLib.cpp +++ b/nvvs/src/PluginLib.cpp @@ -19,6 +19,10 @@ #include +inline int format_as(dcgmDiagAuxDataType type) { + return static_cast(type); +} + /*****************************************************************************/ PluginLib::PluginLib() diff --git a/testing/TestDiagResponseWrapper.cpp b/testing/TestDiagResponseWrapper.cpp index 51f42a6d..2e4c956b 100644 --- a/testing/TestDiagResponseWrapper.cpp +++ b/testing/TestDiagResponseWrapper.cpp @@ -22,6 +22,11 @@ #include "NvvsJsonStrings.h" #include "TestDiagResponseWrapper.h" + +inline int format_as(dcgmDiagResult_enum type) { + return static_cast(type); +} + TestDiagResponseWrapper::TestDiagResponseWrapper() = default; TestDiagResponseWrapper::~TestDiagResponseWrapper() = default; From 202d2de71f84a8d1c422e556eaae9e98940353d2 Mon Sep 17 00:00:00 2001 From: mike dupont Date: Sat, 4 Nov 2023 18:46:36 -0400 Subject: [PATCH 08/12] now the random is no more, using constants raised issue https://github.com/NVIDIA/DCGM/issues/129 --- nvvs/plugin_src/memory/l1tag.cu | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/nvvs/plugin_src/memory/l1tag.cu b/nvvs/plugin_src/memory/l1tag.cu index 7a4ebf40..fe87aaf4 100644 --- a/nvvs/plugin_src/memory/l1tag.cu +++ b/nvvs/plugin_src/memory/l1tag.cu @@ -15,7 +15,7 @@ */ #include "L1TagCuda.h" -#include "newrandom.h" +//#include "newrandom.h" __device__ void ReportError ( @@ -66,11 +66,11 @@ extern "C" __global__ void InitL1Data(const L1TagParams params) // Init RNG (each SM data region will have the same data) unsigned64 s[2]; - InitRand<2>(s, params.randSeed + threadIdx.x); + //InitRand<2>(s, params.randSeed + threadIdx.x); for (uint32_t i = threadIdx.x; i < smidDataBytes / sizeof(*buf); i += blockDim.x) { - const uint16_t rnd = static_cast(FastRand(s) >> 48); + const uint16_t rnd = 2;//static_cast(FastRand(s) >> 48); buf[i] = EncodeOffset(i, rnd); } } @@ -92,8 +92,8 @@ extern "C" __global__ void L1TagTest(const L1TagParams params) // Init RNG (each SM will use the same seed, for equivalent data accesses) unsigned64 s[2]; - InitRand<2>(s, params.randSeed + hwtid); - uint32_t rnd = static_cast(FastRand(s)); + //InitRand<2>(s, params.randSeed + hwtid); + uint32_t rnd = 1;//static_cast(FastRand(s)); // Run the test for the specified iterations for (uint64_t iter = 0; iter < params.iterations; iter++) @@ -168,7 +168,7 @@ extern "C" __global__ void L1TagTest(const L1TagParams params) } // Always use a new random offset - rnd = static_cast(FastRand(s)); + rnd = 3;//static_cast(FastRand(s)); } } } From 6b14aecbe801313736fbd411d88310e0284efa5d Mon Sep 17 00:00:00 2001 From: mike dupont Date: Sat, 4 Nov 2023 23:40:30 -0400 Subject: [PATCH 09/12] linked --- nvvs/plugin_src/pcie/CMakeLists.txt | 16 +++++++++++++--- nvvs/plugin_src/pcie/Cuda10/CMakeLists.txt | 2 +- nvvs/plugin_src/pcie/Cuda11/CMakeLists.txt | 2 +- 3 files changed, 15 insertions(+), 5 deletions(-) diff --git a/nvvs/plugin_src/pcie/CMakeLists.txt b/nvvs/plugin_src/pcie/CMakeLists.txt index 8d363cf0..cd55e57e 100644 --- a/nvvs/plugin_src/pcie/CMakeLists.txt +++ b/nvvs/plugin_src/pcie/CMakeLists.txt @@ -32,6 +32,19 @@ macro(define_bw_checker CHECKER_NAME CUDA_VER) ) target_link_libraries(${CHECKER_NAME}_${CUDA_VER} PRIVATE dcgm_cuda_worker12) + target_link_libraries(${CHECKER_NAME}_${CUDA_VER} PRIVATE "/usr/local/cuda-12.3/lib64/libcublas.so" ) + target_link_libraries(${CHECKER_NAME}_${CUDA_VER} PRIVATE "/usr/local/cuda-12.3/targets/x86_64-linux/lib/libcublasLt.so" ) + + target_link_libraries(${CHECKER_NAME}_${CUDA_VER} + PRIVATE + "/usr/lib/x86_64-linux-gnu/libcuda.so" ) + target_link_libraries(${CHECKER_NAME}_${CUDA_VER} + PRIVATE + "/usr/local/cuda-12.3/lib64/libcudart.so.12" ) + + + + target_include_directories(${CHECKER_NAME}_${CUDA_VER} PRIVATE ${PCIE_PLUGIN_BASE_DIR}/bw_checker) target_include_directories(${CHECKER_NAME}_${CUDA_VER} PRIVATE ${JSONCPP_INCLUDE_DIR}) target_include_directories(${CHECKER_NAME}_${CUDA_VER} PRIVATE ${CUDA${CUDA_VER}_INCLUDE_DIR}) @@ -46,9 +59,6 @@ macro(define_bw_checker CHECKER_NAME CUDA_VER) set_target_properties(${CHECKER_NAME}_${CUDA_VER} PROPERTIES INSTALL_TO_DIR ${DCGM_NVVS_PLUGINS_INSTALL_DIR}/cuda${CUDA_VER}) - if ((${CUDA_VER} EQUAL 10)) - target_compile_options(${CHECKER_NAME}_${CUDA_VER} PRIVATE -Wno-volatile) - endif() set(LOCAL_BW_CHECKERS "${LOCAL_BW_CHECKERS};${CHECKER_NAME}_${CUDA_VER}" PARENT_SCOPE) endmacro() diff --git a/nvvs/plugin_src/pcie/Cuda10/CMakeLists.txt b/nvvs/plugin_src/pcie/Cuda10/CMakeLists.txt index 2f79ba62..1b56d6e4 100644 --- a/nvvs/plugin_src/pcie/Cuda10/CMakeLists.txt +++ b/nvvs/plugin_src/pcie/Cuda10/CMakeLists.txt @@ -13,4 +13,4 @@ # limitations under the License. #define_plugin(Pcie 10) -define_bw_checker(BwChecker 10) +#define_bw_checker(BwChecker 10) diff --git a/nvvs/plugin_src/pcie/Cuda11/CMakeLists.txt b/nvvs/plugin_src/pcie/Cuda11/CMakeLists.txt index 3253cae6..6b5f17b5 100644 --- a/nvvs/plugin_src/pcie/Cuda11/CMakeLists.txt +++ b/nvvs/plugin_src/pcie/Cuda11/CMakeLists.txt @@ -13,4 +13,4 @@ # limitations under the License. #define_plugin(Pcie 11) -define_bw_checker(BwChecker 11) +#define_bw_checker(BwChecker 11) From f7360046d262d2fe7329176126b29d44b38fbcb4 Mon Sep 17 00:00:00 2001 From: mike dupont Date: Sat, 4 Nov 2023 23:48:55 -0400 Subject: [PATCH 10/12] adding submodules --- .gitmodules | 18 ++++++++++++++++++ vendor/Catch2 | 1 + vendor/fmt | 1 + vendor/jsoncpp | 1 + vendor/libevent | 1 + vendor/plog | 1 + vendor/yaml-cpp | 1 + 7 files changed, 24 insertions(+) create mode 100644 .gitmodules create mode 160000 vendor/Catch2 create mode 160000 vendor/fmt create mode 160000 vendor/jsoncpp create mode 160000 vendor/libevent create mode 160000 vendor/plog create mode 160000 vendor/yaml-cpp diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 00000000..064ca2dc --- /dev/null +++ b/.gitmodules @@ -0,0 +1,18 @@ +[submodule "vendor/Catch2"] + path = vendor/Catch2 + url = https://github.com/catchorg/Catch2 +[submodule "vendor/fmt"] + path = vendor/fmt + url = https://github.com/fmtlib/fmt +[submodule "vendor/jsoncpp"] + path = vendor/jsoncpp + url = https://github.com/open-source-parsers/jsoncpp +[submodule "vendor/libevent"] + path = vendor/libevent + url = https://github.com/libevent/libevent +[submodule "vendor/plog"] + path = vendor/plog + url = https://github.com/SergiusTheBest/plog +[submodule "vendor/yaml-cpp"] + path = vendor/yaml-cpp + url = https://github.com/jbeder/yaml-cpp diff --git a/vendor/Catch2 b/vendor/Catch2 new file mode 160000 index 00000000..d4b0b345 --- /dev/null +++ b/vendor/Catch2 @@ -0,0 +1 @@ +Subproject commit d4b0b34561a0ffe71216894caa35694e1aabdd5b diff --git a/vendor/fmt b/vendor/fmt new file mode 160000 index 00000000..7f716e41 --- /dev/null +++ b/vendor/fmt @@ -0,0 +1 @@ +Subproject commit 7f716e41707c5bf23f9d5d9dbe49b4be70543016 diff --git a/vendor/jsoncpp b/vendor/jsoncpp new file mode 160000 index 00000000..69098a18 --- /dev/null +++ b/vendor/jsoncpp @@ -0,0 +1 @@ +Subproject commit 69098a18b9af0c47549d9a271c054d13ca92b006 diff --git a/vendor/libevent b/vendor/libevent new file mode 160000 index 00000000..cfb2b89a --- /dev/null +++ b/vendor/libevent @@ -0,0 +1 @@ +Subproject commit cfb2b89a1d0642abd6389913e237f49c662502e4 diff --git a/vendor/plog b/vendor/plog new file mode 160000 index 00000000..a2b61131 --- /dev/null +++ b/vendor/plog @@ -0,0 +1 @@ +Subproject commit a2b61131975ebc69f2f0ecbe6e69e1bf82133f91 diff --git a/vendor/yaml-cpp b/vendor/yaml-cpp new file mode 160000 index 00000000..db03655d --- /dev/null +++ b/vendor/yaml-cpp @@ -0,0 +1 @@ +Subproject commit db03655d58c66f31952c772718d0394eac2e5481 From 3b2199c73a8a753d44cd3db067b4eafe6f5d2618 Mon Sep 17 00:00:00 2001 From: mike dupont Date: Sat, 4 Nov 2023 23:50:09 -0400 Subject: [PATCH 11/12] removing 10 and 11 --- cublas_proxy/CMakeLists.txt | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/cublas_proxy/CMakeLists.txt b/cublas_proxy/CMakeLists.txt index b476e105..501545f5 100644 --- a/cublas_proxy/CMakeLists.txt +++ b/cublas_proxy/CMakeLists.txt @@ -48,10 +48,10 @@ endmacro() set(LOCAL_CUBLAS_PROXY "") add_subdirectory(Cuda12) -add_subdirectory(Cuda11) -if (NOT ${CMAKE_SYSTEM_PROCESSOR} STREQUAL "aarch64") - add_subdirectory(Cuda10) -endif() +#add_subdirectory(Cuda11) +#if (NOT ${CMAKE_SYSTEM_PROCESSOR} STREQUAL "aarch64") +# add_subdirectory(Cuda10) +#endif() set(DCGM_CUBLAS_PROXY ${LOCAL_CUBLAS_PROXY} PARENT_SCOPE) From 282bb8f82139727248aabaf4cd069678d82eb0fa Mon Sep 17 00:00:00 2001 From: mike dupont Date: Sat, 4 Nov 2023 23:53:45 -0400 Subject: [PATCH 12/12] adding change for fmt --- vendor/fmt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/fmt b/vendor/fmt index 7f716e41..270fcf0a 160000 --- a/vendor/fmt +++ b/vendor/fmt @@ -1 +1 @@ -Subproject commit 7f716e41707c5bf23f9d5d9dbe49b4be70543016 +Subproject commit 270fcf0a525b83f0b33f414ec99625283714ae0e