diff --git a/lib/src/Utils/Device.cpp b/lib/src/Utils/Device.cpp index fab39c99..daf79304 100644 --- a/lib/src/Utils/Device.cpp +++ b/lib/src/Utils/Device.cpp @@ -1,6 +1,8 @@ #include #include +#include +#include bool cl::util::opencl_c_version_contains(const cl::Device& device, const cl::string& version_fragment) @@ -29,3 +31,73 @@ bool cl::util::supports_feature(const cl::Device& device, != c_features.cend(); } #endif + +std::string to_lower(const std::string& s) { + std::string r = ""; + for(int i=0; i<(int)s.length(); i++) { + const char c = s.at(i); + r += c>64&&c<91 ? c+32 : c; + } + return r; +} +bool contains(const std::string& s, const std::string& match) { + return s.find(match)!=std::string::npos; +} +bool contains_any(const std::string& s, const std::vector& matches) { + for(int i=0; i<(int)matches.size(); i++) if(contains(s, matches[i])) return true; + return false; +} + +std::vector cl::util::get_devices() { // returns all available devices + std::vector cl_devices; + std::vector cl_platforms; // get all platforms (drivers) + cl::Platform::get(&cl_platforms); + for(int i=0; i<(int)cl_platforms.size(); i++) { + std::vector cl_devices_available; + cl_platforms[i].getDevices(CL_DEVICE_TYPE_ALL, &cl_devices_available); + for(int j=0; j<(int)cl_devices_available.size(); j++) { + cl_devices.push_back(cl_devices_available[j]); + } + } + return cl_devices; +} + +cl::Device cl::util::select_device_with_most_flops(const std::vector& cl_devices=cl::util::get_devices()) { // returns device with best floating-point performance + float best_value = 0.0f; + int best_i = 0; + for(int i=0; i<(int)cl_devices.size(); i++) { // find device with highest (estimated) floating point performance + const std::string name = cl_devices[i].getInfo(); // device name + const std::string vendor = cl_devices[i].getInfo(); // device vendor + const int compute_units = (int)cl_devices[i].getInfo(); // compute units (CUs) can contain multiple cores depending on the microarchitecture + const int clock_frequency = (int)cl_devices[i].getInfo(); // in MHz + const bool is_gpu = cl_devices[i].getInfo()==CL_DEVICE_TYPE_GPU; + const int ipc = is_gpu?2:32; // IPC (instructions per cycle) is 2 for GPUs and 32 for most modern CPUs + const bool nvidia_192_cores_per_cu = contains_any(to_lower(name), {" 6", " 7", "ro k", "la k"}) || (clock_frequency<1000&&contains(to_lower(name), "titan")); // identify Kepler GPUs + const bool nvidia_64_cores_per_cu = contains_any(to_lower(name), {"p100", "v100", "a100", "a30", " 16", " 20", "titan v", "titan rtx", "ro t", "la t", "ro rtx"}) && !contains(to_lower(name), "rtx a"); // identify P100, Volta, Turing, A100 + const bool amd_128_cores_per_dualcu = contains(to_lower(name), "gfx10"); // identify RDNA/RDNA2 GPUs where dual CUs are reported + const float nvidia = (float)(contains(to_lower(vendor), "nvidia"))*(nvidia_192_cores_per_cu?192.0f:(nvidia_64_cores_per_cu?64.0f:128.0f)); // Nvidia GPUs have 192 cores/CU (Kepler), 128 cores/CU (Maxwell, Pascal, Ampere) or 64 cores/CU (P100, Volta, Turing, A100) + const float amd = (float)(contains_any(to_lower(vendor), {"amd", "advanced"}))*(is_gpu?(amd_128_cores_per_dualcu?128.0f:64.0f):0.5f); // AMD GPUs have 64 cores/CU (GCN, CDNA) or 128 cores/dualCU (RDNA, RDNA2), AMD CPUs (with SMT) have 1/2 core/CU + const float intel = (float)(contains(to_lower(vendor), "intel"))*(is_gpu?8.0f:0.5f); // Intel integrated GPUs usually have 8 cores/CU, Intel CPUs (with HT) have 1/2 core/CU + const float arm = (float)(contains(to_lower(vendor), "arm"))*(is_gpu?8.0f:1.0f); // ARM GPUs usually have 8 cores/CU, ARM CPUs have 1 core/CU + const int cores = (int)((float)compute_units*(nvidia+amd+intel+arm)+0.5f); // for CPUs, compute_units is the number of threads (twice the number of cores with hyperthreading) + const float tflops = 1E-6f*(float)cores*(float)ipc*(float)clock_frequency; // estimated device floating point performance in TeraFLOPs/s + if(tflops>best_value) { + best_value = tflops; + best_i = i; + } + } + return cl_devices[best_i]; +} + +cl::Device cl::util::select_device_with_most_memory(const std::vector& cl_devices=cl::util::get_devices()) { // returns device with largest memory capacity + int best_value = 0; + int best_i = 0; + for(int i=0; i<(int)cl_devices.size(); i++) { // find device with most memory + const int memory = (int)(cl_devices[i].getInfo()/1048576ull); // global memory in MB + if(memory>best_value) { + best_value = memory; + best_i = i; + } + } + return cl_devices[best_i]; +}