From f3e8600cfb6c0266c121d7e1f8b2e5dba9c5fab1 Mon Sep 17 00:00:00 2001
From: Moritz Lehmann <90851087+ProjectPhysX@users.noreply.github.com>
Date: Thu, 10 Feb 2022 17:10:02 +0100
Subject: [PATCH] Functions to automatically select Device with most
 flops/memory

Added utility functions to automatically select the fastest Device or the Device with largest memory capacity from all available Devices.

For selecting the fastest Device, the TFLOPs/s performance of the Device is estimated. For Nvidia and AMD GPUs, the estimate is challenging due to the different number of cores per CU depending on the microarchitecture and even GPU model:
- AMD GCN, CDNA: 64 cores/CU
- AMD RDNA, RDNA2: 128 cores/CU (dual CUs are reported as CUs in OpenCL)
- Nvidia Kepler: 192 cores/CU
- Nvidia Maxwell, Pascal, Ampere: 128 cores/CU
- Nvidia P100, Volta, Turing, A100, A30: 64 cores/CU

The vast majority of GPUs are captured with the correct estimate, but for some rare/old GPUs, the estimate could be wrong by a factor of 2.
For CPUs without SMT/HT as well as for very old CPUs with IPC<32 or very new CPUs with IPC=64 (AVX-512), the estimate is wrong.

Overall however, the estimated values are good enough to identify the fastest device in systems with one CPU and one or multiple GPUs.
---
 lib/src/Utils/Device.cpp | 72 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 72 insertions(+)
diff --git a/lib/src/Utils/Device.cpp b/lib/src/Utils/Device.cpp
index fab39c99..daf79304 100644
--- a/lib/src/Utils/Device.cpp
+++ b/lib/src/Utils/Device.cpp
@@ -1,6 +1,8 @@
 #include <CL/Utils/Device.hpp>
 
 #include <algorithm>
+#include <vector>
+#include <string>
 
 bool cl::util::opencl_c_version_contains(const cl::Device& device,
                                          const cl::string& version_fragment)
@@ -29,3 +31,73 @@ bool cl::util::supports_feature(const cl::Device& device,
         != c_features.cend();
 }
 #endif
+
+std::string to_lower(const std::string& s) {
+    std::string r = "";
+    for(int i=0; i<(int)s.length(); i++) {
+        const char c = s.at(i);
+        r += c>64&&c<91 ? c+32 : c;
+    }
+    return r;
+}
+bool contains(const std::string& s, const std::string& match) {
+    return s.find(match)!=std::string::npos;
+}
+bool contains_any(const std::string& s, const std::vector<std::string>& matches) {
+    for(int i=0; i<(int)matches.size(); i++) if(contains(s, matches[i])) return true;
+    return false;
+}
+
+std::vector<cl::Device> cl::util::get_devices() { // returns all available devices
+    std::vector<cl::Device> cl_devices;
+    std::vector<cl::Platform> cl_platforms; // get all platforms (drivers)
+    cl::Platform::get(&cl_platforms);
+    for(int i=0; i<(int)cl_platforms.size(); i++) {
+        std::vector<cl::Device> cl_devices_available;
+        cl_platforms[i].getDevices(CL_DEVICE_TYPE_ALL, &cl_devices_available);
+        for(int j=0; j<(int)cl_devices_available.size(); j++) {
+            cl_devices.push_back(cl_devices_available[j]);
+        }
+    }
+    return cl_devices;
+}
+
+cl::Device cl::util::select_device_with_most_flops(const std::vector<cl::Device>& cl_devices=cl::util::get_devices()) { // returns device with best floating-point performance
+    float best_value = 0.0f;
+    int best_i = 0;
+    for(int i=0; i<(int)cl_devices.size(); i++) { // find device with highest (estimated) floating point performance
+        const std::string name = cl_devices[i].getInfo<CL_DEVICE_NAME>(); // device name
+        const std::string vendor = cl_devices[i].getInfo<CL_DEVICE_VENDOR>(); // device vendor
+        const int compute_units = (int)cl_devices[i].getInfo<CL_DEVICE_MAX_COMPUTE_UNITS>(); // compute units (CUs) can contain multiple cores depending on the microarchitecture
+        const int clock_frequency = (int)cl_devices[i].getInfo<CL_DEVICE_MAX_CLOCK_FREQUENCY>(); // in MHz
+        const bool is_gpu = cl_devices[i].getInfo<CL_DEVICE_TYPE>()==CL_DEVICE_TYPE_GPU;
+        const int ipc = is_gpu?2:32; // IPC (instructions per cycle) is 2 for GPUs and 32 for most modern CPUs
+        const bool nvidia_192_cores_per_cu = contains_any(to_lower(name), {" 6", " 7", "ro k", "la k"}) || (clock_frequency<1000&&contains(to_lower(name), "titan")); // identify Kepler GPUs
+        const bool nvidia_64_cores_per_cu = contains_any(to_lower(name), {"p100", "v100", "a100", "a30", " 16", " 20", "titan v", "titan rtx", "ro t", "la t", "ro rtx"}) && !contains(to_lower(name), "rtx a"); // identify P100, Volta, Turing, A100
+        const bool amd_128_cores_per_dualcu = contains(to_lower(name), "gfx10"); // identify RDNA/RDNA2 GPUs where dual CUs are reported
+        const float nvidia = (float)(contains(to_lower(vendor), "nvidia"))*(nvidia_192_cores_per_cu?192.0f:(nvidia_64_cores_per_cu?64.0f:128.0f)); // Nvidia GPUs have 192 cores/CU (Kepler), 128 cores/CU (Maxwell, Pascal, Ampere) or 64 cores/CU (P100, Volta, Turing, A100)
+        const float amd = (float)(contains_any(to_lower(vendor), {"amd", "advanced"}))*(is_gpu?(amd_128_cores_per_dualcu?128.0f:64.0f):0.5f); // AMD GPUs have 64 cores/CU (GCN, CDNA) or 128 cores/dualCU (RDNA, RDNA2), AMD CPUs (with SMT) have 1/2 core/CU
+        const float intel = (float)(contains(to_lower(vendor), "intel"))*(is_gpu?8.0f:0.5f); // Intel integrated GPUs usually have 8 cores/CU, Intel CPUs (with HT) have 1/2 core/CU
+        const float arm = (float)(contains(to_lower(vendor), "arm"))*(is_gpu?8.0f:1.0f); // ARM GPUs usually have 8 cores/CU, ARM CPUs have 1 core/CU
+        const int cores = (int)((float)compute_units*(nvidia+amd+intel+arm)+0.5f); // for CPUs, compute_units is the number of threads (twice the number of cores with hyperthreading)
+        const float tflops = 1E-6f*(float)cores*(float)ipc*(float)clock_frequency; // estimated device floating point performance in TeraFLOPs/s
+        if(tflops>best_value) {
+            best_value = tflops;
+            best_i = i;
+        }
+    }
+    return cl_devices[best_i];
+}
+
+cl::Device cl::util::select_device_with_most_memory(const std::vector<cl::Device>& cl_devices=cl::util::get_devices()) { // returns device with largest memory capacity
+    int best_value = 0;
+    int best_i = 0;
+    for(int i=0; i<(int)cl_devices.size(); i++) { // find device with most memory
+        const int memory = (int)(cl_devices[i].getInfo<CL_DEVICE_GLOBAL_MEM_SIZE>()/1048576ull); // global memory in MB
+        if(memory>best_value) {
+            best_value = memory;
+            best_i = i;
+        }
+    }
+    return cl_devices[best_i];
+}