Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
72 changes: 72 additions & 0 deletions lib/src/Utils/Device.cpp
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
#include <CL/Utils/Device.hpp>

#include <algorithm>
#include <vector>
#include <string>

bool cl::util::opencl_c_version_contains(const cl::Device& device,
const cl::string& version_fragment)
Expand Down Expand Up @@ -29,3 +31,73 @@ bool cl::util::supports_feature(const cl::Device& device,
!= c_features.cend();
}
#endif

std::string to_lower(const std::string& s) {
std::string r = "";
for(int i=0; i<(int)s.length(); i++) {
const char c = s.at(i);
r += c>64&&c<91 ? c+32 : c;
}
return r;
}
bool contains(const std::string& s, const std::string& match) {
return s.find(match)!=std::string::npos;
}
bool contains_any(const std::string& s, const std::vector<std::string>& matches) {
for(int i=0; i<(int)matches.size(); i++) if(contains(s, matches[i])) return true;
return false;
}

std::vector<cl::Device> cl::util::get_devices() { // returns all available devices
std::vector<cl::Device> cl_devices;
std::vector<cl::Platform> cl_platforms; // get all platforms (drivers)
cl::Platform::get(&cl_platforms);
for(int i=0; i<(int)cl_platforms.size(); i++) {
std::vector<cl::Device> cl_devices_available;
cl_platforms[i].getDevices(CL_DEVICE_TYPE_ALL, &cl_devices_available);
for(int j=0; j<(int)cl_devices_available.size(); j++) {
cl_devices.push_back(cl_devices_available[j]);
}
}
return cl_devices;
}

cl::Device cl::util::select_device_with_most_flops(const std::vector<cl::Device>& cl_devices=cl::util::get_devices()) { // returns device with best floating-point performance
float best_value = 0.0f;
int best_i = 0;
for(int i=0; i<(int)cl_devices.size(); i++) { // find device with highest (estimated) floating point performance
const std::string name = cl_devices[i].getInfo<CL_DEVICE_NAME>(); // device name
const std::string vendor = cl_devices[i].getInfo<CL_DEVICE_VENDOR>(); // device vendor
const int compute_units = (int)cl_devices[i].getInfo<CL_DEVICE_MAX_COMPUTE_UNITS>(); // compute units (CUs) can contain multiple cores depending on the microarchitecture
const int clock_frequency = (int)cl_devices[i].getInfo<CL_DEVICE_MAX_CLOCK_FREQUENCY>(); // in MHz
const bool is_gpu = cl_devices[i].getInfo<CL_DEVICE_TYPE>()==CL_DEVICE_TYPE_GPU;
const int ipc = is_gpu?2:32; // IPC (instructions per cycle) is 2 for GPUs and 32 for most modern CPUs
const bool nvidia_192_cores_per_cu = contains_any(to_lower(name), {" 6", " 7", "ro k", "la k"}) || (clock_frequency<1000&&contains(to_lower(name), "titan")); // identify Kepler GPUs
const bool nvidia_64_cores_per_cu = contains_any(to_lower(name), {"p100", "v100", "a100", "a30", " 16", " 20", "titan v", "titan rtx", "ro t", "la t", "ro rtx"}) && !contains(to_lower(name), "rtx a"); // identify P100, Volta, Turing, A100
const bool amd_128_cores_per_dualcu = contains(to_lower(name), "gfx10"); // identify RDNA/RDNA2 GPUs where dual CUs are reported
const float nvidia = (float)(contains(to_lower(vendor), "nvidia"))*(nvidia_192_cores_per_cu?192.0f:(nvidia_64_cores_per_cu?64.0f:128.0f)); // Nvidia GPUs have 192 cores/CU (Kepler), 128 cores/CU (Maxwell, Pascal, Ampere) or 64 cores/CU (P100, Volta, Turing, A100)
const float amd = (float)(contains_any(to_lower(vendor), {"amd", "advanced"}))*(is_gpu?(amd_128_cores_per_dualcu?128.0f:64.0f):0.5f); // AMD GPUs have 64 cores/CU (GCN, CDNA) or 128 cores/dualCU (RDNA, RDNA2), AMD CPUs (with SMT) have 1/2 core/CU
const float intel = (float)(contains(to_lower(vendor), "intel"))*(is_gpu?8.0f:0.5f); // Intel integrated GPUs usually have 8 cores/CU, Intel CPUs (with HT) have 1/2 core/CU
const float arm = (float)(contains(to_lower(vendor), "arm"))*(is_gpu?8.0f:1.0f); // ARM GPUs usually have 8 cores/CU, ARM CPUs have 1 core/CU
const int cores = (int)((float)compute_units*(nvidia+amd+intel+arm)+0.5f); // for CPUs, compute_units is the number of threads (twice the number of cores with hyperthreading)
const float tflops = 1E-6f*(float)cores*(float)ipc*(float)clock_frequency; // estimated device floating point performance in TeraFLOPs/s
if(tflops>best_value) {
best_value = tflops;
best_i = i;
}
}
return cl_devices[best_i];
}

cl::Device cl::util::select_device_with_most_memory(const std::vector<cl::Device>& cl_devices=cl::util::get_devices()) { // returns device with largest memory capacity
int best_value = 0;
int best_i = 0;
for(int i=0; i<(int)cl_devices.size(); i++) { // find device with most memory
const int memory = (int)(cl_devices[i].getInfo<CL_DEVICE_GLOBAL_MEM_SIZE>()/1048576ull); // global memory in MB
if(memory>best_value) {
best_value = memory;
best_i = i;
}
}
return cl_devices[best_i];
}