@@ -271,7 +271,9 @@ void analyzeDeviceProperties() {
271271 printf (" Multiprocessors: %d\n " , prop.multiProcessorCount );
272272 printf (" Cores per MP: %d (estimated)\n " , _ConvertSMVer2Cores (prop.major , prop.minor ));
273273 printf (" Total Cores: %d (estimated)\n " , prop.multiProcessorCount * _ConvertSMVer2Cores (prop.major , prop.minor ));
274- printf (" GPU Clock Rate: %.2f GHz\n " , prop.clockRate / 1e6 );
274+ int gpuClockKHz = 0 ;
275+ cudaDeviceGetAttribute (&gpuClockKHz, cudaDevAttrClockRate, device);
276+ printf (" GPU Clock Rate: %.2f GHz\n " , gpuClockKHz / 1e6 );
275277 int memClockKHz = 0 , busWidthBits = 0 ;
276278 cudaDeviceGetAttribute (&memClockKHz, cudaDevAttrMemoryClockRate, device);
277279 cudaDeviceGetAttribute (&busWidthBits, cudaDevAttrGlobalMemoryBusWidth, device);
@@ -421,7 +423,9 @@ void calculateTheoreticalLimits() {
421423 // Compute throughput estimation
422424 int coresPerSM = _ConvertSMVer2Cores (prop.major , prop.minor );
423425 int totalCores = prop.multiProcessorCount * coresPerSM;
424- double computeThroughput = totalCores * prop.clockRate / 1e6 ; // GFLOPS (single precision)
426+ int gpuClockKHz = 0 ;
427+ cudaDeviceGetAttribute (&gpuClockKHz, cudaDevAttrClockRate, device);
428+ double computeThroughput = totalCores * gpuClockKHz / 1e6 ; // GFLOPS (single precision)
425429 printf (" Estimated Peak Compute (SP): %.1f GFLOPS\n " , computeThroughput);
426430
427431 // Roofline model breakpoint
0 commit comments