@@ -272,10 +272,13 @@ void analyzeDeviceProperties() {
272272 printf (" Cores per MP: %d (estimated)\n " , _ConvertSMVer2Cores (prop.major , prop.minor ));
273273 printf (" Total Cores: %d (estimated)\n " , prop.multiProcessorCount * _ConvertSMVer2Cores (prop.major , prop.minor ));
274274 printf (" GPU Clock Rate: %.2f GHz\n " , prop.clockRate / 1e6 );
275- printf (" Memory Clock Rate: %.2f GHz\n " , prop.memoryClockRate / 1e6 );
276- printf (" Memory Bus Width: %d bits\n " , prop.memoryBusWidth );
275+ int memClockKHz = 0 , busWidthBits = 0 ;
276+ cudaDeviceGetAttribute (&memClockKHz, cudaDevAttrMemoryClockRate, device);
277+ cudaDeviceGetAttribute (&busWidthBits, cudaDevAttrGlobalMemoryBusWidth, device);
278+ printf (" Memory Clock Rate: %.2f GHz\n " , memClockKHz / 1e6 );
279+ printf (" Memory Bus Width: %d bits\n " , busWidthBits);
277280 printf (" Peak Memory Bandwidth: %.1f GB/s\n " ,
278- 2.0 * prop. memoryClockRate * (prop. memoryBusWidth / 8 ) / 1.0e6 );
281+ 2.0 * (memClockKHz / 1e6 ) * (busWidthBits / 8.0 ) );
279282 printf (" Global Memory: %.1f GB\n " , prop.totalGlobalMem / (1024.0 * 1024.0 * 1024.0 ));
280283 printf (" Shared Memory per Block: %zu KB\n " , prop.sharedMemPerBlock / 1024 );
281284 printf (" Max Threads per Block: %d\n " , prop.maxThreadsPerBlock );
@@ -409,7 +412,10 @@ void calculateTheoreticalLimits() {
409412 printf (" === Theoretical Performance Limits ===\n " );
410413
411414 // Memory bandwidth calculation
412- double memoryBandwidth = 2.0 * prop.memoryClockRate * (prop.memoryBusWidth / 8 ) / 1.0e6 ; // GB/s
415+ int memClockKHz = 0 , busWidthBits = 0 ;
416+ cudaDeviceGetAttribute (&memClockKHz, cudaDevAttrMemoryClockRate, device);
417+ cudaDeviceGetAttribute (&busWidthBits, cudaDevAttrGlobalMemoryBusWidth, device);
418+ double memoryBandwidth = 2.0 * (memClockKHz / 1e6 ) * (busWidthBits / 8.0 ); // GB/s
413419 printf (" Peak Memory Bandwidth: %.1f GB/s\n " , memoryBandwidth);
414420
415421 // Compute throughput estimation
0 commit comments