Skip to content

Commit 5ba9023

Browse files
committed
Fix CUDA build error (arch flags): completed
Remove Python pip from Dockerfile: completed (your build log confirms success) Repo-wide sweep for hardcoded sm_XX: completed
1 parent d0860cb commit 5ba9023

File tree

3 files changed

+10
-6
lines changed

3 files changed

+10
-6
lines changed

modules/module2/content.md

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -302,7 +302,6 @@ Texture memory provides:
302302

303303
```cuda
304304
#include <cuda_runtime.h>
305-
#include <texture_fetch_functions.h>
306305
307306
__global__ void textureKernel(cudaTextureObject_t texObj, float *output,
308307
int width, int height) {

modules/module2/examples/03_texture_memory_cuda.cu

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,4 @@
11
#include <cuda_runtime.h>
2-
#include <texture_fetch_functions.h>
32
#include <stdio.h>
43
#include <stdlib.h>
54
#include <math.h>

modules/module5/examples/01_gpu_profiling_cuda.cu

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -272,10 +272,13 @@ void analyzeDeviceProperties() {
272272
printf("Cores per MP: %d (estimated)\n", _ConvertSMVer2Cores(prop.major, prop.minor));
273273
printf("Total Cores: %d (estimated)\n", prop.multiProcessorCount * _ConvertSMVer2Cores(prop.major, prop.minor));
274274
printf("GPU Clock Rate: %.2f GHz\n", prop.clockRate / 1e6);
275-
printf("Memory Clock Rate: %.2f GHz\n", prop.memoryClockRate / 1e6);
276-
printf("Memory Bus Width: %d bits\n", prop.memoryBusWidth);
275+
int memClockKHz = 0, busWidthBits = 0;
276+
cudaDeviceGetAttribute(&memClockKHz, cudaDevAttrMemoryClockRate, device);
277+
cudaDeviceGetAttribute(&busWidthBits, cudaDevAttrGlobalMemoryBusWidth, device);
278+
printf("Memory Clock Rate: %.2f GHz\n", memClockKHz / 1e6);
279+
printf("Memory Bus Width: %d bits\n", busWidthBits);
277280
printf("Peak Memory Bandwidth: %.1f GB/s\n",
278-
2.0 * prop.memoryClockRate * (prop.memoryBusWidth / 8) / 1.0e6);
281+
2.0 * (memClockKHz / 1e6) * (busWidthBits / 8.0));
279282
printf("Global Memory: %.1f GB\n", prop.totalGlobalMem / (1024.0 * 1024.0 * 1024.0));
280283
printf("Shared Memory per Block: %zu KB\n", prop.sharedMemPerBlock / 1024);
281284
printf("Max Threads per Block: %d\n", prop.maxThreadsPerBlock);
@@ -409,7 +412,10 @@ void calculateTheoreticalLimits() {
409412
printf("=== Theoretical Performance Limits ===\n");
410413

411414
// Memory bandwidth calculation
412-
double memoryBandwidth = 2.0 * prop.memoryClockRate * (prop.memoryBusWidth / 8) / 1.0e6; // GB/s
415+
int memClockKHz = 0, busWidthBits = 0;
416+
cudaDeviceGetAttribute(&memClockKHz, cudaDevAttrMemoryClockRate, device);
417+
cudaDeviceGetAttribute(&busWidthBits, cudaDevAttrGlobalMemoryBusWidth, device);
418+
double memoryBandwidth = 2.0 * (memClockKHz / 1e6) * (busWidthBits / 8.0); // GB/s
413419
printf("Peak Memory Bandwidth: %.1f GB/s\n", memoryBandwidth);
414420

415421
// Compute throughput estimation

0 commit comments

Comments
 (0)