Skip to content

Commit d0860cb

Browse files
committed
Fix the container build error: remove pip installs from the CUDA Dockerfile to avoid PEP 668.
Fix the nvcc compile error: stop forcing hardcoded -arch flags (sm_70, sm_75, etc.) and detect the actual GPU arch at build time. Clean up source code broken by CUDA 13 deprecations: replace uses of memoryClockRate and memoryBusWidth from cudaDeviceProp with cudaDeviceGetAttribute.
1 parent 5b3e406 commit d0860cb

File tree

16 files changed

+51
-27
lines changed

16 files changed

+51
-27
lines changed

modules/module1/examples/03_matrix_multiplication_cuda.cu

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -210,8 +210,13 @@ int main() {
210210
cudaDeviceProp props;
211211
CUDA_CHECK(cudaGetDeviceProperties(&props, 0));
212212
printf("Running on: %s\n", props.name);
213-
printf("Peak memory bandwidth: %.1f GB/s\n",
214-
2.0 * props.memoryClockRate * (props.memoryBusWidth / 8) / 1.0e6);
213+
// CUDA 13: use cudaDeviceGetAttribute for memory metrics
214+
int memClockKHz = 0;
215+
int busWidthBits = 0;
216+
cudaDeviceGetAttribute(&memClockKHz, cudaDevAttrMemoryClockRate, 0);
217+
cudaDeviceGetAttribute(&busWidthBits, cudaDevAttrGlobalMemoryBusWidth, 0);
218+
double peakGBs = 2.0 * (memClockKHz / 1e6) * (busWidthBits / 8.0);
219+
printf("Peak memory bandwidth: %.1f GB/s\n", peakGBs);
215220

216221
// Cleanup
217222
free(h_A); free(h_B); free(h_C); free(h_C_ref);

modules/module1/examples/04_device_info_cuda.cu

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -27,10 +27,14 @@ int main() {
2727
props.maxThreadsDim[0], props.maxThreadsDim[1], props.maxThreadsDim[2]);
2828
printf(" Max Grid Size: (%d, %d, %d)\n",
2929
props.maxGridSize[0], props.maxGridSize[1], props.maxGridSize[2]);
30-
printf(" Memory Clock Rate: %.2f GHz\n", props.memoryClockRate / 1e6);
31-
printf(" Memory Bus Width: %d bits\n", props.memoryBusWidth);
30+
int memClockKHz = 0;
31+
int busWidthBits = 0;
32+
cudaDeviceGetAttribute(&memClockKHz, cudaDevAttrMemoryClockRate, i);
33+
cudaDeviceGetAttribute(&busWidthBits, cudaDevAttrGlobalMemoryBusWidth, i);
34+
printf(" Memory Clock Rate: %.2f GHz\n", memClockKHz / 1e6);
35+
printf(" Memory Bus Width: %d bits\n", busWidthBits);
3236
printf(" Peak Memory Bandwidth: %.2f GB/s\n",
33-
2.0 * props.memoryClockRate * (props.memoryBusWidth / 8) / 1.0e6);
37+
2.0 * (memClockKHz / 1e6) * (busWidthBits / 8.0));
3438
printf(" Multiprocessor Count: %d\n", props.multiProcessorCount);
3539
printf(" L2 Cache Size: %d bytes\n", props.l2CacheSize);
3640
printf(" Max Threads per Multiprocessor: %d\n", props.maxThreadsPerMultiProcessor);

modules/module1/examples/05_performance_comparison_cuda.cu

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -160,8 +160,11 @@ int main() {
160160
cudaDeviceProp props;
161161
CUDA_CHECK(cudaGetDeviceProperties(&props, 0));
162162
printf("GPU: %s\n", props.name);
163+
int memClockKHz = 0, busWidthBits = 0;
164+
cudaDeviceGetAttribute(&memClockKHz, cudaDevAttrMemoryClockRate, 0);
165+
cudaDeviceGetAttribute(&busWidthBits, cudaDevAttrGlobalMemoryBusWidth, 0);
163166
printf("Peak Memory Bandwidth: %.2f GB/s\n",
164-
2.0 * props.memoryClockRate * (props.memoryBusWidth / 8) / 1.0e6);
167+
2.0 * (memClockKHz / 1e6) * (busWidthBits / 8.0));
165168

166169
return 0;
167170
}

modules/module1/examples/Makefile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@ GPU_VENDOR = NONE
2626
endif
2727

2828
# CUDA architecture detection (prefer actual GPU via nvidia-smi; fallback sm_90)
29-
CUDA_ARCH ?= $(shell nvidia-smi --query-gpu=compute_cap --format=csv,noheader,nounits 2>/dev/null | head -1 | awk -F. '/^[0-9]+\.[0-9]+$/ {printf "sm_%d%d", $$1, $$2}')
29+
CUDA_ARCH ?= $(shell nvidia-smi --query-gpu=compute_cap --format=csv,noheader,nounits 2>/dev/null | head -1 | tr -dc '0-9' | sed -e 's/^/sm_/')
3030
ifeq ($(strip $(CUDA_ARCH)),)
3131
CUDA_ARCH := sm_90
3232
endif

modules/module2/examples/02_memory_coalescing_cuda.cu

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -354,10 +354,13 @@ int main() {
354354
cudaDeviceProp props;
355355
CUDA_CHECK(cudaGetDeviceProperties(&props, 0));
356356
printf("Running on: %s\n", props.name);
357+
int memClockKHz = 0, busWidthBits = 0;
358+
cudaDeviceGetAttribute(&memClockKHz, cudaDevAttrMemoryClockRate, 0);
359+
cudaDeviceGetAttribute(&busWidthBits, cudaDevAttrGlobalMemoryBusWidth, 0);
357360
printf("Global memory bandwidth: %.1f GB/s\n",
358-
2.0 * props.memoryClockRate * (props.memoryBusWidth / 8) / 1.0e6);
359-
printf("Memory bus width: %d bits\n", props.memoryBusWidth);
360-
printf("Memory clock rate: %d MHz\n", props.memoryClockRate / 1000);
361+
2.0 * (memClockKHz / 1e6) * (busWidthBits / 8.0));
362+
printf("Memory bus width: %d bits\n", busWidthBits);
363+
printf("Memory clock rate: %d MHz\n", memClockKHz / 1000);
361364
printf("Warp size: %d threads\n", props.warpSize);
362365

363366
// Run benchmarks

modules/module2/examples/05_memory_bandwidth_optimization_cuda.cu

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -417,10 +417,13 @@ int main() {
417417
CUDA_CHECK(cudaGetDeviceProperties(&props, 0));
418418
printf("Running on: %s\n", props.name);
419419

420-
double theoretical_bandwidth = 2.0 * props.memoryClockRate * (props.memoryBusWidth / 8) / 1.0e6;
420+
int memClockKHz = 0, busWidthBits = 0;
421+
cudaDeviceGetAttribute(&memClockKHz, cudaDevAttrMemoryClockRate, 0);
422+
cudaDeviceGetAttribute(&busWidthBits, cudaDevAttrGlobalMemoryBusWidth, 0);
423+
double theoretical_bandwidth = 2.0 * (memClockKHz / 1e6) * (busWidthBits / 8.0);
421424
printf("Theoretical peak bandwidth: %.1f GB/s\n", theoretical_bandwidth);
422-
printf("Memory clock rate: %d MHz\n", props.memoryClockRate / 1000);
423-
printf("Memory bus width: %d bits\n", props.memoryBusWidth);
425+
printf("Memory clock rate: %d MHz\n", memClockKHz / 1000);
426+
printf("Memory bus width: %d bits\n", busWidthBits);
424427
printf("L2 cache size: %d MB\n", props.l2CacheSize / (1024 * 1024));
425428

426429
// Run benchmarks

modules/module2/examples/Makefile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@ GPU_VENDOR = NONE
2626
endif
2727

2828
# CUDA architecture detection (prefer actual GPU via nvidia-smi; fallback sm_90)
29-
CUDA_ARCH ?= $(shell nvidia-smi --query-gpu=compute_cap --format=csv,noheader,nounits 2>/dev/null | head -1 | awk -F. '/^[0-9]+\.[0-9]+$/ {printf "sm_%d%d", $$1, $$2}')
29+
CUDA_ARCH ?= $(shell nvidia-smi --query-gpu=compute_cap --format=csv,noheader,nounits 2>/dev/null | head -1 | tr -dc '0-9' | sed -e 's/^/sm_/')
3030
ifeq ($(strip $(CUDA_ARCH)),)
3131
CUDA_ARCH := sm_90
3232
endif

modules/module3/examples/Makefile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@ GPU_VENDOR = NONE
2626
endif
2727

2828
# CUDA architecture detection (prefer actual GPU via nvidia-smi; fallback sm_90)
29-
CUDA_ARCH ?= $(shell nvidia-smi --query-gpu=compute_cap --format=csv,noheader,nounits 2>/dev/null | head -1 | awk -F. '/^[0-9]+\.[0-9]+$/ {printf "sm_%d%d", $$1, $$2}')
29+
CUDA_ARCH ?= $(shell nvidia-smi --query-gpu=compute_cap --format=csv,noheader,nounits 2>/dev/null | head -1 | tr -dc '0-9' | sed -e 's/^/sm_/')
3030
ifeq ($(strip $(CUDA_ARCH)),)
3131
CUDA_ARCH := sm_90
3232
endif

modules/module4/examples/01_cuda_streams_basics.cu

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -330,8 +330,11 @@ int main() {
330330
printf("Compute Capability: %d.%d\n", prop.major, prop.minor);
331331
printf("Concurrent Kernels: %s\n", prop.concurrentKernels ? "Yes" : "No");
332332
printf("Async Engine Count: %d\n", prop.asyncEngineCount);
333-
printf("Memory Bus Width: %d bits\n", prop.memoryBusWidth);
334-
printf("Memory Clock Rate: %.2f MHz\n\n", prop.memoryClockRate / 1000.0f);
333+
int memClockKHz = 0, busWidthBits = 0;
334+
cudaDeviceGetAttribute(&memClockKHz, cudaDevAttrMemoryClockRate, 0);
335+
cudaDeviceGetAttribute(&busWidthBits, cudaDevAttrGlobalMemoryBusWidth, 0);
336+
printf("Memory Bus Width: %d bits\n", busWidthBits);
337+
printf("Memory Clock Rate: %.2f MHz\n\n", memClockKHz / 1000.0f);
335338

336339
// Allocate host memory
337340
const int totalSize = TOTAL_SIZE;

modules/module4/examples/02_multi_gpu_programming.cu

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -83,10 +83,13 @@ void printDeviceInfo() {
8383
printf("Device %d: %s\n", i, prop.name);
8484
printf(" Compute Capability: %d.%d\n", prop.major, prop.minor);
8585
printf(" Global Memory: %.2f GB\n", prop.totalGlobalMem / (1024.0*1024.0*1024.0));
86-
printf(" Memory Clock Rate: %.2f MHz\n", prop.memoryClockRate / 1000.0);
87-
printf(" Memory Bus Width: %d bits\n", prop.memoryBusWidth);
88-
printf(" Peak Memory Bandwidth: %.2f GB/s\n",
89-
2.0 * prop.memoryClockRate * (prop.memoryBusWidth / 8) / 1.0e6);
86+
int memClockKHz = 0, busWidthBits = 0;
87+
cudaDeviceGetAttribute(&memClockKHz, cudaDevAttrMemoryClockRate, i);
88+
cudaDeviceGetAttribute(&busWidthBits, cudaDevAttrGlobalMemoryBusWidth, i);
89+
printf(" Memory Clock Rate: %.2f MHz\n", memClockKHz / 1000.0);
90+
printf(" Memory Bus Width: %d bits\n", busWidthBits);
91+
printf(" Peak Memory Bandwidth: %.2f GB/s\n",
92+
2.0 * (memClockKHz / 1e6) * (busWidthBits / 8.0));
9093
printf(" Multiprocessors: %d\n", prop.multiProcessorCount);
9194
printf(" Concurrent Kernels: %s\n", prop.concurrentKernels ? "Yes" : "No");
9295

0 commit comments

Comments
 (0)