Fix CUDA build error (arch flags): completed

coketaste · coketaste · commit 5ba90235f82c · 2025-09-22T14:01:03.000-04:00
Remove Python pip from Dockerfile: completed (your build log confirms success)
Repo-wide sweep for hardcoded sm_XX: completed
diff --git a/modules/module2/content.md b/modules/module2/content.md
@@ -302,7 +302,6 @@ Texture memory provides:
 
 ```cuda
 #include <cuda_runtime.h>
-#include <texture_fetch_functions.h>
 
 __global__ void textureKernel(cudaTextureObject_t texObj, float *output, 
                              int width, int height) {
diff --git a/modules/module2/examples/03_texture_memory_cuda.cu b/modules/module2/examples/03_texture_memory_cuda.cu
@@ -1,5 +1,4 @@
 #include <cuda_runtime.h>
-#include <texture_fetch_functions.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <math.h>
diff --git a/modules/module5/examples/01_gpu_profiling_cuda.cu b/modules/module5/examples/01_gpu_profiling_cuda.cu
@@ -272,10 +272,13 @@ void analyzeDeviceProperties() {
     printf("Cores per MP: %d (estimated)\n", _ConvertSMVer2Cores(prop.major, prop.minor));
     printf("Total Cores: %d (estimated)\n", prop.multiProcessorCount * _ConvertSMVer2Cores(prop.major, prop.minor));
     printf("GPU Clock Rate: %.2f GHz\n", prop.clockRate / 1e6);
-    printf("Memory Clock Rate: %.2f GHz\n", prop.memoryClockRate / 1e6);
-    printf("Memory Bus Width: %d bits\n", prop.memoryBusWidth);
+    int memClockKHz = 0, busWidthBits = 0;
+    cudaDeviceGetAttribute(&memClockKHz, cudaDevAttrMemoryClockRate, device);
+    cudaDeviceGetAttribute(&busWidthBits, cudaDevAttrGlobalMemoryBusWidth, device);
+    printf("Memory Clock Rate: %.2f GHz\n", memClockKHz / 1e6);
+    printf("Memory Bus Width: %d bits\n", busWidthBits);
     printf("Peak Memory Bandwidth: %.1f GB/s\n", 
-           2.0 * prop.memoryClockRate * (prop.memoryBusWidth / 8) / 1.0e6);
+        2.0 * (memClockKHz / 1e6) * (busWidthBits / 8.0));
     printf("Global Memory: %.1f GB\n", prop.totalGlobalMem / (1024.0 * 1024.0 * 1024.0));
     printf("Shared Memory per Block: %zu KB\n", prop.sharedMemPerBlock / 1024);
     printf("Max Threads per Block: %d\n", prop.maxThreadsPerBlock);
@@ -409,7 +412,10 @@ void calculateTheoreticalLimits() {
     printf("=== Theoretical Performance Limits ===\n");
     
     // Memory bandwidth calculation
-    double memoryBandwidth = 2.0 * prop.memoryClockRate * (prop.memoryBusWidth / 8) / 1.0e6; // GB/s
+    int memClockKHz = 0, busWidthBits = 0;
+    cudaDeviceGetAttribute(&memClockKHz, cudaDevAttrMemoryClockRate, device);
+    cudaDeviceGetAttribute(&busWidthBits, cudaDevAttrGlobalMemoryBusWidth, device);
+    double memoryBandwidth = 2.0 * (memClockKHz / 1e6) * (busWidthBits / 8.0); // GB/s
     printf("Peak Memory Bandwidth: %.1f GB/s\n", memoryBandwidth);
     
     // Compute throughput estimation

Original file line number	Diff line number	Diff line change
`@@ -1,5 +1,4 @@`
`1`	`1`	`#include <cuda_runtime.h>`
`2`		`-#include <texture_fetch_functions.h>`
`3`	`2`	`#include <stdio.h>`
`4`	`3`	`#include <stdlib.h>`
`5`	`4`	`#include <math.h>`