diff --git a/Makefile b/Makefile
index 73b83720c..5ae3a716b 100644
--- a/Makefile
+++ b/Makefile
@@ -195,6 +195,30 @@ else
 endif
 
 # Check if NCCL is available, include if so, for multi-GPU training
+# If exactly 1 GPU is detected and FORCE_MULTI_GPU != 1, default to NO_MULTI_GPU
+ifneq ($(OS), Windows_NT)
+  ifneq ($(call file_exists_in_path, nvidia-smi),)
+    GPU_COUNT := $(strip $(shell nvidia-smi -L 2>/dev/null | grep -c "^GPU "))
+  endif
+endif
+
+ifeq ($(strip $(FORCE_MULTI_GPU)),1)
+  ifeq ($(GPU_COUNT),1)
+    $(info → Detected 1 GPU but FORCE_MULTI_GPU=1; building with MULTI_GPU)
+  endif
+else
+  ifeq ($(GPU_COUNT),1)
+    ifeq ($(origin NO_MULTI_GPU), undefined)
+      NO_MULTI_GPU := 1
+      $(info → Detected exactly 1 GPU; defaulting to NO_MULTI_GPU. Set FORCE_MULTI_GPU=1 to override.)
+    endif
+    ifeq ($(origin NO_USE_MPI), undefined)
+      NO_USE_MPI := 1
+      $(info → Disabling MPI as well since NO_MULTI_GPU is active.)
+    endif
+  endif
+endif
+
 ifeq ($(NO_MULTI_GPU), 1)
   $(info → Multi-GPU (NCCL) is manually disabled)
 else
diff --git a/llmc/cuda_utils.cuh b/llmc/cuda_utils.cuh
index 030ec073e..976ec9444 100644
--- a/llmc/cuda_utils.cuh
+++ b/llmc/cuda_utils.cuh
@@ -217,7 +217,14 @@ int cudaMallocConditionallyManaged(void** out, size_t bytes, const char *file, i
         // if we OOM, fallback to a managed allocation. slower but at least won't crash.
         cudaGetLastError(); // reset the error before the next API call
         cudaCheck_(cudaMallocManaged(out, bytes), file, line);
-        cudaCheck_(cudaMemAdvise(*out, bytes, cudaMemAdviseSetPreferredLocation, cudaCpuDeviceId), file, line);
+        // Prefer host (CPU) location when falling back to managed memory
+        // Inline compatibility for CUDA 12+ vs older versions
+        #if defined(CUDART_VERSION) && (CUDART_VERSION >= 12000)
+            cudaMemLocation loc{};
+            loc.type = cudaMemLocationTypeHost;
+            loc.id = cudaCpuDeviceId;
+            cudaCheck_(cudaMemAdvise(*out, bytes, cudaMemAdviseSetPreferredLocation, loc), file, line);
+        #endif
         return 1;
     } else {
         cudaCheck_(err, file, line);