diff --git a/Makefile b/Makefile index 73b83720c..5ae3a716b 100644 --- a/Makefile +++ b/Makefile @@ -195,6 +195,30 @@ else endif # Check if NCCL is available, include if so, for multi-GPU training +# If exactly 1 GPU is detected and FORCE_MULTI_GPU != 1, default to NO_MULTI_GPU +ifneq ($(OS), Windows_NT) + ifneq ($(call file_exists_in_path, nvidia-smi),) + GPU_COUNT := $(strip $(shell nvidia-smi -L 2>/dev/null | grep -c "^GPU ")) + endif +endif + +ifeq ($(strip $(FORCE_MULTI_GPU)),1) + ifeq ($(GPU_COUNT),1) + $(info → Detected 1 GPU but FORCE_MULTI_GPU=1; building with MULTI_GPU) + endif +else + ifeq ($(GPU_COUNT),1) + ifeq ($(origin NO_MULTI_GPU), undefined) + NO_MULTI_GPU := 1 + $(info → Detected exactly 1 GPU; defaulting to NO_MULTI_GPU. Set FORCE_MULTI_GPU=1 to override.) + endif + ifeq ($(origin NO_USE_MPI), undefined) + NO_USE_MPI := 1 + $(info → Disabling MPI as well since NO_MULTI_GPU is active.) + endif + endif +endif + ifeq ($(NO_MULTI_GPU), 1) $(info → Multi-GPU (NCCL) is manually disabled) else diff --git a/llmc/cuda_utils.cuh b/llmc/cuda_utils.cuh index 030ec073e..976ec9444 100644 --- a/llmc/cuda_utils.cuh +++ b/llmc/cuda_utils.cuh @@ -217,7 +217,14 @@ int cudaMallocConditionallyManaged(void** out, size_t bytes, const char *file, i // if we OOM, fallback to a managed allocation. slower but at least won't crash. cudaGetLastError(); // reset the error before the next API call cudaCheck_(cudaMallocManaged(out, bytes), file, line); - cudaCheck_(cudaMemAdvise(*out, bytes, cudaMemAdviseSetPreferredLocation, cudaCpuDeviceId), file, line); + // Prefer host (CPU) location when falling back to managed memory + // Inline compatibility for CUDA 12+ vs older versions + #if defined(CUDART_VERSION) && (CUDART_VERSION >= 12000) + cudaMemLocation loc{}; + loc.type = cudaMemLocationTypeHost; + loc.id = cudaCpuDeviceId; + cudaCheck_(cudaMemAdvise(*out, bytes, cudaMemAdviseSetPreferredLocation, loc), file, line); + #endif return 1; } else { cudaCheck_(err, file, line);