diff --git a/.appveyor.yml b/.appveyor.yml index 28c669c..30efbc5 100644 --- a/.appveyor.yml +++ b/.appveyor.yml @@ -9,31 +9,27 @@ environment: global: STACK_ROOT: "c:\\sr" matrix: - - GHC: "8.10" - - GHC: "8.8" - - GHC: "8.6" + - GHC: "9.10" + - GHC: "9.6" + - GHC: "9.0" - GHC: "8.4" - - GHC: "8.2" - - GHC: "8.0" - # - GHC: "7.10" - # - GHC: "7.8" # failed to install ghc: https://ci.appveyor.com/project/tmcdonell/cuda/build/1.0.4/job/ufhtj0klyq73psas#L149 before_build: # http://help.appveyor.com/discussions/problems/6312-curl-command-not-found - set PATH=C:\Program Files\Git\mingw64\bin;%PATH% - set PATH=C:\Users\appveyor\AppData\Roaming\local\bin;%PATH% - # install CUDA-9.0 - - appveyor DownloadFile "https://developer.nvidia.com/compute/cuda/9.0/Prod/network_installers/cuda_9.0.176_windows_network-exe" -FileName install_cuda.exe - - install_cuda.exe -s compiler_9.0 cudart_9.0 cublas_9.0 cublas_dev_9.0 cufft_9.0 cufft_dev_9.0 cusolver_9.0 cusolver_dev_9.0 cusparse_9.0 cusparse_dev_9.0 - - set PATH=%ProgramFiles%\NVIDIA GPU Computing Toolkit\CUDA\v9.0\nvvm\bin;%PATH% - - set PATH=%ProgramFiles%\NVIDIA GPU Computing Toolkit\CUDA\v9.0\bin;%PATH% + # install CUDA-13.0 + - appveyor DownloadFile "https://developer.download.nvidia.com/compute/cuda/13.0.2/network_installers/cuda_13.0.2_windows_network.exe" -FileName install_cuda.exe + - install_cuda.exe -s crt_13.0 cudart_13.0 nvcc_13.0 + - set PATH=%ProgramFiles%\NVIDIA GPU Computing Toolkit\CUDA\v13.0\nvvm\bin;%PATH% + - set PATH=%ProgramFiles%\NVIDIA GPU Computing Toolkit\CUDA\v13.0\bin;%PATH% - nvcc --version # CUDA refuses to install the driver if no compatible GPU can be found, so # copy these .dll files manually - - appveyor DownloadFile "https://drive.google.com/uc?export=download&id=14x0RX8QlHQ6vKhimbR4FDRgfP7EoHfgc" -FileName nvdriver-9.0.176.7z - - 7z x nvdriver-9.0.176.7z -oC:\Windows\System32 + # - appveyor DownloadFile "https://drive.google.com/uc?export=download&id=14x0RX8QlHQ6vKhimbR4FDRgfP7EoHfgc" -FileName nvdriver-9.0.176.7z + # - 7z x nvdriver-9.0.176.7z -oC:\Windows\System32 # install stack - appveyor DownloadFile "https://www.stackage.org/stack/windows-x86_64" -FileName stack.zip diff --git a/.github/workflows/ci-linux.yml b/.github/workflows/ci-linux.yml index 7eea073..e19a820 100644 --- a/.github/workflows/ci-linux.yml +++ b/.github/workflows/ci-linux.yml @@ -7,6 +7,7 @@ on: push: paths: - '.github/workflows/ci-linux.yml' + - 'Setup.hs' - 'stack*.yaml' - '*.cabal' - '*/src/**' @@ -18,20 +19,20 @@ jobs: strategy: matrix: ghc: - - "8.10" - - "8.8" - - "8.6" - - "8.4" - - "8.2" - - "8.0" - - "7.8" - cuda: - - "10.2" - - "10.1" - - "10.0" + - "9.10" + - "9.8" + - "9.6" + - "9.4" - "9.2" - - "9.1" - "9.0" + # - "8.10" # save some resources + # - "8.8" + # - "8.6" + - "8.4" + cuda: + - "13.0" + - "12.9" + # - "12.5" # save some resources # include: # - os: windows-latest @@ -43,16 +44,16 @@ jobs: HADDOCK_FLAGS: "--haddock --no-haddock-deps --no-haddock-hyperlink-source --haddock-arguments=\"--no-print-missing-docs\"" steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v5 - run: ln -s stack-${{ matrix.ghc }}.yaml stack.yaml - - uses: actions/cache@v2 + - uses: actions/cache@v4 with: path: snapshot.pkgdb key: ${{ runner.os }}-${{ matrix.ghc }}-${{ matrix.cuda }}-snapshot.pkgdb - - uses: actions/cache@v2 + - uses: actions/cache@v4 with: path: | ~/.local/bin @@ -61,7 +62,6 @@ jobs: .stack-work key: ${{ runner.os }}-${{ matrix.ghc }}-${{ matrix.cuda }}-${{ hashFiles('stack.yaml') }}-${{ hashFiles('snapshot.pkgdb') }} restore-keys: | - ${{ runner.os }}-${{ matrix.ghc }}-${{ matrix.cuda }}-${{ hashFiles('stack.yaml') }}-${{ hashFiles('snapshot.pkgdb') }} ${{ runner.os }}-${{ matrix.ghc }}-${{ matrix.cuda }}-${{ hashFiles('stack.yaml') }}- ${{ runner.os }}-${{ matrix.ghc }}-${{ matrix.cuda }}- @@ -80,12 +80,11 @@ jobs: - name: Install CUDA run: | MATRIX_CUDA=${{ matrix.cuda }} - wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/cuda-ubuntu1804.pin - sudo mv cuda-ubuntu1804.pin /etc/apt/preferences.d/cuda-repository-pin-600 - sudo apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/7fa2af80.pub - sudo add-apt-repository "deb http://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/ /" + UBUNTUVER=$(sed -n '/^DISTRIB_RELEASE=/ { s/.*=//; s/\.//; p; q; }' /etc/lsb-release) + wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu$UBUNTUVER/x86_64/cuda-keyring_1.1-1_all.deb + sudo dpkg -i cuda-keyring_1.1-1_all.deb sudo apt-get update - sudo apt-get -y install cuda-${MATRIX_CUDA/./-} + sudo apt-get -y install cuda-{runtime,compiler,libraries,libraries-dev}-${MATRIX_CUDA/./-} echo "CUDA_HOME=/usr/local/cuda-${MATRIX_CUDA}" >> $GITHUB_ENV echo "LD_LIBRARY_PATH=/usr/local/cuda-${MATRIX_CUDA}/lib64:$(stack exec ghc -- --print-libdir)/rts:/usr/local/cuda-${MATRIX_CUDA}/nvvm/lib64:${LD_LIBRARY_PATH}" >> $GITHUB_ENV echo "/usr/local/cuda-${MATRIX_CUDA}/bin" >> $GITHUB_PATH diff --git a/CHANGELOG.md b/CHANGELOG.md index fe21a06..ddd1119 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,11 +5,19 @@ Notable changes to the project will be documented in this file. The format is based on [Keep a Changelog](http://keepachangelog.com/). **NOTE:** The version numbers of this package roughly align to the latest -version of the CUDA API this package is built against This means that this +version of the CUDA API this package is built against. This means that this package _DOES NOT_ follow the PVP, or indeed any sensible version scheme, because NVIDIA are A-OK introducing breaking changes in minor updates. +## [0.13.0.0] - ??? +### Added + * Support for CUDA-13 + +### Removed + * A number of fields from DeviceProperties, as they have been removed from + `cudaDeviceProp`. Use `Foreign.CUDA.Driver.Device.attribute` to query them. + ## [0.12.8.0] - 2025-08-21 ### Added * Support for CUDA-12 diff --git a/README.md b/README.md index 67b5996..8fa919f 100644 --- a/README.md +++ b/README.md @@ -150,10 +150,13 @@ Here is an incomplete historical list of missing bindings. Pull requests welcome - cuGraphMemAllocNodeGetParams - cuGraphMemFreeNodeGetParams -### CUDA-12 +### CUDA >= 12 A lot. PRs welcome. +- CUDA-12.3 + - Edge data in the driver Graph API (`cuGraphAddDependencies_v2` etc.) + # Old compatibility notes diff --git a/Setup.hs b/Setup.hs index a5f79c9..54accf4 100644 --- a/Setup.hs +++ b/Setup.hs @@ -1,14 +1,22 @@ +-- Decouple from GHC's default language setting, so that it's easier +-- to maintain compatibility with old GHCs. +{-# LANGUAGE Haskell2010 #-} +{-# OPTIONS_GHC -Wall #-} + +{-# LANGUAGE ConstraintKinds #-} {-# LANGUAGE CPP #-} {-# LANGUAGE DataKinds #-} +{-# LANGUAGE KindSignatures #-} {-# LANGUAGE QuasiQuotes #-} {-# LANGUAGE TemplateHaskell #-} +{-# LANGUAGE TupleSections #-} -- The MIN_VERSION_Cabal macro was introduced with Cabal-1.24 (??) #ifndef MIN_VERSION_Cabal #define MIN_VERSION_Cabal(major1,major2,minor) 0 #endif -import Distribution.PackageDescription +import Distribution.PackageDescription hiding ( Flag ) import Distribution.Simple import Distribution.Simple.BuildPaths import Distribution.Simple.Command @@ -45,7 +53,6 @@ import Distribution.Simple.PackageDescription import Distribution.Utils.Path (SymbolicPath, FileOrDir(File, Dir), Lib, Include, Pkg, CWD, makeSymbolicPath, interpretSymbolicPath, makeRelativePathEx) import qualified Distribution.Types.LocalBuildConfig as LBC #else -import Data.Kind (Constraint) #endif import Control.Exception @@ -249,7 +256,9 @@ cudaLibraryPaths (Platform arch os) installPath = [ installPath path | path (Windows, X86_64) -> ["lib/x64"] (OSX, _) -> ["lib"] -- MacOS does not distinguish 32- vs. 64-bit paths (_, X86_64) -> ["lib64", "lib"] -- prefer lib64 for 64-bit systems +#if MIN_VERSION_Cabal(2,4,0) (_, AArch64) -> ["lib64", "lib"] +#endif _ -> ["lib"] -- otherwise @@ -734,7 +743,6 @@ die' _ = die -- Compatibility across Cabal 3.14 symbolic paths. -- If we want to drop pre-Cabal-3.14 compatibility at some point, this should all be merged in above. -workingDirFlag :: HasCommonFlags flags => flags -> Flag CWDPath lbiCWD :: LocalBuildInfo -> Maybe CWDPath #if MIN_VERSION_Cabal(3,14,0) @@ -745,6 +753,7 @@ type CWDPath = SymbolicPath CWD ('Dir Pkg) regVerbosity :: RegisterFlags -> Flag Verbosity regVerbosity = setupVerbosity . registerCommonFlags +workingDirFlag :: HasCommonFlags flags => flags -> Flag CWDPath workingDirFlag = setupWorkingDir . getCommonFlags lbiCWD = flagToMaybe . setupWorkingDir . configCommonFlags . LBC.configFlags . LBC.packageBuildDescr . localBuildDescr @@ -772,6 +781,7 @@ type CWDPath = () -- regVerbosity is still present as an actual field in Cabal 3.12 +workingDirFlag :: flags -> Flag CWDPath workingDirFlag _ = NoFlag lbiCWD _ = Nothing @@ -785,10 +795,6 @@ makeRelativePathEx = id interpretSymbolicPath :: Maybe CWDPath -> FilePath -> FilePath interpretSymbolicPath _ = id -type HasCommonFlags flags = () :: Constraint -getCommonFlags :: flags -> () -getCommonFlags _ = () - readHookedBuildInfoWithCWD :: Verbosity -> Maybe CWDPath -> FilePath -> IO HookedBuildInfo readHookedBuildInfoWithCWD verb _ path = readHookedBuildInfo verb path #endif diff --git a/cbits/stubs.c b/cbits/stubs.c index 67f9280..b62d8db 100644 --- a/cbits/stubs.c +++ b/cbits/stubs.c @@ -3,6 +3,7 @@ */ #include "cbits/stubs.h" +#include // memset #if CUDART_VERSION >= 7000 cudaError_t cudaLaunchKernel_simple(const void *func, unsigned int gridX, unsigned int gridY, unsigned int gridZ, unsigned int blockX, unsigned int blockY, unsigned int blockZ, void **args, size_t sharedMem, cudaStream_t stream) @@ -196,7 +197,13 @@ CUresult CUDAAPI cuDeviceTotalMem(size_t *bytes, CUdevice dev) CUresult CUDAAPI cuCtxCreate(CUcontext *pctx, unsigned int flags, CUdevice dev) { +#if CUDA_VERSION >= 13000 + CUctxCreateParams params; + memset(¶ms, 0, sizeof params); + return cuCtxCreate_v4(pctx, ¶ms, flags, dev); +#else return cuCtxCreate_v2(pctx, flags, dev); +#endif } CUresult CUDAAPI cuModuleGetGlobal(CUdeviceptr *dptr, size_t *bytes, CUmodule hmod, const char *name) @@ -424,3 +431,18 @@ CUresult CUDAAPI cuGraphExecKernelNodeSetParams_simple(CUgraphExec hGraphExec, C } #endif +#if CUDA_VERSION >= 13000 +// This is the signature of the CUDA <=12 version; much easier to shim here than in Haskell. +CUresult cuMemAdvise_device(CUdeviceptr dptr, size_t count, CUmem_advise advice, CUdevice device) +{ + return cuMemAdvise(dptr, count, advice, (CUmemLocation){.id = device, .type = CU_MEM_LOCATION_TYPE_DEVICE}); +} + +// This is the signature of the CUDA <=12 version; much easier to shim here than in Haskell. +CUresult cuMemPrefetchAsync_device(CUdeviceptr dptr, size_t count, CUdevice device, CUstream hStream) +{ + // flags is reserved and must be 0 in CUDA 13 + return cuMemPrefetchAsync(dptr, count, (CUmemLocation){.id = device, .type = CU_MEM_LOCATION_TYPE_DEVICE}, 0, hStream); +} +#endif + diff --git a/cbits/stubs.h b/cbits/stubs.h index 45219d7..9d2fbd0 100644 --- a/cbits/stubs.h +++ b/cbits/stubs.h @@ -184,6 +184,11 @@ CUresult CUDAAPI cuDevicePrimaryCtxSetFlags(CUdevice dev, unsigned int flags); CUresult CUDAAPI cuIpcOpenMemHandle(CUdeviceptr *pdptr, CUipcMemHandle handle, unsigned int Flags); #endif +#if CUDA_VERSION >= 13000 +CUresult cuMemAdvise_device(CUdeviceptr dptr, size_t count, CUmem_advise advice, CUdevice device); +CUresult cuMemPrefetchAsync_device(CUdeviceptr dptr, size_t count, CUdevice device, CUstream hStream); +#endif + #ifdef __cplusplus } #endif diff --git a/cuda.cabal b/cuda.cabal index 7bab5cf..b7af9db 100644 --- a/cuda.cabal +++ b/cuda.cabal @@ -1,7 +1,7 @@ cabal-version: 1.24 Name: cuda -Version: 0.12.8.0 +Version: 0.13.0.0 Synopsis: FFI binding to the CUDA interface for programming NVIDIA GPUs Description: The CUDA library provides a direct, general purpose C-like SPMD programming @@ -30,7 +30,7 @@ Description: . * "Foreign.CUDA.Runtime" . - Tested with library versions up to CUDA-12.8. See also the + Tested with library versions up to CUDA-13.0. See also the build matrix for version compatibility. . @@ -177,6 +177,6 @@ source-repository head source-repository this type: git location: https://github.com/tmcdonell/cuda - tag: v0.12.8.0 + tag: v0.13.0.0 -- vim: nospell diff --git a/examples/src/deviceQueryDrv/DeviceQuery.hs b/examples/src/deviceQueryDrv/DeviceQuery.hs index cb51865..9975b5e 100644 --- a/examples/src/deviceQueryDrv/DeviceQuery.hs +++ b/examples/src/deviceQueryDrv/DeviceQuery.hs @@ -5,6 +5,7 @@ module Main where import Control.Monad +import Foreign.Marshal.Utils ( toBool ) import Numeric import Prelude hiding ( (<>) ) import Text.PrettyPrint @@ -15,6 +16,32 @@ import Foreign.CUDA.Analysis as CUDA import qualified Foreign.CUDA.Driver as CUDA +-- In CUDA 13, a number of device properties became things only queryable using +-- cu(da)DeviceGetAttribute. This data type captures those. +$(if CUDA.libraryVersion >= 13000 then [d| + data AttrProperties = AttrProperties + { clockRate :: !Int -- ^ Clock frequency in kilohertz + , memClockRate :: !Int -- ^ Peak memory clock frequency in kilohertz + , computeMode :: !ComputeMode + , kernelExecTimeoutEnabled :: !Bool -- ^ Whether there is a runtime limit on kernels + , singleToDoublePerfRatio :: !Int -- ^ Ratio of single precision performance (in floating-point operations per second) to double precision performance + } + getAttrProperties :: Device -> IO AttrProperties + getAttrProperties d = do + clockRate <- CUDA.attribute d CUDA.ClockRate + memClockRate <- CUDA.attribute d CUDA.MemoryClockRate + computeMode <- toEnum <$> CUDA.attribute d CUDA.ComputeMode + kernelExecTimeoutEnabled <- toBool <$> CUDA.attribute d CUDA.KernelExecTimeout + singleToDoublePerfRatio <- CUDA.attribute d CUDA.SingleToDoublePrecisionPerfRatio + return AttrProperties{..} + |] else [d| + -- make it a record to ensure the {..} syntax is accepted + data AttrProperties = AttrProperties { _dummyAttrProperty :: () } + getAttrProperties :: Device -> IO AttrProperties + getAttrProperties _ = return (AttrProperties ()) + |]) + + main :: IO () main = do version <- CUDA.driverVersion @@ -32,16 +59,17 @@ main = do infos <- forM [0 .. numDevices-1] $ \n -> do dev <- CUDA.device n prp <- CUDA.props dev - return (n, dev, prp) + prp2 <- getAttrProperties dev + return (n, dev, prp, prp2) - forM_ infos $ \(n, dev, prp) -> do + forM_ infos $ \(n, dev, prp, prp2) -> do p2p <- statP2P dev prp infos - printf "\nDevice %d: %s\n%s\n" n (deviceName prp) (statDevice prp) + printf "\nDevice %d: %s\n%s\n" n (deviceName prp) (statDevice prp prp2) unless (null p2p) $ printf "%s\n" p2p -statDevice :: DeviceProperties -> String -statDevice dev@DeviceProperties{..} = +statDevice :: DeviceProperties -> AttrProperties -> String +statDevice dev@DeviceProperties{..} AttrProperties{..} = let DeviceResources{..} = deviceResources dev @@ -69,22 +97,34 @@ statDevice dev@DeviceProperties{..} = ,(" 2D:", grid maxTextureDim2D) ,(" 3D:", cube maxTextureDim3D) ,("Texture alignment:", text $ showBytes textureAlignment) - ,("Maximum memory pitch:", text $ showBytes memPitch) - ,("Concurrent kernel execution:", bool concurrentKernels) - ,("Concurrent copy and execution:", bool deviceOverlap <> text (printf ", with %d copy engine%s" asyncEngineCount (if asyncEngineCount > 1 then "s" else ""))) - ,("Runtime limit on kernel execution:", bool kernelExecTimeoutEnabled) + ,("Maximum memory pitch:", text $ showBytes memPitch)]++ + + $(if CUDA.libraryVersion >= 13000 then [| + [("Concurrent copy and kernel execution:", bool concurrentKernels <> text (printf " with %d copy engine%s" asyncEngineCount (if asyncEngineCount > 1 then "s" else "")))] + |] else [| + [("Concurrent kernel execution:", bool concurrentKernels) + ,("Concurrent copy and execution:", bool deviceOverlap <> text (printf ", with %d copy engine%s" asyncEngineCount (if asyncEngineCount > 1 then "s" else "")))] + |])++ + + [("Runtime limit on kernel execution:", bool kernelExecTimeoutEnabled) ,("Integrated GPU sharing host memory:", bool integrated) ,("Host page-locked memory mapping:", bool canMapHostMemory) ,("ECC memory support:", bool eccEnabled) ,("Unified addressing (UVA):", bool unifiedAddressing)]++ -#if __GLASGOW_HASKELL__ > 710 + $(if CUDA.libraryVersion >= 8000 then [| [("Single to double precision performance:", text $ printf "%d : 1" singleToDoublePerfRatio) - ,("Supports compute pre-emption:", bool preemption)]|] else [|[]|])++ + ,("Supports compute pre-emption:", bool preemption)] + |] else [|[]|])++ + $(if CUDA.libraryVersion >= 9000 then [| - [("Supports cooperative launch:", bool cooperativeLaunch) - ,("Supports multi-device cooperative launch:", bool cooperativeLaunchMultiDevice)]|] else [|[]|])++ -#endif + [("Supports cooperative launch:", bool cooperativeLaunch)] + |] else [|[]|])++ + + $(if CUDA.libraryVersion >= 9000 && CUDA.libraryVersion < 13000 then [| + [("Supports multi-device cooperative launch:", bool cooperativeLaunchMultiDevice)] + |] else [|[]|])++ + [("PCI bus/location:", int (busID pciInfo) <> char '/' <> int (deviceID pciInfo)) ,("Compute mode:", text (show computeMode)) ] @@ -94,7 +134,7 @@ statDevice dev@DeviceProperties{..} = $ text (describe computeMode) -statP2P :: Device -> DeviceProperties -> [(Int, Device, DeviceProperties)] -> IO String +statP2P :: Device -> DeviceProperties -> [(Int, Device, DeviceProperties, AttrProperties)] -> IO String statP2P dev prp infos | CUDA.libraryVersion < 4000 = return [] @@ -103,7 +143,7 @@ statP2P dev prp infos | otherwise = let go [] = return [] - go ((m, peer, pp):is) = + go ((m, peer, pp, _):is) = if dev == peer then go is else do diff --git a/src/Foreign/CUDA/Analysis/Device.chs b/src/Foreign/CUDA/Analysis/Device.chs index 4a44e8b..049bad6 100644 --- a/src/Foreign/CUDA/Analysis/Device.chs +++ b/src/Foreign/CUDA/Analysis/Device.chs @@ -32,13 +32,19 @@ import Debug.Trace -- | -- The compute mode the device is currently in -- +#if CUDA_VERSION < 13000 {# enum CUcomputemode as ComputeMode { underscoreToCase } with prefix="CU_COMPUTEMODE" deriving (Eq, Show) #} +#else +{# enum cudaComputeMode as ComputeMode + { } + with prefix="cudaComputeMode" deriving (Eq, Show) #} +#endif instance Describe ComputeMode where describe Default = "Multiple contexts are allowed on the device simultaneously" -#if CUDA_VERSION < 8000 +#if CUDA_VERSION < 8000 || CUDA_VERSION >= 13000 describe Exclusive = "Only one context used by a single thread can be present on this device at a time" #endif describe Prohibited = "No contexts can be created on this device at this time" @@ -69,7 +75,10 @@ cap a b = let a' = fromIntegral a in --} -- | --- The properties of a compute device +-- The properties of a compute device, mirroring @struct cudaDeviceProp@ in CUDA. +-- +-- In CUDA-13.0, a number of fields were removed from this struct and are now +-- only available by querying individual attributes on the device. -- data DeviceProperties = DeviceProperties { @@ -91,16 +100,22 @@ data DeviceProperties = DeviceProperties , maxTextureDim2D :: !(Int,Int) , maxTextureDim3D :: !(Int,Int,Int) #endif +#if CUDA_VERSION < 13000 , clockRate :: !Int -- ^ Clock frequency in kilohertz +#endif , multiProcessorCount :: !Int -- ^ Number of multiprocessors on the device , memPitch :: !Int64 -- ^ Maximum pitch in bytes allowed by memory copies #if CUDA_VERSION >= 4000 , memBusWidth :: !Int -- ^ Global memory bus width in bits +#if CUDA_VERSION < 13000 , memClockRate :: !Int -- ^ Peak memory clock frequency in kilohertz +#endif #endif , textureAlignment :: !Int64 -- ^ Alignment requirement for textures +#if CUDA_VERSION < 13000 , computeMode :: !ComputeMode , deviceOverlap :: !Bool -- ^ Device can concurrently copy memory and execute a kernel +#endif #if CUDA_VERSION >= 3000 , concurrentKernels :: !Bool -- ^ Device can possibly execute multiple kernels concurrently , eccEnabled :: !Bool -- ^ Device supports and has enabled error correction @@ -111,7 +126,9 @@ data DeviceProperties = DeviceProperties , pciInfo :: !PCI -- ^ PCI device information for the device , tccDriverEnabled :: !Bool -- ^ Whether this is a Tesla device using the TCC driver #endif +#if CUDA_VERSION < 13000 , kernelExecTimeoutEnabled :: !Bool -- ^ Whether there is a runtime limit on kernels +#endif , integrated :: !Bool -- ^ As opposed to discrete , canMapHostMemory :: !Bool -- ^ Device can use pinned memory #if CUDA_VERSION >= 4000 @@ -129,11 +146,15 @@ data DeviceProperties = DeviceProperties #endif #if CUDA_VERSION >= 8000 , preemption :: !Bool -- ^ Device supports compute pre-emption +#if CUDA_VERSION < 13000 , singleToDoublePerfRatio :: !Int -- ^ Ratio of single precision performance (in floating-point operations per second) to double precision performance #endif +#endif #if CUDA_VERSION >= 9000 , cooperativeLaunch :: !Bool -- ^ Device supports launching cooperative kernels +#if CUDA_VERSION < 13000 , cooperativeLaunchMultiDevice :: !Bool -- ^ Device can participate in cooperative multi-device kernels +#endif #endif } deriving (Show) diff --git a/src/Foreign/CUDA/Driver/Device.chs b/src/Foreign/CUDA/Driver/Device.chs index 8498653..7080c4e 100644 --- a/src/Foreign/CUDA/Driver/Device.chs +++ b/src/Foreign/CUDA/Driver/Device.chs @@ -246,7 +246,10 @@ attribute !d !a = cuDeviceGetAttribute a d peekS s _ = peekCString s --- | Returns a UUID for the device +-- | Returns a UUID for the device. +-- +-- Since CUDA-13: If the device is in MIG mode, this function returns its MIG +-- UUID which uniquely identifies the subscribed MIG compute instance. -- -- Requires CUDA-9.2 -- @@ -265,10 +268,17 @@ uuid !dev = unpack ptr where {-# INLINE cuDeviceGetUuid #-} +#if CUDA_VERSION < 13000 {# fun unsafe cuDeviceGetUuid { `Ptr ()' , useDevice `Device' } -> `()' checkStatus*- #} +#else + {# fun unsafe cuDeviceGetUuid_v2 as cuDeviceGetUuid + { `Ptr ()' + , useDevice `Device' + } -> `()' checkStatus*- #} +#endif {-# INLINE unpack #-} unpack :: Ptr () -> IO UUID @@ -319,7 +329,9 @@ props !d = do sharedMemPerBlock <- fromIntegral <$> attribute d SharedMemoryPerBlock memPitch <- fromIntegral <$> attribute d MaxPitch textureAlignment <- fromIntegral <$> attribute d TextureAlignment +#if CUDA_VERSION < 13000 clockRate <- attribute d ClockRate +#endif warpSize <- attribute d WarpSize regsPerBlock <- attribute d RegistersPerBlock maxThreadsPerBlock <- attribute d MaxThreadsPerBlock @@ -333,9 +345,11 @@ props !d = do computeCapability <- capability d totalGlobalMem <- totalMem d multiProcessorCount <- attribute d MultiprocessorCount +#if CUDA_VERSION < 13000 computeMode <- toEnum <$> attribute d ComputeMode deviceOverlap <- toBool <$> attribute d GpuOverlap kernelExecTimeoutEnabled <- toBool <$> attribute d KernelExecTimeout +#endif integrated <- toBool <$> attribute d Integrated canMapHostMemory <- toBool <$> attribute d CanMapHostMemory #if CUDA_VERSION >= 3000 @@ -350,7 +364,9 @@ props !d = do cacheMemL2 <- attribute d L2CacheSize maxThreadsPerMultiProcessor <- attribute d MaxThreadsPerMultiprocessor memBusWidth <- attribute d GlobalMemoryBusWidth +#if CUDA_VERSION < 13000 memClockRate <- attribute d MemoryClockRate +#endif pciInfo <- PCI <$> attribute d PciBusId <*> attribute d PciDeviceId <*> attribute d PciDomainId unifiedAddressing <- toBool <$> attribute d UnifiedAddressing tccDriverEnabled <- toBool <$> attribute d TccDriver @@ -367,11 +383,15 @@ props !d = do #endif #if CUDA_VERSION >= 8000 preemption <- toBool <$> attribute d ComputePreemptionSupported +#if CUDA_VERSION < 13000 singleToDoublePerfRatio <- attribute d SingleToDoublePrecisionPerfRatio #endif +#endif #if CUDA_VERSION >= 9000 cooperativeLaunch <- toBool <$> attribute d CooperativeLaunch +#if CUDA_VERSION < 13000 cooperativeLaunchMultiDevice <- toBool <$> attribute d CooperativeMultiDeviceLaunch +#endif #endif return DeviceProperties{..} diff --git a/src/Foreign/CUDA/Driver/Event.chs b/src/Foreign/CUDA/Driver/Event.chs index 16f6293..00b7e7f 100644 --- a/src/Foreign/CUDA/Driver/Event.chs +++ b/src/Foreign/CUDA/Driver/Event.chs @@ -114,10 +114,17 @@ elapsedTime :: Event -> Event -> IO Float elapsedTime !ev1 !ev2 = resultIfOk =<< cuEventElapsedTime ev1 ev2 {-# INLINE cuEventElapsedTime #-} +#if CUDA_VERSION < 13000 {# fun unsafe cuEventElapsedTime { alloca- `Float' peekFloatConv* , useEvent `Event' , useEvent `Event' } -> `Status' cToEnum #} +#else +{# fun unsafe cuEventElapsedTime_v2 as cuEventElapsedTime + { alloca- `Float' peekFloatConv* + , useEvent `Event' + , useEvent `Event' } -> `Status' cToEnum #} +#endif -- | diff --git a/src/Foreign/CUDA/Driver/Graph/Build.chs b/src/Foreign/CUDA/Driver/Graph/Build.chs index 833ce52..d115753 100644 --- a/src/Foreign/CUDA/Driver/Graph/Build.chs +++ b/src/Foreign/CUDA/Driver/Graph/Build.chs @@ -204,12 +204,23 @@ addDependencies !g !deps = cuGraphAddDependencies g from to where (from, to) = unzip deps +#if CUDA_VERSION < 13000 {# fun unsafe cuGraphAddDependencies { useGraph `Graph' , withNodeArray* `[Node]' , withNodeArrayLen* `[Node]'& } -> `()' checkStatus*- #} +#else + cuGraphAddDependencies g' from' to' = cuGraphAddDependencies_v2 g' from' to' (length deps) + {# fun unsafe cuGraphAddDependencies_v2 + { useGraph `Graph' + , withNodeArray* `[Node]' + , withNodeArray* `[Node]' + , withNullEdgeDataLen* `Int'& + } + -> `()' checkStatus*- #} +#endif #endif @@ -230,12 +241,23 @@ removeDependencies !g !deps = cuGraphRemoveDependencies g from to where (from, to) = unzip deps +#if CUDA_VERSION < 13000 {# fun unsafe cuGraphRemoveDependencies { useGraph `Graph' , withNodeArray* `[Node]' , withNodeArrayLen* `[Node]'& } -> `()' checkStatus*- #} +#else + cuGraphRemoveDependencies g' from' to' = cuGraphRemoveDependencies_v2 g' from' to' (length deps) + {# fun unsafe cuGraphRemoveDependencies_v2 + { useGraph `Graph' + , withNodeArray* `[Node]' + , withNodeArray* `[Node]' + , withNullEdgeDataLen* `Int'& + } + -> `()' checkStatus*- #} +#endif #endif @@ -507,6 +529,7 @@ getEdges !g = to <- peekArray count p_to return $ zip from to where +#if CUDA_VERSION < 13000 {# fun unsafe cuGraphGetEdges { useGraph `Graph' , castPtr `Ptr Node' @@ -514,6 +537,17 @@ getEdges !g = , id `Ptr CULong' } -> `()' checkStatus*- #} +#else + cuGraphGetEdges g' f t c = cuGraphGetEdges_v2 g' f t nullPtr c + {# fun unsafe cuGraphGetEdges_v2 + { useGraph `Graph' + , castPtr `Ptr Node' + , castPtr `Ptr Node' + , castPtr `Ptr edgeData' + , id `Ptr CULong' + } + -> `()' checkStatus*- #} +#endif #endif @@ -598,12 +632,23 @@ getDependencies !n = cuGraphNodeGetDependencies n p_deps p_count peekArray count p_deps where +#if CUDA_VERSION < 13000 {# fun unsafe cuGraphNodeGetDependencies { useNode `Node' , castPtr `Ptr Node' , id `Ptr CULong' } -> `()' checkStatus*- #} +#else + cuGraphNodeGetDependencies n' d c = cuGraphNodeGetDependencies_v2 n' d nullPtr c + {# fun unsafe cuGraphNodeGetDependencies_v2 + { useNode `Node' + , castPtr `Ptr Node' + , castPtr `Ptr edgeData' + , id `Ptr CULong' + } + -> `()' checkStatus*- #} +#endif #endif @@ -628,12 +673,23 @@ getDependents n = cuGraphNodeGetDependentNodes n p_deps p_count peekArray count p_deps where +#if CUDA_VERSION < 13000 {# fun unsafe cuGraphNodeGetDependentNodes { useNode `Node' , castPtr `Ptr Node' , id `Ptr CULong' } -> `()' checkStatus*- #} +#else + cuGraphNodeGetDependentNodes n' d c = cuGraphNodeGetDependentNodes_v2 n' d nullPtr c + {# fun unsafe cuGraphNodeGetDependentNodes_v2 + { useNode `Node' + , castPtr `Ptr Node' + , castPtr `Ptr edgeData' + , id `Ptr CULong' + } + -> `()' checkStatus*- #} +#endif #endif @@ -679,5 +735,9 @@ withNodeArray ns f = withArray ns (f . castPtr) {-# INLINE withNodeArrayLen #-} withNodeArrayLen :: [Node] -> ((Ptr {# type CUgraphNode #}, CULong) -> IO a) -> IO a withNodeArrayLen ns f = withArrayLen ns $ \i p -> f (castPtr p, cIntConv i) + +{-# INLINE withNullEdgeDataLen #-} +withNullEdgeDataLen :: Int -> ((Ptr (), CULong) -> IO a) -> IO a +withNullEdgeDataLen len f = f (nullPtr, cIntConv len) #endif diff --git a/src/Foreign/CUDA/Driver/Graph/Capture.chs b/src/Foreign/CUDA/Driver/Graph/Capture.chs index 160ff70..070391f 100644 --- a/src/Foreign/CUDA/Driver/Graph/Capture.chs +++ b/src/Foreign/CUDA/Driver/Graph/Capture.chs @@ -143,6 +143,10 @@ status = requireSDK 'status 10.0 -- | Query the capture status of a stream and get an id for the capture -- sequence, which is unique over the lifetime of the process. -- +-- Since CUDA-13, "edge data" can be associated with an edge in the graph. This +-- function assumes no such data is present (if there is, a CUDA error +-- (@CUDA_ERROR_LOSSY_QUERY@) will be raised). +-- -- Requires CUDA-10.1 -- -- @@ -152,24 +156,38 @@ status = requireSDK 'status 10.0 #if CUDA_VERSION < 10010 info :: Stream -> IO (Status, Int64) info = requireSDK 'info 10.1 -#elif CUDA_VERSION < 12000 +-- Not another elif because c2hs seems to be buggy +#else +#if CUDA_VERSION < 12000 {# fun unsafe cuStreamGetCaptureInfo as info { useStream `Stream' , alloca- `Status' peekEnum* , alloca- `Int64' peekIntConv* } -> `()' checkStatus*- #} -#else +#elif CUDA_VERSION < 13000 {# fun unsafe cuStreamGetCaptureInfo_v2 as info { useStream `Stream' , alloca- `Status' peekEnum* , alloca- `Int64' peekIntConv* - , alloca- `Graph' - , alloca- `Node' - , alloca- `CSize' + , withNullPtr- `Graph' + , withNullPtr- `Node' + , withNullPtr- `CSize' + } + -> `()' checkStatus*- #} +#else +{# fun unsafe cuStreamGetCaptureInfo_v3 as info + { useStream `Stream' + , alloca- `Status' peekEnum* + , alloca- `Int64' peekIntConv* + , withNullPtr- `Graph' + , withNullPtr- `()' + , withNullPtr- `Node' + , withNullPtr- `CSize' } -> `()' checkStatus*- #} #endif +#endif -- | Set the stream capture interaction mode for this thread. Return the previous value. @@ -201,3 +219,5 @@ peekGraph :: Ptr {# type CUgraph #} -> IO Graph peekGraph = liftM Graph . peek #endif +withNullPtr :: (Ptr a -> r) -> r +withNullPtr f = f nullPtr diff --git a/src/Foreign/CUDA/Driver/Marshal.chs b/src/Foreign/CUDA/Driver/Marshal.chs index bf6c4ba..8d69d75 100644 --- a/src/Foreign/CUDA/Driver/Marshal.chs +++ b/src/Foreign/CUDA/Driver/Marshal.chs @@ -358,6 +358,7 @@ prefetchArrayAsync ptr n mdev mst = go undefined ptr go x _ = nothingIfOk =<< cuMemPrefetchAsync ptr (n * sizeOf x) (maybe (-1) useDevice mdev) (fromMaybe defaultStream mst) {-# INLINE cuMemPrefetchAsync #-} +#if CUDA_VERSION < 13000 {# fun unsafe cuMemPrefetchAsync { useDeviceHandle `DevicePtr a' , `Int' @@ -365,6 +366,15 @@ prefetchArrayAsync ptr n mdev mst = go undefined ptr , useStream `Stream' } -> `Status' cToEnum #} +#else +{# fun unsafe cuMemPrefetchAsync_device as cuMemPrefetchAsync + { useDeviceHandle `DevicePtr a' + , `Int' + , id `CInt' + , useStream `Stream' + } + -> `Status' cToEnum #} +#endif #endif diff --git a/src/Foreign/CUDA/Driver/Unified.chs b/src/Foreign/CUDA/Driver/Unified.chs index 4dbc576..b94cb46 100644 --- a/src/Foreign/CUDA/Driver/Unified.chs +++ b/src/Foreign/CUDA/Driver/Unified.chs @@ -258,6 +258,7 @@ advise ptr n a mdev = go undefined ptr go x _ = nothingIfOk =<< cuMemAdvise ptr (n * sizeOf x) a (maybe (-1) useDevice mdev) {-# INLINE cuMemAdvise #-} +#if CUDA_VERSION < 13000 {# fun unsafe cuMemAdvise { useHandle `Ptr a' , `Int' @@ -265,7 +266,15 @@ advise ptr n a mdev = go undefined ptr , `CInt' } -> `Status' cToEnum #} +#else +{# fun unsafe cuMemAdvise_device as cuMemAdvise + { useHandle `Ptr a' + , `Int' + , cFromEnum `Advice' + , `CInt' + } + -> `Status' cToEnum #} +#endif where useHandle = fromIntegral . ptrToIntPtr #endif - diff --git a/src/Foreign/CUDA/Runtime/Device.chs b/src/Foreign/CUDA/Runtime/Device.chs index 90e2089..942bd63 100644 --- a/src/Foreign/CUDA/Runtime/Device.chs +++ b/src/Foreign/CUDA/Runtime/Device.chs @@ -98,15 +98,23 @@ instance Storable DeviceProperties where warpSize <- cIntConv <$> {#get cudaDeviceProp.warpSize#} p memPitch <- cIntConv <$> {#get cudaDeviceProp.memPitch#} p maxThreadsPerBlock <- cIntConv <$> {#get cudaDeviceProp.maxThreadsPerBlock#} p +#if CUDART_VERSION < 13000 clockRate <- cIntConv <$> {#get cudaDeviceProp.clockRate#} p +#endif totalConstMem <- cIntConv <$> {#get cudaDeviceProp.totalConstMem#} p textureAlignment <- cIntConv <$> {#get cudaDeviceProp.textureAlignment#} p +#if CUDART_VERSION < 13000 deviceOverlap <- cToBool <$> {#get cudaDeviceProp.deviceOverlap#} p +#endif multiProcessorCount <- cIntConv <$> {#get cudaDeviceProp.multiProcessorCount#} p +#if CUDART_VERSION < 13000 kernelExecTimeoutEnabled <- cToBool <$> {#get cudaDeviceProp.kernelExecTimeoutEnabled#} p +#endif integrated <- cToBool <$> {#get cudaDeviceProp.integrated#} p canMapHostMemory <- cToBool <$> {#get cudaDeviceProp.canMapHostMemory#} p +#if CUDART_VERSION < 13000 computeMode <- cToEnum <$> {#get cudaDeviceProp.computeMode#} p +#endif #if CUDART_VERSION >= 3000 concurrentKernels <- cToBool <$> {#get cudaDeviceProp.concurrentKernels#} p maxTextureDim1D <- cIntConv <$> {#get cudaDeviceProp.maxTexture1D#} p @@ -123,7 +131,9 @@ instance Storable DeviceProperties where cacheMemL2 <- cIntConv <$> {#get cudaDeviceProp.l2CacheSize#} p maxThreadsPerMultiProcessor <- cIntConv <$> {#get cudaDeviceProp.maxThreadsPerMultiProcessor#} p memBusWidth <- cIntConv <$> {#get cudaDeviceProp.memoryBusWidth#} p +#if CUDART_VERSION < 13000 memClockRate <- cIntConv <$> {#get cudaDeviceProp.memoryClockRate#} p +#endif pciInfo <- PCI <$> (cIntConv <$> {#get cudaDeviceProp.pciBusID#} p) <*> (cIntConv <$> {#get cudaDeviceProp.pciDeviceID#} p) <*> (cIntConv <$> {#get cudaDeviceProp.pciDomainID#} p) @@ -157,11 +167,15 @@ instance Storable DeviceProperties where #else preemption <- cToBool <$> {#get cudaDeviceProp.computePreemptionSupported#} p #endif +#if CUDART_VERSION < 13000 singleToDoublePerfRatio <- cIntConv <$> {#get cudaDeviceProp.singleToDoublePrecisionPerfRatio#} p #endif +#endif #if CUDART_VERSION >= 9000 cooperativeLaunch <- cToBool <$> {#get cudaDeviceProp.cooperativeLaunch#} p +#if CUDART_VERSION < 13000 cooperativeLaunchMultiDevice <- cToBool <$> {#get cudaDeviceProp.cooperativeMultiDeviceLaunch#} p +#endif #endif return DeviceProperties{..} @@ -219,7 +233,7 @@ props :: Device -> IO DeviceProperties props !n = resultIfOk =<< cudaGetDeviceProperties n {-# INLINE cudaGetDeviceProperties #-} -#if CUDA_VERSION < 12000 +#if CUDA_VERSION < 12000 || CUDA_VERSION >= 13000 {# fun unsafe cudaGetDeviceProperties { alloca- `DeviceProperties' peek* , `Int' } -> `Status' cToEnum #} diff --git a/stack-7.10.yaml b/stack-7.10.yaml index fcb1f6f..300f80f 100644 --- a/stack-7.10.yaml +++ b/stack-7.10.yaml @@ -8,6 +8,8 @@ packages: extra-deps: - Cabal-1.24.2.0 +- c2hs-0.28.8@rev:3 +- language-c-0.10.0@rev:1 # Override default flag values for local packages and extra-deps # flags: {} diff --git a/stack-7.8.yaml b/stack-7.8.yaml index 4a4d803..c5f0131 100644 --- a/stack-7.8.yaml +++ b/stack-7.8.yaml @@ -8,6 +8,8 @@ packages: extra-deps: - Cabal-1.24.2.0 +- c2hs-0.28.8@rev:3 +- language-c-0.10.0@rev:1 # Override default flag values for local packages and extra-deps # flags: {} diff --git a/stack-8.0.yaml b/stack-8.0.yaml index 811282b..a20bdfe 100644 --- a/stack-8.0.yaml +++ b/stack-8.0.yaml @@ -6,7 +6,9 @@ resolver: lts-9.21 packages: - . -# extra-deps: [] +extra-deps: +- c2hs-0.28.8@rev:3 +- language-c-0.10.0@rev:1 # Override default flag values for local packages and extra-deps # flags: {} diff --git a/stack-8.10.yaml b/stack-8.10.yaml index 9c0429a..bb6eb46 100644 --- a/stack-8.10.yaml +++ b/stack-8.10.yaml @@ -6,7 +6,9 @@ resolver: lts-18.28 packages: - . -# extra-deps: [] +extra-deps: +- c2hs-0.28.8@rev:3 +- language-c-0.10.0@rev:1 # Override default flag values for local packages and extra-deps # flags: {} diff --git a/stack-8.2.yaml b/stack-8.2.yaml index e79be67..9c978d7 100644 --- a/stack-8.2.yaml +++ b/stack-8.2.yaml @@ -6,7 +6,9 @@ resolver: lts-11.22 packages: - . -# extra-deps: [] +extra-deps: +- c2hs-0.28.8@rev:3 +- language-c-0.10.0@rev:1 # Override default flag values for local packages and extra-deps # flags: {} diff --git a/stack-8.4.yaml b/stack-8.4.yaml index fc2099a..c69e569 100644 --- a/stack-8.4.yaml +++ b/stack-8.4.yaml @@ -6,7 +6,9 @@ resolver: lts-12.26 packages: - . -# extra-deps: [] +extra-deps: +- c2hs-0.28.8@rev:3 +- language-c-0.10.0@rev:1 # Override default flag values for local packages and extra-deps # flags: {} diff --git a/stack-8.6.yaml b/stack-8.6.yaml index 1b11651..6dc8625 100644 --- a/stack-8.6.yaml +++ b/stack-8.6.yaml @@ -6,7 +6,9 @@ resolver: lts-14.27 packages: - . -# extra-deps: [] +extra-deps: +- c2hs-0.28.8@rev:3 +- language-c-0.10.0@rev:1 # Override default flag values for local packages and extra-deps # flags: {} diff --git a/stack-8.8.yaml b/stack-8.8.yaml index beac0ec..8720a64 100644 --- a/stack-8.8.yaml +++ b/stack-8.8.yaml @@ -6,7 +6,9 @@ resolver: lts-16.31 packages: - . -# extra-deps: [] +extra-deps: +- c2hs-0.28.8@rev:3 +- language-c-0.10.0@rev:1 # Override default flag values for local packages and extra-deps # flags: {} diff --git a/stack-9.0.yaml b/stack-9.0.yaml index 24c8a4a..96919ce 100644 --- a/stack-9.0.yaml +++ b/stack-9.0.yaml @@ -7,7 +7,9 @@ resolver: lts-19.33 packages: - . -# extra-deps: [] +extra-deps: +- c2hs-0.28.8@rev:3 +- language-c-0.10.0@rev:1 # Override default flag values for local packages and extra-deps # flags: {} diff --git a/stack-9.10.yaml b/stack-9.10.yaml new file mode 100644 index 0000000..baa3418 --- /dev/null +++ b/stack-9.10.yaml @@ -0,0 +1,4 @@ +resolver: lts-24.10 + +packages: +- . diff --git a/stack-9.2.yaml b/stack-9.2.yaml new file mode 100644 index 0000000..4525636 --- /dev/null +++ b/stack-9.2.yaml @@ -0,0 +1,8 @@ +resolver: lts-20.26 + +packages: +- . + +extra-deps: +- c2hs-0.28.8@rev:3 +- language-c-0.10.0@rev:1 diff --git a/stack-9.4.yaml b/stack-9.4.yaml new file mode 100644 index 0000000..55e05c7 --- /dev/null +++ b/stack-9.4.yaml @@ -0,0 +1,8 @@ +resolver: lts-21.25 + +packages: +- . + +extra-deps: +- c2hs-0.28.8@rev:3 +- language-c-0.10.0@rev:1 diff --git a/stack-9.6.yaml b/stack-9.6.yaml new file mode 100644 index 0000000..924586a --- /dev/null +++ b/stack-9.6.yaml @@ -0,0 +1,8 @@ +resolver: lts-22.44 + +packages: +- . + +extra-deps: +- c2hs-0.28.8@rev:3 +- language-c-0.10.0@rev:1 diff --git a/stack-9.8.yaml b/stack-9.8.yaml new file mode 100644 index 0000000..0d3d1d5 --- /dev/null +++ b/stack-9.8.yaml @@ -0,0 +1,4 @@ +resolver: lts-23.28 + +packages: +- .