diff --git a/CHANGELOG.md b/CHANGELOG.md index fe21a06..ddd1119 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,11 +5,19 @@ Notable changes to the project will be documented in this file. The format is based on [Keep a Changelog](http://keepachangelog.com/). **NOTE:** The version numbers of this package roughly align to the latest -version of the CUDA API this package is built against This means that this +version of the CUDA API this package is built against. This means that this package _DOES NOT_ follow the PVP, or indeed any sensible version scheme, because NVIDIA are A-OK introducing breaking changes in minor updates. +## [0.13.0.0] - ??? +### Added + * Support for CUDA-13 + +### Removed + * A number of fields from DeviceProperties, as they have been removed from + `cudaDeviceProp`. Use `Foreign.CUDA.Driver.Device.attribute` to query them. + ## [0.12.8.0] - 2025-08-21 ### Added * Support for CUDA-12 diff --git a/README.md b/README.md index 67b5996..8fa919f 100644 --- a/README.md +++ b/README.md @@ -150,10 +150,13 @@ Here is an incomplete historical list of missing bindings. Pull requests welcome - cuGraphMemAllocNodeGetParams - cuGraphMemFreeNodeGetParams -### CUDA-12 +### CUDA >= 12 A lot. PRs welcome. +- CUDA-12.3 + - Edge data in the driver Graph API (`cuGraphAddDependencies_v2` etc.) + # Old compatibility notes diff --git a/cbits/stubs.c b/cbits/stubs.c index 67f9280..b62d8db 100644 --- a/cbits/stubs.c +++ b/cbits/stubs.c @@ -3,6 +3,7 @@ */ #include "cbits/stubs.h" +#include // memset #if CUDART_VERSION >= 7000 cudaError_t cudaLaunchKernel_simple(const void *func, unsigned int gridX, unsigned int gridY, unsigned int gridZ, unsigned int blockX, unsigned int blockY, unsigned int blockZ, void **args, size_t sharedMem, cudaStream_t stream) @@ -196,7 +197,13 @@ CUresult CUDAAPI cuDeviceTotalMem(size_t *bytes, CUdevice dev) CUresult CUDAAPI cuCtxCreate(CUcontext *pctx, unsigned int flags, CUdevice dev) { +#if CUDA_VERSION >= 13000 + CUctxCreateParams params; + memset(¶ms, 0, sizeof params); + return cuCtxCreate_v4(pctx, ¶ms, flags, dev); +#else return cuCtxCreate_v2(pctx, flags, dev); +#endif } CUresult CUDAAPI cuModuleGetGlobal(CUdeviceptr *dptr, size_t *bytes, CUmodule hmod, const char *name) @@ -424,3 +431,18 @@ CUresult CUDAAPI cuGraphExecKernelNodeSetParams_simple(CUgraphExec hGraphExec, C } #endif +#if CUDA_VERSION >= 13000 +// This is the signature of the CUDA <=12 version; much easier to shim here than in Haskell. +CUresult cuMemAdvise_device(CUdeviceptr dptr, size_t count, CUmem_advise advice, CUdevice device) +{ + return cuMemAdvise(dptr, count, advice, (CUmemLocation){.id = device, .type = CU_MEM_LOCATION_TYPE_DEVICE}); +} + +// This is the signature of the CUDA <=12 version; much easier to shim here than in Haskell. +CUresult cuMemPrefetchAsync_device(CUdeviceptr dptr, size_t count, CUdevice device, CUstream hStream) +{ + // flags is reserved and must be 0 in CUDA 13 + return cuMemPrefetchAsync(dptr, count, (CUmemLocation){.id = device, .type = CU_MEM_LOCATION_TYPE_DEVICE}, 0, hStream); +} +#endif + diff --git a/cbits/stubs.h b/cbits/stubs.h index 45219d7..9d2fbd0 100644 --- a/cbits/stubs.h +++ b/cbits/stubs.h @@ -184,6 +184,11 @@ CUresult CUDAAPI cuDevicePrimaryCtxSetFlags(CUdevice dev, unsigned int flags); CUresult CUDAAPI cuIpcOpenMemHandle(CUdeviceptr *pdptr, CUipcMemHandle handle, unsigned int Flags); #endif +#if CUDA_VERSION >= 13000 +CUresult cuMemAdvise_device(CUdeviceptr dptr, size_t count, CUmem_advise advice, CUdevice device); +CUresult cuMemPrefetchAsync_device(CUdeviceptr dptr, size_t count, CUdevice device, CUstream hStream); +#endif + #ifdef __cplusplus } #endif diff --git a/cuda.cabal b/cuda.cabal index 7bab5cf..b7af9db 100644 --- a/cuda.cabal +++ b/cuda.cabal @@ -1,7 +1,7 @@ cabal-version: 1.24 Name: cuda -Version: 0.12.8.0 +Version: 0.13.0.0 Synopsis: FFI binding to the CUDA interface for programming NVIDIA GPUs Description: The CUDA library provides a direct, general purpose C-like SPMD programming @@ -30,7 +30,7 @@ Description: . * "Foreign.CUDA.Runtime" . - Tested with library versions up to CUDA-12.8. See also the + Tested with library versions up to CUDA-13.0. See also the build matrix for version compatibility. . @@ -177,6 +177,6 @@ source-repository head source-repository this type: git location: https://github.com/tmcdonell/cuda - tag: v0.12.8.0 + tag: v0.13.0.0 -- vim: nospell diff --git a/examples/src/deviceQueryDrv/DeviceQuery.hs b/examples/src/deviceQueryDrv/DeviceQuery.hs index cb51865..15cba2f 100644 --- a/examples/src/deviceQueryDrv/DeviceQuery.hs +++ b/examples/src/deviceQueryDrv/DeviceQuery.hs @@ -5,6 +5,7 @@ module Main where import Control.Monad +import Foreign.Marshal.Utils ( toBool ) import Numeric import Prelude hiding ( (<>) ) import Text.PrettyPrint @@ -15,6 +16,31 @@ import Foreign.CUDA.Analysis as CUDA import qualified Foreign.CUDA.Driver as CUDA +-- In CUDA 13, a number of device properties became things only queryable using +-- cu(da)DeviceGetAttribute. This data type captures those. +$(if CUDA.libraryVersion >= 13000 then [d| + data AttrProperties = AttrProperties + { clockRate :: !Int -- ^ Clock frequency in kilohertz + , memClockRate :: !Int -- ^ Peak memory clock frequency in kilohertz + , computeMode :: !ComputeMode + , kernelExecTimeoutEnabled :: !Bool -- ^ Whether there is a runtime limit on kernels + , singleToDoublePerfRatio :: !Int -- ^ Ratio of single precision performance (in floating-point operations per second) to double precision performance + } + getAttrProperties :: Device -> IO AttrProperties + getAttrProperties d = do + clockRate <- CUDA.attribute d CUDA.ClockRate + memClockRate <- CUDA.attribute d CUDA.MemoryClockRate + computeMode <- toEnum <$> CUDA.attribute d CUDA.ComputeMode + kernelExecTimeoutEnabled <- toBool <$> CUDA.attribute d CUDA.KernelExecTimeout + singleToDoublePerfRatio <- CUDA.attribute d CUDA.SingleToDoublePrecisionPerfRatio + return AttrProperties{..} + |] else [d| + data AttrProperties = AttrProperties {} + getAttrProperties :: Device -> IO AttrProperties + getAttrProperties _ = return AttrProperties + |]) + + main :: IO () main = do version <- CUDA.driverVersion @@ -32,16 +58,17 @@ main = do infos <- forM [0 .. numDevices-1] $ \n -> do dev <- CUDA.device n prp <- CUDA.props dev - return (n, dev, prp) + prp2 <- getAttrProperties dev + return (n, dev, prp, prp2) - forM_ infos $ \(n, dev, prp) -> do + forM_ infos $ \(n, dev, prp, prp2) -> do p2p <- statP2P dev prp infos - printf "\nDevice %d: %s\n%s\n" n (deviceName prp) (statDevice prp) + printf "\nDevice %d: %s\n%s\n" n (deviceName prp) (statDevice prp prp2) unless (null p2p) $ printf "%s\n" p2p -statDevice :: DeviceProperties -> String -statDevice dev@DeviceProperties{..} = +statDevice :: DeviceProperties -> AttrProperties -> String +statDevice dev@DeviceProperties{..} AttrProperties{..} = let DeviceResources{..} = deviceResources dev @@ -69,22 +96,34 @@ statDevice dev@DeviceProperties{..} = ,(" 2D:", grid maxTextureDim2D) ,(" 3D:", cube maxTextureDim3D) ,("Texture alignment:", text $ showBytes textureAlignment) - ,("Maximum memory pitch:", text $ showBytes memPitch) - ,("Concurrent kernel execution:", bool concurrentKernels) - ,("Concurrent copy and execution:", bool deviceOverlap <> text (printf ", with %d copy engine%s" asyncEngineCount (if asyncEngineCount > 1 then "s" else ""))) - ,("Runtime limit on kernel execution:", bool kernelExecTimeoutEnabled) + ,("Maximum memory pitch:", text $ showBytes memPitch)]++ + + $(if CUDA.libraryVersion >= 13000 then [| + [("Concurrent copy and kernel execution:", bool concurrentKernels <> text (printf " with %d copy engine%s" asyncEngineCount (if asyncEngineCount > 1 then "s" else "")))] + |] else [| + [("Concurrent kernel execution:", bool concurrentKernels) + ,("Concurrent copy and execution:", bool deviceOverlap <> text (printf ", with %d copy engine%s" asyncEngineCount (if asyncEngineCount > 1 then "s" else "")))] + |])++ + + [("Runtime limit on kernel execution:", bool kernelExecTimeoutEnabled) ,("Integrated GPU sharing host memory:", bool integrated) ,("Host page-locked memory mapping:", bool canMapHostMemory) ,("ECC memory support:", bool eccEnabled) ,("Unified addressing (UVA):", bool unifiedAddressing)]++ -#if __GLASGOW_HASKELL__ > 710 + $(if CUDA.libraryVersion >= 8000 then [| [("Single to double precision performance:", text $ printf "%d : 1" singleToDoublePerfRatio) - ,("Supports compute pre-emption:", bool preemption)]|] else [|[]|])++ + ,("Supports compute pre-emption:", bool preemption)] + |] else [|[]|])++ + $(if CUDA.libraryVersion >= 9000 then [| - [("Supports cooperative launch:", bool cooperativeLaunch) - ,("Supports multi-device cooperative launch:", bool cooperativeLaunchMultiDevice)]|] else [|[]|])++ -#endif + [("Supports cooperative launch:", bool cooperativeLaunch)] + |] else [|[]|])++ + + $(if CUDA.libraryVersion >= 9000 && CUDA.libraryVersion < 13000 then [| + [("Supports multi-device cooperative launch:", bool cooperativeLaunchMultiDevice)] + |] else [|[]|])++ + [("PCI bus/location:", int (busID pciInfo) <> char '/' <> int (deviceID pciInfo)) ,("Compute mode:", text (show computeMode)) ] @@ -94,7 +133,7 @@ statDevice dev@DeviceProperties{..} = $ text (describe computeMode) -statP2P :: Device -> DeviceProperties -> [(Int, Device, DeviceProperties)] -> IO String +statP2P :: Device -> DeviceProperties -> [(Int, Device, DeviceProperties, AttrProperties)] -> IO String statP2P dev prp infos | CUDA.libraryVersion < 4000 = return [] @@ -103,7 +142,7 @@ statP2P dev prp infos | otherwise = let go [] = return [] - go ((m, peer, pp):is) = + go ((m, peer, pp, _):is) = if dev == peer then go is else do diff --git a/src/Foreign/CUDA/Analysis/Device.chs b/src/Foreign/CUDA/Analysis/Device.chs index 4a44e8b..049bad6 100644 --- a/src/Foreign/CUDA/Analysis/Device.chs +++ b/src/Foreign/CUDA/Analysis/Device.chs @@ -32,13 +32,19 @@ import Debug.Trace -- | -- The compute mode the device is currently in -- +#if CUDA_VERSION < 13000 {# enum CUcomputemode as ComputeMode { underscoreToCase } with prefix="CU_COMPUTEMODE" deriving (Eq, Show) #} +#else +{# enum cudaComputeMode as ComputeMode + { } + with prefix="cudaComputeMode" deriving (Eq, Show) #} +#endif instance Describe ComputeMode where describe Default = "Multiple contexts are allowed on the device simultaneously" -#if CUDA_VERSION < 8000 +#if CUDA_VERSION < 8000 || CUDA_VERSION >= 13000 describe Exclusive = "Only one context used by a single thread can be present on this device at a time" #endif describe Prohibited = "No contexts can be created on this device at this time" @@ -69,7 +75,10 @@ cap a b = let a' = fromIntegral a in --} -- | --- The properties of a compute device +-- The properties of a compute device, mirroring @struct cudaDeviceProp@ in CUDA. +-- +-- In CUDA-13.0, a number of fields were removed from this struct and are now +-- only available by querying individual attributes on the device. -- data DeviceProperties = DeviceProperties { @@ -91,16 +100,22 @@ data DeviceProperties = DeviceProperties , maxTextureDim2D :: !(Int,Int) , maxTextureDim3D :: !(Int,Int,Int) #endif +#if CUDA_VERSION < 13000 , clockRate :: !Int -- ^ Clock frequency in kilohertz +#endif , multiProcessorCount :: !Int -- ^ Number of multiprocessors on the device , memPitch :: !Int64 -- ^ Maximum pitch in bytes allowed by memory copies #if CUDA_VERSION >= 4000 , memBusWidth :: !Int -- ^ Global memory bus width in bits +#if CUDA_VERSION < 13000 , memClockRate :: !Int -- ^ Peak memory clock frequency in kilohertz +#endif #endif , textureAlignment :: !Int64 -- ^ Alignment requirement for textures +#if CUDA_VERSION < 13000 , computeMode :: !ComputeMode , deviceOverlap :: !Bool -- ^ Device can concurrently copy memory and execute a kernel +#endif #if CUDA_VERSION >= 3000 , concurrentKernels :: !Bool -- ^ Device can possibly execute multiple kernels concurrently , eccEnabled :: !Bool -- ^ Device supports and has enabled error correction @@ -111,7 +126,9 @@ data DeviceProperties = DeviceProperties , pciInfo :: !PCI -- ^ PCI device information for the device , tccDriverEnabled :: !Bool -- ^ Whether this is a Tesla device using the TCC driver #endif +#if CUDA_VERSION < 13000 , kernelExecTimeoutEnabled :: !Bool -- ^ Whether there is a runtime limit on kernels +#endif , integrated :: !Bool -- ^ As opposed to discrete , canMapHostMemory :: !Bool -- ^ Device can use pinned memory #if CUDA_VERSION >= 4000 @@ -129,11 +146,15 @@ data DeviceProperties = DeviceProperties #endif #if CUDA_VERSION >= 8000 , preemption :: !Bool -- ^ Device supports compute pre-emption +#if CUDA_VERSION < 13000 , singleToDoublePerfRatio :: !Int -- ^ Ratio of single precision performance (in floating-point operations per second) to double precision performance #endif +#endif #if CUDA_VERSION >= 9000 , cooperativeLaunch :: !Bool -- ^ Device supports launching cooperative kernels +#if CUDA_VERSION < 13000 , cooperativeLaunchMultiDevice :: !Bool -- ^ Device can participate in cooperative multi-device kernels +#endif #endif } deriving (Show) diff --git a/src/Foreign/CUDA/Driver/Device.chs b/src/Foreign/CUDA/Driver/Device.chs index 8498653..7080c4e 100644 --- a/src/Foreign/CUDA/Driver/Device.chs +++ b/src/Foreign/CUDA/Driver/Device.chs @@ -246,7 +246,10 @@ attribute !d !a = cuDeviceGetAttribute a d peekS s _ = peekCString s --- | Returns a UUID for the device +-- | Returns a UUID for the device. +-- +-- Since CUDA-13: If the device is in MIG mode, this function returns its MIG +-- UUID which uniquely identifies the subscribed MIG compute instance. -- -- Requires CUDA-9.2 -- @@ -265,10 +268,17 @@ uuid !dev = unpack ptr where {-# INLINE cuDeviceGetUuid #-} +#if CUDA_VERSION < 13000 {# fun unsafe cuDeviceGetUuid { `Ptr ()' , useDevice `Device' } -> `()' checkStatus*- #} +#else + {# fun unsafe cuDeviceGetUuid_v2 as cuDeviceGetUuid + { `Ptr ()' + , useDevice `Device' + } -> `()' checkStatus*- #} +#endif {-# INLINE unpack #-} unpack :: Ptr () -> IO UUID @@ -319,7 +329,9 @@ props !d = do sharedMemPerBlock <- fromIntegral <$> attribute d SharedMemoryPerBlock memPitch <- fromIntegral <$> attribute d MaxPitch textureAlignment <- fromIntegral <$> attribute d TextureAlignment +#if CUDA_VERSION < 13000 clockRate <- attribute d ClockRate +#endif warpSize <- attribute d WarpSize regsPerBlock <- attribute d RegistersPerBlock maxThreadsPerBlock <- attribute d MaxThreadsPerBlock @@ -333,9 +345,11 @@ props !d = do computeCapability <- capability d totalGlobalMem <- totalMem d multiProcessorCount <- attribute d MultiprocessorCount +#if CUDA_VERSION < 13000 computeMode <- toEnum <$> attribute d ComputeMode deviceOverlap <- toBool <$> attribute d GpuOverlap kernelExecTimeoutEnabled <- toBool <$> attribute d KernelExecTimeout +#endif integrated <- toBool <$> attribute d Integrated canMapHostMemory <- toBool <$> attribute d CanMapHostMemory #if CUDA_VERSION >= 3000 @@ -350,7 +364,9 @@ props !d = do cacheMemL2 <- attribute d L2CacheSize maxThreadsPerMultiProcessor <- attribute d MaxThreadsPerMultiprocessor memBusWidth <- attribute d GlobalMemoryBusWidth +#if CUDA_VERSION < 13000 memClockRate <- attribute d MemoryClockRate +#endif pciInfo <- PCI <$> attribute d PciBusId <*> attribute d PciDeviceId <*> attribute d PciDomainId unifiedAddressing <- toBool <$> attribute d UnifiedAddressing tccDriverEnabled <- toBool <$> attribute d TccDriver @@ -367,11 +383,15 @@ props !d = do #endif #if CUDA_VERSION >= 8000 preemption <- toBool <$> attribute d ComputePreemptionSupported +#if CUDA_VERSION < 13000 singleToDoublePerfRatio <- attribute d SingleToDoublePrecisionPerfRatio #endif +#endif #if CUDA_VERSION >= 9000 cooperativeLaunch <- toBool <$> attribute d CooperativeLaunch +#if CUDA_VERSION < 13000 cooperativeLaunchMultiDevice <- toBool <$> attribute d CooperativeMultiDeviceLaunch +#endif #endif return DeviceProperties{..} diff --git a/src/Foreign/CUDA/Driver/Event.chs b/src/Foreign/CUDA/Driver/Event.chs index 16f6293..00b7e7f 100644 --- a/src/Foreign/CUDA/Driver/Event.chs +++ b/src/Foreign/CUDA/Driver/Event.chs @@ -114,10 +114,17 @@ elapsedTime :: Event -> Event -> IO Float elapsedTime !ev1 !ev2 = resultIfOk =<< cuEventElapsedTime ev1 ev2 {-# INLINE cuEventElapsedTime #-} +#if CUDA_VERSION < 13000 {# fun unsafe cuEventElapsedTime { alloca- `Float' peekFloatConv* , useEvent `Event' , useEvent `Event' } -> `Status' cToEnum #} +#else +{# fun unsafe cuEventElapsedTime_v2 as cuEventElapsedTime + { alloca- `Float' peekFloatConv* + , useEvent `Event' + , useEvent `Event' } -> `Status' cToEnum #} +#endif -- | diff --git a/src/Foreign/CUDA/Driver/Graph/Build.chs b/src/Foreign/CUDA/Driver/Graph/Build.chs index 833ce52..d115753 100644 --- a/src/Foreign/CUDA/Driver/Graph/Build.chs +++ b/src/Foreign/CUDA/Driver/Graph/Build.chs @@ -204,12 +204,23 @@ addDependencies !g !deps = cuGraphAddDependencies g from to where (from, to) = unzip deps +#if CUDA_VERSION < 13000 {# fun unsafe cuGraphAddDependencies { useGraph `Graph' , withNodeArray* `[Node]' , withNodeArrayLen* `[Node]'& } -> `()' checkStatus*- #} +#else + cuGraphAddDependencies g' from' to' = cuGraphAddDependencies_v2 g' from' to' (length deps) + {# fun unsafe cuGraphAddDependencies_v2 + { useGraph `Graph' + , withNodeArray* `[Node]' + , withNodeArray* `[Node]' + , withNullEdgeDataLen* `Int'& + } + -> `()' checkStatus*- #} +#endif #endif @@ -230,12 +241,23 @@ removeDependencies !g !deps = cuGraphRemoveDependencies g from to where (from, to) = unzip deps +#if CUDA_VERSION < 13000 {# fun unsafe cuGraphRemoveDependencies { useGraph `Graph' , withNodeArray* `[Node]' , withNodeArrayLen* `[Node]'& } -> `()' checkStatus*- #} +#else + cuGraphRemoveDependencies g' from' to' = cuGraphRemoveDependencies_v2 g' from' to' (length deps) + {# fun unsafe cuGraphRemoveDependencies_v2 + { useGraph `Graph' + , withNodeArray* `[Node]' + , withNodeArray* `[Node]' + , withNullEdgeDataLen* `Int'& + } + -> `()' checkStatus*- #} +#endif #endif @@ -507,6 +529,7 @@ getEdges !g = to <- peekArray count p_to return $ zip from to where +#if CUDA_VERSION < 13000 {# fun unsafe cuGraphGetEdges { useGraph `Graph' , castPtr `Ptr Node' @@ -514,6 +537,17 @@ getEdges !g = , id `Ptr CULong' } -> `()' checkStatus*- #} +#else + cuGraphGetEdges g' f t c = cuGraphGetEdges_v2 g' f t nullPtr c + {# fun unsafe cuGraphGetEdges_v2 + { useGraph `Graph' + , castPtr `Ptr Node' + , castPtr `Ptr Node' + , castPtr `Ptr edgeData' + , id `Ptr CULong' + } + -> `()' checkStatus*- #} +#endif #endif @@ -598,12 +632,23 @@ getDependencies !n = cuGraphNodeGetDependencies n p_deps p_count peekArray count p_deps where +#if CUDA_VERSION < 13000 {# fun unsafe cuGraphNodeGetDependencies { useNode `Node' , castPtr `Ptr Node' , id `Ptr CULong' } -> `()' checkStatus*- #} +#else + cuGraphNodeGetDependencies n' d c = cuGraphNodeGetDependencies_v2 n' d nullPtr c + {# fun unsafe cuGraphNodeGetDependencies_v2 + { useNode `Node' + , castPtr `Ptr Node' + , castPtr `Ptr edgeData' + , id `Ptr CULong' + } + -> `()' checkStatus*- #} +#endif #endif @@ -628,12 +673,23 @@ getDependents n = cuGraphNodeGetDependentNodes n p_deps p_count peekArray count p_deps where +#if CUDA_VERSION < 13000 {# fun unsafe cuGraphNodeGetDependentNodes { useNode `Node' , castPtr `Ptr Node' , id `Ptr CULong' } -> `()' checkStatus*- #} +#else + cuGraphNodeGetDependentNodes n' d c = cuGraphNodeGetDependentNodes_v2 n' d nullPtr c + {# fun unsafe cuGraphNodeGetDependentNodes_v2 + { useNode `Node' + , castPtr `Ptr Node' + , castPtr `Ptr edgeData' + , id `Ptr CULong' + } + -> `()' checkStatus*- #} +#endif #endif @@ -679,5 +735,9 @@ withNodeArray ns f = withArray ns (f . castPtr) {-# INLINE withNodeArrayLen #-} withNodeArrayLen :: [Node] -> ((Ptr {# type CUgraphNode #}, CULong) -> IO a) -> IO a withNodeArrayLen ns f = withArrayLen ns $ \i p -> f (castPtr p, cIntConv i) + +{-# INLINE withNullEdgeDataLen #-} +withNullEdgeDataLen :: Int -> ((Ptr (), CULong) -> IO a) -> IO a +withNullEdgeDataLen len f = f (nullPtr, cIntConv len) #endif diff --git a/src/Foreign/CUDA/Driver/Graph/Capture.chs b/src/Foreign/CUDA/Driver/Graph/Capture.chs index 160ff70..070391f 100644 --- a/src/Foreign/CUDA/Driver/Graph/Capture.chs +++ b/src/Foreign/CUDA/Driver/Graph/Capture.chs @@ -143,6 +143,10 @@ status = requireSDK 'status 10.0 -- | Query the capture status of a stream and get an id for the capture -- sequence, which is unique over the lifetime of the process. -- +-- Since CUDA-13, "edge data" can be associated with an edge in the graph. This +-- function assumes no such data is present (if there is, a CUDA error +-- (@CUDA_ERROR_LOSSY_QUERY@) will be raised). +-- -- Requires CUDA-10.1 -- -- @@ -152,24 +156,38 @@ status = requireSDK 'status 10.0 #if CUDA_VERSION < 10010 info :: Stream -> IO (Status, Int64) info = requireSDK 'info 10.1 -#elif CUDA_VERSION < 12000 +-- Not another elif because c2hs seems to be buggy +#else +#if CUDA_VERSION < 12000 {# fun unsafe cuStreamGetCaptureInfo as info { useStream `Stream' , alloca- `Status' peekEnum* , alloca- `Int64' peekIntConv* } -> `()' checkStatus*- #} -#else +#elif CUDA_VERSION < 13000 {# fun unsafe cuStreamGetCaptureInfo_v2 as info { useStream `Stream' , alloca- `Status' peekEnum* , alloca- `Int64' peekIntConv* - , alloca- `Graph' - , alloca- `Node' - , alloca- `CSize' + , withNullPtr- `Graph' + , withNullPtr- `Node' + , withNullPtr- `CSize' + } + -> `()' checkStatus*- #} +#else +{# fun unsafe cuStreamGetCaptureInfo_v3 as info + { useStream `Stream' + , alloca- `Status' peekEnum* + , alloca- `Int64' peekIntConv* + , withNullPtr- `Graph' + , withNullPtr- `()' + , withNullPtr- `Node' + , withNullPtr- `CSize' } -> `()' checkStatus*- #} #endif +#endif -- | Set the stream capture interaction mode for this thread. Return the previous value. @@ -201,3 +219,5 @@ peekGraph :: Ptr {# type CUgraph #} -> IO Graph peekGraph = liftM Graph . peek #endif +withNullPtr :: (Ptr a -> r) -> r +withNullPtr f = f nullPtr diff --git a/src/Foreign/CUDA/Driver/Marshal.chs b/src/Foreign/CUDA/Driver/Marshal.chs index bf6c4ba..8d69d75 100644 --- a/src/Foreign/CUDA/Driver/Marshal.chs +++ b/src/Foreign/CUDA/Driver/Marshal.chs @@ -358,6 +358,7 @@ prefetchArrayAsync ptr n mdev mst = go undefined ptr go x _ = nothingIfOk =<< cuMemPrefetchAsync ptr (n * sizeOf x) (maybe (-1) useDevice mdev) (fromMaybe defaultStream mst) {-# INLINE cuMemPrefetchAsync #-} +#if CUDA_VERSION < 13000 {# fun unsafe cuMemPrefetchAsync { useDeviceHandle `DevicePtr a' , `Int' @@ -365,6 +366,15 @@ prefetchArrayAsync ptr n mdev mst = go undefined ptr , useStream `Stream' } -> `Status' cToEnum #} +#else +{# fun unsafe cuMemPrefetchAsync_device as cuMemPrefetchAsync + { useDeviceHandle `DevicePtr a' + , `Int' + , id `CInt' + , useStream `Stream' + } + -> `Status' cToEnum #} +#endif #endif diff --git a/src/Foreign/CUDA/Driver/Unified.chs b/src/Foreign/CUDA/Driver/Unified.chs index 4dbc576..b94cb46 100644 --- a/src/Foreign/CUDA/Driver/Unified.chs +++ b/src/Foreign/CUDA/Driver/Unified.chs @@ -258,6 +258,7 @@ advise ptr n a mdev = go undefined ptr go x _ = nothingIfOk =<< cuMemAdvise ptr (n * sizeOf x) a (maybe (-1) useDevice mdev) {-# INLINE cuMemAdvise #-} +#if CUDA_VERSION < 13000 {# fun unsafe cuMemAdvise { useHandle `Ptr a' , `Int' @@ -265,7 +266,15 @@ advise ptr n a mdev = go undefined ptr , `CInt' } -> `Status' cToEnum #} +#else +{# fun unsafe cuMemAdvise_device as cuMemAdvise + { useHandle `Ptr a' + , `Int' + , cFromEnum `Advice' + , `CInt' + } + -> `Status' cToEnum #} +#endif where useHandle = fromIntegral . ptrToIntPtr #endif - diff --git a/src/Foreign/CUDA/Runtime/Device.chs b/src/Foreign/CUDA/Runtime/Device.chs index 90e2089..942bd63 100644 --- a/src/Foreign/CUDA/Runtime/Device.chs +++ b/src/Foreign/CUDA/Runtime/Device.chs @@ -98,15 +98,23 @@ instance Storable DeviceProperties where warpSize <- cIntConv <$> {#get cudaDeviceProp.warpSize#} p memPitch <- cIntConv <$> {#get cudaDeviceProp.memPitch#} p maxThreadsPerBlock <- cIntConv <$> {#get cudaDeviceProp.maxThreadsPerBlock#} p +#if CUDART_VERSION < 13000 clockRate <- cIntConv <$> {#get cudaDeviceProp.clockRate#} p +#endif totalConstMem <- cIntConv <$> {#get cudaDeviceProp.totalConstMem#} p textureAlignment <- cIntConv <$> {#get cudaDeviceProp.textureAlignment#} p +#if CUDART_VERSION < 13000 deviceOverlap <- cToBool <$> {#get cudaDeviceProp.deviceOverlap#} p +#endif multiProcessorCount <- cIntConv <$> {#get cudaDeviceProp.multiProcessorCount#} p +#if CUDART_VERSION < 13000 kernelExecTimeoutEnabled <- cToBool <$> {#get cudaDeviceProp.kernelExecTimeoutEnabled#} p +#endif integrated <- cToBool <$> {#get cudaDeviceProp.integrated#} p canMapHostMemory <- cToBool <$> {#get cudaDeviceProp.canMapHostMemory#} p +#if CUDART_VERSION < 13000 computeMode <- cToEnum <$> {#get cudaDeviceProp.computeMode#} p +#endif #if CUDART_VERSION >= 3000 concurrentKernels <- cToBool <$> {#get cudaDeviceProp.concurrentKernels#} p maxTextureDim1D <- cIntConv <$> {#get cudaDeviceProp.maxTexture1D#} p @@ -123,7 +131,9 @@ instance Storable DeviceProperties where cacheMemL2 <- cIntConv <$> {#get cudaDeviceProp.l2CacheSize#} p maxThreadsPerMultiProcessor <- cIntConv <$> {#get cudaDeviceProp.maxThreadsPerMultiProcessor#} p memBusWidth <- cIntConv <$> {#get cudaDeviceProp.memoryBusWidth#} p +#if CUDART_VERSION < 13000 memClockRate <- cIntConv <$> {#get cudaDeviceProp.memoryClockRate#} p +#endif pciInfo <- PCI <$> (cIntConv <$> {#get cudaDeviceProp.pciBusID#} p) <*> (cIntConv <$> {#get cudaDeviceProp.pciDeviceID#} p) <*> (cIntConv <$> {#get cudaDeviceProp.pciDomainID#} p) @@ -157,11 +167,15 @@ instance Storable DeviceProperties where #else preemption <- cToBool <$> {#get cudaDeviceProp.computePreemptionSupported#} p #endif +#if CUDART_VERSION < 13000 singleToDoublePerfRatio <- cIntConv <$> {#get cudaDeviceProp.singleToDoublePrecisionPerfRatio#} p #endif +#endif #if CUDART_VERSION >= 9000 cooperativeLaunch <- cToBool <$> {#get cudaDeviceProp.cooperativeLaunch#} p +#if CUDART_VERSION < 13000 cooperativeLaunchMultiDevice <- cToBool <$> {#get cudaDeviceProp.cooperativeMultiDeviceLaunch#} p +#endif #endif return DeviceProperties{..} @@ -219,7 +233,7 @@ props :: Device -> IO DeviceProperties props !n = resultIfOk =<< cudaGetDeviceProperties n {-# INLINE cudaGetDeviceProperties #-} -#if CUDA_VERSION < 12000 +#if CUDA_VERSION < 12000 || CUDA_VERSION >= 13000 {# fun unsafe cudaGetDeviceProperties { alloca- `DeviceProperties' peek* , `Int' } -> `Status' cToEnum #}