diff --git a/CHANGELOG.md b/CHANGELOG.md
index fe21a06..ddd1119 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -5,11 +5,19 @@ Notable changes to the project will be documented in this file.
 The format is based on [Keep a Changelog](http://keepachangelog.com/).
 
 **NOTE:** The version numbers of this package roughly align to the latest
-version of the CUDA API this package is built against This means that this
+version of the CUDA API this package is built against. This means that this
 package _DOES NOT_ follow the PVP, or indeed any sensible version scheme,
 because NVIDIA are A-OK introducing breaking changes in minor updates.
 
 
+## [0.13.0.0] - ???
+### Added
+  * Support for CUDA-13
+
+### Removed
+  * A number of fields from DeviceProperties, as they have been removed from
+    `cudaDeviceProp`. Use `Foreign.CUDA.Driver.Device.attribute` to query them.
+
 ## [0.12.8.0] - 2025-08-21
 ### Added
   * Support for CUDA-12
diff --git a/README.md b/README.md
index 67b5996..8fa919f 100644
--- a/README.md
+++ b/README.md
@@ -150,10 +150,13 @@ Here is an incomplete historical list of missing bindings. Pull requests welcome
 - cuGraphMemAllocNodeGetParams
 - cuGraphMemFreeNodeGetParams
 
-### CUDA-12
+### CUDA >= 12
 
 A lot. PRs welcome.
 
+- CUDA-12.3
+  - Edge data in the driver Graph API (`cuGraphAddDependencies_v2` etc.)
+
 
 # Old compatibility notes
 
diff --git a/cbits/stubs.c b/cbits/stubs.c
index 67f9280..b62d8db 100644
--- a/cbits/stubs.c
+++ b/cbits/stubs.c
@@ -3,6 +3,7 @@
  */
 
 #include "cbits/stubs.h"
+#include <string.h>  // memset
 
 #if CUDART_VERSION >= 7000
 cudaError_t cudaLaunchKernel_simple(const void *func, unsigned int gridX, unsigned int gridY, unsigned int gridZ, unsigned int blockX, unsigned int blockY, unsigned int blockZ, void **args, size_t sharedMem, cudaStream_t stream)
@@ -196,7 +197,13 @@ CUresult CUDAAPI cuDeviceTotalMem(size_t *bytes, CUdevice dev)
 
 CUresult CUDAAPI cuCtxCreate(CUcontext *pctx, unsigned int flags, CUdevice dev)
 {
+#if CUDA_VERSION >= 13000
+    CUctxCreateParams params;
+    memset(&params, 0, sizeof params);
+    return cuCtxCreate_v4(pctx, &params, flags, dev);
+#else
     return cuCtxCreate_v2(pctx, flags, dev);
+#endif
 }
 
 CUresult CUDAAPI cuModuleGetGlobal(CUdeviceptr *dptr, size_t *bytes, CUmodule hmod, const char *name)
@@ -424,3 +431,18 @@ CUresult CUDAAPI cuGraphExecKernelNodeSetParams_simple(CUgraphExec hGraphExec, C
 }
 #endif
 
+#if CUDA_VERSION >= 13000
+// This is the signature of the CUDA <=12 version; much easier to shim here than in Haskell.
+CUresult cuMemAdvise_device(CUdeviceptr dptr, size_t count, CUmem_advise advice, CUdevice device)
+{
+  return cuMemAdvise(dptr, count, advice, (CUmemLocation){.id = device, .type = CU_MEM_LOCATION_TYPE_DEVICE});
+}
+
+// This is the signature of the CUDA <=12 version; much easier to shim here than in Haskell.
+CUresult cuMemPrefetchAsync_device(CUdeviceptr dptr, size_t count, CUdevice device, CUstream hStream)
+{
+  // flags is reserved and must be 0 in CUDA 13
+  return cuMemPrefetchAsync(dptr, count, (CUmemLocation){.id = device, .type = CU_MEM_LOCATION_TYPE_DEVICE}, 0, hStream);
+}
+#endif
+
diff --git a/cbits/stubs.h b/cbits/stubs.h
index 45219d7..9d2fbd0 100644
--- a/cbits/stubs.h
+++ b/cbits/stubs.h
@@ -184,6 +184,11 @@ CUresult CUDAAPI cuDevicePrimaryCtxSetFlags(CUdevice dev, unsigned int flags);
 CUresult CUDAAPI cuIpcOpenMemHandle(CUdeviceptr *pdptr, CUipcMemHandle handle, unsigned int Flags);
 #endif
 
+#if CUDA_VERSION >= 13000
+CUresult cuMemAdvise_device(CUdeviceptr dptr, size_t count, CUmem_advise advice, CUdevice device);
+CUresult cuMemPrefetchAsync_device(CUdeviceptr dptr, size_t count, CUdevice device, CUstream hStream);
+#endif
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/cuda.cabal b/cuda.cabal
index 7bab5cf..b7af9db 100644
--- a/cuda.cabal
+++ b/cuda.cabal
@@ -1,7 +1,7 @@
 cabal-version:          1.24
 
 Name:                   cuda
-Version:                0.12.8.0
+Version:                0.13.0.0
 Synopsis:               FFI binding to the CUDA interface for programming NVIDIA GPUs
 Description:
     The CUDA library provides a direct, general purpose C-like SPMD programming
@@ -30,7 +30,7 @@ Description:
     .
     * "Foreign.CUDA.Runtime"
     .
-    Tested with library versions up to CUDA-12.8. See also the
+    Tested with library versions up to CUDA-13.0. See also the
     <https://travis-ci.org/tmcdonell/cuda travis-ci.org> build matrix for
     version compatibility.
     .
@@ -177,6 +177,6 @@ source-repository head
 source-repository this
     type:               git
     location:           https://github.com/tmcdonell/cuda
-    tag:                v0.12.8.0
+    tag:                v0.13.0.0
 
 -- vim: nospell
diff --git a/examples/src/deviceQueryDrv/DeviceQuery.hs b/examples/src/deviceQueryDrv/DeviceQuery.hs
index cb51865..15cba2f 100644
--- a/examples/src/deviceQueryDrv/DeviceQuery.hs
+++ b/examples/src/deviceQueryDrv/DeviceQuery.hs
@@ -5,6 +5,7 @@
 module Main where
 
 import Control.Monad
+import Foreign.Marshal.Utils                            ( toBool )
 import Numeric
 import Prelude                                          hiding ( (<>) )
 import Text.PrettyPrint
@@ -15,6 +16,31 @@ import Foreign.CUDA.Analysis                            as CUDA
 import qualified Foreign.CUDA.Driver                    as CUDA
 
 
+-- In CUDA 13, a number of device properties became things only queryable using
+-- cu(da)DeviceGetAttribute. This data type captures those.
+$(if CUDA.libraryVersion >= 13000 then [d|
+    data AttrProperties = AttrProperties
+      { clockRate                     :: !Int             -- ^ Clock frequency in kilohertz
+      , memClockRate                  :: !Int             -- ^ Peak memory clock frequency in kilohertz
+      , computeMode                   :: !ComputeMode
+      , kernelExecTimeoutEnabled      :: !Bool            -- ^ Whether there is a runtime limit on kernels
+      , singleToDoublePerfRatio       :: !Int             -- ^ Ratio of single precision performance (in floating-point operations per second) to double precision performance
+      }
+    getAttrProperties :: Device -> IO AttrProperties
+    getAttrProperties d = do
+      clockRate <- CUDA.attribute d CUDA.ClockRate
+      memClockRate <- CUDA.attribute d CUDA.MemoryClockRate
+      computeMode <- toEnum <$> CUDA.attribute d CUDA.ComputeMode
+      kernelExecTimeoutEnabled <- toBool <$> CUDA.attribute d CUDA.KernelExecTimeout
+      singleToDoublePerfRatio <- CUDA.attribute d CUDA.SingleToDoublePrecisionPerfRatio
+      return AttrProperties{..}
+  |] else [d|
+    data AttrProperties = AttrProperties {}
+    getAttrProperties :: Device -> IO AttrProperties
+    getAttrProperties _ = return AttrProperties
+  |])
+
+
 main :: IO ()
 main = do
   version    <- CUDA.driverVersion
@@ -32,16 +58,17 @@ main = do
   infos   <- forM [0 .. numDevices-1] $ \n -> do
     dev <- CUDA.device n
     prp <- CUDA.props dev
-    return (n, dev, prp)
+    prp2 <- getAttrProperties dev
+    return (n, dev, prp, prp2)
 
-  forM_ infos $ \(n, dev, prp) -> do
+  forM_ infos $ \(n, dev, prp, prp2) -> do
     p2p <- statP2P dev prp infos
-    printf "\nDevice %d: %s\n%s\n" n (deviceName prp) (statDevice prp)
+    printf "\nDevice %d: %s\n%s\n" n (deviceName prp) (statDevice prp prp2)
     unless (null p2p) $ printf "%s\n" p2p
 
 
-statDevice :: DeviceProperties -> String
-statDevice dev@DeviceProperties{..} =
+statDevice :: DeviceProperties -> AttrProperties -> String
+statDevice dev@DeviceProperties{..} AttrProperties{..} =
   let
       DeviceResources{..} = deviceResources dev
 
@@ -69,22 +96,34 @@ statDevice dev@DeviceProperties{..} =
         ,("  2D:",                                      grid maxTextureDim2D)
         ,("  3D:",                                      cube maxTextureDim3D)
         ,("Texture alignment:",                         text $ showBytes textureAlignment)
-        ,("Maximum memory pitch:",                      text $ showBytes memPitch)
-        ,("Concurrent kernel execution:",               bool concurrentKernels)
-        ,("Concurrent copy and execution:",             bool deviceOverlap <> text (printf ", with %d copy engine%s" asyncEngineCount (if asyncEngineCount > 1 then "s" else "")))
-        ,("Runtime limit on kernel execution:",         bool kernelExecTimeoutEnabled)
+        ,("Maximum memory pitch:",                      text $ showBytes memPitch)]++
+
+        $(if CUDA.libraryVersion >= 13000 then [|
+        [("Concurrent copy and kernel execution:",      bool concurrentKernels <> text (printf " with %d copy engine%s" asyncEngineCount (if asyncEngineCount > 1 then "s" else "")))]
+        |] else [|
+        [("Concurrent kernel execution:",               bool concurrentKernels)
+        ,("Concurrent copy and execution:",             bool deviceOverlap <> text (printf ", with %d copy engine%s" asyncEngineCount (if asyncEngineCount > 1 then "s" else "")))]
+        |])++
+
+        [("Runtime limit on kernel execution:",         bool kernelExecTimeoutEnabled)
         ,("Integrated GPU sharing host memory:",        bool integrated)
         ,("Host page-locked memory mapping:",           bool canMapHostMemory)
         ,("ECC memory support:",                        bool eccEnabled)
         ,("Unified addressing (UVA):",                  bool unifiedAddressing)]++
-#if __GLASGOW_HASKELL__ > 710
+
         $(if CUDA.libraryVersion >= 8000 then [|
         [("Single to double precision performance:",    text $ printf "%d : 1" singleToDoublePerfRatio)
-        ,("Supports compute pre-emption:",              bool preemption)]|] else [|[]|])++
+        ,("Supports compute pre-emption:",              bool preemption)]
+        |] else [|[]|])++
+
         $(if CUDA.libraryVersion >= 9000 then [|
-        [("Supports cooperative launch:",               bool cooperativeLaunch)
-        ,("Supports multi-device cooperative launch:",  bool cooperativeLaunchMultiDevice)]|] else [|[]|])++
-#endif
+        [("Supports cooperative launch:",               bool cooperativeLaunch)]
+        |] else [|[]|])++
+
+        $(if CUDA.libraryVersion >= 9000 && CUDA.libraryVersion < 13000 then [|
+        [("Supports multi-device cooperative launch:",  bool cooperativeLaunchMultiDevice)]
+        |] else [|[]|])++
+
         [("PCI bus/location:",                          int (busID pciInfo) <> char '/' <> int (deviceID pciInfo))
         ,("Compute mode:",                              text (show computeMode))
         ]
@@ -94,7 +133,7 @@ statDevice dev@DeviceProperties{..} =
     $ text (describe computeMode)
 
 
-statP2P :: Device -> DeviceProperties -> [(Int, Device, DeviceProperties)] -> IO String
+statP2P :: Device -> DeviceProperties -> [(Int, Device, DeviceProperties, AttrProperties)] -> IO String
 statP2P dev prp infos
   | CUDA.libraryVersion < 4000          = return []
 
@@ -103,7 +142,7 @@ statP2P dev prp infos
   | otherwise
   = let
         go []                 = return []
-        go ((m, peer, pp):is) =
+        go ((m, peer, pp, _):is) =
           if dev == peer
             then go is
             else do
diff --git a/src/Foreign/CUDA/Analysis/Device.chs b/src/Foreign/CUDA/Analysis/Device.chs
index 4a44e8b..049bad6 100644
--- a/src/Foreign/CUDA/Analysis/Device.chs
+++ b/src/Foreign/CUDA/Analysis/Device.chs
@@ -32,13 +32,19 @@ import Debug.Trace
 -- |
 -- The compute mode the device is currently in
 --
+#if CUDA_VERSION < 13000
 {# enum CUcomputemode as ComputeMode
     { underscoreToCase }
     with prefix="CU_COMPUTEMODE" deriving (Eq, Show) #}
+#else
+{# enum cudaComputeMode as ComputeMode
+    { }
+    with prefix="cudaComputeMode" deriving (Eq, Show) #}
+#endif
 
 instance Describe ComputeMode where
   describe Default          = "Multiple contexts are allowed on the device simultaneously"
-#if CUDA_VERSION < 8000
+#if CUDA_VERSION < 8000 || CUDA_VERSION >= 13000
   describe Exclusive        = "Only one context used by a single thread can be present on this device at a time"
 #endif
   describe Prohibited       = "No contexts can be created on this device at this time"
@@ -69,7 +75,10 @@ cap a b = let a' = fromIntegral a in
 --}
 
 -- |
--- The properties of a compute device
+-- The properties of a compute device, mirroring @struct cudaDeviceProp@ in CUDA.
+--
+-- In CUDA-13.0, a number of fields were removed from this struct and are now
+-- only available by querying individual attributes on the device.
 --
 data DeviceProperties = DeviceProperties
   {
@@ -91,16 +100,22 @@ data DeviceProperties = DeviceProperties
   , maxTextureDim2D               :: !(Int,Int)
   , maxTextureDim3D               :: !(Int,Int,Int)
 #endif
+#if CUDA_VERSION < 13000
   , clockRate                     :: !Int             -- ^ Clock frequency in kilohertz
+#endif
   , multiProcessorCount           :: !Int             -- ^ Number of multiprocessors on the device
   , memPitch                      :: !Int64           -- ^ Maximum pitch in bytes allowed by memory copies
 #if CUDA_VERSION >= 4000
   , memBusWidth                   :: !Int             -- ^ Global memory bus width in bits
+#if CUDA_VERSION < 13000
   , memClockRate                  :: !Int             -- ^ Peak memory clock frequency in kilohertz
+#endif
 #endif
   , textureAlignment              :: !Int64           -- ^ Alignment requirement for textures
+#if CUDA_VERSION < 13000
   , computeMode                   :: !ComputeMode
   , deviceOverlap                 :: !Bool            -- ^ Device can concurrently copy memory and execute a kernel
+#endif
 #if CUDA_VERSION >= 3000
   , concurrentKernels             :: !Bool            -- ^ Device can possibly execute multiple kernels concurrently
   , eccEnabled                    :: !Bool            -- ^ Device supports and has enabled error correction
@@ -111,7 +126,9 @@ data DeviceProperties = DeviceProperties
   , pciInfo                       :: !PCI             -- ^ PCI device information for the device
   , tccDriverEnabled              :: !Bool            -- ^ Whether this is a Tesla device using the TCC driver
 #endif
+#if CUDA_VERSION < 13000
   , kernelExecTimeoutEnabled      :: !Bool            -- ^ Whether there is a runtime limit on kernels
+#endif
   , integrated                    :: !Bool            -- ^ As opposed to discrete
   , canMapHostMemory              :: !Bool            -- ^ Device can use pinned memory
 #if CUDA_VERSION >= 4000
@@ -129,11 +146,15 @@ data DeviceProperties = DeviceProperties
 #endif
 #if CUDA_VERSION >= 8000
   , preemption                    :: !Bool            -- ^ Device supports compute pre-emption
+#if CUDA_VERSION < 13000
   , singleToDoublePerfRatio       :: !Int             -- ^ Ratio of single precision performance (in floating-point operations per second) to double precision performance
 #endif
+#endif
 #if CUDA_VERSION >= 9000
   , cooperativeLaunch             :: !Bool            -- ^ Device supports launching cooperative kernels
+#if CUDA_VERSION < 13000
   , cooperativeLaunchMultiDevice  :: !Bool            -- ^ Device can participate in cooperative multi-device kernels
+#endif
 #endif
   }
   deriving (Show)
diff --git a/src/Foreign/CUDA/Driver/Device.chs b/src/Foreign/CUDA/Driver/Device.chs
index 8498653..7080c4e 100644
--- a/src/Foreign/CUDA/Driver/Device.chs
+++ b/src/Foreign/CUDA/Driver/Device.chs
@@ -246,7 +246,10 @@ attribute !d !a = cuDeviceGetAttribute a d
     peekS s _ = peekCString s
 
 
--- | Returns a UUID for the device
+-- | Returns a UUID for the device.
+--
+-- Since CUDA-13: If the device is in MIG mode, this function returns its MIG
+-- UUID which uniquely identifies the subscribed MIG compute instance.
 --
 -- Requires CUDA-9.2
 --
@@ -265,10 +268,17 @@ uuid !dev =
     unpack ptr
   where
     {-# INLINE cuDeviceGetUuid #-}
+#if CUDA_VERSION < 13000
     {# fun unsafe cuDeviceGetUuid
       {           `Ptr ()'
       , useDevice `Device'
       } -> `()' checkStatus*- #}
+#else
+    {# fun unsafe cuDeviceGetUuid_v2 as cuDeviceGetUuid
+      {           `Ptr ()'
+      , useDevice `Device'
+      } -> `()' checkStatus*- #}
+#endif
 
     {-# INLINE unpack #-}
     unpack :: Ptr () -> IO UUID
@@ -319,7 +329,9 @@ props !d = do
   sharedMemPerBlock     <- fromIntegral <$> attribute d SharedMemoryPerBlock
   memPitch              <- fromIntegral <$> attribute d MaxPitch
   textureAlignment      <- fromIntegral <$> attribute d TextureAlignment
+#if CUDA_VERSION < 13000
   clockRate             <- attribute d ClockRate
+#endif
   warpSize              <- attribute d WarpSize
   regsPerBlock          <- attribute d RegistersPerBlock
   maxThreadsPerBlock    <- attribute d MaxThreadsPerBlock
@@ -333,9 +345,11 @@ props !d = do
   computeCapability             <- capability d
   totalGlobalMem                <- totalMem d
   multiProcessorCount           <- attribute d MultiprocessorCount
+#if CUDA_VERSION < 13000
   computeMode                   <- toEnum <$> attribute d ComputeMode
   deviceOverlap                 <- toBool <$> attribute d GpuOverlap
   kernelExecTimeoutEnabled      <- toBool <$> attribute d KernelExecTimeout
+#endif
   integrated                    <- toBool <$> attribute d Integrated
   canMapHostMemory              <- toBool <$> attribute d CanMapHostMemory
 #if CUDA_VERSION >= 3000
@@ -350,7 +364,9 @@ props !d = do
   cacheMemL2                    <- attribute d L2CacheSize
   maxThreadsPerMultiProcessor   <- attribute d MaxThreadsPerMultiprocessor
   memBusWidth                   <- attribute d GlobalMemoryBusWidth
+#if CUDA_VERSION < 13000
   memClockRate                  <- attribute d MemoryClockRate
+#endif
   pciInfo                       <- PCI <$> attribute d PciBusId <*> attribute d PciDeviceId <*> attribute d PciDomainId
   unifiedAddressing             <- toBool <$> attribute d UnifiedAddressing
   tccDriverEnabled              <- toBool <$> attribute d TccDriver
@@ -367,11 +383,15 @@ props !d = do
 #endif
 #if CUDA_VERSION >= 8000
   preemption                    <- toBool <$> attribute d ComputePreemptionSupported
+#if CUDA_VERSION < 13000
   singleToDoublePerfRatio       <- attribute d SingleToDoublePrecisionPerfRatio
 #endif
+#endif
 #if CUDA_VERSION >= 9000
   cooperativeLaunch             <- toBool <$> attribute d CooperativeLaunch
+#if CUDA_VERSION < 13000
   cooperativeLaunchMultiDevice  <- toBool <$> attribute d CooperativeMultiDeviceLaunch
+#endif
 #endif
 
   return DeviceProperties{..}
diff --git a/src/Foreign/CUDA/Driver/Event.chs b/src/Foreign/CUDA/Driver/Event.chs
index 16f6293..00b7e7f 100644
--- a/src/Foreign/CUDA/Driver/Event.chs
+++ b/src/Foreign/CUDA/Driver/Event.chs
@@ -114,10 +114,17 @@ elapsedTime :: Event -> Event -> IO Float
 elapsedTime !ev1 !ev2 = resultIfOk =<< cuEventElapsedTime ev1 ev2
 
 {-# INLINE cuEventElapsedTime #-}
+#if CUDA_VERSION < 13000
 {# fun unsafe cuEventElapsedTime
   { alloca-  `Float' peekFloatConv*
   , useEvent `Event'
   , useEvent `Event'                } -> `Status' cToEnum #}
+#else
+{# fun unsafe cuEventElapsedTime_v2 as cuEventElapsedTime
+  { alloca-  `Float' peekFloatConv*
+  , useEvent `Event'
+  , useEvent `Event'                } -> `Status' cToEnum #}
+#endif
 
 
 -- |
diff --git a/src/Foreign/CUDA/Driver/Graph/Build.chs b/src/Foreign/CUDA/Driver/Graph/Build.chs
index 833ce52..d115753 100644
--- a/src/Foreign/CUDA/Driver/Graph/Build.chs
+++ b/src/Foreign/CUDA/Driver/Graph/Build.chs
@@ -204,12 +204,23 @@ addDependencies !g !deps = cuGraphAddDependencies g from to
   where
     (from, to) = unzip deps
 
+#if CUDA_VERSION < 13000
     {# fun unsafe cuGraphAddDependencies
       { useGraph          `Graph'
       , withNodeArray*    `[Node]'
       , withNodeArrayLen* `[Node]'&
       }
       -> `()' checkStatus*- #}
+#else
+    cuGraphAddDependencies g' from' to' = cuGraphAddDependencies_v2 g' from' to' (length deps)
+    {# fun unsafe cuGraphAddDependencies_v2
+      { useGraph          `Graph'
+      , withNodeArray*    `[Node]'
+      , withNodeArray*    `[Node]'
+      , withNullEdgeDataLen* `Int'&
+      }
+      -> `()' checkStatus*- #}
+#endif
 #endif
 
 
@@ -230,12 +241,23 @@ removeDependencies !g !deps = cuGraphRemoveDependencies g from to
   where
     (from, to) = unzip deps
 
+#if CUDA_VERSION < 13000
     {# fun unsafe cuGraphRemoveDependencies
       { useGraph          `Graph'
       , withNodeArray*    `[Node]'
       , withNodeArrayLen* `[Node]'&
       }
       -> `()' checkStatus*- #}
+#else
+    cuGraphRemoveDependencies g' from' to' = cuGraphRemoveDependencies_v2 g' from' to' (length deps)
+    {# fun unsafe cuGraphRemoveDependencies_v2
+      { useGraph          `Graph'
+      , withNodeArray*    `[Node]'
+      , withNodeArray*    `[Node]'
+      , withNullEdgeDataLen* `Int'&
+      }
+      -> `()' checkStatus*- #}
+#endif
 #endif
 
 
@@ -507,6 +529,7 @@ getEdges !g =
        to   <- peekArray count p_to
        return $ zip from to
   where
+#if CUDA_VERSION < 13000
     {# fun unsafe cuGraphGetEdges
       { useGraph     `Graph'
       , castPtr      `Ptr Node'
@@ -514,6 +537,17 @@ getEdges !g =
       , id           `Ptr CULong'
       }
       -> `()' checkStatus*- #}
+#else
+    cuGraphGetEdges g' f t c = cuGraphGetEdges_v2 g' f t nullPtr c
+    {# fun unsafe cuGraphGetEdges_v2
+      { useGraph     `Graph'
+      , castPtr      `Ptr Node'
+      , castPtr      `Ptr Node'
+      , castPtr      `Ptr edgeData'
+      , id           `Ptr CULong'
+      }
+      -> `()' checkStatus*- #}
+#endif
 #endif
 
 
@@ -598,12 +632,23 @@ getDependencies !n =
       cuGraphNodeGetDependencies n p_deps p_count
       peekArray count p_deps
   where
+#if CUDA_VERSION < 13000
     {# fun unsafe cuGraphNodeGetDependencies
       { useNode `Node'
       , castPtr `Ptr Node'
       , id      `Ptr CULong'
       }
       -> `()' checkStatus*- #}
+#else
+    cuGraphNodeGetDependencies n' d c = cuGraphNodeGetDependencies_v2 n' d nullPtr c
+    {# fun unsafe cuGraphNodeGetDependencies_v2
+      { useNode `Node'
+      , castPtr `Ptr Node'
+      , castPtr `Ptr edgeData'
+      , id      `Ptr CULong'
+      }
+      -> `()' checkStatus*- #}
+#endif
 #endif
 
 
@@ -628,12 +673,23 @@ getDependents n =
       cuGraphNodeGetDependentNodes n p_deps p_count
       peekArray count p_deps
   where
+#if CUDA_VERSION < 13000
     {# fun unsafe cuGraphNodeGetDependentNodes
       { useNode `Node'
       , castPtr `Ptr Node'
       , id      `Ptr CULong'
       }
       -> `()' checkStatus*- #}
+#else
+    cuGraphNodeGetDependentNodes n' d c = cuGraphNodeGetDependentNodes_v2 n' d nullPtr c
+    {# fun unsafe cuGraphNodeGetDependentNodes_v2
+      { useNode `Node'
+      , castPtr `Ptr Node'
+      , castPtr `Ptr edgeData'
+      , id      `Ptr CULong'
+      }
+      -> `()' checkStatus*- #}
+#endif
 #endif
 
 
@@ -679,5 +735,9 @@ withNodeArray ns f = withArray ns (f . castPtr)
 {-# INLINE withNodeArrayLen #-}
 withNodeArrayLen :: [Node] -> ((Ptr {# type CUgraphNode #}, CULong) -> IO a) -> IO a
 withNodeArrayLen ns f = withArrayLen ns $ \i p -> f (castPtr p, cIntConv i)
+
+{-# INLINE withNullEdgeDataLen #-}
+withNullEdgeDataLen :: Int -> ((Ptr (), CULong) -> IO a) -> IO a
+withNullEdgeDataLen len f = f (nullPtr, cIntConv len)
 #endif
 
diff --git a/src/Foreign/CUDA/Driver/Graph/Capture.chs b/src/Foreign/CUDA/Driver/Graph/Capture.chs
index 160ff70..070391f 100644
--- a/src/Foreign/CUDA/Driver/Graph/Capture.chs
+++ b/src/Foreign/CUDA/Driver/Graph/Capture.chs
@@ -143,6 +143,10 @@ status = requireSDK 'status 10.0
 -- | Query the capture status of a stream and get an id for the capture
 -- sequence, which is unique over the lifetime of the process.
 --
+-- Since CUDA-13, "edge data" can be associated with an edge in the graph. This
+-- function assumes no such data is present (if there is, a CUDA error
+-- (@CUDA_ERROR_LOSSY_QUERY@) will be raised).
+--
 -- Requires CUDA-10.1
 --
 -- <https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__STREAM.html#group__CUDA__STREAM_1g13145ece1d79a1d79a1d22abb9663216>
@@ -152,24 +156,38 @@ status = requireSDK 'status 10.0
 #if CUDA_VERSION < 10010
 info :: Stream -> IO (Status, Int64)
 info = requireSDK 'info 10.1
-#elif CUDA_VERSION < 12000
+-- Not another elif because c2hs seems to be buggy
+#else
+#if CUDA_VERSION < 12000
 {# fun unsafe cuStreamGetCaptureInfo as info
   { useStream `Stream'
   , alloca-   `Status' peekEnum*
   , alloca-   `Int64'  peekIntConv*
   }
   -> `()' checkStatus*- #}
-#else
+#elif CUDA_VERSION < 13000
 {# fun unsafe cuStreamGetCaptureInfo_v2 as info
   { useStream `Stream'
   , alloca-   `Status' peekEnum*
   , alloca-   `Int64'  peekIntConv*
-  , alloca-   `Graph'
-  , alloca-   `Node'
-  , alloca-   `CSize'
+  , withNullPtr- `Graph'
+  , withNullPtr- `Node'
+  , withNullPtr- `CSize'
+  }
+  -> `()' checkStatus*- #}
+#else
+{# fun unsafe cuStreamGetCaptureInfo_v3 as info
+  { useStream `Stream'
+  , alloca-   `Status' peekEnum*
+  , alloca-   `Int64'  peekIntConv*
+  , withNullPtr- `Graph'
+  , withNullPtr- `()'
+  , withNullPtr- `Node'
+  , withNullPtr- `CSize'
   }
   -> `()' checkStatus*- #}
 #endif
+#endif
 
 
 -- | Set the stream capture interaction mode for this thread. Return the previous value.
@@ -201,3 +219,5 @@ peekGraph :: Ptr {# type CUgraph #} -> IO Graph
 peekGraph = liftM Graph . peek
 #endif
 
+withNullPtr :: (Ptr a -> r) -> r
+withNullPtr f = f nullPtr
diff --git a/src/Foreign/CUDA/Driver/Marshal.chs b/src/Foreign/CUDA/Driver/Marshal.chs
index bf6c4ba..8d69d75 100644
--- a/src/Foreign/CUDA/Driver/Marshal.chs
+++ b/src/Foreign/CUDA/Driver/Marshal.chs
@@ -358,6 +358,7 @@ prefetchArrayAsync ptr n mdev mst = go undefined ptr
     go x _ = nothingIfOk =<< cuMemPrefetchAsync ptr (n * sizeOf x) (maybe (-1) useDevice mdev) (fromMaybe defaultStream mst)
 
 {-# INLINE cuMemPrefetchAsync #-}
+#if CUDA_VERSION < 13000
 {# fun unsafe cuMemPrefetchAsync
   { useDeviceHandle `DevicePtr a'
   ,                 `Int'
@@ -365,6 +366,15 @@ prefetchArrayAsync ptr n mdev mst = go undefined ptr
   , useStream       `Stream'
   }
   -> `Status' cToEnum #}
+#else
+{# fun unsafe cuMemPrefetchAsync_device as cuMemPrefetchAsync
+  { useDeviceHandle `DevicePtr a'
+  ,                 `Int'
+  , id              `CInt'
+  , useStream       `Stream'
+  }
+  -> `Status' cToEnum #}
+#endif
 #endif
 
 
diff --git a/src/Foreign/CUDA/Driver/Unified.chs b/src/Foreign/CUDA/Driver/Unified.chs
index 4dbc576..b94cb46 100644
--- a/src/Foreign/CUDA/Driver/Unified.chs
+++ b/src/Foreign/CUDA/Driver/Unified.chs
@@ -258,6 +258,7 @@ advise ptr n a mdev = go undefined ptr
     go x _ = nothingIfOk =<< cuMemAdvise ptr (n * sizeOf x) a (maybe (-1) useDevice mdev)
 
 {-# INLINE cuMemAdvise #-}
+#if CUDA_VERSION < 13000
 {# fun unsafe cuMemAdvise
   { useHandle `Ptr a'
   ,           `Int'
@@ -265,7 +266,15 @@ advise ptr n a mdev = go undefined ptr
   ,           `CInt'
   }
   -> `Status' cToEnum #}
+#else
+{# fun unsafe cuMemAdvise_device as cuMemAdvise
+  { useHandle `Ptr a'
+  ,           `Int'
+  , cFromEnum `Advice'
+  ,           `CInt'
+  }
+  -> `Status' cToEnum #}
+#endif
   where
     useHandle = fromIntegral . ptrToIntPtr
 #endif
-
diff --git a/src/Foreign/CUDA/Runtime/Device.chs b/src/Foreign/CUDA/Runtime/Device.chs
index 90e2089..942bd63 100644
--- a/src/Foreign/CUDA/Runtime/Device.chs
+++ b/src/Foreign/CUDA/Runtime/Device.chs
@@ -98,15 +98,23 @@ instance Storable DeviceProperties where
     warpSize                      <- cIntConv <$> {#get cudaDeviceProp.warpSize#} p
     memPitch                      <- cIntConv <$> {#get cudaDeviceProp.memPitch#} p
     maxThreadsPerBlock            <- cIntConv <$> {#get cudaDeviceProp.maxThreadsPerBlock#} p
+#if CUDART_VERSION < 13000
     clockRate                     <- cIntConv <$> {#get cudaDeviceProp.clockRate#} p
+#endif
     totalConstMem                 <- cIntConv <$> {#get cudaDeviceProp.totalConstMem#} p
     textureAlignment              <- cIntConv <$> {#get cudaDeviceProp.textureAlignment#} p
+#if CUDART_VERSION < 13000
     deviceOverlap                 <- cToBool  <$> {#get cudaDeviceProp.deviceOverlap#} p
+#endif
     multiProcessorCount           <- cIntConv <$> {#get cudaDeviceProp.multiProcessorCount#} p
+#if CUDART_VERSION < 13000
     kernelExecTimeoutEnabled      <- cToBool  <$> {#get cudaDeviceProp.kernelExecTimeoutEnabled#} p
+#endif
     integrated                    <- cToBool  <$> {#get cudaDeviceProp.integrated#} p
     canMapHostMemory              <- cToBool  <$> {#get cudaDeviceProp.canMapHostMemory#} p
+#if CUDART_VERSION < 13000
     computeMode                   <- cToEnum  <$> {#get cudaDeviceProp.computeMode#} p
+#endif
 #if CUDART_VERSION >= 3000
     concurrentKernels             <- cToBool  <$> {#get cudaDeviceProp.concurrentKernels#} p
     maxTextureDim1D               <- cIntConv <$> {#get cudaDeviceProp.maxTexture1D#} p
@@ -123,7 +131,9 @@ instance Storable DeviceProperties where
     cacheMemL2                    <- cIntConv <$> {#get cudaDeviceProp.l2CacheSize#} p
     maxThreadsPerMultiProcessor   <- cIntConv <$> {#get cudaDeviceProp.maxThreadsPerMultiProcessor#} p
     memBusWidth                   <- cIntConv <$> {#get cudaDeviceProp.memoryBusWidth#} p
+#if CUDART_VERSION < 13000
     memClockRate                  <- cIntConv <$> {#get cudaDeviceProp.memoryClockRate#} p
+#endif
     pciInfo                       <- PCI <$> (cIntConv <$> {#get cudaDeviceProp.pciBusID#} p)
                                          <*> (cIntConv <$> {#get cudaDeviceProp.pciDeviceID#} p)
                                          <*> (cIntConv <$> {#get cudaDeviceProp.pciDomainID#} p)
@@ -157,11 +167,15 @@ instance Storable DeviceProperties where
 #else
     preemption                    <- cToBool  <$> {#get cudaDeviceProp.computePreemptionSupported#} p
 #endif
+#if CUDART_VERSION < 13000
     singleToDoublePerfRatio       <- cIntConv <$> {#get cudaDeviceProp.singleToDoublePrecisionPerfRatio#} p
 #endif
+#endif
 #if CUDART_VERSION >= 9000
     cooperativeLaunch             <- cToBool <$> {#get cudaDeviceProp.cooperativeLaunch#} p
+#if CUDART_VERSION < 13000
     cooperativeLaunchMultiDevice  <- cToBool <$> {#get cudaDeviceProp.cooperativeMultiDeviceLaunch#} p
+#endif
 #endif
 
     return DeviceProperties{..}
@@ -219,7 +233,7 @@ props :: Device -> IO DeviceProperties
 props !n = resultIfOk =<< cudaGetDeviceProperties n
 
 {-# INLINE cudaGetDeviceProperties #-}
-#if CUDA_VERSION < 12000
+#if CUDA_VERSION < 12000 || CUDA_VERSION >= 13000
 {# fun unsafe cudaGetDeviceProperties
   { alloca- `DeviceProperties' peek*
   ,         `Int'                    } -> `Status' cToEnum #}